Publishing R5 content (#72)
authorAlexey Suhov <asuhov@users.noreply.github.com>
Mon, 21 Jan 2019 18:31:31 +0000 (21:31 +0300)
committeropenvino-pushbot <44090433+openvino-pushbot@users.noreply.github.com>
Mon, 21 Jan 2019 18:31:31 +0000 (21:31 +0300)
* Publishing R5 content

* Updated ade revision

* updated readme

* add possibility to build CPU plugin with Intel MKL package

1616 files changed:
inference-engine/CMakeLists.txt
inference-engine/README.md
inference-engine/cmake/FindlibGNA.cmake [new file with mode: 0644]
inference-engine/cmake/arm.toolchain.cmake [new file with mode: 0644]
inference-engine/cmake/check_features.cmake
inference-engine/cmake/config.cmake.in
inference-engine/cmake/debug.cmake
inference-engine/cmake/dependencies.cmake
inference-engine/cmake/download_and_extract.cmake
inference-engine/cmake/features.cmake
inference-engine/cmake/ie_parallel.cmake [new file with mode: 0644]
inference-engine/cmake/omp.cmake [deleted file]
inference-engine/cmake/options.cmake
inference-engine/cmake/os_flags.cmake
inference-engine/cmake/sanitizer.cmake
inference-engine/cmake/sdl.cmake
inference-engine/cmake/share/InferenceEngineConfig-version.cmake.in [moved from inference-engine/cmake/share/InferenceEngineConfig-version.cmake with 92% similarity]
inference-engine/cmake/share/InferenceEngineConfig.cmake.in [moved from inference-engine/cmake/share/InferenceEngineConfig.cmake with 74% similarity]
inference-engine/ie_bridges/python/CMakeLists.txt
inference-engine/ie_bridges/python/README.md
inference-engine/ie_bridges/python/cmake/CopyIeLibs.cmake [deleted file]
inference-engine/ie_bridges/python/cmake/FindCython.cmake
inference-engine/ie_bridges/python/cmake/ReplicatePythonSourceTree.cmake [deleted file]
inference-engine/ie_bridges/python/cmake/UseCython.cmake
inference-engine/ie_bridges/python/docs/api_overview.md
inference-engine/ie_bridges/python/inference_engine/CMakeLists.txt [deleted file]
inference-engine/ie_bridges/python/inference_engine/__init__.py [deleted file]
inference-engine/ie_bridges/python/inference_engine/ie_api_impl.hpp [deleted file]
inference-engine/ie_bridges/python/sample/benchmark_app/README.md [new file with mode: 0644]
inference-engine/ie_bridges/python/sample/benchmark_app/benchmark.py [new file with mode: 0644]
inference-engine/ie_bridges/python/sample/benchmark_app/utils/benchmark_utils.py [new file with mode: 0644]
inference-engine/ie_bridges/python/sample/benchmark_app/utils/constants.py [new file with mode: 0644]
inference-engine/ie_bridges/python/sample/classification_sample.py
inference-engine/ie_bridges/python/sample/classification_sample_async.py
inference-engine/ie_bridges/python/sample/greengrass_samples/greengrass_classification_sample.py
inference-engine/ie_bridges/python/sample/greengrass_samples/greengrass_object_detection_sample_ssd.py
inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/classification_demo.ipynb [new file with mode: 0644]
inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/image_net_synset.txt [new file with mode: 0644]
inference-engine/ie_bridges/python/sample/segmentation_sample.py [deleted file]
inference-engine/ie_bridges/python/sample/style_transfer_sample.py
inference-engine/ie_bridges/python/setup.py
inference-engine/ie_bridges/python/src/openvino/__init__.py [moved from model-optimizer/extensions/front/kaldi/__init__.py with 100% similarity]
inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt [new file with mode: 0644]
inference-engine/ie_bridges/python/src/openvino/inference_engine/__init__.py [new file with mode: 0644]
inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/CMakeLists.txt [new file with mode: 0644]
inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/__init__.py [new file with mode: 0644]
inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pxd [new file with mode: 0644]
inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pyx [new file with mode: 0644]
inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.cpp [new file with mode: 0644]
inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.hpp [new file with mode: 0644]
inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl_defs.pxd [new file with mode: 0644]
inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pxd [moved from inference-engine/ie_bridges/python/inference_engine/ie_api.pxd with 78% similarity]
inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pyx [moved from inference-engine/ie_bridges/python/inference_engine/ie_api.pyx with 75% similarity]
inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.cpp [moved from inference-engine/ie_bridges/python/inference_engine/ie_api_impl.cpp with 51% similarity]
inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp [new file with mode: 0644]
inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl_defs.pxd [moved from inference-engine/ie_bridges/python/inference_engine/ie_api_impl_defs.pxd with 75% similarity]
inference-engine/include/builders/ie_argmax_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_batch_normalization_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_clamp_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_concat_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_const_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_convolution_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_crop_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_ctc_greedy_decoder_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_deconvolution_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_detection_output_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_eltwise_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_elu_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_fully_connected_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_grn_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_input_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_layer_builder.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_layer_fragment.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_memory_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_mvn_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_network_builder.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_norm_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_normalize_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_output_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_permute_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_pooling_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_power_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_prelu_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_prior_box_clustered_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_prior_box_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_proposal_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_psroi_pooling_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_region_yolo_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_relu6_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_relu_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_reorg_yolo_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_reshape_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_roi_pooling_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_scale_shift_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_sigmoid_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_simpler_nms_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_softmax_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_split_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_tanh_layer.hpp [new file with mode: 0644]
inference-engine/include/builders/ie_tile_layer.hpp [new file with mode: 0644]
inference-engine/include/cldnn/cldnn_config.hpp
inference-engine/include/cpp/ie_cnn_net_reader.h
inference-engine/include/cpp/ie_cnn_network.h
inference-engine/include/cpp/ie_executable_network.hpp
inference-engine/include/cpp/ie_infer_request.hpp
inference-engine/include/cpp/ie_memory_state.hpp
inference-engine/include/cpp/ie_plugin_cpp.hpp
inference-engine/include/details/caseless.hpp
inference-engine/include/details/ie_blob_iterator.hpp
inference-engine/include/details/ie_cnn_network_iterator.hpp
inference-engine/include/details/ie_cnn_network_tools.h
inference-engine/include/details/ie_exception.hpp
inference-engine/include/details/ie_exception_conversion.hpp
inference-engine/include/details/ie_inetwork_iterator.hpp [new file with mode: 0644]
inference-engine/include/details/ie_irelease.hpp
inference-engine/include/details/ie_no_copy.hpp
inference-engine/include/details/ie_no_release.hpp
inference-engine/include/details/ie_pre_allocator.hpp
inference-engine/include/details/ie_so_loader.h
inference-engine/include/details/ie_so_pointer.hpp
inference-engine/include/details/os/lin_shared_object_loader.h
inference-engine/include/details/os/win_shared_object_loader.h
inference-engine/include/gna/gna_config.hpp [new file with mode: 0644]
inference-engine/include/hetero/hetero_plugin_config.hpp
inference-engine/include/ie_allocator.hpp
inference-engine/include/ie_api.h
inference-engine/include/ie_blob.h
inference-engine/include/ie_builders.hpp [new file with mode: 0644]
inference-engine/include/ie_common.h
inference-engine/include/ie_context.hpp [new file with mode: 0644]
inference-engine/include/ie_data.h
inference-engine/include/ie_device.hpp
inference-engine/include/ie_error.hpp
inference-engine/include/ie_extension.h
inference-engine/include/ie_icnn_net_reader.h
inference-engine/include/ie_icnn_network.hpp
inference-engine/include/ie_icnn_network_stats.hpp
inference-engine/include/ie_iexecutable_network.hpp
inference-engine/include/ie_iextension.h
inference-engine/include/ie_ihetero_plugin.hpp
inference-engine/include/ie_iinfer_request.hpp
inference-engine/include/ie_imemory_state.hpp
inference-engine/include/ie_inetwork.hpp [new file with mode: 0644]
inference-engine/include/ie_input_info.hpp
inference-engine/include/ie_layers.h
inference-engine/include/ie_layers_property.hpp
inference-engine/include/ie_layouts.h
inference-engine/include/ie_locked_memory.hpp
inference-engine/include/ie_parallel.hpp
inference-engine/include/ie_parameter.hpp [new file with mode: 0644]
inference-engine/include/ie_plugin.hpp
inference-engine/include/ie_plugin_config.hpp
inference-engine/include/ie_plugin_dispatcher.hpp
inference-engine/include/ie_plugin_ptr.hpp
inference-engine/include/ie_precision.hpp
inference-engine/include/ie_preprocess.hpp
inference-engine/include/ie_primitive_info.hpp
inference-engine/include/ie_tensor_info.hpp
inference-engine/include/ie_unicode.hpp [new file with mode: 0644]
inference-engine/include/ie_utils.hpp
inference-engine/include/ie_version.hpp
inference-engine/include/inference_engine.hpp
inference-engine/install_dependencies.sh
inference-engine/samples/CMakeLists.txt
inference-engine/samples/benchmark_app/CMakeLists.txt
inference-engine/samples/benchmark_app/README.md
inference-engine/samples/benchmark_app/benchmark_app.h
inference-engine/samples/benchmark_app/main.cpp
inference-engine/samples/calibration_tool/CMakeLists.txt
inference-engine/samples/calibration_tool/README.md
inference-engine/samples/calibration_tool/calibrator_processors.cpp
inference-engine/samples/calibration_tool/calibrator_processors.h
inference-engine/samples/calibration_tool/data_stats.cpp
inference-engine/samples/calibration_tool/data_stats.h
inference-engine/samples/calibration_tool/main.cpp
inference-engine/samples/calibration_tool/network_serializer.h [deleted file]
inference-engine/samples/classification_sample/CMakeLists.txt
inference-engine/samples/classification_sample/README.md
inference-engine/samples/classification_sample/classification_sample.h
inference-engine/samples/classification_sample/main.cpp
inference-engine/samples/classification_sample_async/CMakeLists.txt
inference-engine/samples/classification_sample_async/README.md
inference-engine/samples/classification_sample_async/classification_sample_async.h
inference-engine/samples/classification_sample_async/main.cpp
inference-engine/samples/common/format_reader/CMakeLists.txt
inference-engine/samples/common/format_reader/MnistUbyte.cpp
inference-engine/samples/common/format_reader/MnistUbyte.h
inference-engine/samples/common/format_reader/bmp.cpp
inference-engine/samples/common/format_reader/bmp.h
inference-engine/samples/common/format_reader/format_reader.cpp
inference-engine/samples/common/format_reader/format_reader.h
inference-engine/samples/common/format_reader/format_reader_ptr.h
inference-engine/samples/common/format_reader/opencv_wraper.cpp
inference-engine/samples/common/format_reader/opencv_wraper.h
inference-engine/samples/common/format_reader/register.h
inference-engine/samples/common/os/windows/w_dirent.h
inference-engine/samples/common/samples/args_helper.hpp
inference-engine/samples/common/samples/common.hpp
inference-engine/samples/common/samples/ocv_common.hpp [new file with mode: 0644]
inference-engine/samples/common/samples/slog.hpp
inference-engine/samples/hello_autoresize_classification/CMakeLists.txt
inference-engine/samples/hello_autoresize_classification/README.md
inference-engine/samples/hello_autoresize_classification/main.cpp
inference-engine/samples/hello_classification/CMakeLists.txt
inference-engine/samples/hello_classification/main.cpp
inference-engine/samples/hello_request_classification/CMakeLists.txt
inference-engine/samples/hello_request_classification/README.md
inference-engine/samples/hello_request_classification/main.cpp
inference-engine/samples/hello_shape_infer_ssd/CMakeLists.txt
inference-engine/samples/hello_shape_infer_ssd/README.md
inference-engine/samples/hello_shape_infer_ssd/main.cpp
inference-engine/samples/hello_shape_infer_ssd/shape_infer_extension.hpp
inference-engine/samples/lenet_network_graph_builder/CMakeLists.txt [new file with mode: 0644]
inference-engine/samples/lenet_network_graph_builder/LeNet.bin [new file with mode: 0644]
inference-engine/samples/lenet_network_graph_builder/README.md [new file with mode: 0644]
inference-engine/samples/lenet_network_graph_builder/lenet_network_graph_builder.hpp [new file with mode: 0644]
inference-engine/samples/lenet_network_graph_builder/main.cpp [new file with mode: 0644]
inference-engine/samples/object_detection_sample_ssd/CMakeLists.txt
inference-engine/samples/object_detection_sample_ssd/README.md
inference-engine/samples/object_detection_sample_ssd/main.cpp
inference-engine/samples/object_detection_sample_ssd/object_detection_sample_ssd.h
inference-engine/samples/perfcheck/CMakeLists.txt [new file with mode: 0644]
inference-engine/samples/perfcheck/README.md [new file with mode: 0644]
inference-engine/samples/perfcheck/main.cpp [new file with mode: 0644]
inference-engine/samples/perfcheck/perfcheck.h [new file with mode: 0644]
inference-engine/samples/speech_sample/CMakeLists.txt [new file with mode: 0644]
inference-engine/samples/speech_sample/README.md [new file with mode: 0644]
inference-engine/samples/speech_sample/main.cpp [new file with mode: 0644]
inference-engine/samples/speech_sample/speech_sample.hpp [new file with mode: 0644]
inference-engine/samples/style_transfer_sample/CMakeLists.txt
inference-engine/samples/style_transfer_sample/README.md
inference-engine/samples/style_transfer_sample/main.cpp
inference-engine/samples/style_transfer_sample/style_transfer_sample.h
inference-engine/samples/validation_app/CMakeLists.txt
inference-engine/samples/validation_app/ClassificationProcessor.cpp
inference-engine/samples/validation_app/ClassificationProcessor.hpp
inference-engine/samples/validation_app/ObjectDetectionProcessor.cpp
inference-engine/samples/validation_app/ObjectDetectionProcessor.hpp
inference-engine/samples/validation_app/PreprocessingOptions.hpp
inference-engine/samples/validation_app/Processor.cpp
inference-engine/samples/validation_app/Processor.hpp
inference-engine/samples/validation_app/README.md
inference-engine/samples/validation_app/SSDObjectDetectionProcessor.hpp
inference-engine/samples/validation_app/VOCAnnotationParser.cpp
inference-engine/samples/validation_app/VOCAnnotationParser.hpp
inference-engine/samples/validation_app/YOLOObjectDetectionProcessor.hpp
inference-engine/samples/validation_app/classification_set_generator.cpp
inference-engine/samples/validation_app/classification_set_generator.hpp
inference-engine/samples/validation_app/console_progress.hpp
inference-engine/samples/validation_app/csv_dumper.hpp
inference-engine/samples/validation_app/image_decoder.cpp
inference-engine/samples/validation_app/image_decoder.hpp
inference-engine/samples/validation_app/main.cpp
inference-engine/samples/validation_app/pugixml/pugiconfig.hpp
inference-engine/samples/validation_app/pugixml/pugixml.cpp
inference-engine/samples/validation_app/pugixml/pugixml.hpp
inference-engine/samples/validation_app/user_exception.hpp
inference-engine/src/CMakeLists.txt
inference-engine/src/cldnn_engine/CMakeLists.txt
inference-engine/src/cldnn_engine/cldnn_custom_layer.cpp
inference-engine/src/cldnn_engine/cldnn_custom_layer.h
inference-engine/src/cldnn_engine/cldnn_engine.cpp
inference-engine/src/cldnn_engine/cldnn_engine.h
inference-engine/src/cldnn_engine/cldnn_graph.cpp
inference-engine/src/cldnn_engine/cldnn_graph.h
inference-engine/src/cldnn_engine/cldnn_infer_request.cpp
inference-engine/src/cldnn_engine/cldnn_infer_request.h
inference-engine/src/cldnn_engine/debug_options.cpp
inference-engine/src/cldnn_engine/debug_options.h
inference-engine/src/cldnn_engine/dllmain.cpp
inference-engine/src/cldnn_engine/simple_math.cpp
inference-engine/src/cldnn_engine/simple_math.h
inference-engine/src/extension/CMakeLists.txt
inference-engine/src/extension/README.md
inference-engine/src/extension/cmake/OptimizationFlags.cmake
inference-engine/src/extension/cmake/feature_defs.cmake
inference-engine/src/extension/common/defs.h
inference-engine/src/extension/common/fast_exp.h
inference-engine/src/extension/common/matrixmult.h
inference-engine/src/extension/common/opt_exp.h
inference-engine/src/extension/common/softmax.h
inference-engine/src/extension/ext_argmax.cpp
inference-engine/src/extension/ext_base.cpp
inference-engine/src/extension/ext_base.hpp
inference-engine/src/extension/ext_ctc_greedy.cpp
inference-engine/src/extension/ext_detectionoutput.cpp
inference-engine/src/extension/ext_gather.cpp [new file with mode: 0644]
inference-engine/src/extension/ext_grn.cpp
inference-engine/src/extension/ext_interp.cpp
inference-engine/src/extension/ext_list.cpp
inference-engine/src/extension/ext_list.hpp
inference-engine/src/extension/ext_mvn.cpp
inference-engine/src/extension/ext_normalize.cpp
inference-engine/src/extension/ext_pad.cpp [new file with mode: 0644]
inference-engine/src/extension/ext_powerfile.cpp
inference-engine/src/extension/ext_priorbox.cpp
inference-engine/src/extension/ext_priorbox_clustered.cpp
inference-engine/src/extension/ext_proposal.cpp
inference-engine/src/extension/ext_psroi.cpp
inference-engine/src/extension/ext_region_yolo.cpp
inference-engine/src/extension/ext_reorg_yolo.cpp
inference-engine/src/extension/ext_resample.cpp
inference-engine/src/extension/ext_simplernms.cpp
inference-engine/src/extension/ext_spatial_transformer.cpp
inference-engine/src/extension/simple_copy.cpp [new file with mode: 0644]
inference-engine/src/extension/simple_copy.h [new file with mode: 0644]
inference-engine/src/gna_plugin/CMakeLists.txt [new file with mode: 0644]
inference-engine/src/gna_plugin/dnn.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/dnn.h [new file with mode: 0644]
inference-engine/src/gna_plugin/dnn_memory.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/dnn_memory.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/dnn_traits.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/floatmath.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/floatmath.h [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_allocator.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_api_wrapper.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_device.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_device.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_executable_network.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_helper.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_infer_request.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_layer_info.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_mem_requests.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_memory.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_memory_state.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_model_serial.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_model_serial.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_plugin.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_plugin.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_plugin_config.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_plugin_internal.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_plugin_log.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_plugin_passes.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/lstm.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/lstm.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/polymorh_allocator.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/pwl.h [new file with mode: 0644]
inference-engine/src/gna_plugin/pwl_design.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/quantization/model_quantizer.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/quantization/precision_ex.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/quantization/quantization.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/quantization/quantization.h [new file with mode: 0644]
inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/util.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/util.h [new file with mode: 0644]
inference-engine/src/hetero_plugin/CMakeLists.txt
inference-engine/src/hetero_plugin/fallback_policy.cpp
inference-engine/src/hetero_plugin/fallback_policy.h
inference-engine/src/hetero_plugin/hetero_async_infer_request.cpp
inference-engine/src/hetero_plugin/hetero_async_infer_request.h
inference-engine/src/hetero_plugin/hetero_device_loader.cpp
inference-engine/src/hetero_plugin/hetero_device_loader.h
inference-engine/src/hetero_plugin/hetero_executable_network.cpp
inference-engine/src/hetero_plugin/hetero_executable_network.h
inference-engine/src/hetero_plugin/hetero_infer_request.cpp
inference-engine/src/hetero_plugin/hetero_infer_request.h
inference-engine/src/hetero_plugin/hetero_plugin.cpp
inference-engine/src/hetero_plugin/hetero_plugin.h
inference-engine/src/hetero_plugin/hetero_plugin_base.hpp
inference-engine/src/inference_engine/CMakeLists.txt
inference-engine/src/inference_engine/ade_util.cpp
inference-engine/src/inference_engine/ade_util.hpp
inference-engine/src/inference_engine/blob_factory.cpp
inference-engine/src/inference_engine/blob_factory.hpp
inference-engine/src/inference_engine/blob_transform.cpp
inference-engine/src/inference_engine/blob_transform.hpp
inference-engine/src/inference_engine/builders/ie_argmax_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_batch_normalization_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_clamp_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_concat_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_const_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_convolution_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_crop_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_ctc_greedy_decoder_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_deconvolution_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_detection_output_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_eltwise_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_elu_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_fully_connected_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_grn_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_input_layer_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_layer_builder.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_layer_fragment.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_memory_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_mvn_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_network_builder.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_norm_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_normalize_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_output_layer_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_permute_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_pooling_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_power_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_prelu_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_prior_box_clustered_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_prior_box_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_proposal_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_psroi_pooling_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_region_yolo_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_relu6_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_relu_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_reorg_yolo_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_reshape_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_roi_pooling_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_scale_shift_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_sigmoid_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_simpler_nms_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_softmax_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_split_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_tanh_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/builders/ie_tile_layer.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/cnn_network_impl.cpp
inference-engine/src/inference_engine/cnn_network_impl.hpp
inference-engine/src/inference_engine/cnn_network_int8_normalizer.cpp
inference-engine/src/inference_engine/cnn_network_int8_normalizer.hpp
inference-engine/src/inference_engine/cnn_network_stats_impl.cpp
inference-engine/src/inference_engine/cnn_network_stats_impl.hpp
inference-engine/src/inference_engine/cpp_interfaces/base/ie_executable_network_base.hpp
inference-engine/src/inference_engine/cpp_interfaces/base/ie_infer_async_request_base.hpp
inference-engine/src/inference_engine/cpp_interfaces/base/ie_memory_state_base.hpp
inference-engine/src/inference_engine/cpp_interfaces/base/ie_plugin_base.hpp
inference-engine/src/inference_engine/cpp_interfaces/exception2status.hpp
inference-engine/src/inference_engine/cpp_interfaces/ie_executor_manager.cpp
inference-engine/src/inference_engine/cpp_interfaces/ie_executor_manager.hpp
inference-engine/src/inference_engine/cpp_interfaces/ie_itask_executor.hpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task.cpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task.hpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task_executor.cpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task_executor.hpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task_synchronizer.hpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.cpp
inference-engine/src/inference_engine/cpp_interfaces/ie_task_with_stages.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_async_request_thread_safe_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_infer_request_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_memory_state_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/impl/ie_plugin_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iinfer_async_request_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iinfer_request_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/interface/ie_imemory_state_internal.hpp
inference-engine/src/inference_engine/cpp_interfaces/interface/ie_iplugin_internal.hpp
inference-engine/src/inference_engine/cpu_detector.cpp
inference-engine/src/inference_engine/cpu_detector.hpp
inference-engine/src/inference_engine/cpu_x86_sse42/blob_transform_sse42.cpp
inference-engine/src/inference_engine/cpu_x86_sse42/blob_transform_sse42.hpp
inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.cpp
inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_data_sse42.hpp
inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp [new file with mode: 0644]
inference-engine/src/inference_engine/data_stats.cpp
inference-engine/src/inference_engine/data_stats.h
inference-engine/src/inference_engine/debug.h
inference-engine/src/inference_engine/description_buffer.hpp
inference-engine/src/inference_engine/dll_main.hpp
inference-engine/src/inference_engine/file_utils.cpp
inference-engine/src/inference_engine/file_utils.h
inference-engine/src/inference_engine/graph_tools.cpp
inference-engine/src/inference_engine/graph_tools.hpp
inference-engine/src/inference_engine/graph_transformer.cpp
inference-engine/src/inference_engine/graph_transformer.h
inference-engine/src/inference_engine/ie_algorithm.hpp
inference-engine/src/inference_engine/ie_blob_common.cpp
inference-engine/src/inference_engine/ie_blob_proxy.hpp
inference-engine/src/inference_engine/ie_cnn_layer_builder.h [new file with mode: 0644]
inference-engine/src/inference_engine/ie_cnn_net_reader_impl.cpp
inference-engine/src/inference_engine/ie_cnn_net_reader_impl.h
inference-engine/src/inference_engine/ie_context.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/ie_data.cpp
inference-engine/src/inference_engine/ie_device.cpp
inference-engine/src/inference_engine/ie_format_parser.cpp [moved from inference-engine/src/inference_engine/v2_format_parser.cpp with 74% similarity]
inference-engine/src/inference_engine/ie_format_parser.h [moved from inference-engine/src/inference_engine/v2_format_parser.h with 91% similarity]
inference-engine/src/inference_engine/ie_graph_splitter.cpp
inference-engine/src/inference_engine/ie_graph_splitter.hpp
inference-engine/src/inference_engine/ie_layer_parsers.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/ie_layer_parsers.h [moved from inference-engine/src/inference_engine/v2_layer_parsers.h with 81% similarity]
inference-engine/src/inference_engine/ie_layer_validators.cpp
inference-engine/src/inference_engine/ie_layer_validators.hpp
inference-engine/src/inference_engine/ie_layers_internal.cpp
inference-engine/src/inference_engine/ie_layers_internal.hpp
inference-engine/src/inference_engine/ie_layers_prv.h [new file with mode: 0644]
inference-engine/src/inference_engine/ie_layouts.cpp
inference-engine/src/inference_engine/ie_memcpy.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/ie_memcpy.h [new file with mode: 0644]
inference-engine/src/inference_engine/ie_network.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/ie_network.hpp [new file with mode: 0644]
inference-engine/src/inference_engine/ie_preprocess_data.cpp
inference-engine/src/inference_engine/ie_preprocess_data.hpp
inference-engine/src/inference_engine/ie_preprocess_gapi.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/ie_preprocess_gapi.hpp [new file with mode: 0644]
inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.hpp [new file with mode: 0644]
inference-engine/src/inference_engine/ie_preprocess_gapi_kernels_impl.hpp [new file with mode: 0644]
inference-engine/src/inference_engine/ie_profiling.hpp
inference-engine/src/inference_engine/ie_util_internal.cpp
inference-engine/src/inference_engine/ie_util_internal.hpp
inference-engine/src/inference_engine/ie_utils.cpp
inference-engine/src/inference_engine/ie_version.cpp
inference-engine/src/inference_engine/layer_transform.hpp
inference-engine/src/inference_engine/memory_solver.cpp
inference-engine/src/inference_engine/memory_solver.hpp
inference-engine/src/inference_engine/net_pass.cpp [new file with mode: 0644]
inference-engine/src/inference_engine/net_pass.h [new file with mode: 0644]
inference-engine/src/inference_engine/network_serializer.cpp [moved from inference-engine/samples/calibration_tool/network_serializer.cpp with 87% similarity]
inference-engine/src/inference_engine/network_serializer.h [new file with mode: 0644]
inference-engine/src/inference_engine/parsers.h
inference-engine/src/inference_engine/precision_utils.cpp
inference-engine/src/inference_engine/precision_utils.h
inference-engine/src/inference_engine/range_iterator.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_argmax_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_holder.cpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_holder.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_built_in_impl.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_concat_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_conv_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_crop_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_ctc_greedy_decoder_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_deconv_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_detection_output_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_eltwise_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_equal_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_flatten_shape_infer.hpp [new file with mode: 0644]
inference-engine/src/inference_engine/shape_infer/built-in/ie_gather_shape_infer.hpp [new file with mode: 0644]
inference-engine/src/inference_engine/shape_infer/built-in/ie_gemm_shape_infer.hpp [new file with mode: 0644]
inference-engine/src/inference_engine/shape_infer/built-in/ie_inner_product_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_interp_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_pad_shape_infer.hpp [new file with mode: 0644]
inference-engine/src/inference_engine/shape_infer/built-in/ie_permute_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_pool_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_priorbox_clustered_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_priorbox_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_proposal_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_psroi_pooling_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_region_yolo_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_reorg_yolo_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_resample_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_reshape_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_roi_pooling_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_simpler_nms_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_spatial_transformer_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_split_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_tile_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/ie_upsampling_shape_infer.hpp
inference-engine/src/inference_engine/shape_infer/built-in/impl_register.hpp
inference-engine/src/inference_engine/shape_infer/ie_reshape_io_controllers.cpp
inference-engine/src/inference_engine/shape_infer/ie_reshape_io_controllers.hpp
inference-engine/src/inference_engine/shape_infer/ie_reshape_launcher.cpp
inference-engine/src/inference_engine/shape_infer/ie_reshape_launcher.hpp
inference-engine/src/inference_engine/shape_infer/ie_reshaper.cpp
inference-engine/src/inference_engine/shape_infer/ie_reshaper.hpp
inference-engine/src/inference_engine/system_alllocator.cpp
inference-engine/src/inference_engine/system_alllocator.hpp
inference-engine/src/inference_engine/v2_layer_parsers.cpp [deleted file]
inference-engine/src/inference_engine/w_dirent.h
inference-engine/src/inference_engine/w_unistd.h
inference-engine/src/inference_engine/xml_parse_utils.cpp
inference-engine/src/inference_engine/xml_parse_utils.h
inference-engine/src/mkldnn_plugin/CMakeLists.txt
inference-engine/src/mkldnn_plugin/config.cpp
inference-engine/src/mkldnn_plugin/config.h
inference-engine/src/mkldnn_plugin/mean_image.cpp
inference-engine/src/mkldnn_plugin/mean_image.h
inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h
inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h
inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h
inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp
inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp
inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.h
inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.h
inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.cpp
inference-engine/src/mkldnn_plugin/mkldnn/os/lin/lin_omp_manager.h
inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h [deleted file]
inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h [deleted file]
inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.cpp
inference-engine/src/mkldnn_plugin/mkldnn_async_infer_request.h
inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp
inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h
inference-engine/src/mkldnn_plugin/mkldnn_dims.h
inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
inference-engine/src/mkldnn_plugin/mkldnn_edge.h
inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.cpp
inference-engine/src/mkldnn_plugin/mkldnn_extension_mngr.h
inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
inference-engine/src/mkldnn_plugin/mkldnn_graph.h
inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
inference-engine/src/mkldnn_plugin/mkldnn_infer_request.h
inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
inference-engine/src/mkldnn_plugin/mkldnn_memory.h
inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
inference-engine/src/mkldnn_plugin/mkldnn_node.h
inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
inference-engine/src/mkldnn_plugin/mkldnn_plugin.h
inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp
inference-engine/src/mkldnn_plugin/mkldnn_primitive.h
inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/mkldnn_streams.h [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_activation_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_depthwise_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.hpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_power_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.h
inference-engine/src/mkldnn_plugin/perf_count.h
inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/utils/blob_dump.h [new file with mode: 0644]
inference-engine/tests/CMakeLists.txt
inference-engine/tests/helpers/CMakeLists.txt
inference-engine/tests/helpers/disable_tests.hpp
inference-engine/tests/helpers/ir_gen_helper.cpp [new file with mode: 0644]
inference-engine/tests/helpers/ir_gen_helper.hpp [new file with mode: 0644]
inference-engine/tests/helpers/single_layer_common.cpp [new file with mode: 0644]
inference-engine/tests/helpers/single_layer_common.hpp
inference-engine/tests/helpers/test_assertions.hpp
inference-engine/tests/helpers/test_model_path.hpp
inference-engine/tests/helpers/test_model_repo.hpp.in
inference-engine/tests/helpers/test_models_path.cpp
inference-engine/tests/helpers/tests_common.hpp
inference-engine/tests/helpers/tests_common_func.hpp
inference-engine/tests/helpers/tests_file_utils.cpp
inference-engine/tests/helpers/tests_file_utils.hpp
inference-engine/tests/helpers/tests_utils.hpp
inference-engine/tests/helpers/version_printer.cpp
inference-engine/tests/helpers/xml_father.hpp
inference-engine/tests/helpers/xml_helper.hpp
inference-engine/tests/helpers/xml_net_builder.cpp
inference-engine/tests/helpers/xml_net_builder.hpp
inference-engine/tests/mock_engine/CMakeLists.txt
inference-engine/tests/mock_engine/dllmain.cpp
inference-engine/tests/mock_engine/mock_plugin.cpp
inference-engine/tests/mock_engine/mock_plugin.hpp
inference-engine/tests/mock_engine/stub_inference_engine.xpp
inference-engine/tests/unit/CMakeLists.txt
inference-engine/tests/unit/builders/batch_normalization_layer_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/builders/builder_test.hpp [new file with mode: 0644]
inference-engine/tests/unit/builders/input_layer_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/builders/network_builder_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/cnn_network/cnn_net_reader_impl_test.cpp
inference-engine/tests/unit/cnn_network/cnn_network_impl_test.cpp
inference-engine/tests/unit/cnn_network/layout_tests.cpp
inference-engine/tests/unit/cnn_network/mean_image.cpp
inference-engine/tests/unit/cnn_network/mean_image.h
inference-engine/tests/unit/cnn_network/parser_tests_base.hpp
inference-engine/tests/unit/cnn_network/v2_format_parser_test.cpp
inference-engine/tests/unit/cnn_network/v3_format_parser_test.cpp
inference-engine/tests/unit/cnn_network/xml_father_tests.cpp
inference-engine/tests/unit/engines/gna/I8_quantisation_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/configuration_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_allocator_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_api_stub.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_graph_aot_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_hardware_precision_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_matcher.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_matcher.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_memory_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_mock_api.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_proc_type_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_pwl_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/gna_query_state_tests.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/i16_quantisation_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/matchers/conv_matcher.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/matchers/copy_matcher.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/matchers/diag_matcher.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/matchers/nnet_base_matcher.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/matchers/pool_matcher.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/matchers/precision_matcher.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/matchers/pwl_matcher.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/matchers/pwl_quantization_metrics_matcher.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/test_irs.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/gna/test_irs.hpp [new file with mode: 0644]
inference-engine/tests/unit/engines/mkldnn/constant_propagation_test.cpp
inference-engine/tests/unit/engines/mkldnn/convert_desc_test.cpp
inference-engine/tests/unit/engines/mkldnn/dump_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/mkldnn/dumper_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/fake_layer.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/gather_tests.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/graph_generic_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/interp_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/mvn_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/resample_tests.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_activation_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_batchnorm_scaleshift_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_batchnorm_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_concat_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_conv_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_crop_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_deconv_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_depthwise_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_eltwise_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_fullyconnected_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_gemm_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_input_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_leaks_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_lrn_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_permute_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_pooling_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_power_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_relu_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_reorder_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_reshape_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_roi_pooling_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_simplernms_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_softmax_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_split_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_tile_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_conv_depthwise_fusing_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_dw_conv_fusing_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_optimization_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/structure/graph_structure_test.cpp
inference-engine/tests/unit/engines/mkldnn/graph/test_graph.hpp
inference-engine/tests/unit/engines/mkldnn/mkldnn_primitive_test.cpp
inference-engine/tests/unit/engines/mkldnn/test_layers.cpp
inference-engine/tests/unit/graph_tools/graph_copy_tests.cpp
inference-engine/tests/unit/graph_tools/graph_test_base.hpp
inference-engine/tests/unit/graph_tools/graph_tools_test.cpp
inference-engine/tests/unit/inference_engine_tests/alocator_tests.cpp
inference-engine/tests/unit/inference_engine_tests/blob_proxy_test.cpp
inference-engine/tests/unit/inference_engine_tests/blob_test.cpp
inference-engine/tests/unit/inference_engine_tests/caslesseq_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cnn_network_test.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_base_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_thread_safe_default_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/async_infer_request_thread_safe_internal.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/callback_manager_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_base_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_thread_safe_async_only_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executable_network_thread_safe_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/executor_manager_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/iinference_plugin_internal_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/memory_state_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/plugin_base_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_common_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_executor_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_synchronizer_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_tests.cpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_tests_utils.hpp
inference-engine/tests/unit/inference_engine_tests/cpp_interfaces/task_with_stages_tests.cpp
inference-engine/tests/unit/inference_engine_tests/data_test.cpp
inference-engine/tests/unit/inference_engine_tests/debug_tests.cpp
inference-engine/tests/unit/inference_engine_tests/device_tests.cpp
inference-engine/tests/unit/inference_engine_tests/exception_test.cpp
inference-engine/tests/unit/inference_engine_tests/inference_engine_plugin_test.cpp
inference-engine/tests/unit/inference_engine_tests/inference_engine_test.cpp
inference-engine/tests/unit/inference_engine_tests/layer_transform_test.cpp
inference-engine/tests/unit/inference_engine_tests/layers_test.cpp
inference-engine/tests/unit/inference_engine_tests/locked_memory_test.cpp
inference-engine/tests/unit/inference_engine_tests/plugin_dispatcher_tests.cpp
inference-engine/tests/unit/inference_engine_tests/pointer_test.cpp
inference-engine/tests/unit/inference_engine_tests/pre_allocator_test.cpp
inference-engine/tests/unit/inference_engine_tests/precision_test.cpp
inference-engine/tests/unit/inference_engine_tests/preprocess_test.cpp
inference-engine/tests/unit/inference_engine_tests/range_iterator_tests.cpp
inference-engine/tests/unit/inference_engine_tests/response_buffer_test.cpp
inference-engine/tests/unit/inference_engine_tests/shared_object_loader_test.cpp
inference-engine/tests/unit/inference_engine_tests/so_pointer_tests.cpp
inference-engine/tests/unit/inference_engine_tests/tensor_desc_test.cpp
inference-engine/tests/unit/inference_engine_tests/util_test.cpp
inference-engine/tests/unit/mem_solver/mem_solver_test.cpp
inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_default.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_internal.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_async_infer_request_thread_safe_internal.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_network_internal.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_thread_safe_async_only.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_executable_thread_safe_default.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_infer_request_internal.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/impl/mock_inference_plugin_internal.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iasync_infer_request_internal.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iexecutable_network_internal.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_iinfer_request_internal.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/interface/mock_imemory_state_internal.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/mock_plugin_impl.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/mock_task_executor.hpp
inference-engine/tests/unit/mocks/cpp_interfaces/mock_task_synchronizer.hpp
inference-engine/tests/unit/mocks/mock_allocator.hpp
inference-engine/tests/unit/mocks/mock_error_listener.hpp
inference-engine/tests/unit/mocks/mock_iasync_infer_request.hpp
inference-engine/tests/unit/mocks/mock_icnn_network.hpp
inference-engine/tests/unit/mocks/mock_iexecutable_network.hpp
inference-engine/tests/unit/mocks/mock_iformat_parser.hpp
inference-engine/tests/unit/mocks/mock_inference_engine.hpp
inference-engine/tests/unit/mocks/mock_not_empty_icnn_network.hpp
inference-engine/tests/unit/mocks/mock_plugin_dispatcher.hpp
inference-engine/tests/unit/mocks/shape_infer/mock_input_controller.hpp
inference-engine/tests/unit/mocks/shape_infer/mock_ishape_infer_impl.hpp
inference-engine/tests/unit/mocks/shape_infer/mock_output_controller.hpp
inference-engine/tests/unit/mocks/shape_infer/mock_reshaper_launcher.hpp
inference-engine/tests/unit/mocks/shape_infer/mock_shape_infer_extension.hpp
inference-engine/tests/unit/opencv_test_gapi/CMakeLists.txt [new file with mode: 0644]
inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.cpp [moved from inference-engine/tests/unit/topology_verification_tests/v1_topology_verification_test.cpp with 70% similarity]
inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.hpp [new file with mode: 0644]
inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests_inl.hpp [new file with mode: 0644]
inference-engine/tests/unit/opencv_test_gapi/common/gapi_tests_common.hpp [new file with mode: 0644]
inference-engine/tests/unit/opencv_test_gapi/cpu/gapi_core_tests_fluid.cpp [new file with mode: 0644]
inference-engine/tests/unit/shape_infer/built_in_holder_test.cpp
inference-engine/tests/unit/shape_infer/built_in_shape_infer_batch_test.cpp
inference-engine/tests/unit/shape_infer/built_in_shape_infer_conv_test.cpp
inference-engine/tests/unit/shape_infer/built_in_shape_infer_fake_test.cpp
inference-engine/tests/unit/shape_infer/built_in_shape_infer_general_test.cpp
inference-engine/tests/unit/shape_infer/built_in_shape_infer_general_test.hpp
inference-engine/tests/unit/shape_infer/built_in_shape_infer_pool_test.cpp
inference-engine/tests/unit/shape_infer/cpu_ext_shape_infer_general_test.cpp
inference-engine/tests/unit/shape_infer/input_controller_test.cpp
inference-engine/tests/unit/shape_infer/input_reshape_launcher_test.cpp
inference-engine/tests/unit/shape_infer/output_controller_test.cpp
inference-engine/tests/unit/shape_infer/reshape_launcher_test.cpp
inference-engine/tests/unit/shape_infer/reshaper_test.cpp
inference-engine/tests/unit/stress_tests/stress_tests.cpp
inference-engine/tests/unit/topology_verification_tests/v2_topology_verification_test.cpp
inference-engine/thirdparty/CMakeLists.txt
inference-engine/thirdparty/MKL.cmake [deleted file]
inference-engine/thirdparty/ade
inference-engine/thirdparty/clDNN/api/C/border.h
inference-engine/thirdparty/clDNN/api/CPP/border.hpp
inference-engine/thirdparty/clDNN/kernel_selector/common/common_types.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_base.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_base.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/border/border_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/roi_pooling/roi_pooling_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/tile/tile_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/border_gpu_ref.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/roi_pooling_ref.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/tile_ref.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp
inference-engine/thirdparty/clDNN/src/border.cpp
inference-engine/thirdparty/clDNN/src/gpu/border_gpu.cpp
inference-engine/thirdparty/clDNN/src/memory_pool.cpp
inference-engine/thirdparty/clDNN/src/program.cpp
inference-engine/thirdparty/clDNN/src/roi_pooling.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/border_gpu_test.cpp
inference-engine/thirdparty/fluid/README.md [new file with mode: 0644]
inference-engine/thirdparty/fluid/check.sh [new file with mode: 0644]
inference-engine/thirdparty/fluid/checksum.sh [new file with mode: 0644]
inference-engine/thirdparty/fluid/checksum.txt [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/CMakeLists.txt [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/cmake/DownloadADE.cmake [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/cmake/init.cmake [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/cmake/standalone.cmake [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/doc/00-root.markdown [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/doc/01-background.markdown [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/doc/10-hld-overview.md [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/doc/20-kernel-api.markdown [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/doc/30-implementation.markdown [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/doc/dot/kernel_hierarchy.dot [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/doc/pics/demo.jpg [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/doc/pics/gapi_scheme.png [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/doc/pics/kernel_hierarchy.png [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/core.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/core.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/imgproc.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/core.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidbuffer.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/imgproc.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garg.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garray.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcall.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcommon.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompiled.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompoundkernel.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcomputation.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gkernel.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmat.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmetaarg.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gproto.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/core.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/ggpukernel.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/imgproc.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gscalar.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtype_traits.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtyped.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/imgproc.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/opencv_includes.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/operators.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/assert.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/convert.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/exports.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/mat.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/saturate.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/scalar.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/types.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/any.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/compiler_hints.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/optional.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/throw.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/util.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/variant.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_imgproc_perf_tests_gpu.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/internal/gapi_compiler_perf_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/perf_main.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/perf/perf_precomp.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/samples/api_example.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/samples/api_ref_snippets.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/samples/kernel_api_snippets.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/README.md [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/garray.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend_priv.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall_priv.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation_priv.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gkernel.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gmat.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode_priv.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto_priv.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/gscalar.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_core.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_imgproc.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/api/operators.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/README.md [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gbackend.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundbackend.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundkernel.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpukernel.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer_priv.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidcore.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidutils.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpukernel.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/README.md [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled_priv.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gobjref.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/dump_dot.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/exec.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/islands.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/kernels.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/meta.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/passes.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/compiler/transactions.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/logger.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/src/precomp.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_compoundkernel_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests_inl.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests_inl.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_tests_common.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_array_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_basic_hetero_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_desc_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_resize_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_roi_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcompiled_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcomputation_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_kernel_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_mock_kernels.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_sample_pipelines.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_scalar_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_smoke_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_typed_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gapi_util_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_backend_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_executor_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_garg_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmetaarg_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_recompilation_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_resolve_kernel_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_vectorref_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_transactions_test.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/own/gapi_types_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/own/mat_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/own/scalar_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/test_main.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/test_precomp.hpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/util/any_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/util/optional_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/modules/gapi/test/util/variant_tests.cpp [new file with mode: 0644]
inference-engine/thirdparty/fluid/revision.txt [new file with mode: 0644]
inference-engine/thirdparty/fluid/update.sh [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/CMakeLists.txt
inference-engine/thirdparty/mkl-dnn/README.md
inference-engine/thirdparty/mkl-dnn/cmake/MKL.cmake
inference-engine/thirdparty/mkl-dnn/cmake/OpenMP.cmake
inference-engine/thirdparty/mkl-dnn/cmake/TBB.cmake
inference-engine/thirdparty/mkl-dnn/cmake/Threading.cmake [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/cmake/lnx/TBBConfig.cmake
inference-engine/thirdparty/mkl-dnn/cmake/mac/TBBConfig.cmake
inference-engine/thirdparty/mkl-dnn/cmake/options.cmake
inference-engine/thirdparty/mkl-dnn/cmake/platform.cmake
inference-engine/thirdparty/mkl-dnn/cmake/profiling.cmake
inference-engine/thirdparty/mkl-dnn/cmake/utils.cmake
inference-engine/thirdparty/mkl-dnn/cmake/win/TBBConfig.cmake
inference-engine/thirdparty/mkl-dnn/doc/design/understanding_memory_formats.md
inference-engine/thirdparty/mkl-dnn/doc/mainpage.md
inference-engine/thirdparty/mkl-dnn/examples/CMakeLists.txt
inference-engine/thirdparty/mkl-dnn/examples/simple_net.c
inference-engine/thirdparty/mkl-dnn/examples/simple_rnn_training.cpp
inference-engine/thirdparty/mkl-dnn/examples/simple_training_net.c
inference-engine/thirdparty/mkl-dnn/include/mkldnn.h
inference-engine/thirdparty/mkl-dnn/include/mkldnn.hpp
inference-engine/thirdparty/mkl-dnn/include/mkldnn_types.h
inference-engine/thirdparty/mkl-dnn/scripts/prepare_mkl.bat
inference-engine/thirdparty/mkl-dnn/scripts/prepare_mkl.sh
inference-engine/thirdparty/mkl-dnn/src/CMakeLists.txt
inference-engine/thirdparty/mkl-dnn/src/common/batch_normalization.cpp
inference-engine/thirdparty/mkl-dnn/src/common/c_types_map.hpp
inference-engine/thirdparty/mkl-dnn/src/common/convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/common/convolution_pd.hpp
inference-engine/thirdparty/mkl-dnn/src/common/depthwise_pd.hpp
inference-engine/thirdparty/mkl-dnn/src/common/eltwise.cpp
inference-engine/thirdparty/mkl-dnn/src/common/format_traits.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/common/lrn.cpp
inference-engine/thirdparty/mkl-dnn/src/common/math_utils.hpp
inference-engine/thirdparty/mkl-dnn/src/common/memory_desc_wrapper.cpp
inference-engine/thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp
inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_debug.cpp
inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_thread.hpp
inference-engine/thirdparty/mkl-dnn/src/common/mkldnn_traits.hpp
inference-engine/thirdparty/mkl-dnn/src/common/primitive.cpp
inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.cpp
inference-engine/thirdparty/mkl-dnn/src/common/primitive_attr.hpp
inference-engine/thirdparty/mkl-dnn/src/common/primitive_desc.cpp
inference-engine/thirdparty/mkl-dnn/src/common/reorder.cpp
inference-engine/thirdparty/mkl-dnn/src/common/rnn.cpp
inference-engine/thirdparty/mkl-dnn/src/common/rnn_pd.hpp
inference-engine/thirdparty/mkl-dnn/src/common/shuffle.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/common/shuffle_pd.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/common/type_helpers.hpp
inference-engine/thirdparty/mkl-dnn/src/common/utils.hpp
inference-engine/thirdparty/mkl-dnn/src/common/verbose.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_concat.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_convolution_pd.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_engine.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_inner_product_pd.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_memory.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reducer.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reorder.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_reorder_pd.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_shuffle_pd.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_sum.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm_utils.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/gemm_utils.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx512_common_gemm_f32.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx512_common_gemm_f32.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx_gemm_f32.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/jit_avx_gemm_f32.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm/ref_gemm.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution_utils.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_convolution_utils.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_inner_product.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_u8s8s32x_inner_product.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.cpp [moved from inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_u8s8s32x_convolution.cpp with 80% similarity]
inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_x8s8s32x_convolution.hpp [moved from inference-engine/thirdparty/mkl-dnn/src/cpu/gemm_u8s8s32x_convolution.hpp with 90% similarity]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_conv_kernel_f32.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_conv_kernel_f32.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_1x1_convolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_conv_kernel_f32.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx2_convolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_conv_kernel.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_1x1_convolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_kernel.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_kernel.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_conv_winograd_kernel_f32.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_convolution_winograd.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_common_lrn.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_2x3.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_2x3.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3_kernel.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_fp32_wino_conv_4x3_kernel.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_convolution.cpp [deleted file]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_wino_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_wino_convolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp [moved from inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_1x1_conv_kernel.cpp with 86% similarity]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp [moved from inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_1x1_conv_kernel.hpp with 87% similarity]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.cpp [moved from inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_1x1_convolution.cpp with 70% similarity]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_1x1_convolution.hpp [moved from inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_1x1_convolution.hpp with 58% similarity]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_conv_kernel.cpp [moved from inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_conv_kernel.cpp with 53% similarity]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_conv_kernel.hpp [moved from inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_conv_kernel.hpp with 82% similarity]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.hpp [moved from inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_convolution.hpp with 60% similarity]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_generator.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_primitive_conf.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_conv_kernel_f32.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_conv_kernel_f32.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_1x1_convolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_conv_kernel_f32.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_sse42_convolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_1x1_conv_utils.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_batch_normalization.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_batch_normalization.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_depthwise.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_conv_kernel_f32.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_dw_convolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_eltwise.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_lrn_kernel_f32.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_lrn_kernel_f32.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pool_kernel_f32.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pool_kernel_f32.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_pooling.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_reorder.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_reorder_utils.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/nchw_pooling.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ncsp_batch_normalization.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ncsp_batch_normalization.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/nspc_batch_normalization.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/nspc_batch_normalization.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_convolution.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_convolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_deconvolution.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_depthwise.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_eltwise.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_inner_product.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_lrn.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_lrn.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_pooling.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_pooling.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_rnn.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_rnn.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/src/cpu/simple_concat.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/simple_concat.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/simple_q10n.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/simple_reorder.hpp
inference-engine/thirdparty/mkl-dnn/src/cpu/wino_reorder.hpp
inference-engine/thirdparty/mkl-dnn/tests/CMakeLists.txt
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/CMakeLists.txt
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/README.md
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/benchdnn.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/bench_bnorm.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/bnorm/bnorm.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/common.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/common.hpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/bench_conv.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/bench_deconv.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/cfg.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv_aux.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/conv_common.hpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/deconv.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/conv/ref_conv.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/dnn_types.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/dnn_types.hpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/bnorm/test_bnorm_regressions
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/bnorm/test_bnorm_regressions_large [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_1d [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_1d_wavenet [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_all
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_all_topo
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_dilated
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_regression_small_spatial
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_tails
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/ip/test_ip_all
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru_small [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_inference [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_small [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_training [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_inference [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_small [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_training [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/shuffle/test_shuffle [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/shuffle/test_shuffle_axis [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_all
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_attrs
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_dilated
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_regression_general
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_conv_tails
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/test_deconv_all
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/ip/bench_ip.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/ip/ip.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/mkldnn_debug.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/bench_reorder.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/reorder/reorder.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/bench_rnn.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/cfg.cpp [moved from inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/input_rnn.hpp with 55% similarity]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/ref_rnn.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn.hpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/rnn/rnn_aux.cpp
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/bench_shuffle.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/perf_report.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/ref_shuffle.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle_aux.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/gtests/CMakeLists.txt
inference-engine/thirdparty/mkl-dnn/tests/gtests/convolution_common.h
inference-engine/thirdparty/mkl-dnn/tests/gtests/gtest/src/gtest-death-test.cc
inference-engine/thirdparty/mkl-dnn/tests/gtests/gtest/src/gtest-port.cc
inference-engine/thirdparty/mkl-dnn/tests/gtests/gtest/src/gtest-test-part.cc
inference-engine/thirdparty/mkl-dnn/tests/gtests/gtest/src/gtest.cc
inference-engine/thirdparty/mkl-dnn/tests/gtests/in/convolution_simple_small.h
inference-engine/thirdparty/mkl-dnn/tests/gtests/in/gemm_in.h [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/gtests/mkldnn_test_common.hpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_common.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_common.hpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8fp.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_forward_u8s8s32.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm.cpp [deleted file]
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_common.hpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_f32.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_s8s8s32.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_s8u8s32.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_iface_pd_iter.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_lrn_backward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_lrn_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_memory.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_mkldnn_threading.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_pooling_backward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_pooling_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_shuffle.cpp [new file with mode: 0644]
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_softmax_forward.cpp
inference-engine/thirdparty/mkl-dnn/tests/gtests/test_sum.cpp
inference-engine/thirdparty/mkldnn.cmake
inference-engine/thirdparty/ocv/opencv_hal_sse.hpp [new file with mode: 0644]
model-optimizer/ModelOptimizer [deleted file]
model-optimizer/extensions/back/ConvolutionReshaper.py [new file with mode: 0644]
model-optimizer/extensions/back/PermuteForReshape.py [new file with mode: 0644]
model-optimizer/extensions/back/TileReshaper.py [new file with mode: 0644]
model-optimizer/extensions/back/disable_unsupported_ND_operations.py
model-optimizer/extensions/back/kaldi_remove_memory_output.py
model-optimizer/extensions/back/remove_last_softmax_pattern.py
model-optimizer/extensions/front/Pack.py [moved from model-optimizer/extensions/front/tf/Pack.py with 89% similarity]
model-optimizer/extensions/front/caffe/axpy.py [new file with mode: 0644]
model-optimizer/extensions/front/caffe/bn.py [new file with mode: 0644]
model-optimizer/extensions/front/caffe/detection_output.py
model-optimizer/extensions/front/caffe/flatten_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/caffe/interp_ext.py
model-optimizer/extensions/front/caffe/pooling_ext.py
model-optimizer/extensions/front/caffe/priorbox_ext.py
model-optimizer/extensions/front/caffe/shufflechannel_ext.py [moved from model-optimizer/mo/front/kaldi/extractors/inner_product_ext.py with 69% similarity]
model-optimizer/extensions/front/caffe/softmax_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/caffe/split_to_identity.py [new file with mode: 0644]
model-optimizer/extensions/front/instance_normalization.py [moved from model-optimizer/extensions/front/onnx/instance_normalization.py with 100% similarity]
model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py [new file with mode: 0644]
model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py [new file with mode: 0644]
model-optimizer/extensions/front/kaldi/add_reshape_for_conv.py [deleted file]
model-optimizer/extensions/front/kaldi/add_reshape_for_pooling.py [deleted file]
model-optimizer/extensions/front/kaldi/eliminate_redundant_reshape.py
model-optimizer/extensions/front/kaldi/fuse_repeated_reshape.py
model-optimizer/extensions/front/kaldi/replace_lstm_node_pattern.py
model-optimizer/extensions/front/kaldi/replace_splice_node_pattern.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/RNN_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/block_grad_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/conv_ext.py
model-optimizer/extensions/front/mxnet/copy_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/dropout_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/flatten_ext.py [moved from model-optimizer/mo/front/kaldi/extractors/clamp_ext.py with 74% similarity]
model-optimizer/extensions/front/mxnet/instance_norm_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/max_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/maximum_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/minimum_ext.py [moved from model-optimizer/mo/front/kaldi/extractors/activation_ext.py with 76% similarity]
model-optimizer/extensions/front/mxnet/pooling_ext.py
model-optimizer/extensions/front/mxnet/reshape_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/rnn_param_concat.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/slice_channel_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/softmax.py
model-optimizer/extensions/front/mxnet/softmax_activation_ext.py
model-optimizer/extensions/front/mxnet/softmax_ext.py
model-optimizer/extensions/front/mxnet/ssd_pattern_flatten_softmax_activation.py
model-optimizer/extensions/front/mxnet/ssd_pattern_remove_flatten.py
model-optimizer/extensions/front/mxnet/ssd_pattern_remove_reshape.py
model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose.py
model-optimizer/extensions/front/mxnet/stack_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/swapaxes_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/up_sampling_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/mxnet/zeros_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/onnx/conv_ext.py
model-optimizer/extensions/front/onnx/flatten_ext.py
model-optimizer/extensions/front/onnx/gather_ext.py
model-optimizer/extensions/front/onnx/lstm_ext.py
model-optimizer/extensions/front/onnx/matmul_ext.py
model-optimizer/extensions/front/onnx/neg_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/onnx/pad_ext.py
model-optimizer/extensions/front/onnx/pooling_ext.py
model-optimizer/extensions/front/onnx/reduce_sum_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/onnx/softmax_ext.py [moved from model-optimizer/mo/front/kaldi/extractors/scale_shift.py with 69% similarity]
model-optimizer/extensions/front/onnx/split_ext.py [moved from model-optimizer/mo/front/kaldi/extractors/split.py with 62% similarity]
model-optimizer/extensions/front/tf/BlockLSTM.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/BlockLSTM_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/CTCGreedyDecoder.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/CTCGreedyDecoder_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/ObjectDetectionAPI.py
model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/SSDToolboxDetectionOutput.py
model-optimizer/extensions/front/tf/assign_elimination.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/basic_lstm_cell.py
model-optimizer/extensions/front/tf/deconv_ext.py
model-optimizer/extensions/front/tf/fake_const.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/fifo_queue_v2_ext.py
model-optimizer/extensions/front/tf/fifo_replacer.py
model-optimizer/extensions/front/tf/gather_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/max_ext.py [moved from model-optimizer/mo/front/kaldi/extractors/memory_ext.py with 71% similarity]
model-optimizer/extensions/front/tf/mvn_unrolled.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/nearest_neighbor_upsampling.py
model-optimizer/extensions/front/tf/next_iteration_ext.py
model-optimizer/extensions/front/tf/pad_ext.py
model-optimizer/extensions/front/tf/pooling_ext.py
model-optimizer/extensions/front/tf/rank_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/retinanet.json [new file with mode: 0644]
model-optimizer/extensions/front/tf/reverse_sequence.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/reverse_v2.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/softmax_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/sqrt_ext.py [moved from model-optimizer/mo/front/kaldi/extractors/eltwise_ext.py with 75% similarity]
model-optimizer/extensions/front/tf/square_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/stop_gradient_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/tf/variable_ext.py [new file with mode: 0644]
model-optimizer/extensions/middle/AddIsCyclicAttribute.py [moved from model-optimizer/mo/front/mxnet/extractors/flatten.py with 64% similarity]
model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py [new file with mode: 0644]
model-optimizer/extensions/middle/ConstSwitchResolver.py
model-optimizer/extensions/middle/ConvertGroupedStridedSlice.py
model-optimizer/extensions/middle/ConvertLayoutDependentOperations.py
model-optimizer/extensions/middle/EltwiseInputReshape.py
model-optimizer/extensions/middle/FusePermutesSequence.py
model-optimizer/extensions/middle/GemmResolver.py [new file with mode: 0644]
model-optimizer/extensions/middle/NormalizePad.py [new file with mode: 0644]
model-optimizer/extensions/middle/PadToPoolingMiddleReplacer.py [deleted file]
model-optimizer/extensions/middle/PixelLinkReshape.py
model-optimizer/extensions/middle/Reduce.py
model-optimizer/extensions/middle/ShuffleChannel.py [new file with mode: 0644]
model-optimizer/extensions/middle/ShufflenetReshape.py
model-optimizer/extensions/middle/SliceConverter.py
model-optimizer/extensions/middle/SwapAxesMiddleReplacer.py [new file with mode: 0644]
model-optimizer/extensions/middle/TF_lstm_cell_to_generic.py
model-optimizer/extensions/middle/TensorIteratorCondition.py
model-optimizer/extensions/middle/TensorIteratorConditionChecker.py
model-optimizer/extensions/middle/TensorIteratorInput.py
model-optimizer/extensions/middle/TensorIteratorMerge.py
model-optimizer/extensions/middle/UselessMerge.py
model-optimizer/extensions/middle/UselessStridedSlice.py
model-optimizer/extensions/middle/decompose_bi_lstm.py [new file with mode: 0644]
model-optimizer/extensions/middle/lstm_sequence_normalize.py
model-optimizer/extensions/middle/lstm_sequence_tensor_iterator.py
model-optimizer/extensions/middle/lstm_tensor_iterator_to_lstm_sequence.py
model-optimizer/extensions/middle/mxnet_lstm_sequence_normalize.py [new file with mode: 0644]
model-optimizer/extensions/middle/permute_tensor_iterator.py
model-optimizer/extensions/middle/reverse_tensor_iterator.py [new file with mode: 0644]
model-optimizer/extensions/ops/BlockLSTM.py [new file with mode: 0644]
model-optimizer/extensions/ops/DetectionOutput.py
model-optimizer/extensions/ops/TensorArrayGather.py
model-optimizer/extensions/ops/TensorArrayScatter.py
model-optimizer/extensions/ops/TensorArrayWrite.py
model-optimizer/extensions/ops/axpy.py [new file with mode: 0644]
model-optimizer/extensions/ops/bn.py [new file with mode: 0644]
model-optimizer/extensions/ops/constant_fill.py
model-optimizer/extensions/ops/gather.py [new file with mode: 0644]
model-optimizer/extensions/ops/identity.py [new file with mode: 0644]
model-optimizer/extensions/ops/interp.py
model-optimizer/extensions/ops/lstm_sequence.py
model-optimizer/extensions/ops/merge.py
model-optimizer/extensions/ops/pack.py [moved from model-optimizer/mo/front/common/partial_infer/up_sampling.py with 57% similarity]
model-optimizer/extensions/ops/priorbox.py
model-optimizer/extensions/ops/rank.py [new file with mode: 0644]
model-optimizer/extensions/ops/reverse_sequence.py [moved from model-optimizer/extensions/ops/take.py with 53% similarity]
model-optimizer/extensions/ops/shufflechannel.py [moved from model-optimizer/mo/front/tf/extractors/softmax.py with 60% similarity]
model-optimizer/extensions/ops/splice.py [moved from model-optimizer/mo/front/caffe/extractors/softmax.py with 66% similarity]
model-optimizer/extensions/ops/splitv.py
model-optimizer/extensions/ops/stop_gradient.py [new file with mode: 0644]
model-optimizer/extensions/ops/swapaxes.py [new file with mode: 0644]
model-optimizer/install_prerequisites/install_prerequisites.sh
model-optimizer/mo/front/caffe/extractor.py
model-optimizer/mo/front/caffe/extractors/elu.py
model-optimizer/mo/front/caffe/extractors/flatten.py [deleted file]
model-optimizer/mo/front/caffe/extractors/reshape.py
model-optimizer/mo/front/caffe/extractors/scale.py
model-optimizer/mo/front/caffe/extractors/slice.py
model-optimizer/mo/front/caffe/loader.py
model-optimizer/mo/front/caffe/proto/caffe_pb2.py
model-optimizer/mo/front/caffe/proto/generate_caffe_pb2.py
model-optimizer/mo/front/caffe/proto/mo_caffe.proto
model-optimizer/mo/front/common/layout.py
model-optimizer/mo/front/common/partial_infer/elemental.py
model-optimizer/mo/front/common/partial_infer/expand_dims.py
model-optimizer/mo/front/common/partial_infer/flatten.py [deleted file]
model-optimizer/mo/front/common/partial_infer/inner_product.py
model-optimizer/mo/front/common/partial_infer/matmul.py
model-optimizer/mo/front/common/partial_infer/reduce.py
model-optimizer/mo/front/common/partial_infer/reshape.py
model-optimizer/mo/front/common/partial_infer/slice.py
model-optimizer/mo/front/common/partial_infer/split.py
model-optimizer/mo/front/common/partial_infer/squeeze.py
model-optimizer/mo/front/common/partial_infer/utils.py
model-optimizer/mo/front/common/register_custom_ops.py
model-optimizer/mo/front/extractor.py
model-optimizer/mo/front/kaldi/extractor.py
model-optimizer/mo/front/kaldi/extractors/add_shift_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/affine_component_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/affine_component_preconditioned_online_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/affine_transform_ext.py
model-optimizer/mo/front/kaldi/extractors/concat_ext.py
model-optimizer/mo/front/kaldi/extractors/convolution_ext.py [deleted file]
model-optimizer/mo/front/kaldi/extractors/convolutional_1d_component_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/copy_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/lstm_projected_streams_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/max_pooling_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/normalize_component_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/pooling_ext.py [deleted file]
model-optimizer/mo/front/kaldi/extractors/rectified_linear_component_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/rescale_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/reshape.py [deleted file]
model-optimizer/mo/front/kaldi/extractors/sigmoid_ext.py
model-optimizer/mo/front/kaldi/extractors/slice_ext.py
model-optimizer/mo/front/kaldi/extractors/softmax_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/splice_component_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/extractors/tanh_component_ext.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/loader.py [deleted file]
model-optimizer/mo/front/kaldi/loader/__init__.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/loader/loader.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/loader/utils.py [new file with mode: 0644]
model-optimizer/mo/front/kaldi/register_custom_ops.py
model-optimizer/mo/front/kaldi/utils.py
model-optimizer/mo/front/mxnet/extractor.py
model-optimizer/mo/front/mxnet/extractors/reshape.py [deleted file]
model-optimizer/mo/front/mxnet/extractors/up_sampling.py [deleted file]
model-optimizer/mo/front/mxnet/extractors/utils.py
model-optimizer/mo/front/onnx/extractor.py
model-optimizer/mo/front/tf/change_placeholder_type.py
model-optimizer/mo/front/tf/extractor.py
model-optimizer/mo/front/tf/extractors/utils.py
model-optimizer/mo/front/tf/graph_utils.py
model-optimizer/mo/front/tf/loader.py
model-optimizer/mo/graph/graph.py
model-optimizer/mo/middle/passes/conv.py
model-optimizer/mo/middle/passes/eliminate.py
model-optimizer/mo/middle/passes/fusing/decomposition.py
model-optimizer/mo/middle/passes/infer.py
model-optimizer/mo/middle/passes/pool.py
model-optimizer/mo/middle/passes/shape.py
model-optimizer/mo/ops/convolution.py
model-optimizer/mo/ops/deconvolution.py
model-optimizer/mo/ops/eltwise.py
model-optimizer/mo/ops/expand_dims.py
model-optimizer/mo/ops/flatten.py [new file with mode: 0644]
model-optimizer/mo/ops/flatten_onnx.py
model-optimizer/mo/ops/lin_op.py
model-optimizer/mo/ops/memory.py
model-optimizer/mo/ops/op.py
model-optimizer/mo/ops/pad.py
model-optimizer/mo/ops/pooling.py
model-optimizer/mo/ops/power.py
model-optimizer/mo/ops/reduce.py
model-optimizer/mo/ops/reshape.py
model-optimizer/mo/ops/slice.py
model-optimizer/mo/ops/softmax.py
model-optimizer/mo/ops/split.py
model-optimizer/mo/ops/squeeze.py
model-optimizer/mo/ops/unsqueeze.py
model-optimizer/mo/pipeline/caffe.py
model-optimizer/mo/pipeline/kaldi.py
model-optimizer/mo/pipeline/mx.py
model-optimizer/mo/pipeline/onnx.py
model-optimizer/mo/pipeline/tf.py
model-optimizer/mo/utils/class_registration.py
model-optimizer/mo/utils/cli_parser.py
model-optimizer/mo/utils/graph.py
model-optimizer/mo/utils/guess_framework.py
model-optimizer/mo/utils/simple_proto_parser.py
model-optimizer/mo/utils/summarize_graph.py
model-optimizer/mo/utils/tensorboard.py
model-optimizer/mo/utils/utils.py
model-optimizer/mo/utils/versions_checker.py
model-optimizer/requirements.txt
model-optimizer/requirements_caffe.txt
model-optimizer/requirements_mxnet.txt
model-optimizer/version.txt

index 2f561d9..46f821d 100644 (file)
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 cmake_minimum_required (VERSION 3.3)
 
 project(InferenceEngine)
@@ -18,7 +19,9 @@ endif()
 
 option (OS_FOLDER "create OS dedicated folder in output" OFF)
 
-if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
+    set (ARCH_FOLDER armv7l)
+elseif("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
     set (ARCH_FOLDER intel64)
 else()
     set (ARCH_FOLDER  ia32)
@@ -46,7 +49,6 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "")
     debug_message(STATUS "CMAKE_BUILD_TYPE not defined, 'Release' will be used")
     set(CMAKE_BUILD_TYPE "Release")
 endif()
-
 message(STATUS "BUILD_CONFIGURATION: ${CMAKE_BUILD_TYPE}")
 
 if(COVERAGE)
@@ -55,17 +57,38 @@ endif()
 
 if (UNIX)
     SET(LIB_DL ${CMAKE_DL_LIBS})
-else()
 endif()
 
 set (OUTPUT_ROOT ${IE_MAIN_SOURCE_DIR})
 
-if(NOT(UNIX))
-       if (WIN32)
-               #set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
-               #set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
-       endif()
+include(os_flags)
 
+#resolving dependencies for the project
+include (dependencies)
+
+set(CMAKE_DEBUG_POSTFIX ${IE_DEBUG_POSTFIX})
+set(CMAKE_RELEASE_POSTFIX ${IE_RELEASE_POSTFIX})
+
+if (WIN32)
+    # Support CMake multiconfiguration for Visual Studio build
+    set(IE_BUILD_POSTFIX $<$<CONFIG:Debug>:${IE_DEBUG_POSTFIX}>$<$<CONFIG:Release>:${IE_RELEASE_POSTFIX}>)
+    set(IE_BUILD_CONFIGURATION $<CONFIG>)
+else ()
+    if (${CMAKE_BUILD_TYPE} STREQUAL "Debug" )
+        set(IE_BUILD_POSTFIX ${IE_DEBUG_POSTFIX})
+    else()
+        set(IE_BUILD_POSTFIX ${IE_RELEASE_POSTFIX})
+    endif()
+    set(IE_BUILD_CONFIGURATION ${CMAKE_BUILD_TYPE})
+endif()
+
+add_definitions(-DIE_BUILD_POSTFIX=\"${IE_BUILD_POSTFIX}\")
+
+if(NOT(UNIX))
+    if (WIN32)
+        #set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
+        #set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
+    endif()
     set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER})
     set (CMAKE_LIBRARY_PATH ${OUTPUT_ROOT}/${BIN_FOLDER})
     set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER})
@@ -75,20 +98,15 @@ if(NOT(UNIX))
     set (LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER})
     set (LIBRARY_OUTPUT_PATH ${LIBRARY_OUTPUT_DIRECTORY}) # compatibility issue: linux uses LIBRARY_OUTPUT_PATH, windows uses LIBRARY_OUTPUT_DIRECTORY
 else ()
-    set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/lib)
-    set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/lib)
-    set (CMAKE_COMPILE_PDB_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE})
-    set (CMAKE_PDB_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE})
-    set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE})
-    set (LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/lib)
+    set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${IE_BUILD_CONFIGURATION}/lib)
+    set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${IE_BUILD_CONFIGURATION}/lib)
+    set (CMAKE_COMPILE_PDB_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${IE_BUILD_CONFIGURATION})
+    set (CMAKE_PDB_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${IE_BUILD_CONFIGURATION})
+    set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${IE_BUILD_CONFIGURATION})
+    set (LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_ROOT}/${BIN_FOLDER}/${IE_BUILD_CONFIGURATION}/lib)
     set (LIBRARY_OUTPUT_PATH ${LIBRARY_OUTPUT_DIRECTORY}/lib)
 endif()
 
-include(os_flags)
-
-#resolving rependencies for the project
-include (dependencies)
-
 if (APPLE)
     set(CMAKE_MACOSX_RPATH 1)
 endif(APPLE)
@@ -108,9 +126,8 @@ message (STATUS "IE_MAIN_SOURCE_DIR .................... " ${IE_MAIN_SOURCE_DIR}
 message (STATUS "CMAKE_GENERATOR ....................... " ${CMAKE_GENERATOR})
 message (STATUS "CMAKE_C_COMPILER_ID ................... " ${CMAKE_C_COMPILER_ID})
 
-if("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
-    include(sdl)
-endif()
+include(sdl)
+
 set (CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 include (sanitizer)
@@ -131,6 +148,10 @@ if (ENABLE_SAMPLES_CORE)
     set(InferenceEngine_DIR "${CMAKE_BINARY_DIR}")
 
     #to be able to link
-    set (LIB_FOLDER ${IE_MAIN_SOURCE_DIR}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/lib)
+    set (LIB_FOLDER ${IE_MAIN_SOURCE_DIR}/${BIN_FOLDER}/${IE_BUILD_CONFIGURATION}/lib)
     add_subdirectory(samples)
 endif()
+
+if (ENABLE_PYTHON)
+    add_subdirectory(ie_bridges/python)
+endif()
\ No newline at end of file
index cfe61ff..c122e00 100644 (file)
@@ -8,6 +8,7 @@ The software was validated on:
 ### Software Requirements
 - [CMake\*](https://cmake.org/download/) 3.9 or higher
 - GCC\* 4.8 or higher to build the Inference Engine
+- Python 2.7 or higher for Inference Engine Python API wrapper
 
 ### Build Steps
 1. Clone submodules:
@@ -29,6 +30,11 @@ You can use the following additional build options:
 - Internal JIT GEMM implementation is used by default.
 - To switch to OpenBLAS\* implementation, use `GEMM=OPENBLAS` option and `BLAS_INCLUDE_DIRS` and `BLAS_LIBRARIES` cmake options to specify path to OpenBLAS headers and library, for example use the following options on CentOS\*: `-DGEMM=OPENBLAS -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DBLAS_LIBRARIES=/usr/lib64/libopenblas.so.0`
 - To switch to optimized MKL-ML\* GEMM implementation, use `GEMM=MKL` and `MKLROOT` cmake options to specify path to unpacked MKL-ML with `include` and `lib` folders, for example use the following options: `-DGEMM=MKL -DMKLROOT=<path_to_MKL>`. MKL-ML\* package can be downloaded [here](https://github.com/intel/mkl-dnn/releases/download/v0.17/mklml_lnx_2019.0.1.20180928.tgz)
+
+- OpenMP threading is used by default. To build Inference Engine with TBB threading, set `-DTHREADING=TBB` option.
+
+- To build Python API wrapper, use -DENABLE_PYTHON=ON option. To specify exact Python version, use the following options: `-DPYTHON_EXECUTABLE=`which python3.6` -DPYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.6m.so -DPYTHON_INCLUDE_DIR=/usr/include/python3.6`
+
 - To switch on/off the CPU and GPU plugins, use `cmake` options `-DENABLE_MKL_DNN=ON/OFF` and `-DENABLE_CLDNN=ON/OFF`.
 
 ## Build on Windows\* Systems:
@@ -41,6 +47,7 @@ The software was validated on:
 - [CMake\*](https://cmake.org/download/) 3.9 or higher
 - [OpenBLAS\*](https://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int64.zip/download) and [mingw64\* runtime dependencies](https://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip/download).
 - [Intel® C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe) 18.0 to build the Inference Engine on Windows.
+- Python 3.4 or higher for Inference Engine Python API wrapper
 
 ### Build Steps
 1. Clone submodules:
@@ -64,11 +71,26 @@ cmake -G "Visual Studio 15 2017 Win64" -T "Intel C++ Compiler 18.0" ^
     -DICCLIB="C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\compiler\lib" ..
 ```
 
+- Internal JIT GEMM implementation is used by default.
 - To switch to OpenBLAS GEMM implementation, use -DGEMM=OPENBLAS cmake option and specify path to OpenBLAS using `-DBLAS_INCLUDE_DIRS=<OPENBLAS_DIR>\include` and `-DBLAS_LIBRARIES=<OPENBLAS_DIR>\lib\libopenblas.dll.a` options. Prebuilt OpenBLAS\* package can be downloaded [here](https://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int64.zip/download), mingw64* runtime dependencies [here](https://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip/download)
 - To switch to optimized MKL-ML GEMM implementation, use `GEMM=MKL` and `MKLROOT` cmake options to specify path to unpacked MKL-ML with `include` and `lib` folders, for example use the following options: `-DGEMM=MKL -DMKLROOT=<path_to_MKL>`. MKL-ML\* package can be downloaded [here](https://github.com/intel/mkl-dnn/releases/download/v0.17/mklml_win_2019.0.1.20180928.zip)
 
+- OpenMP threading is used by default. To build Inference Engine with TBB threading, set `-DTHREADING=TBB` option.
+
+- To build Python API wrapper, use -DENABLE_PYTHON=ON option. To specify exact Python version, use the following options: `-DPYTHON_EXECUTABLE="C:\Program Files\Python36\python.exe" -DPYTHON_INCLUDE_DIR="C:\Program Files\Python36\include" -DPYTHON_LIBRARY="C:\Program Files\Python36\libs\python36.lib"`.
+
 6. Build generated solution in Visual Studio 2017 or run `cmake --build . --config Release` to build from the command line.
 
+### Building Inference Engine with Ninja
+
+```sh
+call "C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\bin\ipsxe-comp-vars.bat" intel64 vs2017
+set CXX=icl
+set CC=icl
+cmake -G Ninja -Wno-dev -DCMAKE_BUILD_TYPE=Release ..
+cmake --build . --config Release
+```
+
 Before running the samples on Microsoft\* Windows\*, please add path to OpenMP library (<dldt_repo>/inference-engine/temp/omp/lib) and OpenCV libraries (<dldt_repo>/inference-engine/temp/opencv_4.0.0/bin) to the %PATH% environment variable.
 
 ---
diff --git a/inference-engine/cmake/FindlibGNA.cmake b/inference-engine/cmake/FindlibGNA.cmake
new file mode 100644 (file)
index 0000000..eeb8480
--- /dev/null
@@ -0,0 +1,39 @@
+# Copyright (C) 2018 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+#module to locate GNA libraries
+
+cmake_minimum_required(VERSION 2.8)
+
+if (WIN32)
+    set(GNA_PLATFORM_DIR win64)
+    set(GNA_LIB_DIR x64)
+    set(GNA_LIB gna)
+elseif (UNIX)
+    set(GNA_PLATFORM_DIR linux)
+    set(GNA_LIB_DIR lib)
+    set(GNA_LIB gna_api)
+    set(GNA_KERNEL_LIB gna_kernel)
+else ()
+    message(FATAL_ERROR "GNA not supported on this platform, only linux, and windows")
+endif ()
+
+find_library(GNA_API_LIBRARY
+        ${GNA_LIB}
+        HINTS
+        ${GNA}/${GNA_PLATFORM_DIR}/${GNA_LIB_DIR})
+
+set(libGNA_INCLUDE_DIRS ${GNA}/${GNA_PLATFORM_DIR}/include)
+set(libGNA_LIBRARY ${GNA_API_LIBRARY})
+
+if (UNIX)
+    #message("Searching for libgna_kernel.so in: ${GNA}/${GNA_PLATFORM_DIR}/${GNA_KERNEL_LIB}")
+    find_library(GNA_KERNEL_LIBRARY
+            ${GNA_KERNEL_LIB}
+            HINTS
+            ${GNA}/${GNA_PLATFORM_DIR}/${GNA_LIB_DIR})
+endif ()
+
+set(libGNA_LIBRARIES ${libGNA_LIBRARY} ${GNA_KERNEL_LIBRARY})
diff --git a/inference-engine/cmake/arm.toolchain.cmake b/inference-engine/cmake/arm.toolchain.cmake
new file mode 100644 (file)
index 0000000..2890f1a
--- /dev/null
@@ -0,0 +1,10 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR armv7l)
+
+set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
+set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
index 47c2681..88ff23f 100644 (file)
@@ -2,11 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
+
 include("features")
 include("mode")
-if (THREADING STREQUAL "OMP")
-    include("omp")
-endif()
 include("itt")
 
 #64 bits platform
@@ -28,17 +26,15 @@ else()
     SET(ENABLE_MKL_DNN OFF)
 endif()
 
-
 #apple specific
 if (APPLE)
+    set(ENABLE_GNA OFF)
     set(ENABLE_CLDNN OFF)
 endif()
 
 
 #minGW specific - under wine no support for downloading file and applying them using git
 if (WIN32)
-    enable_omp()
-
     if (MINGW)
         SET(ENABLE_CLDNN OFF) # dont have mingw dll for linking
         set(ENABLE_SAMPLES OFF)
@@ -61,7 +57,7 @@ if (LINUX)
 endif ()
 
 if (NOT ENABLE_MKL_DNN)
-    set(GEMM OPENBLAS)
+    set(ENABLE_MKL OFF)
 endif()
 
 #next section set defines to be accesible in c++/c code for certain feature
@@ -93,6 +89,10 @@ if (ENABLE_OBJECT_DETECTION_TESTS)
     add_definitions(-DENABLE_OBJECT_DETECTION_TESTS=1)
 endif()
 
+if (ENABLE_GNA)
+    add_definitions(-DENABLE_GNA)
+endif()
+
 if (DEVELOPMENT_PLUGIN_MODE)
     message (STATUS "Enabled development plugin mode")
 
@@ -112,9 +112,5 @@ if (VERBOSE_BUILD)
     set(CMAKE_VERBOSE_MAKEFILE  ON)
 endif()
 
-if (THREADING STREQUAL "TBB" OR THREADING STREQUAL "SEQ")
-    set(ENABLE_INTEL_OMP OFF)
-    message(STATUS "ENABLE_INTEL_OMP should be disabled if THREADING is TBB or Sequential. ENABLE_INTEL_OMP option is " ${ENABLE_INTEL_OMP})
-endif()
 
-print_enabled_features()
\ No newline at end of file
+print_enabled_features()
index a17d6da..ed3c880 100644 (file)
@@ -1,6 +1,8 @@
 # Copyright (C) 2018 Intel Corporation
+#
 # SPDX-License-Identifier: Apache-2.0
 #
+
 if(DEFINED IE_MAIN_SOURCE_DIR AND TARGET inference_engine)
     set(InferenceEngine_INCLUDE_DIRS ${IE_MAIN_SOURCE_DIR}/include)
     set(InferenceEngine_LIBRARIES inference_engine)
index 4a8edf0..8d5ad84 100644 (file)
@@ -67,3 +67,8 @@ function (log_rpath component lib_path)
   log_rpath_remove_top(${component} TRUE ${lib_path} TRUE)
 endfunction()
 
+# Just wrapping of the original message() function to make this macro known during IE build.
+# This macro is redefined (with additional checks) within the InferenceEngineConfig.cmake file.
+macro(ext_message TRACE_LEVEL)
+    message(${TRACE_LEVEL} "${ARGN}")
+endmacro()
\ No newline at end of file
index dc2d89d..cc027bf 100644 (file)
@@ -59,10 +59,12 @@ if (GEMM STREQUAL "MKL")
 if(NOT MKLROOT)
     message(FATAL_ERROR "MKLROOT not found: install MKL and set -DMKLROOT=<path_to_MKL>")
 endif()
+set(MKL ${MKLROOT})
 debug_message(STATUS "mkl_ml=" ${MKLROOT})
 endif ()
 
-if (ENABLE_INTEL_OMP)
+## Intel OMP package
+if (THREADING STREQUAL "OMP")
 if (WIN32)
     RESOLVE_DEPENDENCY(OMP
             ARCHIVE_WIN "iomp.zip"
@@ -80,36 +82,29 @@ log_rpath_from_dir(OMP "${OMP}/lib")
 debug_message(STATUS "intel_omp=" ${OMP})
 endif ()
 
-#TBB package
+#TBB package
 if (THREADING STREQUAL "TBB")
 if (WIN32)
     #TODO: add target_path to be platform specific as well, to avoid following if
     RESOLVE_DEPENDENCY(TBB
-            ARCHIVE_WIN "tbb2018_20180618_win.zip" #TODO: windows zip archive created incorrectly using old name for folder
+            ARCHIVE_WIN "tbb2019_20181010_win.zip" #TODO: windows zip archive created incorrectly using old name for folder
             TARGET_PATH "${TEMP}/tbb"
             ENVIRONMENT "TBBROOT"
             VERSION_REGEX ".*_([a-z]*_([a-z0-9]+\\.)*[0-9]+).*")
 elseif(LINUX)
     RESOLVE_DEPENDENCY(TBB
-            ARCHIVE_LIN "tbb2018_20180618_lin.tgz"
+            ARCHIVE_LIN "tbb2019_20181010_lin.tgz"
             TARGET_PATH "${TEMP}/tbb"
             ENVIRONMENT "TBBROOT")
 endif()
-set(TBB_INCLUDE_DIRS "${TBB}/include")
-find_path(TBB_INCLUDE_DIRS tbb/tbb.h)
-find_library(TBB_LIBRARIES_RELEASE tbb HINTS "${TBB}/lib")
-if (TBB_INCLUDE_DIRS AND TBB_LIBRARIES_RELEASE)
-    log_rpath_from_dir(TBB "${TBB}/lib")
-else()
-    message("FATAL_ERROR" "TBB is unset")
-endif()
+log_rpath_from_dir(TBB "${TBB}/lib")
 debug_message(STATUS "tbb=" ${TBB})
 endif ()
 
 if (ENABLE_OPENCV)
 if (WIN32)
     RESOLVE_DEPENDENCY(OPENCV
-            ARCHIVE_WIN "opencv_4.0.0-0256.zip"
+            ARCHIVE_WIN "opencv_4.0.1-0353.zip"
             TARGET_PATH "${TEMP}/opencv_4.0.0"
             ENVIRONMENT "OpenCV_DIR"
             VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*")
@@ -118,14 +113,21 @@ if (WIN32)
 elseif(LINUX)
 if (${LINUX_OS_NAME} STREQUAL "Ubuntu 16.04")
     RESOLVE_DEPENDENCY(OPENCV
-            ARCHIVE_LIN "opencv_4.0.0-0256_ubuntu16.tgz"
+            ARCHIVE_LIN "opencv_4.0.0-0305_ubuntu16.tgz"
             TARGET_PATH "${TEMP}/opencv_4.0.0_ubuntu"
             ENVIRONMENT "OpenCV_DIR"
             VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*")
     log_rpath_from_dir(OPENCV "opencv_4.0.0_ubuntu/lib")
+elseif (${LINUX_OS_NAME} STREQUAL "Ubuntu 18.04")
+    RESOLVE_DEPENDENCY(OPENCV
+            ARCHIVE_LIN "opencv_4.0.0-0305_ubuntu18.tgz"
+            TARGET_PATH "${TEMP}/opencv_4.0.0_ubuntu18"
+            ENVIRONMENT "OpenCV_DIR"
+            VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*")
+    log_rpath_from_dir(OPENCV "opencv_4.0.0_ubuntu/lib")
 elseif (${LINUX_OS_NAME} STREQUAL "CentOS 7")
     RESOLVE_DEPENDENCY(OPENCV
-            ARCHIVE_LIN "opencv_4.0.0-0256_centos.tgz"
+            ARCHIVE_LIN "opencv_4.0.0-0305_centos.tgz"
             TARGET_PATH "${TEMP}/opencv_4.0.0_centos"
             ENVIRONMENT "OpenCV_DIR"
             VERSION_REGEX ".*_([0-9]+.[0-9]+.[0-9]+).*")
@@ -136,6 +138,26 @@ endif()
 debug_message(STATUS "opencv=" ${OPENCV})
 endif()
 
-if (THREADING STREQUAL "OMP")
-    include(omp)
-endif ()
+
+include(ie_parallel)
+
+if (ENABLE_GNA)
+    RESOLVE_DEPENDENCY(GNA
+            ARCHIVE_UNIFIED "gna_20181120.zip"
+            TARGET_PATH "${TEMP}/gna")
+endif()
+
+configure_file(
+        "${CMAKE_SOURCE_DIR}/cmake/share/InferenceEngineConfig.cmake.in"
+        "${CMAKE_BINARY_DIR}/share/InferenceEngineConfig.cmake"
+        @ONLY)
+
+configure_file(
+        "${CMAKE_SOURCE_DIR}/cmake/share/InferenceEngineConfig-version.cmake.in"
+        "${CMAKE_BINARY_DIR}/share/InferenceEngineConfig-version.cmake"
+        COPYONLY)
+
+configure_file(
+        "${CMAKE_SOURCE_DIR}/cmake/ie_parallel.cmake"
+        "${CMAKE_BINARY_DIR}/share/ie_parallel.cmake"
+        COPYONLY)
index b030b8f..513de81 100644 (file)
@@ -144,7 +144,7 @@ function (CheckOrDownloadAndExtract component RELATIVE_URL archive_name unpacked
   set (status "ON")
   set (on_master FALSE)
 
-  set (URL  "https://download.01.org/openvinotoolkit/2018_R4/dldt/inference_engine/${RELATIVE_URL}")
+  set (URL  "https://download.01.org/openvinotoolkit/2018_R5/dldt/inference_engine/${RELATIVE_URL}")
 
   #no message on recursive calls
   if (${use_alternatives})
index 021ba8b..d9ff98b 100644 (file)
@@ -11,6 +11,8 @@ include ("options")
 
 #backed targets
 
+ie_option (ENABLE_GNA "GNA support for inference engine" ON)
+
 ie_option (ENABLE_MKL_DNN "MKL-DNN plugin for inference engine" ON)
 
 ie_option (ENABLE_CLDNN "clDnn based plugin for inference engine" ON)
@@ -22,23 +24,45 @@ ie_option (ENABLE_PROFILING_RAW "Raw counters profiling (just values, no start/s
 #
 
 # "MKL-DNN library might use MKL-ML or OpenBLAS for gemm tasks: MKL|OPENBLAS|JIT"
-if (NOT GEMM STREQUAL "MKL" AND NOT GEMM STREQUAL "OPENBLAS" AND NOT GEMM STREQUAL "JIT")
+if (NOT GEMM STREQUAL "MKL"
+        AND NOT GEMM STREQUAL "OPENBLAS"
+        AND NOT GEMM STREQUAL "JIT")
     set (GEMM "JIT")
-    message(STATUS "GEMM should be set to MKL|OPENBLAS|JIT. Default option is " ${GEMM})
+    message(STATUS "GEMM should be set to MKL, OPENBLAS or JIT. Default option is " ${GEMM})
 endif()
 list (APPEND IE_OPTIONS GEMM)
 
 # "MKL-DNN library based on OMP or TBB or Sequential implementation: TBB|OMP|SEQ"
-if (NOT THREADING STREQUAL "TBB" AND NOT THREADING STREQUAL "OMP" AND NOT THREADING STREQUAL "SEQ")
+if (NOT THREADING STREQUAL "TBB"
+        AND NOT THREADING STREQUAL "OMP"
+        AND NOT THREADING STREQUAL "SEQ")
     set (THREADING "OMP")
-    message(STATUS "THREADING should be set to TBB|OMP|SEQ. Default option is " ${THREADING})
+    message(STATUS "THREADING should be set to TBB, OMP or SEQ. Default option is " ${THREADING})
 endif()
 list (APPEND IE_OPTIONS THREADING)
 
-ie_option (ENABLE_INTEL_OMP "MKL-DNN library based on Intel OMP implementation" ON)
+# Enable postfixes for Debug/Release builds
+set (IE_DEBUG_POSTFIX_WIN "d")
+set (IE_RELEASE_POSTFIX_WIN "")
+set (IE_DEBUG_POSTFIX_LIN "")
+set (IE_RELEASE_POSTFIX_LIN "")
+if (WIN32)
+    set (IE_DEBUG_POSTFIX ${IE_DEBUG_POSTFIX_WIN})
+    set (IE_RELEASE_POSTFIX ${IE_RELEASE_POSTFIX_WIN})
+else()
+    set (IE_DEBUG_POSTFIX ${IE_DEBUG_POSTFIX_LIN})
+    set (IE_RELEASE_POSTFIX ${IE_RELEASE_POSTFIX_LIN})
+endif()
+list (APPEND IE_OPTIONS IE_DEBUG_POSTFIX)
+list (APPEND IE_OPTIONS IE_RELEASE_POSTFIX)
 
 ie_option (ENABLE_TESTS "unit and functional tests" OFF)
 
+ie_option (ENABLE_GAPI_TESTS "unit tests for GAPI kernels" OFF)
+
+ie_option (GAPI_TEST_PERF "if GAPI unit tests should examine performance" OFF)
+
+
 ie_option (ENABLE_SAMPLES_CORE "console samples core library" ON)
 
 ie_option (ENABLE_SANITIZER "enable checking memory errors via AddressSanitizer" OFF)
@@ -63,6 +87,12 @@ ie_option (OS_FOLDER "create OS dedicated folder in output" OFF)
 
 ie_option (ENABLE_PLUGIN_RPATH "enables rpath information to be present in plugins binary, and in corresponding test_applications" ON)
 
+ie_option (ENABLE_AFFINITY_GENERATOR "enables affinity generator build" OFF)
+
+ie_option (ENABLE_DEBUG_SYMBOLS "generates symbols for debugging" OFF)
+
+ie_option (ENABLE_PYTHON "enables ie python bridge build" OFF)
+
 #environment variables used
 
 #name of environment variable stored path to temp directory"
diff --git a/inference-engine/cmake/ie_parallel.cmake b/inference-engine/cmake/ie_parallel.cmake
new file mode 100644 (file)
index 0000000..7c183b5
--- /dev/null
@@ -0,0 +1,100 @@
+# Copyright (C) 2018 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+
+function(set_ie_threading_interface_for TARGET_NAME)
+    set(IE_THREAD_DEFINE "IE_THREAD_SEQ")
+
+    if (THREADING STREQUAL "TBB")
+        if (NOT (IE_MAIN_SOURCE_DIR))
+            set(incl_path ${IE_EXTERNAL_DIR}/tbb/include)
+            if (WIN32)
+                set(lib_rel_path ${IE_LIB_REL_DIR})
+                set(lib_dbg_path ${IE_LIB_DBG_DIR})
+            else ()
+                set(lib_rel_path ${IE_EXTERNAL_DIR}/tbb/lib)
+                set(lib_dbg_path ${lib_rel_path})
+            endif ()
+        else ()
+            set(incl_path ${TBB}/include)
+            set(lib_rel_path ${TBB}/lib)
+            set(lib_dbg_path ${lib_rel_path})
+        endif ()
+
+        if (NOT TBB_INCLUDE_DIRS OR NOT TBB_LIBRARIES_RELEASE OR NOT TBB_LIBRARIES_DEBUG)
+            find_path(TBB_INCLUDE_DIRS tbb/tbb.h ${incl_path} NO_DEFAULT_PATH)
+            find_library(TBB_LIBRARIES_RELEASE tbb ${lib_rel_path} NO_DEFAULT_PATH)
+            find_library(TBB_LIBRARIES_DEBUG tbb_debug ${lib_dbg_path} NO_DEFAULT_PATH)
+            ext_message(STATUS "TBB include: ${TBB_INCLUDE_DIRS}")
+            ext_message(STATUS "TBB Release lib: ${TBB_LIBRARIES_RELEASE}")
+            ext_message(STATUS "TBB Debug lib: ${TBB_LIBRARIES_DEBUG}")
+        endif ()
+
+        if (NOT TBB_INCLUDE_DIRS OR NOT TBB_LIBRARIES_RELEASE OR NOT TBB_LIBRARIES_DEBUG)
+            ext_message(WARNING "TBB not found. TBB support will be disabled. ${IE_THREAD_DEFINE} is defined")
+        else ()
+            set(IE_THREAD_DEFINE "IE_THREAD_TBB")
+            target_include_directories(${TARGET_NAME} PUBLIC ${TBB_INCLUDE_DIRS})
+            if (WIN32)
+                target_link_libraries(${TARGET_NAME} PUBLIC "-nodefaultlib:vcomp")
+                target_link_libraries(${TARGET_NAME} PUBLIC "$<$<CONFIG:DEBUG>:${TBB_LIBRARIES_DEBUG}>;$<$<NOT:$<CONFIG:DEBUG>>:${TBB_LIBRARIES_RELEASE}>")
+            else()
+                if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+                    target_link_libraries(${TARGET_NAME} PUBLIC ${TBB_LIBRARIES_DEBUG})
+                else()
+                    target_link_libraries(${TARGET_NAME} PUBLIC ${TBB_LIBRARIES_RELEASE})
+                endif ()
+            endif ()
+        endif ()
+    elseif (THREADING STREQUAL "OMP")
+        if (WIN32)
+            set(omp_lib_name libiomp5md)
+        else ()
+            set(omp_lib_name iomp5)
+        endif ()
+
+        if (NOT(IE_MAIN_SOURCE_DIR))
+            if (WIN32)
+                set(lib_rel_path ${IE_LIB_REL_DIR})
+                set(lib_dbg_path ${IE_LIB_DBG_DIR})
+            else ()
+                set(lib_rel_path ${IE_EXTERNAL_DIR}/omp/lib)
+                set(lib_dbg_path ${lib_rel_path})
+            endif ()
+        else ()
+            set(lib_rel_path ${OMP}/lib)
+            set(lib_dbg_path ${lib_rel_path})
+        endif ()
+
+        if (NOT OMP_LIBRARIES_RELEASE OR NOT OMP_LIBRARIES_DEBUG)
+            find_library(OMP_LIBRARIES_RELEASE ${omp_lib_name} ${lib_rel_path} NO_DEFAULT_PATH)
+            find_library(OMP_LIBRARIES_DEBUG ${omp_lib_name} ${lib_dbg_path} NO_DEFAULT_PATH)
+            ext_message(STATUS "OMP Release lib: ${OMP_LIBRARIES_RELEASE}")
+            ext_message(STATUS "OMP Debug lib: ${OMP_LIBRARIES_DEBUG}")
+        endif ()
+
+        if (NOT OMP_LIBRARIES_RELEASE OR NOT OMP_LIBRARIES_DEBUG)
+            ext_message(WARNING "Intel OpenMP not found. Intel OpenMP support will be disabled. ${IE_THREAD_DEFINE} is defined")
+        else ()
+            set(IE_THREAD_DEFINE "IE_THREAD_OMP")
+            
+            if (WIN32)
+                target_compile_options(${TARGET_NAME} PUBLIC ${OpenMP_CXX_FLAGS} /openmp)
+                target_compile_options(${TARGET_NAME} PUBLIC ${OpenMP_CXX_FLAGS} /Qopenmp)
+
+                target_link_libraries(${TARGET_NAME} PUBLIC "-nodefaultlib:vcomp")
+                target_link_libraries(${TARGET_NAME} PUBLIC "$<$<CONFIG:DEBUG>:${OMP_LIBRARIES_DEBUG}>;$<$<NOT:$<CONFIG:DEBUG>>:${OMP_LIBRARIES_RELEASE}>")
+            else()
+                target_compile_options(${TARGET_NAME} PUBLIC ${OpenMP_CXX_FLAGS} -fopenmp)
+                if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+                    target_link_libraries(${TARGET_NAME} PUBLIC ${OMP_LIBRARIES_DEBUG})
+                else()
+                    target_link_libraries(${TARGET_NAME} PUBLIC ${OMP_LIBRARIES_RELEASE})
+                endif ()
+            endif ()
+        endif ()
+    endif ()
+
+    target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=${IE_THREAD_DEFINE})
+endfunction(set_ie_threading_interface_for)
diff --git a/inference-engine/cmake/omp.cmake b/inference-engine/cmake/omp.cmake
deleted file mode 100644 (file)
index ab9886a..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (C) 2018 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-
-cmake_policy(SET CMP0054 NEW)
-
-if (APPLE OR WIN32)
-
-    find_path(OMP_INC omp.h)
-    find_library(OMP_LIB iomp5
-        PATHS   ${OMP}/lib)
-
-    if (OMP_INC AND OMP_LIB)
-        set(HAVE_OMP TRUE)
-        get_filename_component(OMP_LIB_DIR "${OMP_LIB}" PATH)
-    else()
-        if (THREADING STREQUAL "OMP")
-            find_package(OpenMP)
-            if (NOT OPENMP_FOUND)    
-                message(WARNING "OpenMP not found. OpenMP support will be disabled.")
-            endif()
-        endif()
-    endif()
-endif()
-
-
-macro(enable_omp)
-    if (APPLE) ## MacOS
-        if (HAVE_OMP)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libiomp5")
-            set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${OMP_LIB_DIR}")
-        else()
-            message(WARNING "Was trying to enable OMP for some target. However OpenMP was not detected on system.")
-        endif()
-    elseif(UNIX) # Linux
-        add_definitions(-fopenmp)
-    elseif(WIN32) # Windows
-        if (THREADING STREQUAL "OMP")
-            set(OPENMP_FLAGS "/Qopenmp /openmp")
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_CCXX_FLAGS} ${OPENMP_FLAGS}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CCXX_FLAGS} ${OPENMP_FLAGS}")
-        endif()
-    endif()
-
-    if (ENABLE_INTEL_OMP)
-        if (WIN32)
-            find_library(intel_omp_lib
-                libiomp5md
-                PATHS ${OMP}/lib ${ICCLIB})
-            set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /nodefaultlib:vcomp")
-            set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /nodefaultlib:vcomp")
-        else()
-            find_library(intel_omp_lib
-                    iomp5
-                    PATHS ${OMP}/lib)
-        endif()
-    endif()
-endmacro(enable_omp)
index b408bea..1f44f87 100644 (file)
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-
 # Usage: ie_option(<option_variable> "description" <initial value or boolean expression> [IF <condition>])
+
 function (ie_option variable description value)
     option(${variable} "${description}" ${value})
     list (APPEND IE_OPTIONS "${variable}")
index 7069847..cb7c6b1 100644 (file)
@@ -7,12 +7,33 @@ if (WIN32)
     set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS _CRT_SECURE_NO_WARNINGS)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_SCL_SECURE_NO_WARNINGS")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc") #no asynchronous structured exception handling
-       set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LARGEADDRESSAWARE")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LARGEADDRESSAWARE")
+    
+    if(ENABLE_DEBUG_SYMBOLS)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
+
+        set(DEBUG_SYMBOLS_LINKER_FLAGS "/DEBUG")
+        if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+            # Keep default /OPT values. See /DEBUG reference for details.
+            set(DEBUG_SYMBOLS_LINKER_FLAGS "${DEBUG_SYMBOLS_LINKER_FLAGS} /OPT:REF /OPT:ICF")
+        endif()
+
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${DEBUG_SYMBOLS_LINKER_FLAGS}")
+        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${DEBUG_SYMBOLS_LINKER_FLAGS}")
+        set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${DEBUG_SYMBOLS_LINKER_FLAGS}")
+    endif()
+
 else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Werror=return-type ")
     if (APPLE)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-command-line-argument")
     elseif(UNIX)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wuninitialized -Winit-self -Wmaybe-uninitialized")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wuninitialized -Winit-self")
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-switch")
+        else()
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wmaybe-uninitialized")
+        endif()
     endif()
 endif()
index 17af7db..cdbe108 100644 (file)
@@ -3,9 +3,18 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+include(CheckCXXCompilerFlag)
+
 if (ENABLE_SANITIZER)
-    set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} -fsanitize=address -fuse-ld=gold")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fuse-ld=gold")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fsanitize=address")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address")
-endif()
\ No newline at end of file
+    set(SANITIZER_COMPILER_FLAGS "-fsanitize=address")
+    CHECK_CXX_COMPILER_FLAG("-fsanitize-recover=address" SANITIZE_RECOVER_SUPPORTED)
+    if (SANITIZE_RECOVER_SUPPORTED)
+        set(SANITIZER_COMPILER_FLAGS "${SANITIZER_COMPILER_FLAGS} -fsanitize-recover=address")
+    endif()
+    set(SANITIZER_LINKER_FLAGS "-fsanitize=address -fuse-ld=gold")
+
+    set(CMAKE_CC_FLAGS "${CMAKE_CC_FLAGS} ${SANITIZER_COMPILER_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SANITIZER_COMPILER_FLAGS}")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${SANITIZER_LINKER_FLAGS}")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${SANITIZER_LINKER_FLAGS}")
+endif()
index f5a4fb9..26618c6 100644 (file)
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-if (UNIX OR APPLE)
+if (UNIX OR APPLE AND ${CMAKE_BUILD_TYPE} STREQUAL "Release")
     set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} -fPIE -fPIC -Wformat -Wformat-security")
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2")
     set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -D_FORTIFY_SOURCE=2")
@@ -16,21 +16,24 @@ if (UNIX OR APPLE)
         else()
             set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} -fstack-protector-strong")
         endif()
+        set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s -fvisibility=hidden")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s -fvisibility=hidden")
     elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
         set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} -fstack-protector-all")
+        set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -fvisibility=hidden")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvisibility=hidden")
     elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector")
         set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -z noexecstack -z relro -z now")
         set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -z noexecstack -z relro -z now")
+        set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Wl,--strip-all -fvisibility=hidden")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wl,--strip-all -fvisibility=hidden")
     endif()
 
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_CCXX_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CCXX_FLAGS}")
-
-
 elseif (WIN32)
-    elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /sdl")
-
+    if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MP /sdl")
+    endif()
 endif()
-
@@ -1,7 +1,9 @@
 # Copyright (C) 2018 Intel Corporation
+#
 # SPDX-License-Identifier: Apache-2.0
 #
-set(InferenceEngine_VERSION 1.4.0)
+
+set(InferenceEngine_VERSION 1.5.0)
 set(PACKAGE_VERSION ${InferenceEngine_VERSION})
 
 set(PACKAGE_VERSION_EXACT False)
@@ -1,6 +1,8 @@
 # Copyright (C) 2018 Intel Corporation
+#
 # SPDX-License-Identifier: Apache-2.0
 #
+#
 # FindIE
 # ------
 #
 #   IE::inference_engine    - The Inference Engine library
 #
 
+macro(ext_message TRACE_LEVEL)
+    if (${TRACE_LEVEL} STREQUAL FATAL_ERROR)
+        if(InferenceEngine_FIND_REQUIRED)
+            message(FATAL_ERROR "${ARGN}")
+        elseif(NOT InferenceEngine_FIND_QUIETLY)
+            message(WARNING "${ARGN}")
+        endif()
+        return()
+    elseif(NOT InferenceEngine_FIND_QUIETLY)
+        message(${TRACE_LEVEL} "${ARGN}")
+    endif ()
+endmacro()
 
 set(InferenceEngine_FOUND FALSE)
 
@@ -28,13 +42,17 @@ else()
     if (WIN32)
         set(_ARCH intel64)
     else()
-        if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
+        if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
+            set(_ARCH armv7l)
+        elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
             set(_ARCH intel64)
         elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "i386")
             set(_ARCH ia32)
         endif()
     endif()
 
+    set(THREADING "@THREADING@")
+
     # check whether setvars.sh is sourced
     if(NOT IE_ROOT_DIR AND (DEFINED ENV{InferenceEngine_DIR} OR InferenceEngine_DIR OR DEFINED ENV{INTEL_CVSDK_DIR}))
         if (EXISTS "${InferenceEngine_DIR}")
@@ -57,7 +75,7 @@ else()
             set(_OS_PATH "")
         else()
            if (NOT EXISTS "/etc/lsb-release")
-                execute_process(COMMAND find /etc/ -maxdepth 1 -type f -name *-release -exec cat {} \;
+                execute_process(COMMAND find -L /etc/ -maxdepth 1 -type f -name *-release -exec cat {} \;
                             OUTPUT_VARIABLE release_data RESULT_VARIABLE result)
                 set(name_regex "NAME=\"([^ \"\n]*).*\"\n")
                 set(version_regex "VERSION=\"([0-9]+(\\.[0-9]+)?)[^\n]*\"")
@@ -75,12 +93,7 @@ else()
             set(os_name "${os_name} ${CMAKE_MATCH_1}")
 
             if (NOT os_name)
-                if(InferenceEngine_FIND_REQUIRED)
-                    message(FATAL_ERROR "Cannot detect OS via reading /etc/*-release:\n ${release_data}")
-                elseif(NOT InferenceEngine_FIND_QUIETLY)
-                    message(WARNING "Cannot detect OS via reading /etc/*-release:\n ${release_data}")
-                endif()
-                return()
+                ext_message(FATAL_ERROR "Cannot detect OS via reading /etc/*-release:\n ${release_data}")
             endif()
 
             if (NOT InferenceEngine_FIND_QUIETLY)
@@ -91,17 +104,18 @@ else()
                 set(_OS_PATH "ubuntu_14.04/")
             elseif (${os_name} STREQUAL "Ubuntu 16.04")
                 set(_OS_PATH "ubuntu_16.04/")
+            elseif (${os_name} STREQUAL "Ubuntu 18.04")
+                set(_OS_PATH "ubuntu_18.04/")
             elseif (${os_name} STREQUAL "CentOS 7")
                 set(_OS_PATH "centos_7.4/")
             elseif (${os_name} STREQUAL "poky 2.0")
                 set(_OS_PATH "ubuntu_16.04/")
+            elseif (${os_name} STREQUAL "poky 2.5")
+                set(_OS_PATH "ubuntu_18.04/")
+            elseif (${os_name} STREQUAL "Raspbian 9")
+                set(_OS_PATH "raspbian_9/")
             else()
-                if(InferenceEngine_FIND_REQUIRED)
-                    message(FATAL_ERROR "${os_name} is not supported. List of supported OS: Ubuntu 14.04, Ubuntu 16.04, CentOS 7")
-                elseif(NOT InferenceEngine_FIND_QUIETLY)
-                    message(WARNING "${os_name} is not supported. List of supported OS: Ubuntu 14.04, Ubuntu 16.04, CentOS 7")
-                endif()
-                return()
+                ext_message(FATAL_ERROR "${os_name} is not supported. List of supported OS: Ubuntu 14.04, Ubuntu 16.04, Ubuntu 18.04, CentOS 7, poky 2.0, poky 2.5, Raspbian 9")
             endif()
         endif()
     endif()
@@ -125,21 +139,31 @@ else()
     find_path(IE_INCLUDE_DIR inference_engine.hpp "${_IE_ROOT_INCLUDE_DIR}")
     find_path(IE_SRC_DIR extension "${_IE_ROOT_SRC_DIR}")
 
+    set(IE_LIB_DIR "${_IE_ROOT_LIBRARY}")
+    set(IE_LIB_REL_DIR "${IE_LIB_DIR}/Release")
+    set(IE_LIB_DBG_DIR "${IE_LIB_DIR}/Debug")
+    set(IE_EXTERNAL_DIR "${IE_ROOT_DIR}/external")
+
     include(FindPackageHandleStandardArgs)
 
     if (WIN32)
-        find_library(IE_RELEASE_LIBRARY inference_engine "${_IE_ROOT_LIBRARY}/Release")
-        find_library(IE_DEBUG_LIBRARY inference_engine "${_IE_ROOT_LIBRARY}/Debug")
-        find_package_handle_standard_args(  IE
+        find_library(IE_RELEASE_LIBRARY inference_engine@IE_RELEASE_POSTFIX_WIN@ "${IE_LIB_REL_DIR}")
+        find_library(IE_DEBUG_LIBRARY inference_engine@IE_DEBUG_POSTFIX_WIN@ "${IE_LIB_DBG_DIR}")
+        find_package_handle_standard_args(  InferenceEngine
+                                            FOUND_VAR INFERENCEENGINE_FOUND
                                             REQUIRED_VARS IE_RELEASE_LIBRARY IE_DEBUG_LIBRARY IE_INCLUDE_DIR
                                             FAIL_MESSAGE "Inference Engine cannot be found at ${_IE_ROOT_LIBRARY}. Please consult InferenceEgnineConfig.cmake module's help page.")
     else()
-        find_library(IE_LIBRARY inference_engine "${_IE_ROOT_LIBRARY}")
-        find_package_handle_standard_args(  IE
+        find_library(IE_LIBRARY inference_engine@IE_RELEASE_POSTFIX_LIN@ "${IE_LIB_DIR}")
+        find_package_handle_standard_args(  InferenceEngine
+                                            FOUND_VAR INFERENCEENGINE_FOUND
                                             REQUIRED_VARS IE_LIBRARY IE_INCLUDE_DIR
                                             FAIL_MESSAGE "Inference Engine cannot be found at ${_IE_ROOT_LIBRARY}. Please consult InferenceEgnineConfig.cmake module's help page.")
     endif()
-    if(IE_FOUND)
+    if(INFERENCEENGINE_FOUND)
+        # to keep this line for successful execution in CMake 2.8
+        set(InferenceEngine_FOUND TRUE)
+
         add_library(IE::inference_engine SHARED IMPORTED GLOBAL)
 
         if (WIN32)
@@ -162,10 +186,10 @@ else()
 
         set(InferenceEngine_INCLUDE_DIRS ${IE_INCLUDE_DIR})
         set(InferenceEngine_LIBRARIES IE::inference_engine)
-        set(InferenceEngine_FOUND TRUE)
+
+        include("${IE_ROOT_DIR}/share/ie_parallel.cmake")
 
         add_subdirectory(${IE_SRC_DIR}/extension EXCLUDE_FROM_ALL ie_cpu_extension)
         add_library(IE::ie_cpu_extension ALIAS ie_cpu_extension)
     endif()
 endif()
-
index 0fed229..2ce462b 100644 (file)
@@ -1,42 +1,49 @@
-# Copyright (C) 2018 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
 # Defines the CMake commands/policies
-cmake_minimum_required( VERSION 2.8.5 )
+cmake_minimum_required (VERSION 3.3)
 
 # Set the project name
-project( INFERENCE_ENGINE_DRIVER )
-
-option(COPY_IE_LIBS "Copy Inference Engine libs to package directory" ${WIN32})
-
-set (IE_DEFAULT_PATH computer_vision_sdk/deployment_tools/inference_engine/share)
-
-find_package(InferenceEngine REQUIRED PATHS /opt/intel/${IE_DEFAULT_PATH} $ENV{HOME}/intel/${IE_DEFAULT_PATH})
-
-# Make the scripts available in the 'cmake' directory available for the
-# 'include()' command, 'find_package()' command.
-set( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake )
-
-# Include the CMake script UseCython.cmake.  This defines add_cython_module().
-# Instruction for use can be found at the top of cmake/UseCython.cmake.
-include( UseCython )
-
-# With CMake, a clean separation can be made between the source tree and the
-# build tree.  When all source is compiled, as with pure C/C++, the source is
-# no-longer needed in the build tree.  However, with pure *.py source, the
-# source is processed directly.  To handle this, we reproduce the availability
-# of the source files in the build tree.
-add_custom_target( ReplicatePythonSourceTree ALL ${CMAKE_COMMAND} -P
-  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/ReplicatePythonSourceTree.cmake
-  ${CMAKE_CURRENT_BINARY_DIR}
-  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} )
-
-add_custom_target( CopyIeLibs ${CMAKE_COMMAND} -P
-  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/CopyIeLibs.cmake
-  ${IE_ROOT_DIR}/bin/${_ARCH}/Release ${_IE_ROOT_LIBRARY}
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ie_driver )
-
-include_directories( IE::inference_engine )
-
-# Process the CMakeLists.txt in the 'src' and 'bin' directory.
-add_subdirectory( inference_engine )
+project (ie_python_api)
+set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}/cmake)
+
+if (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
+    set (ARCH armv7l)
+elseif ("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+    set (ARCH intel64)
+else()
+    set (ARCH ia32)
+endif()
+
+
+# in case of independent python api build (out of Inference Engine root Cmake)
+if (NOT(IE_MAIN_SOURCE_DIR))
+    if("${CMAKE_BUILD_TYPE}" STREQUAL "")
+        message(STATUS "CMAKE_BUILD_TYPE not defined, 'Release' will be used")
+        set(CMAKE_BUILD_TYPE "Release")
+    endif()
+    message(STATUS "BUILD_CONFIGURATION: ${CMAKE_BUILD_TYPE}")
+
+    set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/bin/${ARCH})
+    if(NOT(WIN32))
+        set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/${CMAKE_BUILD_TYPE})
+    endif()
+endif()
+
+include (UseCython)
+
+if (PYTHONINTERP_FOUND)
+    set (PYTHON_VERSION python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR})
+else()
+    message(FATAL_ERROR "Python Interpretator was not found!")
+endif()
+
+if(WIN32)
+    set (PYTHON_BRIDGE_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/$<CONFIG>/python_api/${PYTHON_VERSION}/openvino)
+else()
+    set (PYTHON_BRIDGE_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/python_api/${PYTHON_VERSION}/openvino)
+endif()
+
+find_package (InferenceEngine REQUIRED)
+
+set (PYTHON_BRIDGE_SRC_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
+add_subdirectory (src/openvino/inference_engine)
+add_subdirectory (src/openvino/inference_engine/dnn_builder)
\ No newline at end of file
index b9704fa..6dbe6a0 100644 (file)
@@ -7,40 +7,53 @@
 
 ## Prerequisites
 
-Install the following Python modules:
-- opencv-python
-- numpy
-- cython
+2. Install Inference Engine Python API dependencies:
+```bash
+pip3 install -r requirements.txt
+```
+
+## Building on Linux
+
+Build Inference Engine Python API alongside with the Inference Engine build. 
+You need to run Inference Engine build with the following flags:
+
+```shellscript
+  cd <IE_ROOT>
+  mkdir -p build
+  cd build
+  cmake -DENABLE_PYTHON=ON -DPYTHON_EXECUTABLE=`which python3.6` \
+       -DPYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.6m.so \
+       -DPYTHON_INCLUDE_DIR=/usr/include/python3.6 ..
+  make -j16
+```
 
 ## Building on Windows
+
+You need to run Inference Engine build with the following flags:
+
 ```shellscript
+       cd <IE_ROOT>
        mkdir build
        cd build
        set PATH=C:\Program Files\Python36\Scripts;%PATH%
-       cmake -G "Visual Studio 14 2015 Win64" -DInferenceEngine_DIR=..\..\..\build ^
+       cmake -G "Visual Studio 15 2017 Win64" -T "Intel C++ Compiler 18.0" ^
+               -DENABLE_PYTHON=ON ^
                -DPYTHON_EXECUTABLE="C:\Program Files\Python36\python.exe" ^
                -DPYTHON_INCLUDE_DIR="C:\Program Files\Python36\include" ^
                -DPYTHON_LIBRARY="C:\Program Files\Python36\libs\python36.lib" ..
 ```
 
-Then build generated solution INFERENCE_ENGINE_DRIVER.sln using Microsoft\* Visual Studio.
+Then build generated solution INFERENCE_ENGINE_DRIVER.sln using Microsoft\* Visual Studio or run `cmake --build . --config Release` to build from the command line.
 
-## Building on Linux
 
-```shellscript
-  mkdir -p build
-  cd build
-  cmake -DInferenceEngine_DIR=`realpath ../../../build` -DPYTHON_EXECUTABLE=`which python3.6` \
-       -DPYTHON_LIBRARY=/usr/lib/x86_64-linux-gnu/libpython3.6m.so \
-       -DPYTHON_INCLUDE_DIR=/usr/include/python3.6 ..
-  make -j16
-```
-
-Note: `-DInferenceEngine_DIR` parameter is needed to specify the folder with generated make files or Visual Studio solution used to build Inference Engine (see readme file in the inference-engine root folder).
+## Running sample
 
-Before running the Python samples, please manually replicate OpenVINO folders structure with Python modules:
-- create an empty folder `openvino/inference_engine`
-- move built `ie_api.so` and `__init__.py` files from the `<build_folder>/inference_engine` to `openvino/inference_engine` folder
-- create an empty `__init__.py` file in the `openvino` folder
-- add the root folder where `openvino` folder is located to the PYTHONPATH environment variable.
+Before running the Python samples:
+- add the folder with built `openvino` Python module (located at `inference-engine/bin/intel64/Release/lib/python_api/python3.6`) to the PYTHONPATH environment variable.
 - add the folder with Inference Engine libraries to LD_LIBRARY_PATH variable on Linux (or PATH on Windows).
+
+Example of command line to run classification sample:
+
+```bash
+python3 sample/classification_sample.py -m <path/to/xml> -i <path/to/input/image> -d CPU 
+```
diff --git a/inference-engine/ie_bridges/python/cmake/CopyIeLibs.cmake b/inference-engine/ie_bridges/python/cmake/CopyIeLibs.cmake
deleted file mode 100644 (file)
index 2e96fcf..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-set(IE_WIN_LIBS ${CMAKE_ARGV3})
-set(IE_LIBS ${CMAKE_ARGV4})
-
-if (WIN32)
-    file( GLOB IE_LIBS "${IE_WIN_LIBS}/*.dll")
-    file( COPY ${IE_LIBS} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR})
-else()
-    file( GLOB IE_LIBS "${IE_LIBS}/*.so")
-    file( COPY ${IE_LIBS} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR})
-endif()
index 30e1f03..3070950 100644 (file)
@@ -1,10 +1,19 @@
-# Find the Cython compiler.
+# Copyright (c) 2016 Intel Corporation
 #
-# This code sets the following variables:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#  CYTHON_EXECUTABLE
+#      http://www.apache.org/licenses/LICENSE-2.0
 #
-# See also UseCython.cmake
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Following changes were done on top of original file:
+# Add CYTHON_EXECUTABLE searching hints at lines 50 and 51
 
 #=============================================================================
 # Copyright 2011 Kitware, Inc.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #=============================================================================
-
+# Find the Cython compiler.
+#
+# This code sets the following variables:
+#
+#  CYTHON_EXECUTABLE
+#
+# See also UseCython.cmake
 # Use the Cython executable that lives next to the Python executable
 # if it is a local installation.
 find_package( PythonInterp )
diff --git a/inference-engine/ie_bridges/python/cmake/ReplicatePythonSourceTree.cmake b/inference-engine/ie_bridges/python/cmake/ReplicatePythonSourceTree.cmake
deleted file mode 100644 (file)
index 4316d6e..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-# Note: when executed in the build dir, then CMAKE_CURRENT_SOURCE_DIR is the
-# build dir.
-
-file( COPY setup.py inference_engine tests DESTINATION "${CMAKE_ARGV3}"
-  FILES_MATCHING PATTERN "*.py" )
-
-file( COPY requirements.txt DESTINATION "${CMAKE_ARGV3}" )
index ee631b7..1b9a0a2 100644 (file)
 #
 # See also FindCython.cmake
 
+# Copyright (c) 2016 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Following changes were done on top of the original file:
+# added PRIVATE linking mode for target_link_libraries call at lines 298 and 336
+
 #=============================================================================
 # Copyright 2011 Kitware, Inc.
 #
index 9d87791..3a182ec 100644 (file)
@@ -35,12 +35,15 @@ This class stores main information about the layer and allow to modify some laye
 * `name` - Name of the layer 
 * `type`- Layer type
 * `precision` - Layer base operating precision. Provides getter and setter interfaces.
+* `layout` - Returns the layout of shape of the layer.
+* `shape` -  Return the list of the shape of the layer.
+* `parents` - Returns a list, which contains names of layers preceding this layer.
+* `children` - Returns a list, which contains names of layers following this layer. 
 * `affinity` - Layer affinity set by user or a default affinity set by the `IEPlugin.set_initial_affinity()` method.             
                The affinity attribute provides getter and setter interfaces, so the layer affinity can be modified directly.
-               For example: 
-                        
+               For example:                          
 ```py
->>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
 >>> plugin = IEPlugin(device="HETERO:FPGA,CPU")
 >>> plugin.set_config({"TARGET_FALLBACK": "HETERO:FPGA,CPU"})
 >>> plugin.set_initial_affinity(net) 
@@ -82,7 +85,12 @@ layers affinity and output layers.
 
 ### Class Constructor
 
-There is no explicit class constructor. Use `from_ir` class method to read the Intermediate Representation (IR) and initialize a correct instance of the `IENetwork` class.
+* `__init__(model: str, weights: str)`
+
+    * Parameters:
+        
+        * model - Path to `.xml` file  of the IR
+        * weights - Path to `.bin` file  of the IR
 
 ### Class attributes:
 
@@ -91,7 +99,7 @@ There is no explicit class constructor. Use `from_ir` class method to read the I
              For example, to get a shape of the input layer:
 
 ```py
->>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
 >>> net.inputs
 {'data': <inference_engine.ie_api.InputInfo object at 0x7efe042dedd8>}
 >>> net.inputs['data'].shape
@@ -102,7 +110,7 @@ There is no explicit class constructor. Use `from_ir` class method to read the I
               For example, to get a shape of the output layer:
     
 ```py
->>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
 >>> net.inputs
 {'prob': <inference_engine.ie_api.OutputInfo object at 0x7efe03ab95d0>}
 >>> net.outputs['prob'].shape
@@ -113,7 +121,7 @@ There is no explicit class constructor. Use `from_ir` class method to read the I
                  network batch size. For example:
     
 ```py
->>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
 >>> net.batch_size
 1
 >>> net.batch_size = 4
@@ -124,20 +132,37 @@ There is no explicit class constructor. Use `from_ir` class method to read the I
 ```
     
 * `layers` - Return dictionary that maps network layer names to <a name="ienetlayer-class"></a>`IENetLayer` 
-             objects containing layer properties. For example, to list all network layers:
+             objects containing layer properties in topological order. For example, to list all network layers:
              
 ```py
- >>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+ >>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
  >>> net.layers
  {'conv0': <inference_engine.ie_api.IENetLayer object at 0x7f3a4c102370>
  ...
  }
  ```
  
+ * `stats` - Returns `LayersStatsMap` object containing dictionary that maps network layer names to calibration statistics 
+            represented by <a name="layerstats-class"></a> `LayerStats` objects.
+            `LayersStatsMap` class inherited from built-in python `dict` and overrides default `update()`method to allow
+            to set or modify layers calibration statistics. 
+```py
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net.stats.update({
+        "conv1_2d" : LayserStats(min=(-25, -1, 0), max=(63, 124, 70)),
+        "conv2_2d" : LayserStats(min=(-5, -1, 0, 1, -7, 2), max=(63, 124, 70, 174, 99, 106)),
+    })
+```
+For more details about low precision inference please refer to "Low-Precision 8-bit Integer Inference" 
+section in Inference Engine Developers Guide documentation. 
+
+             
 ### Class Methods
 
 * `from_ir(model: str, weights: str)` 
 
+**Note:** The function is deprecated. Please use `IENetwork()` class constructor to create valid instance of `IENetwork`
+
     * Description:
             
         The class method serves to read the model from the `.xml` and `.bin` files of the IR.
@@ -154,7 +179,7 @@ There is no explicit class constructor. Use `from_ir` class method to read the I
     * Usage example:
     
 ```py
->>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
 >>> net
 <inference_engine.ie_api.IENetwork object at 0x7fd7dbce54b0>
 ```
@@ -179,7 +204,7 @@ There is no explicit class constructor. Use `from_ir` class method to read the I
     * Usage example:
     
 ```py
->>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
 >>> net.add_outputs(["conv5_1/dwise', conv2_1/expand'])]
 >>> net.outputs
 ['prob', 'conv5_1/dwise', 'conv2_1/expand']
@@ -213,12 +238,44 @@ outputs.
     * Usage example:
     
 ```py
->>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
 >>> input_layer = next(iter(net.inputs))
 >>> n, c, h, w = net.inputs[input_layer]
 >>> net.reshape({input_layer: (n, c, h*2, w*2)}]
 ``` 
 
+* `serialize(path_to_xml, path_to_bin)`:
+    
+    * Description: 
+        
+        The method serializes the network and stores it in files. 
+        
+    * Parameters:
+    
+        * `path_to_xml` - path to a file, where a serialized model will be stored. 
+        * `path_to_bin` - path to a file, where serialized weights will be stored.
+
+    * Return value:
+    
+        None
+            
+    * Usage example:
+    
+```py
+>>> net = IENetwork(model=path_to_model, weights=path_to_weights)
+>>> net.serialize(path_to_xml, path_to_bin)
+``` 
+## <a name="layerstats-class"></a>LayerStats
+Layer calibration statistic container
+### Class Constructor
+
+* `__init__(min: tuple = (), max: tuple = ())`
+
+    * Parameters:
+        
+        * min - Tuple with per-channel minimum layer activation values 
+        * max - Tuple with per-channel maximum layer activation values
+
 ## <a name="inputinfo-class"></a>InputInfo 
 
 This class contains the information about the network input layers
@@ -283,7 +340,7 @@ This class is the main plugin interface and serves to initialize and configure t
     
     * Parameters:
        
-        * `network` - A valid IENetwork instance created by `IENetwork.from_ir()` method
+        * `network` - A valid `IENetwork` instance
         * `num_requests` - A positive integer value of infer requests to be created. Number of infer requests may be limited 
         by device capabilities.        
         * `config` - A dictionary of plugin configuration keys and their values
@@ -295,7 +352,7 @@ This class is the main plugin interface and serves to initialize and configure t
     * Usage example:
     
 ```py
->>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
 >>> plugin = IEPlugin(device="CPU")
 >>> exec_net = plugin.load(network=net, num_requsts=2)
 >>> exec_net
@@ -396,7 +453,7 @@ There is no explicit class constructor. To make a valid instance of `ExecutableN
     * Usage example:
         
 ```py
->>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
 >>> plugin = IEPlugin(device="CPU")
 >>> exec_net = plugin.load(network=net, num_requsts=3)
 >>> exec_net.requests
@@ -424,7 +481,7 @@ There is no explicit class constructor. To make a valid instance of `ExecutableN
     * Usage example:
     
 ```py
->>> net = IENetwork.from_ir(model=path_to_xml_file, weights=path_to_bin_file)
+>>> net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
 >>> plugin = IEPlugin(device="CPU")
 >>> exec_net = plugin.load(network=net, num_requsts=2)
 >>> res = exec_net.infer({'data': img})
@@ -609,3 +666,22 @@ array([4.85416055e-01, 1.70385033e-01, 1.21873841e-01, 1.18894853e-01,
 ...
 }
 ```
+
+* `set_batch(size)`
+    * Description:   
+       Sets new batch size for certain infer request when dynamic batching is enabled in executable network that created this request.
+       
+       **Note:** Support of dynamic batch size depends on the target plugin.        
+        
+    * Parameters:
+        * `batch` - new batch size to be used by all the following inference calls for this request.
+        
+    * Usage example:
+```py
+>>> plugin.set_config({"DYN_BATCH_ENABLED": "YES"})
+>>> exec_net = plugin.load(network=net)
+>>> exec_net.requests[0].set_batch(inputs_count)
+```
+Please refer to `dynamic_batch_demo.py` to see the full usage example.
+
+
diff --git a/inference-engine/ie_bridges/python/inference_engine/CMakeLists.txt b/inference-engine/ie_bridges/python/inference_engine/CMakeLists.txt
deleted file mode 100644 (file)
index 2835e04..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (C) 2018 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-# If the pyx file is a C++ file, we should specify that here.
-
-set(CMAKE_INCLUDE_CURRENT_DIR ON)
-
-if (COPY_IE_LIBS)
-  if (UNIX)
-    SET(CMAKE_SKIP_BUILD_RPATH  FALSE)
-    SET(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
-    SET(CMAKE_INSTALL_RPATH "$ORIGIN")
-    SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
-  endif (UNIX)
-endif()
-
-set_source_files_properties(
-    ie_api_impl_defs.pxd
-    ie_api_impl.hpp
-    ie_api_impl.cpp
-    ie_api.pyx
-    ie_api.pxd
-
-  PROPERTIES CYTHON_IS_CXX TRUE
-)
-
-cython_add_module(
-    ie_api
-
-    ie_api_impl_defs.pxd
-    ie_api_impl.hpp
-    ie_api_impl.cpp
-    ie_api.pyx
-)
-
-target_link_libraries(ie_api PRIVATE IE::inference_engine)
-set_target_properties(ie_api PROPERTIES CXX_STANDARD 11 LINKER_LANGUAGE CXX)
-
-#if (NOT UNIX AND ${PYTHON_VERSION_STRING} MATCHES "^1.4")
-#  set(python_subdir "python2.7")
-#else()
-#  set(python_subdir "python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}")
-#endif()
-#
-#
-# Copy required build artifacts to structure which will be used in final package
-#add_custom_command(TARGET ie_api POST_BUILD
-#                   
-#                   COMMAND ${CMAKE_COMMAND} -E make_directory
-#                   ${CMAKE_SOURCE_DIR}/bin/${python_subdir}/openvino/inference_engine/
-#
-#                   COMMAND ${CMAKE_COMMAND} -E touch
-#                   ${CMAKE_SOURCE_DIR}/bin/${python_subdir}/openvino/__init__.py)
-#
-#if (${WIN32})
-#add_custom_command(TARGET ie_api POST_BUILD
-#                   COMMAND ${CMAKE_COMMAND} -E copy
-#                   ${CMAKE_CURRENT_BINARY_DIR}/Release/ie_api.pyd ${CMAKE_SOURCE_DIR}/bin/${python_subdir}/openvino/inference_engine/
-#
-#                   COMMAND ${CMAKE_COMMAND} -E copy
-#                   ${CMAKE_CURRENT_BINARY_DIR}/__init__.py ${CMAKE_SOURCE_DIR}/bin/${python_subdir}/openvino/inference_engine/)
-#else()
-#add_custom_command(TARGET ie_api POST_BUILD
-#                   COMMAND ${CMAKE_COMMAND} -E copy
-#                   ${CMAKE_CURRENT_BINARY_DIR}/ie_api.so ${CMAKE_SOURCE_DIR}/bin/${python_subdir}/openvino/inference_engine/
-#
-#                   COMMAND ${CMAKE_COMMAND} -E copy
-#                   ${CMAKE_CURRENT_BINARY_DIR}/__init__.py ${CMAKE_SOURCE_DIR}/bin/${python_subdir}/openvino/inference_engine/)
-#endif()
diff --git a/inference-engine/ie_bridges/python/inference_engine/__init__.py b/inference-engine/ie_bridges/python/inference_engine/__init__.py
deleted file mode 100644 (file)
index 07e2717..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-from .ie_api import *
-__version__ = get_version()
-__all__ = ['IENetwork', "IEPlugin", "IENetReader"]
diff --git a/inference-engine/ie_bridges/python/inference_engine/ie_api_impl.hpp b/inference-engine/ie_bridges/python/inference_engine/ie_api_impl.hpp
deleted file mode 100644 (file)
index f226524..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (C) 2018 Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#ifndef INFERENCE_ENGINE_DRIVER_IE_API_IMPL_HPP
-#define INFERENCE_ENGINE_DRIVER_IE_API_IMPL_HPP
-
-#include <string>
-#include <inference_engine.hpp>
-#include <iterator>
-#include <iostream>
-#include <algorithm>
-#include <sstream>
-#include "ie_extension.h"
-
-
-namespace InferenceEnginePython {
-struct IENetLayer {
-    InferenceEngine::CNNLayerPtr layer_ptr;
-    std::string name;
-    std::string type;
-    std::string precision;
-    std::string affinity;
-    std::map<std::string, std::string> params;
-    void setAffinity(const std::string & target_affinity);
-    void setParams(const std::map<std::string, std::string> & params_map);
-    std::map<std::string, InferenceEngine::Blob::Ptr> getWeights();
-    void setPrecision(std::string precision);
-};
-struct InputInfo{
-    InferenceEngine::InputInfo actual;
-    std::vector<size_t> dims;
-    std::string precision;
-    std::string layout;
-    void setPrecision(std::string precision);
-    void setLayout(std::string layout);
-};
-struct OutputInfo{
-    InferenceEngine::DataPtr actual;
-    std::vector<size_t> dims;
-    std::string precision;
-    std::string layout;
-    void setPrecision(std::string precision);
-};
-struct ProfileInfo {
-    std::string status;
-    std::string exec_type;
-    std::string layer_type;
-    long long real_time;
-    long long cpu_time;
-    unsigned execution_index;
-};
-struct IENetwork {
-    InferenceEngine::CNNNetwork actual;
-    std::string name;
-    std::size_t batch_size;
-    void setBatch(const size_t size);
-    void addOutputs(const std::vector<std::string> &out_layers, const std::string &precision);
-    std::map<std::string, InferenceEnginePython::IENetLayer> getLayers();
-    std::map<std::string, InferenceEnginePython::InputInfo> getInputs();
-    std::map<std::string, InferenceEnginePython::OutputInfo> getOutputs();
-    void reshape(const std::map<std::string, std::vector<size_t>> & input_shapes);
-};
-
-struct IENetReader {
-    static IENetwork read(std::string const &model, std::string const &weights);
-    std::vector<std::pair<std::string, std::string>> getLayers();
-};
-
-struct InferRequestWrap {
-    InferenceEngine::IInferRequest::Ptr request_ptr;
-    InferenceEngine::BlobMap inputs;
-    InferenceEngine::BlobMap outputs;
-
-    void infer();
-    void infer_async();
-    int  wait(int64_t timeout);
-    InferenceEngine::Blob::Ptr &getInputBlob(const std::string &blob_name);
-    InferenceEngine::Blob::Ptr &getOutputBlob(const std::string &blob_name);
-    std::vector<std::string> getInputsList();
-    std::vector<std::string> getOutputsList();
-    std::map<std::string, InferenceEnginePython::ProfileInfo> getPerformanceCounts();
-};
-
-
-struct IEExecNetwork {
-    InferenceEngine::IExecutableNetwork::Ptr actual;
-    std::vector<InferRequestWrap> infer_requests;
-    IEExecNetwork(const std::string &name, size_t num_requests);
-
-    std::string name;
-    int next_req_index = 0;
-    bool async;
-    void infer();
-};
-
-
-struct IEPlugin {
-    std::unique_ptr<InferenceEnginePython::IEExecNetwork> load(InferenceEnginePython::IENetwork &net,
-                                                                   int num_requests,
-                                                                   const std::map<std::string,std::string> &config);
-    std::string device_name;
-    std::string version;
-    void setConfig(const std::map<std::string, std::string> &);
-    void addCpuExtension(const std::string &extension_path);
-    void setInitialAffinity(InferenceEnginePython::IENetwork &net);
-    IEPlugin(const std::string &device, const std::vector<std::string> &plugin_dirs);
-    IEPlugin() = default;
-    std::set<std::string> queryNetwork(InferenceEnginePython::IENetwork &net);
-    InferenceEngine::InferenceEnginePluginPtr actual;
-
-};
-
-template<class T>
-T* get_buffer(InferenceEngine::Blob& blob) {
-    return blob.buffer().as<T *>();
-}
-
-template<class T, class... Args>
-std::unique_ptr<T> make_unique(Args&&... args)
-{
-    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-std::string get_version();
-}; // InferenceEnginePython
-
-#endif //INFERENCE_ENGINE_DRIVER_IE_API_IMPL_HPP
diff --git a/inference-engine/ie_bridges/python/sample/benchmark_app/README.md b/inference-engine/ie_bridges/python/sample/benchmark_app/README.md
new file mode 100644 (file)
index 0000000..7a9a526
--- /dev/null
@@ -0,0 +1,81 @@
+# Benchmark Application Demo
+
+This topic demonstrates how to run the Benchmark Application demo, which performs inference using convolutional networks.
+
+## How It Works
+
+> **NOTE:** To achieve benchmark results similar to the official published results, set CPU frequency to 2.9GHz and GPU frequency to 1GHz.
+
+Upon the start-up, the application reads command-line parameters and loads a network and images to the Inference Engine plugin. The number of infer requests and execution approach depend on a mode defined with the `-api` command-line parameter.
+
+
+### Synchronous API
+For synchronous mode, the primary metric is latency. The application creates one infer request and executes the `Infer` method. A number of executions is defined by one of the two values:
+* Number of iterations defined with the `-niter` command-line argument
+* Predefined duration if `-niter` is skipped. Predefined duration value depends on device.
+
+During the execution, the application collects two types of metrics:
+* Latency for each infer request executed with `Infer` method
+* Duration of all executions
+
+Reported latency value is calculated as mean value of all collected latencies. Reported throughput value is a derivative from reported latency and additionally depends on batch size.
+
+### Asynchronous API
+For asynchronous mode, the primary metric is throughput in frames per second (FPS). The application creates a certain number of infer requests and executes the `StartAsync` method. A number of infer is specified with the `-nireq` command-line parameter. A number of executions is defined by one of the two values:
+* Number of iterations defined with the `-niter` command-line argument
+* Predefined duration if `-niter` is skipped. Predefined duration value depends on device.
+
+The infer requests are executed asynchronously. `Wait` method is used to wait for previous execution to complete. The application measures all infer requests executions and reports the throughput metric based on batch size and total execution duration.
+
+## Running
+
+Running the application with the `-h` or `--help`' option yields the following usage message:
+```python3 benchmark_app.py -h
+
+benchmark_app [OPTION]
+Options:
+
+    -h, --help                                       Print a usage message
+    -i, --path_to_images "<path>"                    Required. Path to a folder with images or to image files.
+    -m, --path_to_model "<path>"                     Required. Path to an .xml file with a trained model.
+    -pp "<path>"                                     Path to a plugin folder.
+    -api, --api_type "<sync/async>"                  Required. Enable using sync/async API.
+    -d, --target_device "<device>"                   Specify a target device to infer on: CPU, GPU, FPGA or MYRIAD. Use "-d HETERO:<comma separated devices list>" format to specify HETERO plugin. The application looks for a suitable plugin for the specified device.
+    -niter, --number_iterations "<integer>"          Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device.
+    -nireq, --number_infer_requests "<integer>"      Optional. Number of infer requests (default value is 2).
+    -l, --path_to_extension "<absolute_path>"        Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.
+          Or
+    -c, --path_to_cldnn_config "<absolute_path>"     Required for GPU custom kernels. Absolute path to an .xml file with the kernels description.
+    -b, --batch_size "<integer>"                     Optional. Batch size value. If not specified, the batch size value is determined from IR.
+    -nthreads, --number_threads "<integer>"          Number of threads to use for inference on the CPU (including Hetero cases).
+    -pin {YES,NO}, --infer_threads_pinning {YES,NO}  Optional. Enable ("YES" is default value) or disable ("NO")CPU threads pinning for CPU-involved inference.
+```
+
+Running the application with the empty list of options yields the usage message given above and an error message.
+
+To run the demo, you can use one-layer public models or one-layer pre-trained and optimized models delivered with the package that support images as input.
+
+For example, to do inference on an image using a trained network with multiple outputs on CPU, run the following command:
+
+```python3 benchmark_app.py -i <path_to_image>/inputImage.bmp -m <path_to_model>/multiple-output.xml -d CPU
+```
+
+> **NOTE**: Public models should be first converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
+## Demo Output
+
+Application output depends on a used API. For synchronous API, the application outputs latency and throughput:
+```
+[ INFO ] Start inference synchronously (10 s duration)
+[BENCHMARK RESULT] Latency is 15.5520 msec
+[BENCHMARK RESULT] Throughput is 1286.0082 FPS
+```
+
+For asynchronous API, the application outputs only throughput:
+```
+[ INFO ] Start inference asynchronously (10 s duration, 8 inference requests in parallel)
+[BENCHMARK RESULT] Throughput is 1444.2591 FPS
+```
+
+## See Also
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
diff --git a/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark.py b/inference-engine/ie_bridges/python/sample/benchmark_app/benchmark.py
new file mode 100644 (file)
index 0000000..761b63e
--- /dev/null
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from statistics import median
+from openvino.inference_engine import IENetwork, IEPlugin
+
+from utils.benchmark_utils import *
+
+def main(args=None):
+    try:
+        if args is None:
+            args = parse_args()
+
+        validate_args(args)
+
+        # --------------------------------- 1. Load Plugin for inference engine ---------------------------------
+        logging.info("Loading plugin")
+        plugin = IEPlugin(args.target_device)
+
+        config = dict()
+        if CPU_DEVICE_NAME in args.target_device:
+            if args.path_to_extension:
+                plugin.add_cpu_extension(args.path_to_extension)
+            # limit threading for CPU portion of inference
+            if args.number_threads is not None:
+                config.update({'CPU_THREADS_NUM': str(args.number_threads)})
+            # pin threads for CPU portion of inference
+            config.update({'CPU_BIND_THREAD': args.infer_threads_pinning})
+            # for pure CPU execution, more throughput-oriented execution via streams
+            if args.api_type == 'async' and CPU_DEVICE_NAME in args.target_device:
+                config.update({'CPU_THROUGHPUT_STREAMS': str(args.number_infer_requests)})
+        elif GPU_DEVICE_NAME in args.target_device:
+            if args.path_to_cldnn_config:
+                config.update({'CONFIG_FILE': args.path_to_cldnn_config})
+                logger.info("GPU extensions is loaded {}".format(args.path_to_cldnn_config))
+        elif MYRIAD_DEVICE_NAME in args.target_device:
+            config.update({'LOG_LEVEL': 'LOG_INFO'})
+            config.update({'VPU_LOG_LEVEL': 'LOG_INFO'})
+
+        plugin.set_config(config)
+
+        logger.info("Device is {}".format(plugin.device))
+        logger.info("Plugin version is {}".format(plugin.version))
+
+        # --------------------- 2. Read IR Generated by ModelOptimizer (.xml and .bin files) ---------------------
+        logger.info("Loading network files")
+
+        xml_filename = os.path.abspath(args.path_to_model)
+        head, tail = os.path.splitext(xml_filename)
+        bin_filename = os.path.abspath(head + BIN_EXTENSION)
+
+        ie_network = IENetwork(xml_filename, bin_filename)
+
+        input_info = ie_network.inputs
+
+        if len(input_info) == 0:
+            raise AttributeError('No inputs info is provided')
+        elif len(input_info) != 1:
+            raise AttributeError("only one input layer network is supported")
+
+        # -------------------------------------- 3. Change network batch_size  -------------------------------------
+        batch_size = ie_network.batch_size
+        key = list(input_info.keys()).pop()
+        precision = input_info[key].precision
+
+        if args.batch_size and args.batch_size != ie_network.batch_size:
+            # deepcopy input_info
+            shape = input_info[key].shape
+            # We support models having only one input layers
+            if input_info[key].layout != LAYOUT_TYPE:
+                raise Exception('Unsupported model for batch size changing in automatic mode')
+            shape[BATCH_SIZE_ELEM] = args.batch_size
+            ie_network.reshape({key: shape})
+
+            input_info = ie_network.inputs
+
+            batch_size = args.batch_size
+
+
+        logger_message = "Network batch size was changed to: " if args.batch_size is not None else "Network batch size: "
+        logger_message += " {}, precision: {}".format(batch_size, precision)
+        logger.info(logger_message)
+
+        # ------------------------------------- 4. Loading model to the plugin -------------------------------------
+        logger.info("Loading model to the plugin")
+        exe_network = plugin.load(ie_network, args.number_infer_requests)
+
+        # ------------------------------------ 5. Performance measurements stuff -----------------------------------
+        inputs = get_images(os.path.abspath(args.path_to_images), batch_size)
+
+        if batch_size < len(inputs):
+            logger.warn("Network batch size {} is less then images count  {}"
+                        ", some input files will be ignored".format(batch_size, len(inputs)))
+
+        input_images = {key: fill_blob_with_image(inputs, input_info[key].shape)}
+
+        times = list()
+        duration = 0
+
+        if args.number_iterations is None:
+            duration = get_duration_in_secs(args.target_device)
+
+        if args.api_type == 'sync':
+
+            # warming up - out of scope
+            exe_network.infer(input_images)
+
+            if args.number_iterations is not None:
+                logger.info(
+                    "Start inference synchronously ({}) sync inference executions".format(args.number_iterations))
+                for iteration in range(args.number_iterations):
+                    sync_infer_request(exe_network, times, input_images)
+
+            else:
+                logger.info("Start inference synchronously ({} s duration)".format(duration))
+                start_time = datetime.now()
+                current_time = start_time
+                while (current_time - start_time).total_seconds() < duration:
+                    current_time = sync_infer_request(exe_network, times, input_images)
+
+            times.sort()
+            latency = median(times)
+            fps = batch_size / latency
+
+            print("[BENCHMARK RESULT] Latency is {:.4f} msec".format(latency * 1e3))
+            print("[BENCHMARK RESULT] Throughput is {:.4f} FPS".format(fps))
+        else:
+            infer_requests = exe_network.requests
+
+            if args.number_iterations is not None:
+                logger.info("Start inference asynchronously ({}"
+                            " async inference executions, {} "
+                            " inference requests in parallel".format(args.number_iterations,
+                                                                       args.number_infer_requests))
+            else:
+                logger.info("Start inference asynchronously ({} s duration, "
+                            "{} inference requests in parallel)".format(duration, args.number_infer_requests))
+
+            current_inference = 0
+            required_inference_requests_were_executed = False
+            previous_inference = 1 - args.number_infer_requests
+            step = 0
+            steps_count = args.number_infer_requests - 1
+            if args.number_iterations is not None:
+                steps_count += args.number_iterations
+
+            # warming up - out of scope
+            infer_requests[0].async_infer(input_images)
+            infer_requests[0].wait()
+
+            start_time = datetime.now()
+            while not required_inference_requests_were_executed or step < steps_count or \
+                    args.number_iterations is None and (datetime.now() - start_time).total_seconds() < duration:
+                exe_network.start_async(current_inference, input_images)
+
+                if previous_inference >= 0:
+                    status = infer_requests[previous_inference].wait()
+                    if status is not 0:
+                        raise Exception("Infer request not completed successfully")
+
+                current_inference += 1
+                if current_inference >= args.number_infer_requests:
+                    current_inference = 0
+                    required_inference_requests_were_executed = True
+
+                previous_inference += 1
+                if previous_inference >= args.number_infer_requests:
+                    previous_inference = 0
+
+                step += 1
+
+            # wait the latest inference executions
+            for not_completed_index in range(args.number_infer_requests):
+                if infer_requests[not_completed_index].wait(0) != 0:
+                    infer_requests[not_completed_index].wait()
+
+            total_duration = (datetime.now() - start_time).total_seconds()
+            fps = batch_size * step / total_duration
+
+            print("[BENCHMARK RESULT] Throughput is {:.4f} FPS".format(fps))
+
+        del exe_network
+        del plugin
+
+    except Exception as e:
+        logging.exception(e)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inference-engine/ie_bridges/python/sample/benchmark_app/utils/benchmark_utils.py b/inference-engine/ie_bridges/python/sample/benchmark_app/utils/benchmark_utils.py
new file mode 100644 (file)
index 0000000..4267614
--- /dev/null
@@ -0,0 +1,122 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import logging
+import argparse
+import os
+import cv2
+import numpy as np
+import sys
+
+from glob import glob
+from random import choice
+from datetime import datetime
+from fnmatch import fnmatch
+
+from . constants import *
+
+logging.basicConfig(format="[ %(levelname)s ] %(message)s", level=logging.INFO, stream=sys.stdout)
+logger = logging.getLogger('BenchmarkApp')
+
+
+def validate_args(args):
+    if args.number_iterations is not None and args.number_iterations < 0:
+        raise Exception("Number of iterations should be positive (invalid -niter option value)")
+    if args.number_infer_requests < 0:
+        raise Exception("Number of inference requests should be positive (invalid -nireq option value)")
+    if not fnmatch(args.path_to_model, XML_EXTENSION_PATTERN):
+        raise Exception('Path {} is not xml file.')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--path_to_images', type=str, required=True, help=HELP_MESSAGES['IMAGE_MESSAGE'])
+    parser.add_argument('-m', '--path_to_model', type=str, required=True, help=HELP_MESSAGES['MODEL_MESSAGE'])
+    parser.add_argument('-c', '--path_to_cldnn_config', type=str, required=False,
+                        help=HELP_MESSAGES['CUSTOM_GPU_LIBRARY_MESSAGE'])
+    parser.add_argument('-l', '--path_to_extension', type=str, required=False, default=None,
+                        help=HELP_MESSAGES['CUSTOM_GPU_LIBRARY_MESSAGE'])
+    parser.add_argument('-api', '--api_type', type=str, required=False, default='async', choices=['sync', 'async'],
+                        help=HELP_MESSAGES['API_MESSAGE'])
+    parser.add_argument('-d', '--target_device', type=str, required=False, default="CPU",
+                        help=HELP_MESSAGES['TARGET_DEVICE_MESSAGE'])
+    parser.add_argument('-niter', '--number_iterations', type=int, required=False, default=None,
+                        help=HELP_MESSAGES['ITERATIONS_COUNT_MESSAGE'])
+    parser.add_argument('-nireq', '--number_infer_requests', type=int, required=False, default=2,
+                        help=HELP_MESSAGES['INFER_REQUESTS_COUNT_MESSAGE'])
+    parser.add_argument('-nthreads', '--number_threads', type=int, required=False, default=None,
+                        help=HELP_MESSAGES['INFER_NUM_THREADS_MESSAGE'])
+    parser.add_argument('-b', '--batch_size', type=int, required=False, default=None,
+                        help=HELP_MESSAGES['BATCH_SIZE_MESSAGE'])
+    parser.add_argument('-pin', '--infer_threads_pinning', type=str, required=False, default='YES',
+                        choices=['YES', 'NO'], help=HELP_MESSAGES['INFER_THREADS_PINNING_MESSAGE'])
+    return parser.parse_args()
+
+
+def get_images(path_to_images, batch_size):
+    images = list()
+    if os.path.isfile(path_to_images):
+        while len(images) != batch_size:
+            images.append(path_to_images)
+    else:
+        path = os.path.join(path_to_images, '*')
+        files = glob(path, recursive=True)
+        for file in files:
+            file_extension = file.rsplit('.').pop().upper()
+            if file_extension in IMAGE_EXTENSIONS:
+                images.append(file)
+        if len(images) == 0:
+            raise Exception("No images found in {}".format(path_to_images))
+        if len(images) < batch_size:
+            while len(images) != batch_size:
+                images.append(choice(images))
+    return images
+
+
+def get_duration_in_secs(target_device):
+    duration = 0
+    for device in DEVICE_DURATION_IN_SECS:
+        if device in target_device:
+            duration = max(duration, DEVICE_DURATION_IN_SECS[device])
+
+    if duration == 0:
+        duration = DEVICE_DURATION_IN_SECS[UNKNOWN_DEVICE_TYPE]
+        logger.warn("Default duration {} seconds for unknown device {} is used".format(duration, target_device))
+
+    return duration
+
+
+def fill_blob_with_image(images_path, shape):
+    images = np.ndarray(shape)
+    for item in range(shape[0]):
+        image = cv2.imread(images_path[item])
+
+        new_im_size = tuple(shape[2:])
+        if image.shape[:-1] != new_im_size:
+            logger.warn("Image {} is resize from ({}) to ({})".format(images_path[item], image.shape[:-1], new_im_size))
+            image = cv2.resize(image, new_im_size)
+
+        image = image.transpose((2, 0, 1))
+        images[item] = image
+    return images
+
+
+def sync_infer_request(exe_network, times, images):
+    iteration_start_time = datetime.now()
+    exe_network.infer(images)
+    current_time = datetime.now()
+    times.append((current_time - iteration_start_time).total_seconds())
+    return current_time
diff --git a/inference-engine/ie_bridges/python/sample/benchmark_app/utils/constants.py b/inference-engine/ie_bridges/python/sample/benchmark_app/utils/constants.py
new file mode 100644 (file)
index 0000000..f68919e
--- /dev/null
@@ -0,0 +1,63 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+HELP_MESSAGES = {
+    'IMAGE_MESSAGE': "Path to a folder with images or to image files.",
+    'MULTI_INPUT_MESSAGE': "Path to multi input file containing.",
+    'MODEL_MESSAGE': "Path to an .xml file with a trained model.",
+    'PLUGIN_PATH_MESSAGE': "Path to a plugin folder.",
+    'API_MESSAGE': "Enable using sync/async API. Default value is sync",
+    'TARGET_DEVICE_MESSAGE': "Specify a target device to infer on: CPU, GPU, FPGA or MYRIAD. "
+                           "Use \"-d HETERO:<comma separated devices list>\" format to specify HETERO plugin. "
+    "The application looks for a suitable plugin for the specified device.",
+    'ITERATIONS_COUNT_MESSAGE': "Number of iterations. "
+    "If not specified, the number of iterations is calculated depending on a device.",
+    'INFER_REQUESTS_COUNT_MESSAGE': "Number of infer requests (default value is 2).",
+    'INFER_NUM_THREADS_MESSAGE': "Number of threads to use for inference on the CPU "
+                                 "(including Hetero cases).",
+    'CUSTOM_CPU_LIBRARY_MESSAGE': "Required for CPU custom layers. "
+                                  "Absolute path to a shared library with the kernels implementations.",
+    'CUSTOM_GPU_LIBRARY_MESSAGE': "Required for GPU custom kernels. Absolute path to an .xml file with the kernels description.",
+    'BATCH_SIZE_MESSAGE': "Optional. Batch size value. If not specified, the batch size value is determined from IR",
+    'INFER_THREADS_PINNING_MESSAGE': "Optional. Enable (\"YES\" is default value) or disable (\"NO\")"
+                                     "CPU threads pinning for CPU-involved inference."
+}
+
+DEVICE_DURATION_IN_SECS = {
+    "CPU": 60,
+    "GPU": 60,
+    "VPU": 60,
+    "MYRIAD": 60,
+    "FPGA": 120,
+    "HDDL": 60,
+    "UNKNOWN": 120
+}
+
+IMAGE_EXTENSIONS = ['JPEG', 'JPG', 'PNG', 'BMP']
+
+MYRIAD_DEVICE_NAME = "MYRIAD"
+CPU_DEVICE_NAME = "CPU"
+GPU_DEVICE_NAME = "GPU"
+UNKNOWN_DEVICE_TYPE = "UNKNOWN"
+
+BATCH_SIZE_ELEM = 0
+
+LAYOUT_TYPE = 'NCHW'
+
+XML_EXTENSION = ".xml"
+BIN_EXTENSION = ".bin"
+
+XML_EXTENSION_PATTERN = '*' + XML_EXTENSION
index 082a84d..f02459f 100644 (file)
@@ -58,7 +58,7 @@ def main():
         plugin.add_cpu_extension(args.cpu_extension)
     # Read IR
     log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin))
-    net = IENetwork.from_ir(model=model_xml, weights=model_bin)
+    net = IENetwork(model=model_xml, weights=model_bin)
 
     if plugin.device == "CPU":
         supported_layers = plugin.get_supported_layers(net)
@@ -108,8 +108,8 @@ def main():
         log.info("Performance counters:")
         print("{:<70} {:<15} {:<15} {:<15} {:<10}".format('name', 'layer_type', 'exet_type', 'status', 'real_time, us'))
         for layer, stats in perf_counts.items():
-            print ("{:<70} {:<15} {:<15} {:<15} {:<10}".format(layer, stats['layer_type'], stats['exec_type'],
-                                                               stats['status'], stats['real_time']))
+            print("{:<70} {:<15} {:<15} {:<15} {:<10}".format(layer, stats['layer_type'], stats['exec_type'],
+                                                              stats['status'], stats['real_time']))
 
     # Processing output blob
     log.info("Processing output blob")
index 9cf7d23..ae86555 100644 (file)
@@ -58,7 +58,7 @@ def main():
         plugin.add_cpu_extension(args.cpu_extension)
     # Read IR
     log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin))
-    net = IENetwork.from_ir(model=model_xml, weights=model_bin)
+    net = IENetwork(model=model_xml, weights=model_bin)
 
     if plugin.device == "CPU":
         supported_layers = plugin.get_supported_layers(net)
@@ -106,10 +106,10 @@ def main():
     if args.perf_counts:
         perf_counts = infer_request_handle.get_perf_counts()
         log.info("Performance counters:")
-        print ("{:<70} {:<15} {:<15} {:<15} {:<10}".format('name', 'layer_type', 'exet_type', 'status', 'real_time, us'))
+        print("{:<70} {:<15} {:<15} {:<15} {:<10}".format('name', 'layer_type', 'exet_type', 'status', 'real_time, us'))
         for layer, stats in perf_counts.items():
-            print ("{:<70} {:<15} {:<15} {:<15} {:<10}".format(layer, stats['layer_type'], stats['exec_type'],
-                                                               stats['status'], stats['real_time']))
+            print("{:<70} {:<15} {:<15} {:<15} {:<10}".format(layer, stats['layer_type'], stats['exec_type'],
+                                                              stats['status'], stats['real_time']))
     # Processing output blob
     log.info("Processing output blob")
     res = infer_request_handle.outputs[out_blob]
index 91563b3..193c5a5 100644 (file)
@@ -1,7 +1,7 @@
 """
 BSD 3-clause "New" or "Revised" license
 
-Copyright (C) 2018 Intel Coporation.
+Copyright (C) 2018 Intel Corporation.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
@@ -38,7 +38,7 @@ import boto3
 import timeit
 import datetime
 import json
-from collections import OrderedDict 
+from collections import OrderedDict
 
 from openvino.inference_engine import IENetwork, IEPlugin
 
@@ -82,6 +82,7 @@ PARAM_LABELMAP_FILE = os.environ.get("PARAM_LABELMAP_FILE")
 PARAM_TOPIC_NAME = os.environ.get("PARAM_TOPIC_NAME", "intel/faas/classification")
 PARAM_NUM_TOP_RESULTS = int(os.environ.get("PARAM_NUM_TOP_RESULTS", "10"))
 
+
 def report(res_json, frame):
     now = datetime.datetime.now()
     date_prefix = str(now).replace(" ", "_")
@@ -89,17 +90,18 @@ def report(res_json, frame):
         data = json.dumps(res_json)
         client.publish(topic=PARAM_TOPIC_NAME, payload=data)
     if enable_kinesis_output:
-        kinesis_client.put_record(StreamName=kinesis_stream_name, Data=json.dumps(res_json), PartitionKey=kinesis_partition_key)
+        kinesis_client.put_record(StreamName=kinesis_stream_name, Data=json.dumps(res_json),
+                                  PartitionKey=kinesis_partition_key)
     if enable_s3_jpeg_output:
         temp_image = os.path.join(PARAM_OUTPUT_DIRECTORY, "inference_result.jpeg")
         cv2.imwrite(temp_image, frame)
         with open(temp_image) as file:
             image_contents = file.read()
-            s3_client.put_object(Body=image_contents, Bucket=s3_bucket_name, Key=date_prefix + ".jpeg") 
+            s3_client.put_object(Body=image_contents, Bucket=s3_bucket_name, Key=date_prefix + ".jpeg")
     if enable_local_jpeg_output:
         cv2.imwrite(os.path.join(PARAM_OUTPUT_DIRECTORY, date_prefix + ".jpeg"), frame)
 
-    
+
 def greengrass_classification_sample_run():
     client.publish(topic=PARAM_TOPIC_NAME, payload="OpenVINO: Initializing...")
     model_bin = os.path.splitext(PARAM_MODEL_XML)[0] + ".bin"
@@ -109,7 +111,7 @@ def greengrass_classification_sample_run():
     if "CPU" in PARAM_DEVICE:
         plugin.add_cpu_extension(PARAM_CPU_EXTENSION_PATH)
     # Read IR
-    net = IENetwork.from_ir(model=PARAM_MODEL_XML, weights=model_bin)
+    net = IENetwork(model=PARAM_MODEL_XML, weights=model_bin)
     assert len(net.inputs.keys()) == 1, "Sample supports only single input topologies"
     assert len(net.outputs) == 1, "Sample supports only single output topologies"
     input_blob = next(iter(net.inputs))
@@ -126,9 +128,9 @@ def greengrass_classification_sample_run():
     res_json = []
     labeldata = None
     if PARAM_LABELMAP_FILE is not None:
-       with open(PARAM_LABELMAP_FILE) as labelmap_file:
+        with open(PARAM_LABELMAP_FILE) as labelmap_file:
             labeldata = json.load(labelmap_file)
-    
+
     while (cap.isOpened()):
         ret, frame = cap.read()
         if not ret:
@@ -148,17 +150,17 @@ def greengrass_classification_sample_run():
         res_json = OrderedDict()
         res_json["Candidates"] = OrderedDict()
         frame_timestamp = datetime.datetime.now()
-            
+
         for i in top_ind:
             classlabel = labeldata[str(i)] if labeldata else str(i)
             res_json["Candidates"][classlabel] = round(res[out_blob][0, i], 2)
-            
+
         frame_count += 1
         # Measure elapsed seconds since the last report
         seconds_elapsed = timeit.default_timer() - start_time
         if seconds_elapsed >= reporting_interval:
             res_json["timestamp"] = frame_timestamp.isoformat()
-            res_json["frame_id"] = int(frameid)   
+            res_json["frame_id"] = int(frameid)
             res_json["inference_fps"] = frame_count / inf_seconds
             start_time = timeit.default_timer()
             report(res_json, frame)
@@ -169,8 +171,10 @@ def greengrass_classification_sample_run():
     del exec_net
     del plugin
 
+
 greengrass_classification_sample_run()
 
+
 def function_handler(event, context):
     client.publish(topic=PARAM_TOPIC_NAME, payload='HANDLER_CALLED!')
     return
index 55c2f0f..e6898be 100644 (file)
@@ -1,7 +1,7 @@
 """
 BSD 3-clause "New" or "Revised" license
 
-Copyright (C) 2018 Intel Coporation.
+Copyright (C) 2018 Intel Corporation.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
@@ -38,7 +38,7 @@ import boto3
 import timeit
 import datetime
 import json
-from collections import OrderedDict 
+from collections import OrderedDict
 
 from openvino.inference_engine import IENetwork, IEPlugin
 
@@ -81,6 +81,7 @@ PARAM_CPU_EXTENSION_PATH = os.environ.get("PARAM_CPU_EXTENSION_PATH")
 PARAM_LABELMAP_FILE = os.environ.get("PARAM_LABELMAP_FILE")
 PARAM_TOPIC_NAME = os.environ.get("PARAM_TOPIC_NAME", "intel/faas/ssd")
 
+
 def report(res_json, frame):
     now = datetime.datetime.now()
     date_prefix = str(now).replace(" ", "_")
@@ -88,17 +89,18 @@ def report(res_json, frame):
         data = json.dumps(res_json)
         client.publish(topic=PARAM_TOPIC_NAME, payload=data)
     if enable_kinesis_output:
-        kinesis_client.put_record(StreamName=kinesis_stream_name, Data=json.dumps(res_json), PartitionKey=kinesis_partition_key)
+        kinesis_client.put_record(StreamName=kinesis_stream_name, Data=json.dumps(res_json),
+                                  PartitionKey=kinesis_partition_key)
     if enable_s3_jpeg_output:
         temp_image = os.path.join(PARAM_OUTPUT_DIRECTORY, "inference_result.jpeg")
         cv2.imwrite(temp_image, frame)
         with open(temp_image) as file:
             image_contents = file.read()
-            s3_client.put_object(Body=image_contents, Bucket=s3_bucket_name, Key=date_prefix + ".jpeg") 
+            s3_client.put_object(Body=image_contents, Bucket=s3_bucket_name, Key=date_prefix + ".jpeg")
     if enable_local_jpeg_output:
         cv2.imwrite(os.path.join(PARAM_OUTPUT_DIRECTORY, date_prefix + ".jpeg"), frame)
 
-    
+
 def greengrass_object_detection_sample_ssd_run():
     client.publish(topic=PARAM_TOPIC_NAME, payload="OpenVINO: Initializing...")
     model_bin = os.path.splitext(PARAM_MODEL_XML)[0] + ".bin"
@@ -108,7 +110,7 @@ def greengrass_object_detection_sample_ssd_run():
     if "CPU" in PARAM_DEVICE:
         plugin.add_cpu_extension(PARAM_CPU_EXTENSION_PATH)
     # Read IR
-    net = IENetwork.from_ir(model=PARAM_MODEL_XML, weights=model_bin)
+    net = IENetwork(model=PARAM_MODEL_XML, weights=model_bin)
     assert len(net.inputs.keys()) == 1, "Sample supports only single input topologies"
     assert len(net.outputs) == 1, "Sample supports only single output topologies"
     input_blob = next(iter(net.inputs))
@@ -124,9 +126,9 @@ def greengrass_object_detection_sample_ssd_run():
     frame_count = 0
     labeldata = None
     if PARAM_LABELMAP_FILE is not None:
-       with open(PARAM_LABELMAP_FILE) as labelmap_file:
+        with open(PARAM_LABELMAP_FILE) as labelmap_file:
             labeldata = json.load(labelmap_file)
-    
+
     while (cap.isOpened()):
         ret, frame = cap.read()
         if not ret:
@@ -142,26 +144,27 @@ def greengrass_object_detection_sample_ssd_run():
         res = exec_net.infer(inputs={input_blob: in_frame})
         inf_seconds += timeit.default_timer() - inf_start_time
         # Parse detection results of the current request
-        res_json = OrderedDict() 
-        frame_timestamp = datetime.datetime.now()    
+        res_json = OrderedDict()
+        frame_timestamp = datetime.datetime.now()
         object_id = 0
         for obj in res[out_blob][0][0]:
-             if obj[2] > 0.5:
-                 xmin = int(obj[3] * initial_w)
-                 ymin = int(obj[4] * initial_h)
-                 xmax = int(obj[5] * initial_w)
-                 ymax = int(obj[6] * initial_h)
-                 cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (255, 165, 20), 4)
-                 obj_id = "Object" + str(object_id)
-                 classlabel = labeldata[str(int(obj[1]))] if labeldata else ""
-                 res_json[obj_id] = {"label": int(obj[1]), "class": classlabel, "confidence": round(obj[2], 2), "xmin": round(obj[3], 2), "ymin": round(obj[4], 2), "xmax": round(obj[5], 2), "ymax": round(obj[6], 2)}
-                 object_id += 1
+            if obj[2] > 0.5:
+                xmin = int(obj[3] * initial_w)
+                ymin = int(obj[4] * initial_h)
+                xmax = int(obj[5] * initial_w)
+                ymax = int(obj[6] * initial_h)
+                cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (255, 165, 20), 4)
+                obj_id = "Object" + str(object_id)
+                classlabel = labeldata[str(int(obj[1]))] if labeldata else ""
+                res_json[obj_id] = {"label": int(obj[1]), "class": classlabel, "confidence": round(obj[2], 2), "xmin": round(
+                    obj[3], 2), "ymin": round(obj[4], 2), "xmax": round(obj[5], 2), "ymax": round(obj[6], 2)}
+                object_id += 1
         frame_count += 1
         # Measure elapsed seconds since the last report
         seconds_elapsed = timeit.default_timer() - start_time
         if seconds_elapsed >= reporting_interval:
             res_json["timestamp"] = frame_timestamp.isoformat()
-            res_json["frame_id"] = int(frameid)   
+            res_json["frame_id"] = int(frameid)
             res_json["inference_fps"] = frame_count / inf_seconds
             start_time = timeit.default_timer()
             report(res_json, frame)
@@ -172,8 +175,10 @@ def greengrass_object_detection_sample_ssd_run():
     del exec_net
     del plugin
 
+
 greengrass_object_detection_sample_ssd_run()
 
+
 def function_handler(event, context):
     client.publish(topic=PARAM_TOPIC_NAME, payload='HANDLER_CALLED!')
     return
diff --git a/inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/classification_demo.ipynb b/inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/classification_demo.ipynb
new file mode 100644 (file)
index 0000000..632672f
--- /dev/null
@@ -0,0 +1,463 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook demonstrates the worklflow of a simple image classification task.\n",
+    "We will go through all the pipeline steps: downloading the model, generating the Intermediate Representation (IR) using the Model Optimizer, running inference in Python, and parsing and interpretating the output results.\n",
+    "\n",
+    "To demonstrate the scenario, we will use the pre-trained SquezeNet V1.1 Caffe\\* model. SqueezeNet is a pretty accurate and at the same time lightweight network. For more information about the model, please visit <a href=\"https://github.com/DeepScale/SqueezeNet/\">GitHub</a> page and refer to original <a href=\"https://arxiv.org/abs/1602.07360\">SqueezeNet paper</a>.\n",
+    "\n",
+    "Follow the steps to perform image classification with the SquezeNet V1.1 model:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**1. Download the model files:** "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "echo \"Downloading deploy.protxt ...\"\n",
+    "if [ -f deploy.prototxt ]; then \n",
+    "    echo \"deploy.protxt file already exists. Downloading skipped\"\n",
+    "else\n",
+    "    wget https://raw.githubusercontent.com/DeepScale/SqueezeNet/a47b6f13d30985279789d08053d37013d67d131b/SqueezeNet_v1.1/deploy.prototxt -q\n",
+    "    echo \"Finished!\"\n",
+    "fi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "! echo \"Downloading squeezenet_v1.1.caffemodel ...\"\n",
+    "if [ -f squeezenet_v1.1.caffemodel ]; then\n",
+    "    echo \"squeezenet_v1.1.caffemodel file already exists. Download skipped\"\n",
+    "else\n",
+    "    wget https://github.com/DeepScale/SqueezeNet/raw/a47b6f13d30985279789d08053d37013d67d131b/SqueezeNet_v1.1/squeezenet_v1.1.caffemodel -q\n",
+    "    echo \"Finished!\"\n",
+    "fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Run the following command to see the model files:**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -la"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* `deploy.prototxt` contains the network toplogy description in text format. \n",
+    "* `squeezenet_v1.1.caffemodel` contains weights for all network layers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**2. Optimize and convert the model from intial Caffe representation to the IR representation, which is required for scoring the model using Inference Engine. To convert and optimize the model, use the Model Optimizer command line tool.**\n",
+    "\n",
+    "To locate Model Optimizer scripts, specify the path to the Model Optimizer root directory in the `MO_ROOT` variable in the cell bellow and then run it (If you use the installed OpenVINO&trade; package, you can find the Model Optimizer in `<INSTALLATION_ROOT_DIR>/deployment_tools/model_optimizer`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "MO_ROOT=/localdisk/repos/model-optimizer-tensorflow/\n",
+    "echo $MO_ROOT\n",
+    "python3 $MO_ROOT/mo.py --input_model squeezenet_v1.1.caffemodel --input_proto deploy.prototxt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**3. Now, you have the SqueezeNet model converted to the IR, and you can infer it.**\n",
+    "\n",
+    "a. First, import required modules:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openvino.inference_engine import IENetwork, IEPlugin\n",
+    "import numpy as np\n",
+    "import cv2\n",
+    "import logging as log\n",
+    "from time import time\n",
+    "import sys\n",
+    "import glob\n",
+    "import os\n",
+    "from matplotlib import pyplot as plt\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "b. Initialize required constants:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Configure logging format\n",
+    "log.basicConfig(format=\"[ %(levelname)s ] %(message)s\", level=log.INFO, stream=sys.stdout)\n",
+    "\n",
+    "# Path to IR model files\n",
+    "MODEL_XML = \"./squeezenet_v1.1.xml\"\n",
+    "MODEL_BIN = \"./squeezenet_v1.1.bin\"\n",
+    "\n",
+    "# Target device to run inference\n",
+    "TARGET_DEVICE = \"CPU\"\n",
+    "\n",
+    "# Folder with input images for the model\n",
+    "IMAGES_FOLDER = \"./images\"\n",
+    "\n",
+    "# File containing information about classes names \n",
+    "LABELS_FILE = \"./image_net_synset.txt\"\n",
+    "\n",
+    "# Number of top prediction results to parse\n",
+    "NTOP = 5\n",
+    "\n",
+    "# Required batch size - number of images which will be processed in parallel\n",
+    "BATCH = 4"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "c. Create a plugin instance for the specified target device  \n",
+    "d. Read the IR files and create an `IENEtwork` instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plugin = IEPlugin(TARGET_DEVICE)\n",
+    "net = IENetwork(model=MODEL_XML, weights=MODEL_BIN)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "e. Set the network batch size to the constatns specified above. \n",
+    "\n",
+    "Batch size is an \"amount\" of input data that will be infered in parallel. In this cases it is a number of images, which will be classified in parallel. \n",
+    "\n",
+    "You can set the network batch size using one of the following options:\n",
+    "1. On the IR generation stage, run the Model Optimizer with `-b` command line option. For example, to generate the IR with batch size equal to 4, add `-b 4` to Model Optimizer command line options. By default, it takes the batch size from the original network in framework representation (usually, it is equal to 1, but in this case, the original Caffe model is provided with the batch size equal to 10). \n",
+    "2. Use Inference Engine after reading IR. We will use this option.\n",
+    "\n",
+    "To set the batch size with the Inference Engine:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "log.info(\"Current network batch size is {}, will be changed to {}\".format(net.batch_size, BATCH))\n",
+    "net.batch_size = BATCH"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "f. After setting batch size, you can get required information about network input layers.\n",
+    "To preprocess input images, you need to know input layer shape.\n",
+    "\n",
+    "`inputs` property of `IENetwork` returns the dicitonary with input layer names and `InputInfo` objects, which contain information about an input layer including its shape.\n",
+    "\n",
+    "SqueezeNet is a single-input toplogy, so to get the input layer name and its shape, you can get the first item from the `inputs` dictionary:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_layer = next(iter(net.inputs))\n",
+    "n,c,h,w = net.inputs[input_layer].shape\n",
+    "layout = net.inputs[input_layer].layout\n",
+    "log.info(\"Network input layer {} has shape {} and layout {}\".format(input_layer, (n,c,h,w), layout))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "So what do the shape and layout mean?  \n",
+    "Layout will helps to interprete the shape dimsesnions meaning. \n",
+    "\n",
+    "`NCHW` input layer layout means:\n",
+    "* the fisrt dimension of an input data is a batch of **N** images processed in parallel \n",
+    "* the second dimension is a numnber of **C**hannels expected in the input images\n",
+    "* the third and the forth are a spatial dimensions - **H**eight and **W**idth of an input image\n",
+    "\n",
+    "Our shapes means that the network expects four 3-channel images running in parallel with size 227x227."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "g. Read and preprocess input images.\n",
+    "\n",
+    "For it, go to `IMAGES_FOLDER`, find all `.bmp` files, and take four images for inference:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "search_pattern = os.path.join(IMAGES_FOLDER, \"*.bmp\")\n",
+    "images = glob.glob(search_pattern)[:BATCH]\n",
+    "log.info(\"Input images:\\n {}\".format(\"\\n\".join(images)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now you can read and preprocess the image files and create an array with input blob data.\n",
+    "\n",
+    "For preprocessing, you must do the following:\n",
+    "1. Resize the images to fit the HxW input dimenstions.\n",
+    "2. Transpose the HWC layout.\n",
+    "\n",
+    "Transposing is tricky and not really obvious.\n",
+    "As you alredy saw above, the network has the `NCHW` layout, so each input image should be in `CHW` format. But by deafult, OpenCV\\* reads images in the `HWC` format. That is why you have to swap the axes using the `numpy.transpose()` function:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_data = np.ndarray(shape=(n, c, h, w))\n",
+    "orig_images = [] # Will be used to show image in notebook\n",
+    "for i, img in enumerate(images):\n",
+    "    image = cv2.imread(img)\n",
+    "    orig_images.append(image)\n",
+    "    if image.shape[:-1] != (h, w):\n",
+    "        log.warning(\"Image {} is resized from {} to {}\".format(img, image.shape[:-1], (h, w)))\n",
+    "        image = cv2.resize(image, (w, h))\n",
+    "    image = image.transpose((2, 0, 1))  # Change data layout from HWC to CHW\n",
+    "    input_data[i] = image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "i. Infer the model model to classify input images:\n",
+    "\n",
+    "1. Load the `IENetwork` object to the plugin to create `ExectuableNEtwork` object.    \n",
+    "2. Start inference using the `infer()` function specifying dictionary with input layer name and prepared data as an argument for the function.     \n",
+    "3. Measure inference time in miliseconds and calculate throughput metric in frames-per-second (FPS)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exec_net = plugin.load(net)\n",
+    "t0 = time()\n",
+    "res_map = exec_net.infer({input_layer: input_data})\n",
+    "inf_time = (time() - t0) * 1000 \n",
+    "fps = BATCH * inf_time \n",
+    "log.info(\"Inference time: {} ms.\".format(inf_time))\n",
+    "log.info(\"Throughput: {} fps.\".format(fps))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**4. After the inference, you need to parse and interpretate the inference results.**\n",
+    "\n",
+    "First, you need to see the shape of the network output layer. It can be done in similar way as for the inputs, but here you need to call `outputs` property of `IENetwork` object:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_layer = next(iter(net.outputs))\n",
+    "n,c,h,w = net.outputs[output_layer].shape\n",
+    "layout = net.outputs[output_layer].layout\n",
+    "log.info(\"Network output layer {} has shape {} and layout {}\".format(output_layer, (n,c,h,w), layout))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It is not a common case for classification netowrks to have output layer with *NCHW* layout. Usually, it is just *NC*. However, in this case, the last two dimensions are just a feature of the network and do not have much sense. Ignore them as you will remove  them on the final parsing stage. \n",
+    "\n",
+    "What are the first and second dimensions of the output layer?    \n",
+    "* The first dimension is a batch. We precoessed four images, and the prediction result for a particular image is stored in the first dimension of the output array. For example, prediction results for the third image is `res[2]` (since numeration starts from 0).\n",
+    "* The second dimension is an array with normalized probabilities (from 0 to 1) for each class. This network is trained using the <a href=\"http://image-net.org/index\">ImageNet</a> dataset with 1000 classes. Each `n`-th value in the output data for a certain image represent the probability of the image belonging to the `n`-th class. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To parse the output results:\n",
+    "\n",
+    "a. Read the `LABELS_FILE`, which maps the class ID to human-readable class names:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(LABELS_FILE, 'r') as f:\n",
+    "    labels_map = [x.split(sep=' ', maxsplit=1)[-1].strip() for x in f]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "b. Parse the output array with prediction results. The parsing algorith is the following:\n",
+    "0. Squeeze the last two \"extra\" dimensions of the output data.\n",
+    "1. Iterate over all batches.\n",
+    "2. Sort the probabilities vector descendingly to get `NTOP` classes with the highest probabilities (by default, the `numpy.argsort` sorts the data in the ascending order, but using the array slicing `[::-1]`, you can reverse the data order).\n",
+    "3. Map the `NTOP` probabilities to the corresponding labeles in `labeles_map`.\n",
+    "\n",
+    "For the vizualization, you also need to store top-1 class and probability."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "top1_res = [] # will be used for the visualization\n",
+    "res = np.squeeze(res_map[output_layer])\n",
+    "log.info(\"Top {} results: \".format(NTOP))\n",
+    "for i, probs in enumerate(res):\n",
+    "    top_ind = np.argsort(probs)[-NTOP:][::-1]\n",
+    "    print(\"Image {}\".format(images[i]))\n",
+    "    top1_ind = top_ind[0]\n",
+    "    top1_res.append((labels_map[top1_ind], probs[top1_ind]))\n",
+    "    for id in top_ind:\n",
+    "        print(\"label: {}   probability: {:.2f}% \".format(labels_map[id], probs[id] * 100))\n",
+    "    print(\"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The code above prints the results as plain text.   \n",
+    "You can also use OpenCV\\* to visualize the results using the `orig_images` and `top1_res` variables, which you created during images reading and results parsing:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.clf()\n",
+    "for i, img in enumerate(orig_images):\n",
+    "    label_str = \"{}\".format(top1_res[i][0].split(',')[0])\n",
+    "    prob_str = \"{:.2f}%\".format(top1_res[i][1])\n",
+    "    cv2.putText(img, label_str, (5, 15), cv2.FONT_HERSHEY_COMPLEX, 0.6, (220,100,10), 1)\n",
+    "    cv2.putText(img, prob_str, (5, 35), cv2.FONT_HERSHEY_COMPLEX, 0.6, (220,100,10), 1)\n",
+    "    plt.figure()\n",
+    "    plt.axis(\"off\")\n",
+    "    \n",
+    "    # We have to convert colors, because matplotlib expects an image in RGB color format \n",
+    "    # but by default, the OpenCV read images in BRG format\n",
+    "    im_to_show = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
+    "    plt.imshow(im_to_show)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/image_net_synset.txt b/inference-engine/ie_bridges/python/sample/jupyter_notebooks/classification_demo/image_net_synset.txt
new file mode 100644 (file)
index 0000000..a9e8c7f
--- /dev/null
@@ -0,0 +1,1000 @@
+n01440764 tench, Tinca tinca
+n01443537 goldfish, Carassius auratus
+n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+n01491361 tiger shark, Galeocerdo cuvieri
+n01494475 hammerhead, hammerhead shark
+n01496331 electric ray, crampfish, numbfish, torpedo
+n01498041 stingray
+n01514668 cock
+n01514859 hen
+n01518878 ostrich, Struthio camelus
+n01530575 brambling, Fringilla montifringilla
+n01531178 goldfinch, Carduelis carduelis
+n01532829 house finch, linnet, Carpodacus mexicanus
+n01534433 junco, snowbird
+n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+n01558993 robin, American robin, Turdus migratorius
+n01560419 bulbul
+n01580077 jay
+n01582220 magpie
+n01592084 chickadee
+n01601694 water ouzel, dipper
+n01608432 kite
+n01614925 bald eagle, American eagle, Haliaeetus leucocephalus
+n01616318 vulture
+n01622779 great grey owl, great gray owl, Strix nebulosa
+n01629819 European fire salamander, Salamandra salamandra
+n01630670 common newt, Triturus vulgaris
+n01631663 eft
+n01632458 spotted salamander, Ambystoma maculatum
+n01632777 axolotl, mud puppy, Ambystoma mexicanum
+n01641577 bullfrog, Rana catesbeiana
+n01644373 tree frog, tree-frog
+n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+n01664065 loggerhead, loggerhead turtle, Caretta caretta
+n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+n01667114 mud turtle
+n01667778 terrapin
+n01669191 box turtle, box tortoise
+n01675722 banded gecko
+n01677366 common iguana, iguana, Iguana iguana
+n01682714 American chameleon, anole, Anolis carolinensis
+n01685808 whiptail, whiptail lizard
+n01687978 agama
+n01688243 frilled lizard, Chlamydosaurus kingi
+n01689811 alligator lizard
+n01692333 Gila monster, Heloderma suspectum
+n01693334 green lizard, Lacerta viridis
+n01694178 African chameleon, Chamaeleo chamaeleon
+n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+n01697457 African crocodile, Nile crocodile, Crocodylus niloticus
+n01698640 American alligator, Alligator mississipiensis
+n01704323 triceratops
+n01728572 thunder snake, worm snake, Carphophis amoenus
+n01728920 ringneck snake, ring-necked snake, ring snake
+n01729322 hognose snake, puff adder, sand viper
+n01729977 green snake, grass snake
+n01734418 king snake, kingsnake
+n01735189 garter snake, grass snake
+n01737021 water snake
+n01739381 vine snake
+n01740131 night snake, Hypsiglena torquata
+n01742172 boa constrictor, Constrictor constrictor
+n01744401 rock python, rock snake, Python sebae
+n01748264 Indian cobra, Naja naja
+n01749939 green mamba
+n01751748 sea snake
+n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus
+n01756291 sidewinder, horned rattlesnake, Crotalus cerastes
+n01768244 trilobite
+n01770081 harvestman, daddy longlegs, Phalangium opilio
+n01770393 scorpion
+n01773157 black and gold garden spider, Argiope aurantia
+n01773549 barn spider, Araneus cavaticus
+n01773797 garden spider, Aranea diademata
+n01774384 black widow, Latrodectus mactans
+n01774750 tarantula
+n01775062 wolf spider, hunting spider
+n01776313 tick
+n01784675 centipede
+n01795545 black grouse
+n01796340 ptarmigan
+n01797886 ruffed grouse, partridge, Bonasa umbellus
+n01798484 prairie chicken, prairie grouse, prairie fowl
+n01806143 peacock
+n01806567 quail
+n01807496 partridge
+n01817953 African grey, African gray, Psittacus erithacus
+n01818515 macaw
+n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+n01820546 lorikeet
+n01824575 coucal
+n01828970 bee eater
+n01829413 hornbill
+n01833805 hummingbird
+n01843065 jacamar
+n01843383 toucan
+n01847000 drake
+n01855032 red-breasted merganser, Mergus serrator
+n01855672 goose
+n01860187 black swan, Cygnus atratus
+n01871265 tusker
+n01872401 echidna, spiny anteater, anteater
+n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+n01877812 wallaby, brush kangaroo
+n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+n01883070 wombat
+n01910747 jellyfish
+n01914609 sea anemone, anemone
+n01917289 brain coral
+n01924916 flatworm, platyhelminth
+n01930112 nematode, nematode worm, roundworm
+n01943899 conch
+n01944390 snail
+n01945685 slug
+n01950731 sea slug, nudibranch
+n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore
+n01968897 chambered nautilus, pearly nautilus, nautilus
+n01978287 Dungeness crab, Cancer magister
+n01978455 rock crab, Cancer irroratus
+n01980166 fiddler crab
+n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+n01983481 American lobster, Northern lobster, Maine lobster, Homarus americanus
+n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+n01985128 crayfish, crawfish, crawdad, crawdaddy
+n01986214 hermit crab
+n01990800 isopod
+n02002556 white stork, Ciconia ciconia
+n02002724 black stork, Ciconia nigra
+n02006656 spoonbill
+n02007558 flamingo
+n02009229 little blue heron, Egretta caerulea
+n02009912 American egret, great white heron, Egretta albus
+n02011460 bittern
+n02012849 crane
+n02013706 limpkin, Aramus pictus
+n02017213 European gallinule, Porphyrio porphyrio
+n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana
+n02018795 bustard
+n02025239 ruddy turnstone, Arenaria interpres
+n02027492 red-backed sandpiper, dunlin, Erolia alpina
+n02028035 redshank, Tringa totanus
+n02033041 dowitcher
+n02037110 oystercatcher, oyster catcher
+n02051845 pelican
+n02056570 king penguin, Aptenodytes patagonica
+n02058221 albatross, mollymawk
+n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+n02074367 dugong, Dugong dugon
+n02077923 sea lion
+n02085620 Chihuahua
+n02085782 Japanese spaniel
+n02085936 Maltese dog, Maltese terrier, Maltese
+n02086079 Pekinese, Pekingese, Peke
+n02086240 Shih-Tzu
+n02086646 Blenheim spaniel
+n02086910 papillon
+n02087046 toy terrier
+n02087394 Rhodesian ridgeback
+n02088094 Afghan hound, Afghan
+n02088238 basset, basset hound
+n02088364 beagle
+n02088466 bloodhound, sleuthhound
+n02088632 bluetick
+n02089078 black-and-tan coonhound
+n02089867 Walker hound, Walker foxhound
+n02089973 English foxhound
+n02090379 redbone
+n02090622 borzoi, Russian wolfhound
+n02090721 Irish wolfhound
+n02091032 Italian greyhound
+n02091134 whippet
+n02091244 Ibizan hound, Ibizan Podenco
+n02091467 Norwegian elkhound, elkhound
+n02091635 otterhound, otter hound
+n02091831 Saluki, gazelle hound
+n02092002 Scottish deerhound, deerhound
+n02092339 Weimaraner
+n02093256 Staffordshire bullterrier, Staffordshire bull terrier
+n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+n02093647 Bedlington terrier
+n02093754 Border terrier
+n02093859 Kerry blue terrier
+n02093991 Irish terrier
+n02094114 Norfolk terrier
+n02094258 Norwich terrier
+n02094433 Yorkshire terrier
+n02095314 wire-haired fox terrier
+n02095570 Lakeland terrier
+n02095889 Sealyham terrier, Sealyham
+n02096051 Airedale, Airedale terrier
+n02096177 cairn, cairn terrier
+n02096294 Australian terrier
+n02096437 Dandie Dinmont, Dandie Dinmont terrier
+n02096585 Boston bull, Boston terrier
+n02097047 miniature schnauzer
+n02097130 giant schnauzer
+n02097209 standard schnauzer
+n02097298 Scotch terrier, Scottish terrier, Scottie
+n02097474 Tibetan terrier, chrysanthemum dog
+n02097658 silky terrier, Sydney silky
+n02098105 soft-coated wheaten terrier
+n02098286 West Highland white terrier
+n02098413 Lhasa, Lhasa apso
+n02099267 flat-coated retriever
+n02099429 curly-coated retriever
+n02099601 golden retriever
+n02099712 Labrador retriever
+n02099849 Chesapeake Bay retriever
+n02100236 German short-haired pointer
+n02100583 vizsla, Hungarian pointer
+n02100735 English setter
+n02100877 Irish setter, red setter
+n02101006 Gordon setter
+n02101388 Brittany spaniel
+n02101556 clumber, clumber spaniel
+n02102040 English springer, English springer spaniel
+n02102177 Welsh springer spaniel
+n02102318 cocker spaniel, English cocker spaniel, cocker
+n02102480 Sussex spaniel
+n02102973 Irish water spaniel
+n02104029 kuvasz
+n02104365 schipperke
+n02105056 groenendael
+n02105162 malinois
+n02105251 briard
+n02105412 kelpie
+n02105505 komondor
+n02105641 Old English sheepdog, bobtail
+n02105855 Shetland sheepdog, Shetland sheep dog, Shetland
+n02106030 collie
+n02106166 Border collie
+n02106382 Bouvier des Flandres, Bouviers des Flandres
+n02106550 Rottweiler
+n02106662 German shepherd, German shepherd dog, German police dog, alsatian
+n02107142 Doberman, Doberman pinscher
+n02107312 miniature pinscher
+n02107574 Greater Swiss Mountain dog
+n02107683 Bernese mountain dog
+n02107908 Appenzeller
+n02108000 EntleBucher
+n02108089 boxer
+n02108422 bull mastiff
+n02108551 Tibetan mastiff
+n02108915 French bulldog
+n02109047 Great Dane
+n02109525 Saint Bernard, St Bernard
+n02109961 Eskimo dog, husky
+n02110063 malamute, malemute, Alaskan malamute
+n02110185 Siberian husky
+n02110341 dalmatian, coach dog, carriage dog
+n02110627 affenpinscher, monkey pinscher, monkey dog
+n02110806 basenji
+n02110958 pug, pug-dog
+n02111129 Leonberg
+n02111277 Newfoundland, Newfoundland dog
+n02111500 Great Pyrenees
+n02111889 Samoyed, Samoyede
+n02112018 Pomeranian
+n02112137 chow, chow chow
+n02112350 keeshond
+n02112706 Brabancon griffon
+n02113023 Pembroke, Pembroke Welsh corgi
+n02113186 Cardigan, Cardigan Welsh corgi
+n02113624 toy poodle
+n02113712 miniature poodle
+n02113799 standard poodle
+n02113978 Mexican hairless
+n02114367 timber wolf, grey wolf, gray wolf, Canis lupus
+n02114548 white wolf, Arctic wolf, Canis lupus tundrarum
+n02114712 red wolf, maned wolf, Canis rufus, Canis niger
+n02114855 coyote, prairie wolf, brush wolf, Canis latrans
+n02115641 dingo, warrigal, warragal, Canis dingo
+n02115913 dhole, Cuon alpinus
+n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+n02117135 hyena, hyaena
+n02119022 red fox, Vulpes vulpes
+n02119789 kit fox, Vulpes macrotis
+n02120079 Arctic fox, white fox, Alopex lagopus
+n02120505 grey fox, gray fox, Urocyon cinereoargenteus
+n02123045 tabby, tabby cat
+n02123159 tiger cat
+n02123394 Persian cat
+n02123597 Siamese cat, Siamese
+n02124075 Egyptian cat
+n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+n02127052 lynx, catamount
+n02128385 leopard, Panthera pardus
+n02128757 snow leopard, ounce, Panthera uncia
+n02128925 jaguar, panther, Panthera onca, Felis onca
+n02129165 lion, king of beasts, Panthera leo
+n02129604 tiger, Panthera tigris
+n02130308 cheetah, chetah, Acinonyx jubatus
+n02132136 brown bear, bruin, Ursus arctos
+n02133161 American black bear, black bear, Ursus americanus, Euarctos americanus
+n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+n02134418 sloth bear, Melursus ursinus, Ursus ursinus
+n02137549 mongoose
+n02138441 meerkat, mierkat
+n02165105 tiger beetle
+n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+n02167151 ground beetle, carabid beetle
+n02168699 long-horned beetle, longicorn, longicorn beetle
+n02169497 leaf beetle, chrysomelid
+n02172182 dung beetle
+n02174001 rhinoceros beetle
+n02177972 weevil
+n02190166 fly
+n02206856 bee
+n02219486 ant, emmet, pismire
+n02226429 grasshopper, hopper
+n02229544 cricket
+n02231487 walking stick, walkingstick, stick insect
+n02233338 cockroach, roach
+n02236044 mantis, mantid
+n02256656 cicada, cicala
+n02259212 leafhopper
+n02264363 lacewing, lacewing fly
+n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+n02268853 damselfly
+n02276258 admiral
+n02277742 ringlet, ringlet butterfly
+n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+n02280649 cabbage butterfly
+n02281406 sulphur butterfly, sulfur butterfly
+n02281787 lycaenid, lycaenid butterfly
+n02317335 starfish, sea star
+n02319095 sea urchin
+n02321529 sea cucumber, holothurian
+n02325366 wood rabbit, cottontail, cottontail rabbit
+n02326432 hare
+n02328150 Angora, Angora rabbit
+n02342885 hamster
+n02346627 porcupine, hedgehog
+n02356798 fox squirrel, eastern fox squirrel, Sciurus niger
+n02361337 marmot
+n02363005 beaver
+n02364673 guinea pig, Cavia cobaya
+n02389026 sorrel
+n02391049 zebra
+n02395406 hog, pig, grunter, squealer, Sus scrofa
+n02396427 wild boar, boar, Sus scrofa
+n02397096 warthog
+n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius
+n02403003 ox
+n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+n02410509 bison
+n02412080 ram, tup
+n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+n02417914 ibex, Capra ibex
+n02422106 hartebeest
+n02422699 impala, Aepyceros melampus
+n02423022 gazelle
+n02437312 Arabian camel, dromedary, Camelus dromedarius
+n02437616 llama
+n02441942 weasel
+n02442845 mink
+n02443114 polecat, fitch, foulmart, foumart, Mustela putorius
+n02443484 black-footed ferret, ferret, Mustela nigripes
+n02444819 otter
+n02445715 skunk, polecat, wood pussy
+n02447366 badger
+n02454379 armadillo
+n02457408 three-toed sloth, ai, Bradypus tridactylus
+n02480495 orangutan, orang, orangutang, Pongo pygmaeus
+n02480855 gorilla, Gorilla gorilla
+n02481823 chimpanzee, chimp, Pan troglodytes
+n02483362 gibbon, Hylobates lar
+n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus
+n02484975 guenon, guenon monkey
+n02486261 patas, hussar monkey, Erythrocebus patas
+n02486410 baboon
+n02487347 macaque
+n02488291 langur
+n02488702 colobus, colobus monkey
+n02489166 proboscis monkey, Nasalis larvatus
+n02490219 marmoset
+n02492035 capuchin, ringtail, Cebus capucinus
+n02492660 howler monkey, howler
+n02493509 titi, titi monkey
+n02493793 spider monkey, Ateles geoffroyi
+n02494079 squirrel monkey, Saimiri sciureus
+n02497673 Madagascar cat, ring-tailed lemur, Lemur catta
+n02500267 indri, indris, Indri indri, Indri brevicaudatus
+n02504013 Indian elephant, Elephas maximus
+n02504458 African elephant, Loxodonta africana
+n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+n02514041 barracouta, snoek
+n02526121 eel
+n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+n02606052 rock beauty, Holocanthus tricolor
+n02607072 anemone fish
+n02640242 sturgeon
+n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus
+n02643566 lionfish
+n02655020 puffer, pufferfish, blowfish, globefish
+n02666196 abacus
+n02667093 abaya
+n02669723 academic gown, academic robe, judge's robe
+n02672831 accordion, piano accordion, squeeze box
+n02676566 acoustic guitar
+n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier
+n02690373 airliner
+n02692877 airship, dirigible
+n02699494 altar
+n02701002 ambulance
+n02704792 amphibian, amphibious vehicle
+n02708093 analog clock
+n02727426 apiary, bee house
+n02730930 apron
+n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+n02749479 assault rifle, assault gun
+n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack
+n02776631 bakery, bakeshop, bakehouse
+n02777292 balance beam, beam
+n02782093 balloon
+n02783161 ballpoint, ballpoint pen, ballpen, Biro
+n02786058 Band Aid
+n02787622 banjo
+n02788148 bannister, banister, balustrade, balusters, handrail
+n02790996 barbell
+n02791124 barber chair
+n02791270 barbershop
+n02793495 barn
+n02794156 barometer
+n02795169 barrel, cask
+n02797295 barrow, garden cart, lawn cart, wheelbarrow
+n02799071 baseball
+n02802426 basketball
+n02804414 bassinet
+n02804610 bassoon
+n02807133 bathing cap, swimming cap
+n02808304 bath towel
+n02808440 bathtub, bathing tub, bath, tub
+n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+n02814860 beacon, lighthouse, beacon light, pharos
+n02815834 beaker
+n02817516 bearskin, busby, shako
+n02823428 beer bottle
+n02823750 beer glass
+n02825657 bell cote, bell cot
+n02834397 bib
+n02835271 bicycle-built-for-two, tandem bicycle, tandem
+n02837789 bikini, two-piece
+n02840245 binder, ring-binder
+n02841315 binoculars, field glasses, opera glasses
+n02843684 birdhouse
+n02859443 boathouse
+n02860847 bobsled, bobsleigh, bob
+n02865351 bolo tie, bolo, bola tie, bola
+n02869837 bonnet, poke bonnet
+n02870880 bookcase
+n02871525 bookshop, bookstore, bookstall
+n02877765 bottlecap
+n02879718 bow
+n02883205 bow tie, bow-tie, bowtie
+n02892201 brass, memorial tablet, plaque
+n02892767 brassiere, bra, bandeau
+n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+n02895154 breastplate, aegis, egis
+n02906734 broom
+n02909870 bucket, pail
+n02910353 buckle
+n02916936 bulletproof vest
+n02917067 bullet train, bullet
+n02927161 butcher shop, meat market
+n02930766 cab, hack, taxi, taxicab
+n02939185 caldron, cauldron
+n02948072 candle, taper, wax light
+n02950826 cannon
+n02951358 canoe
+n02951585 can opener, tin opener
+n02963159 cardigan
+n02965783 car mirror
+n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig
+n02966687 carpenter's kit, tool kit
+n02971356 carton
+n02974003 car wheel
+n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+n02978881 cassette
+n02979186 cassette player
+n02980441 castle
+n02981792 catamaran
+n02988304 CD player
+n02992211 cello, violoncello
+n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone
+n02999410 chain
+n03000134 chainlink fence
+n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+n03000684 chain saw, chainsaw
+n03014705 chest
+n03016953 chiffonier, commode
+n03017168 chime, bell, gong
+n03018349 china cabinet, china closet
+n03026506 Christmas stocking
+n03028079 church, church building
+n03032252 cinema, movie theater, movie theatre, movie house, picture palace
+n03041632 cleaver, meat cleaver, chopper
+n03042490 cliff dwelling
+n03045698 cloak
+n03047690 clog, geta, patten, sabot
+n03062245 cocktail shaker
+n03063599 coffee mug
+n03063689 coffeepot
+n03065424 coil, spiral, volute, whorl, helix
+n03075370 combination lock
+n03085013 computer keyboard, keypad
+n03089624 confectionery, confectionary, candy store
+n03095699 container ship, containership, container vessel
+n03100240 convertible
+n03109150 corkscrew, bottle screw
+n03110669 cornet, horn, trumpet, trump
+n03124043 cowboy boot
+n03124170 cowboy hat, ten-gallon hat
+n03125729 cradle
+n03126707 crane
+n03127747 crash helmet
+n03127925 crate
+n03131574 crib, cot
+n03133878 Crock Pot
+n03134739 croquet ball
+n03141823 crutch
+n03146219 cuirass
+n03160309 dam, dike, dyke
+n03179701 desk
+n03180011 desktop computer
+n03187595 dial telephone, dial phone
+n03188531 diaper, nappy, napkin
+n03196217 digital clock
+n03197337 digital watch
+n03201208 dining table, board
+n03207743 dishrag, dishcloth
+n03207941 dishwasher, dish washer, dishwashing machine
+n03208938 disk brake, disc brake
+n03216828 dock, dockage, docking facility
+n03218198 dogsled, dog sled, dog sleigh
+n03220513 dome
+n03223299 doormat, welcome mat
+n03240683 drilling platform, offshore rig
+n03249569 drum, membranophone, tympan
+n03250847 drumstick
+n03255030 dumbbell
+n03259280 Dutch oven
+n03271574 electric fan, blower
+n03272010 electric guitar
+n03272562 electric locomotive
+n03290653 entertainment center
+n03291819 envelope
+n03297495 espresso maker
+n03314780 face powder
+n03325584 feather boa, boa
+n03337140 file, file cabinet, filing cabinet
+n03344393 fireboat
+n03345487 fire engine, fire truck
+n03347037 fire screen, fireguard
+n03355925 flagpole, flagstaff
+n03372029 flute, transverse flute
+n03376595 folding chair
+n03379051 football helmet
+n03384352 forklift
+n03388043 fountain
+n03388183 fountain pen
+n03388549 four-poster
+n03393912 freight car
+n03394916 French horn, horn
+n03400231 frying pan, frypan, skillet
+n03404251 fur coat
+n03417042 garbage truck, dustcart
+n03424325 gasmask, respirator, gas helmet
+n03425413 gas pump, gasoline pump, petrol pump, island dispenser
+n03443371 goblet
+n03444034 go-kart
+n03445777 golf ball
+n03445924 golfcart, golf cart
+n03447447 gondola
+n03447721 gong, tam-tam
+n03450230 gown
+n03452741 grand piano, grand
+n03457902 greenhouse, nursery, glasshouse
+n03459775 grille, radiator grille
+n03461385 grocery store, grocery, food market, market
+n03467068 guillotine
+n03476684 hair slide
+n03476991 hair spray
+n03478589 half track
+n03481172 hammer
+n03482405 hamper
+n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier
+n03485407 hand-held computer, hand-held microcomputer
+n03485794 handkerchief, hankie, hanky, hankey
+n03492542 hard disc, hard disk, fixed disk
+n03494278 harmonica, mouth organ, harp, mouth harp
+n03495258 harp
+n03496892 harvester, reaper
+n03498962 hatchet
+n03527444 holster
+n03529860 home theater, home theatre
+n03530642 honeycomb
+n03532672 hook, claw
+n03534580 hoopskirt, crinoline
+n03535780 horizontal bar, high bar
+n03538406 horse cart, horse-cart
+n03544143 hourglass
+n03584254 iPod
+n03584829 iron, smoothing iron
+n03590841 jack-o'-lantern
+n03594734 jean, blue jean, denim
+n03594945 jeep, landrover
+n03595614 jersey, T-shirt, tee shirt
+n03598930 jigsaw puzzle
+n03599486 jinrikisha, ricksha, rickshaw
+n03602883 joystick
+n03617480 kimono
+n03623198 knee pad
+n03627232 knot
+n03630383 lab coat, laboratory coat
+n03633091 ladle
+n03637318 lampshade, lamp shade
+n03642806 laptop, laptop computer
+n03649909 lawn mower, mower
+n03657121 lens cap, lens cover
+n03658185 letter opener, paper knife, paperknife
+n03661043 library
+n03662601 lifeboat
+n03666591 lighter, light, igniter, ignitor
+n03670208 limousine, limo
+n03673027 liner, ocean liner
+n03676483 lipstick, lip rouge
+n03680355 Loafer
+n03690938 lotion
+n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+n03692522 loupe, jeweler's loupe
+n03697007 lumbermill, sawmill
+n03706229 magnetic compass
+n03709823 mailbag, postbag
+n03710193 mailbox, letter box
+n03710637 maillot
+n03710721 maillot, tank suit
+n03717622 manhole cover
+n03720891 maraca
+n03721384 marimba, xylophone
+n03724870 mask
+n03729826 matchstick
+n03733131 maypole
+n03733281 maze, labyrinth
+n03733805 measuring cup
+n03742115 medicine chest, medicine cabinet
+n03743016 megalith, megalithic structure
+n03759954 microphone, mike
+n03761084 microwave, microwave oven
+n03763968 military uniform
+n03764736 milk can
+n03769881 minibus
+n03770439 miniskirt, mini
+n03770679 minivan
+n03773504 missile
+n03775071 mitten
+n03775546 mixing bowl
+n03776460 mobile home, manufactured home
+n03777568 Model T
+n03777754 modem
+n03781244 monastery
+n03782006 monitor
+n03785016 moped
+n03786901 mortar
+n03787032 mortarboard
+n03788195 mosque
+n03788365 mosquito net
+n03791053 motor scooter, scooter
+n03792782 mountain bike, all-terrain bike, off-roader
+n03792972 mountain tent
+n03793489 mouse, computer mouse
+n03794056 mousetrap
+n03796401 moving van
+n03803284 muzzle
+n03804744 nail
+n03814639 neck brace
+n03814906 necklace
+n03825788 nipple
+n03832673 notebook, notebook computer
+n03837869 obelisk
+n03838899 oboe, hautboy, hautbois
+n03840681 ocarina, sweet potato
+n03841143 odometer, hodometer, mileometer, milometer
+n03843555 oil filter
+n03854065 organ, pipe organ
+n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO
+n03866082 overskirt
+n03868242 oxcart
+n03868863 oxygen mask
+n03871628 packet
+n03873416 paddle, boat paddle
+n03874293 paddlewheel, paddle wheel
+n03874599 padlock
+n03876231 paintbrush
+n03877472 pajama, pyjama, pj's, jammies
+n03877845 palace
+n03884397 panpipe, pandean pipe, syrinx
+n03887697 paper towel
+n03888257 parachute, chute
+n03888605 parallel bars, bars
+n03891251 park bench
+n03891332 parking meter
+n03895866 passenger car, coach, carriage
+n03899768 patio, terrace
+n03902125 pay-phone, pay-station
+n03903868 pedestal, plinth, footstall
+n03908618 pencil box, pencil case
+n03908714 pencil sharpener
+n03916031 perfume, essence
+n03920288 Petri dish
+n03924679 photocopier
+n03929660 pick, plectrum, plectron
+n03929855 pickelhaube
+n03930313 picket fence, paling
+n03930630 pickup, pickup truck
+n03933933 pier
+n03935335 piggy bank, penny bank
+n03937543 pill bottle
+n03938244 pillow
+n03942813 ping-pong ball
+n03944341 pinwheel
+n03947888 pirate, pirate ship
+n03950228 pitcher, ewer
+n03954731 plane, carpenter's plane, woodworking plane
+n03956157 planetarium
+n03958227 plastic bag
+n03961711 plate rack
+n03967562 plow, plough
+n03970156 plunger, plumber's helper
+n03976467 Polaroid camera, Polaroid Land camera
+n03976657 pole
+n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+n03980874 poncho
+n03982430 pool table, billiard table, snooker table
+n03983396 pop bottle, soda bottle
+n03991062 pot, flowerpot
+n03992509 potter's wheel
+n03995372 power drill
+n03998194 prayer rug, prayer mat
+n04004767 printer
+n04005630 prison, prison house
+n04008634 projectile, missile
+n04009552 projector
+n04019541 puck, hockey puck
+n04023962 punching bag, punch bag, punching ball, punchball
+n04026417 purse
+n04033901 quill, quill pen
+n04033995 quilt, comforter, comfort, puff
+n04037443 racer, race car, racing car
+n04039381 racket, racquet
+n04040759 radiator
+n04041544 radio, wireless
+n04044716 radio telescope, radio reflector
+n04049303 rain barrel
+n04065272 recreational vehicle, RV, R.V.
+n04067472 reel
+n04069434 reflex camera
+n04070727 refrigerator, icebox
+n04074963 remote control, remote
+n04081281 restaurant, eating house, eating place, eatery
+n04086273 revolver, six-gun, six-shooter
+n04090263 rifle
+n04099969 rocking chair, rocker
+n04111531 rotisserie
+n04116512 rubber eraser, rubber, pencil eraser
+n04118538 rugby ball
+n04118776 rule, ruler
+n04120489 running shoe
+n04125021 safe
+n04127249 safety pin
+n04131690 saltshaker, salt shaker
+n04133789 sandal
+n04136333 sarong
+n04141076 sax, saxophone
+n04141327 scabbard
+n04141975 scale, weighing machine
+n04146614 school bus
+n04147183 schooner
+n04149813 scoreboard
+n04152593 screen, CRT screen
+n04153751 screw
+n04154565 screwdriver
+n04162706 seat belt, seatbelt
+n04179913 sewing machine
+n04192698 shield, buckler
+n04200800 shoe shop, shoe-shop, shoe store
+n04201297 shoji
+n04204238 shopping basket
+n04204347 shopping cart
+n04208210 shovel
+n04209133 shower cap
+n04209239 shower curtain
+n04228054 ski
+n04229816 ski mask
+n04235860 sleeping bag
+n04238763 slide rule, slipstick
+n04239074 sliding door
+n04243546 slot, one-armed bandit
+n04251144 snorkel
+n04252077 snowmobile
+n04252225 snowplow, snowplough
+n04254120 soap dispenser
+n04254680 soccer ball
+n04254777 sock
+n04258138 solar dish, solar collector, solar furnace
+n04259630 sombrero
+n04263257 soup bowl
+n04264628 space bar
+n04265275 space heater
+n04266014 space shuttle
+n04270147 spatula
+n04273569 speedboat
+n04275548 spider web, spider's web
+n04277352 spindle
+n04285008 sports car, sport car
+n04286575 spotlight, spot
+n04296562 stage
+n04310018 steam locomotive
+n04311004 steel arch bridge
+n04311174 steel drum
+n04317175 stethoscope
+n04325704 stole
+n04326547 stone wall
+n04328186 stopwatch, stop watch
+n04330267 stove
+n04332243 strainer
+n04335435 streetcar, tram, tramcar, trolley, trolley car
+n04336792 stretcher
+n04344873 studio couch, day bed
+n04346328 stupa, tope
+n04347754 submarine, pigboat, sub, U-boat
+n04350905 suit, suit of clothes
+n04355338 sundial
+n04355933 sunglass
+n04356056 sunglasses, dark glasses, shades
+n04357314 sunscreen, sunblock, sun blocker
+n04366367 suspension bridge
+n04367480 swab, swob, mop
+n04370456 sweatshirt
+n04371430 swimming trunks, bathing trunks
+n04371774 swing
+n04372370 switch, electric switch, electrical switch
+n04376876 syringe
+n04380533 table lamp
+n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle
+n04392985 tape player
+n04398044 teapot
+n04399382 teddy, teddy bear
+n04404412 television, television system
+n04409515 tennis ball
+n04417672 thatch, thatched roof
+n04418357 theater curtain, theatre curtain
+n04423845 thimble
+n04428191 thresher, thrasher, threshing machine
+n04429376 throne
+n04435653 tile roof
+n04442312 toaster
+n04443257 tobacco shop, tobacconist shop, tobacconist
+n04447861 toilet seat
+n04456115 torch
+n04458633 totem pole
+n04461696 tow truck, tow car, wrecker
+n04462240 toyshop
+n04465501 tractor
+n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+n04476259 tray
+n04479046 trench coat
+n04482393 tricycle, trike, velocipede
+n04483307 trimaran
+n04485082 tripod
+n04486054 triumphal arch
+n04487081 trolleybus, trolley coach, trackless trolley
+n04487394 trombone
+n04493381 tub, vat
+n04501370 turnstile
+n04505470 typewriter keyboard
+n04507155 umbrella
+n04509417 unicycle, monocycle
+n04515003 upright, upright piano
+n04517823 vacuum, vacuum cleaner
+n04522168 vase
+n04523525 vault
+n04525038 velvet
+n04525305 vending machine
+n04532106 vestment
+n04532670 viaduct
+n04536866 violin, fiddle
+n04540053 volleyball
+n04542943 waffle iron
+n04548280 wall clock
+n04548362 wallet, billfold, notecase, pocketbook
+n04550184 wardrobe, closet, press
+n04552348 warplane, military plane
+n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+n04554684 washer, automatic washer, washing machine
+n04557648 water bottle
+n04560804 water jug
+n04562935 water tower
+n04579145 whiskey jug
+n04579432 whistle
+n04584207 wig
+n04589890 window screen
+n04590129 window shade
+n04591157 Windsor tie
+n04591713 wine bottle
+n04592741 wing
+n04596742 wok
+n04597913 wooden spoon
+n04599235 wool, woolen, woollen
+n04604644 worm fence, snake fence, snake-rail fence, Virginia fence
+n04606251 wreck
+n04612504 yawl
+n04613696 yurt
+n06359193 web site, website, internet site, site
+n06596364 comic book
+n06785654 crossword puzzle, crossword
+n06794110 street sign
+n06874185 traffic light, traffic signal, stoplight
+n07248320 book jacket, dust cover, dust jacket, dust wrapper
+n07565083 menu
+n07579787 plate
+n07583066 guacamole
+n07584110 consomme
+n07590611 hot pot, hotpot
+n07613480 trifle
+n07614500 ice cream, icecream
+n07615774 ice lolly, lolly, lollipop, popsicle
+n07684084 French loaf
+n07693725 bagel, beigel
+n07695742 pretzel
+n07697313 cheeseburger
+n07697537 hotdog, hot dog, red hot
+n07711569 mashed potato
+n07714571 head cabbage
+n07714990 broccoli
+n07715103 cauliflower
+n07716358 zucchini, courgette
+n07716906 spaghetti squash
+n07717410 acorn squash
+n07717556 butternut squash
+n07718472 cucumber, cuke
+n07718747 artichoke, globe artichoke
+n07720875 bell pepper
+n07730033 cardoon
+n07734744 mushroom
+n07742313 Granny Smith
+n07745940 strawberry
+n07747607 orange
+n07749582 lemon
+n07753113 fig
+n07753275 pineapple, ananas
+n07753592 banana
+n07754684 jackfruit, jak, jack
+n07760859 custard apple
+n07768694 pomegranate
+n07802026 hay
+n07831146 carbonara
+n07836838 chocolate sauce, chocolate syrup
+n07860988 dough
+n07871810 meat loaf, meatloaf
+n07873807 pizza, pizza pie
+n07875152 potpie
+n07880968 burrito
+n07892512 red wine
+n07920052 espresso
+n07930864 cup
+n07932039 eggnog
+n09193705 alp
+n09229709 bubble
+n09246464 cliff, drop, drop-off
+n09256479 coral reef
+n09288635 geyser
+n09332890 lakeside, lakeshore
+n09399592 promontory, headland, head, foreland
+n09421951 sandbar, sand bar
+n09428293 seashore, coast, seacoast, sea-coast
+n09468604 valley, vale
+n09472597 volcano
+n09835506 ballplayer, baseball player
+n10148035 groom, bridegroom
+n10565667 scuba diver
+n11879895 rapeseed
+n11939491 daisy
+n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+n12144580 corn
+n12267677 acorn
+n12620546 hip, rose hip, rosehip
+n12768682 buckeye, horse chestnut, conker
+n12985857 coral fungus
+n12998815 agaric
+n13037406 gyromitra
+n13040303 stinkhorn, carrion fungus
+n13044778 earthstar
+n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+n13054560 bolete
+n13133613 ear, spike, capitulum
+n15075141 toilet tissue, toilet paper, bathroom tissue
diff --git a/inference-engine/ie_bridges/python/sample/segmentation_sample.py b/inference-engine/ie_bridges/python/sample/segmentation_sample.py
deleted file mode 100644 (file)
index ad66050..0000000
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/usr/bin/env python
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-from __future__ import print_function
-import sys
-import os
-from argparse import ArgumentParser
-import cv2
-import numpy as np
-import logging as log
-from time import time
-from openvino.inference_engine import IENetwork, IEPlugin
-
-classes_color_map = [
-    (150, 150, 150),
-    (58, 55, 169),
-    (211, 51, 17),
-    (157, 80, 44),
-    (23, 95, 189),
-    (210, 133, 34),
-    (76, 226, 202),
-    (101, 138, 127),
-    (223, 91, 182),
-    (80, 128, 113),
-    (235, 155, 55),
-    (44, 151, 243),
-    (159, 80, 170),
-    (239, 208, 44),
-    (128, 50, 51),
-    (82, 141, 193),
-    (9, 107, 10),
-    (223, 90, 142),
-    (50, 248, 83),
-    (178, 101, 130),
-    (71, 30, 204)
-]
-
-
-def build_argparser():
-    parser = ArgumentParser()
-    parser.add_argument("-m", "--model", help="Path to an .xml file with a trained model.", required=True, type=str)
-    parser.add_argument("-i", "--input", help="Path to a folder with images or path to an image files", required=True,
-                        type=str, nargs="+")
-    parser.add_argument("-l", "--cpu_extension",
-                        help="MKLDNN (CPU)-targeted custom layers.Absolute path to a shared library with the kernels "
-                             "impl.", type=str, default=None)
-    parser.add_argument("-pp", "--plugin_dir", help="Path to a plugin folder", type=str, default=None)
-    parser.add_argument("-d", "--device",
-                        help="Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample "
-                             "will look for a suitable plugin for device specified (CPU by default)", default="CPU",
-                        type=str)
-    parser.add_argument("-nt", "--number_top", help="Number of top results", default=10, type=int)
-    parser.add_argument("-ni", "--number_iter", help="Number of inference iterations", default=1, type=int)
-    parser.add_argument("-pc", "--perf_counts", help="Report performance counters", default=False, action="store_true")
-    return parser
-
-
-def main():
-    log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout)
-    args = build_argparser().parse_args()
-    model_xml = args.model
-    model_bin = os.path.splitext(model_xml)[0] + ".bin"
-
-    # Plugin initialization for specified device and load extensions library if specified
-    plugin = IEPlugin(device=args.device, plugin_dirs=args.plugin_dir)
-    if args.cpu_extension and 'CPU' in args.device:
-        plugin.add_cpu_extension(args.cpu_extension)
-    # Read IR
-    log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin))
-    net = IENetwork.from_ir(model=model_xml, weights=model_bin)
-
-    if plugin.device == "CPU":
-        supported_layers = plugin.get_supported_layers(net)
-        not_supported_layers = [l for l in net.layers.keys() if l not in supported_layers]
-        if len(not_supported_layers) != 0:
-            log.error("Following layers are not supported by the plugin for specified device {}:\n {}".
-                      format(plugin.device, ', '.join(not_supported_layers)))
-            log.error("Please try to specify cpu extensions library path in sample's command line parameters using -l "
-                      "or --cpu_extension command line argument")
-            sys.exit(1)
-    assert len(net.inputs.keys()) == 1, "Sample supports only single input topologies"
-    assert len(net.outputs) == 1, "Sample supports only single output topologies"
-
-    log.info("Preparing input blobs")
-    input_blob = next(iter(net.inputs))
-    out_blob = next(iter(net.outputs))
-    net.batch_size = len(args.input)
-
-    # Read and pre-process input images
-    n, c, h, w = net.inputs[input_blob].shape
-    images = np.ndarray(shape=(n, c, h, w))
-    for i in range(n):
-        image = cv2.imread(args.input[i])
-        if image.shape[:-1] != (h, w):
-            log.warning("Image {} is resized from {} to {}".format(args.input[i], image.shape[:-1], (h, w)))
-            image = cv2.resize(image, (w, h))
-        image = image.transpose((2, 0, 1))  # Change data layout from HWC to CHW
-        images[i] = image
-    log.info("Batch size is {}".format(n))
-
-    # Loading model to the plugin
-    log.info("Loading model to the plugin")
-    exec_net = plugin.load(network=net)
-    del net
-
-    # Start sync inference
-    log.info("Starting inference ({} iterations)".format(args.number_iter))
-    infer_time = []
-    for i in range(args.number_iter):
-        t0 = time()
-        res = exec_net.infer(inputs={input_blob: images})
-        infer_time.append((time() - t0) * 1000)
-    log.info("Average running time of one iteration: {} ms".format(np.average(np.asarray(infer_time))))
-    if args.perf_counts:
-        perf_counts = exec_net.requests[0].get_perf_counts()
-        log.info("Performance counters:")
-        print("{:<70} {:<15} {:<15} {:<15} {:<10}".format('name', 'layer_type', 'exet_type', 'status', 'real_time, us'))
-        for layer, stats in perf_counts.items():
-            print ("{:<70} {:<15} {:<15} {:<15} {:<10}".format(layer, stats['layer_type'], stats['exec_type'],
-                                                               stats['status'], stats['real_time']))
-    # Processing output blob
-    log.info("Processing output blob")
-    res = res[out_blob]
-    for batch, data in enumerate(res):
-        classes_map = np.zeros(shape=(h, w, c), dtype=np.int)
-        for i in range(h):
-            for j in range(w):
-                if len(data[:, i, j]) == 1:
-                    pixel_class = int(data[:, i, j])
-                else:
-                    pixel_class = np.argmax(data[:, i, j])
-                classes_map[i, j, :] = classes_color_map[min(pixel_class, 20)]
-        out_img = os.path.join(os.path.dirname(__file__), "out_{}.bmp".format(batch))
-        cv2.imwrite(out_img, classes_map)
-        log.info("Result image was saved to {}".format(out_img))
-    del exec_net
-    del plugin
-
-
-if __name__ == '__main__':
-    sys.exit(main() or 0)
index fc471a5..76fcada 100644 (file)
@@ -51,7 +51,6 @@ def build_argparser():
                         type=float)
     parser.add_argument("-pc", "--perf_counts", help="Report performance counters", default=False, action="store_true")
 
-
     return parser
 
 
@@ -67,7 +66,7 @@ def main():
         plugin.add_cpu_extension(args.cpu_extension)
     # Read IR
     log.info("Loading network files:\n\t{}\n\t{}".format(model_xml, model_bin))
-    net = IENetwork.from_ir(model=model_xml, weights=model_bin)
+    net = IENetwork(model=model_xml, weights=model_bin)
 
     if plugin.device == "CPU":
         supported_layers = plugin.get_supported_layers(net)
@@ -117,8 +116,8 @@ def main():
         log.info("Performance counters:")
         print("{:<70} {:<15} {:<15} {:<15} {:<10}".format('name', 'layer_type', 'exet_type', 'status', 'real_time, us'))
         for layer, stats in perf_counts.items():
-            print ("{:<70} {:<15} {:<15} {:<15} {:<10}".format(layer, stats['layer_type'], stats['exec_type'],
-                                                               stats['status'], stats['real_time']))
+            print("{:<70} {:<15} {:<15} {:<15} {:<10}".format(layer, stats['layer_type'], stats['exec_type'],
+                                                              stats['status'], stats['real_time']))
     # Processing output blob
     log.info("Processing output blob")
     res = res[out_blob]
index 41f8772..bb9df0e 100644 (file)
@@ -51,8 +51,8 @@ def parse_command_line_options(cls):
         base_init_options(self)
 
     def run(self):
-        global  INFERENCE_ENGINE_DIR
-        global  BUNDLE_INFERENCE_ENGINE
+        global INFERENCE_ENGINE_DIR
+        global BUNDLE_INFERENCE_ENGINE
 
         if self.copy_ie_libs:
             BUNDLE_INFERENCE_ENGINE = True
@@ -187,16 +187,14 @@ cmdclass = {
 }
 
 setup(
-    name="inference_engine",
-    version='0.1.1',
+    name="src",
+    version='1.0',
     description='Python inference for Inference Engine',
     packages=find_packages(exclude=['tests']),
     package_data={PACKAGE_NAME: ['*.so', '*.dll', '*dylib*', '*.pyd']},
     include_package_data=True,
     ext_modules=extensions,
     cmdclass=cmdclass,
-    author='', author_email='',
-    tests_require=['pytest'],
     install_requires=list(requirements),
     zip_safe=False,
 )
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt b/inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt
new file mode 100644 (file)
index 0000000..aa8ac74
--- /dev/null
@@ -0,0 +1,36 @@
+# If the pyx file is a C++ file, we should specify that here.
+set (CMAKE_INCLUDE_CURRENT_DIR ON)
+set (TARGET_NAME "ie_api")
+
+set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/inference_engine)
+set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+
+set_source_files_properties(
+    ie_api_impl_defs.pxd
+    ie_api_impl.hpp
+    ie_api_impl.cpp
+    ie_api.pyx
+    ie_api.pxd
+
+  PROPERTIES CYTHON_IS_CXX TRUE
+)
+
+cython_add_module (
+    ${TARGET_NAME}
+
+    ie_api_impl_defs.pxd
+    ie_api_impl.hpp
+    ie_api_impl.cpp
+    ie_api.pyx
+)
+
+set_target_properties (${TARGET_NAME} PROPERTIES CXX_STANDARD 11 LINKER_LANGUAGE CXX)
+target_link_libraries (${TARGET_NAME} PRIVATE ${InferenceEngine_LIBRARIES})
+
+# perform copy
+ADD_CUSTOM_COMMAND (TARGET ${TARGET_NAME}
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/inference_engine/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/__init__.py
+    COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/requirements.txt ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../../requirements.txt
+    COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../__init__.py
+)
\ No newline at end of file
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/__init__.py b/inference-engine/ie_bridges/python/src/openvino/inference_engine/__init__.py
new file mode 100644 (file)
index 0000000..ff435b3
--- /dev/null
@@ -0,0 +1,3 @@
+from .ie_api import *
+__version__ = get_version()
+__all__ = ['IENetwork', "IEPlugin", "IENetReader"]
\ No newline at end of file
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/CMakeLists.txt b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/CMakeLists.txt
new file mode 100644 (file)
index 0000000..1b25c3e
--- /dev/null
@@ -0,0 +1,37 @@
+# If the pyx file is a C++ file, we should specify that here.
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+set(TARGET_NAME "dnn_builder")
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/inference_engine/${TARGET_NAME})
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+
+set_source_files_properties(
+    dnn_builder_defs.pxd
+    dnn_builder_impl.hpp
+    dnn_builder_impl.cpp
+    dnn_builder.pyx
+    dnn_builder.pxd
+
+  PROPERTIES CYTHON_IS_CXX TRUE
+)
+
+cython_add_module(
+    ${TARGET_NAME}
+
+    dnn_builder_impl_defs.pxd
+    dnn_builder_impl.hpp
+    dnn_builder_impl.cpp
+    dnn_builder.pyx
+)
+
+set_target_properties (${TARGET_NAME} PROPERTIES CXX_STANDARD 11 LINKER_LANGUAGE CXX)
+add_dependencies (${TARGET_NAME} ie_api)
+target_include_directories (${TARGET_NAME} PRIVATE ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/inference_engine )
+target_link_libraries (${TARGET_NAME} PRIVATE ${InferenceEngine_LIBRARIES})
+
+# perform copy
+ADD_CUSTOM_COMMAND (TARGET ${TARGET_NAME}
+    POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/inference_engine/${TARGET_NAME}/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+)
\ No newline at end of file
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/__init__.py b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/__init__.py
new file mode 100644 (file)
index 0000000..79744ab
--- /dev/null
@@ -0,0 +1,2 @@
+from .dnn_builder import *
+__all__ = ["NetworkBuilder", "LayerBuilder"]
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pxd b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pxd
new file mode 100644 (file)
index 0000000..9a56215
--- /dev/null
@@ -0,0 +1,26 @@
+from .cimport dnn_builder_impl_defs as C
+from libcpp.memory cimport shared_ptr
+
+cdef class NetworkBuilder:
+    cdef C.NetworkBuilder impl
+
+cdef class INetwork:
+    cdef C.INetwork impl
+
+cdef class ILayer:
+    cdef C.ILayer impl
+
+cdef class Port:
+    cdef C.Port impl
+
+cdef class PortInfo:
+    cdef C.PortInfo impl
+
+cdef class Connection:
+    cdef C.Connection impl
+
+cdef class LayerBuilder:
+    cdef C.LayerBuilder impl
+
+cdef class LayerConstantData(dict):
+    cdef shared_ptr[C.LayerBuilder] impl
\ No newline at end of file
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pyx b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder.pyx
new file mode 100644 (file)
index 0000000..b0754cb
--- /dev/null
@@ -0,0 +1,423 @@
+# #distutils: language=c++
+#from cython.operator cimport dereference as deref
+from libcpp.vector cimport vector
+from libcpp.map cimport map
+from libcpp.string cimport string
+from ..ie_api cimport IENetwork, BlobBuffer
+from .cimport dnn_builder_impl_defs as C
+from .dnn_builder_impl_defs cimport Blob
+import numpy as np
+
+
+np_precision_map = {
+            "float32": "FP32",
+            "float16": "FP16",
+            "int32": "I32",
+            "int16": "I16",
+            "uint16": "U16",
+            "int8": "I8",
+            "uint8": "U8",
+        }
+cdef class NetworkBuilder:
+    def __cinit__(self, name=None, IENetwork ie_net=None):
+        if name is not None and ie_net is not None:
+            raise AttributeError("Both name and ie_net arguments are defined")
+        elif name is not None:
+            self.impl = C.NetworkBuilder(name.encode())
+        elif ie_net is not None:
+            self.impl = C.NetworkBuilder().from_ie_network(ie_net.impl)
+
+    def build(self):
+        cdef INetwork i_net = INetwork()
+        i_net.impl = self.impl.build()
+        return i_net
+
+    def get_layer(self, id: int):
+        cdef LayerBuilder py_layer = LayerBuilder()
+        py_layer.impl = self.impl.getLayer(id)
+        return py_layer
+
+    @property
+    def layers(self):
+        cdef vector[C.LayerBuilder] c_layers = self.impl.getLayers()
+        cdef LayerBuilder py_layer
+        py_layers = {}
+        for l in c_layers:
+            py_layer = LayerBuilder()
+            py_layer.impl = l
+            py_layers[l.getName().decode()] = py_layer
+        return py_layers
+
+    def remove_layer(self, LayerBuilder layer):
+        self.impl.removeLayer(layer.impl)
+
+    def get_layer_connection(self, LayerBuilder layer):
+        cdef vector[C.Connection] c_connections = self.impl.getLayerConnections(layer.impl)
+        cdef Connection connection
+        connections = []
+        for con in c_connections:
+            connection = Connection()
+            connection.impl = con
+            connections.append(connection)
+        return connections
+
+    def disconnect(self, Connection connection):
+        self.impl.disconnect(connection.impl)
+
+    def connect(self, PortInfo input, PortInfo output):
+        self.impl.connect(input.impl, output.impl)
+
+    def add_layer(self, LayerBuilder layer, input_ports: list = None):
+        cdef vector[C.PortInfo] c_ports
+        cdef PortInfo c_port
+        if not input_ports:
+            return self.impl.addLayer(layer.impl)
+        else:
+            for p in input_ports:
+                c_port = PortInfo(p.layer_id, p.port_id)
+                c_ports.push_back(c_port.impl)
+            return self.impl.addAndConnectLayer(c_ports, layer.impl)
+
+cdef class INetwork:
+    def __iter__(self):
+        cdef ILayer layer
+        layers = []
+        cdef vector[C.ILayer] c_layers = self.impl.layers
+        for l in c_layers:
+            layer = ILayer()
+            layer.impl = l
+            layers.append(layer)
+        return iter(layers)
+
+    @property
+    def layers(self):
+        cdef ILayer layer
+        layers = {}
+        cdef vector[C.ILayer] c_layers = self.impl.layers
+        for l in c_layers:
+            layer = ILayer()
+            layer.impl = l
+            layers[l.name.decode()] = layer
+        return layers
+
+    @property
+    def inputs(self):
+        cdef ILayer layer
+        layers = {}
+        cdef vector[C.ILayer] c_layers = self.impl.inputs
+        for l in c_layers:
+            layer = ILayer()
+            layer.impl = l
+            layers[l.name.decode()] = layer
+        return layers
+
+    @property
+    def outputs(self):
+        cdef ILayer layer
+        layers = {}
+        cdef vector[C.ILayer] c_layers = self.impl.outputs
+        for l in c_layers:
+            layer = ILayer()
+            layer.impl = l
+            layers[l.name.decode()] = layer
+        return layers
+
+    @property
+    def name(self):
+        return self.impl.name.decode()
+
+
+    @property
+    def size(self):
+        return self.impl.size
+
+    def get_layer_connection(self, layer: ILayer):
+        cdef Connection connection
+        connections = []
+        cdef vector[C.Connection] c_connections = self.impl.getLayerConnections(layer.id)
+        for con in c_connections:
+            connection = Connection()
+            connection.impl = con
+            connections.append(connection)
+        return connections
+
+    def to_ie_network(self):
+        cdef IENetwork net = IENetwork()
+        net.impl = self.impl.to_ie_network()
+        return net
+
+cdef class ILayer:
+    @property
+    def name(self):
+        return self.impl.name.decode()
+
+    @property
+    def id(self):
+        return self.impl.id
+
+    @property
+    def type(self):
+        return self.impl.type.decode()
+
+    @property
+    def params(self):
+        return {k.decode(): v.decode() for k, v in self.impl.parameters}
+
+    @property
+    def input_ports(self):
+        cdef Port port
+        cdef vector[C.Port] c_ports = self.impl.in_ports
+        ports = []
+        for p in c_ports:
+            port = Port()
+            port.impl = p
+            ports.append(port)
+        return ports
+
+    @property
+    def output_ports(self):
+        cdef Port port
+        cdef vector[C.Port] c_ports = self.impl.out_ports
+        ports = []
+        for p in c_ports:
+            port = Port()
+            port.impl = p
+            ports.append(port)
+        return ports
+
+    @property
+    def constant_data(self):
+        cdef map[string, Blob.Ptr] c_constant_data
+        c_constant_data = self.impl.constant_data
+        constant_data = {}
+        cdef BlobBuffer weights_buffer
+        for weights in c_constant_data:
+            weights_buffer = BlobBuffer()
+            weights_buffer.reset(weights.second)
+            constant_data[weights.first.decode()] = weights_buffer.to_numpy()
+        return constant_data
+
+
+cdef class Port:
+    def __cinit__(self, shape: list=[]):
+        cdef vector[size_t] c_shape
+        for d in shape:
+            c_shape.push_back(d)
+        self.impl = C.Port(c_shape)
+    @property
+    def shape(self):
+        return self.impl.shape
+
+cdef class PortInfo:
+    def __cinit__(self, layer_id: int = -1, port_id: int = -1):
+        if layer_id != -1 and port_id != -1:
+            self.impl = C.PortInfo(layer_id, port_id)
+        else:
+            self.impl = C.PortInfo()
+    @property
+    def layer_id(self):
+        return self.impl.layer_id
+
+    @property
+    def port_id(self):
+        return self.impl.port_id
+
+    def __eq__(self, other):
+        return self.layer_id == other.layer_id and self.port_id == other.port_id
+
+    def __ne__(self, other):
+        return self.layer_id != other.layer_id and self.port_id != other.port_id
+
+cdef class Connection:
+    def __cinit__(self, PortInfo input = None, PortInfo output = None):
+        if input and output:
+            self.impl = C.Connection(input.impl, output.impl)
+        else:
+            self.impl = C.Connection()
+    @property
+    def _from(self):
+        cdef PortInfo port_info = PortInfo()
+        port_info.impl = self.impl._from
+        return port_info
+
+    @property
+    def to(self):
+        cdef PortInfo port_info = PortInfo()
+        port_info.impl = self.impl.to
+        return port_info
+
+    def __eq__(self, other):
+        return self._from == other._from and self.to == other.to
+
+    def __ne__(self, other):
+        return self._from != other._from and self.to != other.to
+
+
+def check_constant_data(data):
+    for k, v in data.items():
+        if not all([isinstance(x, type(v[0])) for x in v]):
+            raise TypeError("Elements of list for key {} have different data types! "
+                            "Please specify list of 'int' or 'float' values.".format(k))
+        if isinstance(v, list):
+            if isinstance(v[0], float):
+                dtype = np.float32
+            elif isinstance(v[0], int):
+                dtype = np.int32
+            else:
+                raise TypeError("Unsupported precision of the data for key {}! Given {} but 'float  or 'int' precision expected".
+                              format(k, str(v.dtype)))
+            data[k] = np.asanyarray(v, dtype=dtype)
+        elif isinstance(v, np.ndarray):
+            pass
+        else:
+            raise TypeError("Unsupported data type for key '{}'. {} given but 'list' or 'numpy.ndarray' expected".
+                            format(k, type(v)))
+    return data
+
+
+# TODO: Fix LAyerBuilder object copying - pass by reference
+# cdef class LayerConstantData(dict):
+#     def update(self, other=None, **kwargs):
+#         if other:
+#             other = check_constant_data(other)
+#         cdef vector[size_t] dims
+#         cdef Blob.Ptr blob_ptr
+#         cdef BlobBuffer buffer
+#         for k, v in other.items():
+#             if k in self.keys() and (v.shape == self[k].shape and v.dtype == self[k].dtype):
+#                 print("Reuse blob for {}\n".format(k))
+#                 self[k][:] = v
+#             else:
+#                 for dim in v.shape:
+#                     dims.push_back(dim)
+#                 ie_precision = np_precision_map.get(str(v.dtype), None)
+#                 if not ie_precision:
+#                     raise BufferError("Unsupported precision of the data for key {}! Given {} but one of the {} precisions expected".
+#                                       format(k, str(v.dtype), ", ".join(np_precision_map.keys())))
+#                 blob_ptr = deref(self.impl).allocateBlob(dims, ie_precision.encode())
+#                 buffer = BlobBuffer()
+#                 buffer.reset(blob_ptr)
+#                 np_buffer = buffer.to_numpy()
+#                 np_buffer[:] = v
+#                 deref(self.impl).addConstantData(k.encode(), blob_ptr)
+
+cdef class LayerBuilder:
+
+    def __cinit__(self, type: str=None, name: str=None):
+        if name and type:
+            self.impl = C.LayerBuilder(name.encode(), type.encode())
+        else:
+            self.impl = C.LayerBuilder()
+
+    @property
+    def id(self):
+        return self.impl.id
+    @property
+    def name(self):
+        return self.impl.getName().decode()
+    @name.setter
+    def name(self, name: str):
+        self.impl.setName(name.encode())
+
+    @property
+    def type(self):
+        return self.impl.getType().decode()
+    @type.setter
+    def type(self, type: str):
+        self.impl.setType(type.encode())
+
+    @property
+    def input_ports(self):
+        cdef Port port
+        cdef vector[C.Port] c_ports = self.impl.getInputPorts()
+        py_ports = []
+        for p in c_ports:
+            port = Port()
+            port.impl = p
+            py_ports.append(port)
+        return py_ports
+
+    @input_ports.setter
+    def input_ports(self, ports: list):
+        cdef vector[C.Port] c_ports
+        cdef Port c_port
+        for p in ports:
+            c_port = Port(p.shape)
+            c_ports.push_back(c_port.impl)
+        self.impl.setInputPorts(c_ports)
+
+    @property
+    def output_ports(self):
+        cdef Port port
+        cdef vector[C.Port] c_ports = self.impl.getOutputPorts()
+        py_ports = []
+        for p in c_ports:
+            port = Port()
+            port.impl = p
+            py_ports.append(port)
+        return py_ports
+
+    @output_ports.setter
+    def output_ports(self, ports: list):
+        cdef vector[C.Port] c_ports
+        cdef Port c_port
+        for p in ports:
+            c_port = Port(p.shape)
+            c_ports.push_back(c_port.impl)
+        self.impl.setOutputPorts(c_ports)
+
+    @property
+    def params(self):
+        return {k.decode(): v.decode() for k, v in self.impl.getParameters()}
+
+    @params.setter
+    def params(self, params_map: dict):
+        cdef map[string, string] c_params_map
+        for k, v in params_map.items():
+            c_params_map[k.encode()] = str(v).encode()
+        self.impl.setParameters(c_params_map)
+
+    def build(self):
+        cdef ILayer layer = ILayer()
+        layer.impl = self.impl.build()
+        return layer
+
+    @property
+    def constant_data(self):
+        cdef map[string, Blob.Ptr] c_constant_data
+        c_constant_data = self.impl.getConstantData()
+        constant_data = {}
+        # TODO: Fix LAyerBuilder object copying - pass by reference
+        # constant_data = LayerConstantData()
+        # constant_data.impl = make_shared[C.LayerBuilder](self.impl)
+        cdef BlobBuffer weights_buffer
+        for weights in c_constant_data:
+            weights_buffer = BlobBuffer()
+            weights_buffer.reset(weights.second)
+            constant_data[weights.first.decode()] = weights_buffer.to_numpy()
+        return constant_data
+
+    @constant_data.setter
+    def constant_data(self, data: dict):
+        cdef vector[size_t] dims
+        cdef map[string, Blob.Ptr] c_constant_data
+        cdef Blob.Ptr blob_ptr
+        cdef BlobBuffer buffer
+        data = check_constant_data(data)
+        for k, v in data.items():
+            for dim in v.shape:
+                dims.push_back(dim)
+            ie_precision = np_precision_map.get(str(v.dtype), None)
+            if not ie_precision:
+                raise BufferError("Unsupported precision of the data for key {}! Given {} but one of the {} precisions expected".
+                                  format(k, str(v.dtype), ", ".join(np_precision_map.keys())))
+            blob_ptr = self.impl.allocateBlob(dims, ie_precision.encode())
+            buffer = BlobBuffer()
+            buffer.reset(blob_ptr)
+            np_buffer = buffer.to_numpy()
+            np_buffer[:] = v
+            c_constant_data[k.encode()] = blob_ptr
+
+        self.impl.setConstantData(c_constant_data)
+
+    # TODO: Implement get\setGraph when will be supported
\ No newline at end of file
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.cpp b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.cpp
new file mode 100644 (file)
index 0000000..fc9ab4e
--- /dev/null
@@ -0,0 +1,330 @@
+// Copyright (c) 2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//        http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dnn_builder_impl.hpp"
+
+// using namespace InferenceEnginePython;
+// using namespace std;
+
+std::map<std::string, InferenceEngine::Precision> precision_map = {{"FP32", InferenceEngine::Precision::FP32},
+                                                                   {"FP16", InferenceEngine::Precision::FP16},
+                                                                   {"Q78",  InferenceEngine::Precision::Q78},
+                                                                   {"I32",  InferenceEngine::Precision::I32},
+                                                                   {"I16",  InferenceEngine::Precision::I16},
+                                                                   {"I8",   InferenceEngine::Precision::I8},
+                                                                   {"U16",  InferenceEngine::Precision::U16},
+                                                                   {"U8",   InferenceEngine::Precision::U8}};
+
+InferenceEnginePython::ILayer buildILayer(InferenceEngine::ILayer::CPtr it) {
+    std::vector<InferenceEnginePython::Port> in_ports;
+    std::vector<InferenceEnginePython::Port> out_ports;
+    for (const auto &port : it->getInputPorts()) {
+        in_ports.push_back(InferenceEnginePython::Port(port.shape()));
+    }
+    for (const auto &port : it->getOutputPorts()) {
+        out_ports.push_back(InferenceEnginePython::Port(port.shape()));
+    }
+
+    std::map<std::string, std::string> params_map;
+    for (const auto &params : it->getParameters()->getParameters()) {
+        params_map.emplace(params.first, params.second);
+    }
+    std::map<std::string, InferenceEngine::Blob::Ptr> data_map;
+    for (const auto &data : it->getParameters()->getConstantData()) {
+        data_map.emplace(data.first, std::const_pointer_cast<InferenceEngine::Blob>(data.second));
+    }
+    return {it,
+            it->getName(),
+            it->getId(),
+            it->getType(),
+            params_map,
+            data_map,
+            in_ports,
+            out_ports,
+    };
+}
+
+// NetworkBuilder
+InferenceEnginePython::NetworkBuilder::NetworkBuilder(const std::string &name) {
+    // TODO(  ): std::move or instance in heap? Please check in other places.
+    InferenceEngine::Builder::Network network(name);
+    network_ptr = std::make_shared<InferenceEngine::Builder::Network>(network);
+}
+
+InferenceEnginePython::NetworkBuilder InferenceEnginePython::NetworkBuilder::from_ie_network(
+        const InferenceEnginePython::IENetwork &icnn_net) {
+    InferenceEngine::Builder::Network network((InferenceEngine::ICNNNetwork &) icnn_net.actual);
+    NetworkBuilder net_builder = NetworkBuilder();
+    net_builder.network_ptr = std::make_shared<InferenceEngine::Builder::Network>(network);
+    return net_builder;
+}
+
+InferenceEnginePython::INetwork InferenceEnginePython::NetworkBuilder::build() {
+    InferenceEngine::INetwork::Ptr i_net = network_ptr->build();
+    std::vector<ILayer> layers;
+    for (const auto &it : *i_net) {
+        layers.push_back(buildILayer(it));
+    }
+    std::vector<ILayer> inputs;
+    for (const auto &it : i_net->getInputs()) {
+        inputs.push_back(buildILayer(it));
+    }
+    std::vector<ILayer> outputs;
+    for (const auto &it : i_net->getInputs()) {
+        outputs.push_back(buildILayer(it));
+    }
+    return {i_net,             // INetwork ptr
+            i_net->getName(),  // name
+            i_net->size(),     // Number of layers
+            layers,
+            inputs,
+            outputs
+    };
+}
+
+std::vector<InferenceEnginePython::LayerBuilder> InferenceEnginePython::NetworkBuilder::getLayers() {
+    std::vector<LayerBuilder> layers;
+    for (const auto &it : network_ptr->getLayers()) {
+        LayerBuilder layer;
+        layer.actual = it;
+        layer.id = it.getId();
+        layers.push_back(layer);
+    }
+    return layers;
+}
+
+InferenceEnginePython::LayerBuilder InferenceEnginePython::NetworkBuilder::getLayer(size_t layer_id) {
+    LayerBuilder layer;
+    InferenceEngine::Builder::Layer ie_layer = network_ptr->getLayer(layer_id);
+    layer.actual = ie_layer;
+    layer.id = ie_layer.getId();
+    return layer;
+}
+
+void InferenceEnginePython::NetworkBuilder::removeLayer(const LayerBuilder &layer) {
+    network_ptr->removeLayer(layer.id);
+}
+
+const std::vector<InferenceEnginePython::Connection> InferenceEnginePython::NetworkBuilder::getLayerConnections(
+        const LayerBuilder &layer) {
+    std::vector<InferenceEngine::Connection> ie_connections = network_ptr->getLayerConnections(layer.id);
+    std::vector<Connection> connections;
+    for (auto const &it : ie_connections) {
+        PortInfo input(it.from().layerId(), it.from().portId());
+        PortInfo output(it.to().layerId(), it.to().portId());
+        connections.push_back(Connection(input, output));
+    }
+    return connections;
+}
+
+void InferenceEnginePython::NetworkBuilder::disconnect(const Connection &connection) {
+    network_ptr->disconnect(connection.actual);
+}
+
+void InferenceEnginePython::NetworkBuilder::connect(const PortInfo &input, const PortInfo &output) {
+    network_ptr->connect(input.actual, output.actual);
+}
+
+size_t InferenceEnginePython::NetworkBuilder::addLayer(const LayerBuilder &layer) {
+    return network_ptr->addLayer(layer.actual);
+}
+
+size_t InferenceEnginePython::NetworkBuilder::addAndConnectLayer(const std::vector<PortInfo> &input,
+                                                                 const LayerBuilder &layer) {
+    std::vector<InferenceEngine::PortInfo> ie_ports;
+    for (const auto &it : input) {
+        ie_ports.push_back(it.actual);
+    }
+    return network_ptr->addLayer(ie_ports, layer.actual);
+}
+// NetworkBuilder end
+// NetworkBuilder end
+
+// Port
+InferenceEnginePython::Port::Port(const std::vector<size_t> &shapes) {
+    actual = InferenceEngine::Port(shapes);
+    shape = actual.shape();
+}
+
+InferenceEnginePython::PortInfo::PortInfo(size_t layer_id, size_t port_id) : PortInfo() {
+    this->actual = InferenceEngine::PortInfo(layer_id, port_id);
+    this->layer_id = layer_id;
+    this->port_id = port_id;
+}
+// Port end
+
+// INetwork
+std::vector<InferenceEnginePython::Connection> InferenceEnginePython::INetwork::getLayerConnections(size_t layer_id) {
+    std::vector<Connection> connections;
+    for (const auto &it : actual->getLayerConnections(layer_id)) {
+        PortInfo input = PortInfo(it.from().layerId(), it.from().portId());
+        PortInfo output = PortInfo(it.to().layerId(), it.to().portId());
+        connections.push_back(Connection(input, output));
+    }
+    return connections;
+}
+
+InferenceEnginePython::IENetwork InferenceEnginePython::INetwork::to_ie_network() {
+    std::shared_ptr<InferenceEngine::ICNNNetwork> icnn_net = InferenceEngine::Builder::convertToICNNNetwork(actual);
+    InferenceEngine::CNNNetwork cnn_net(icnn_net);
+    IENetwork ie_net = IENetwork();
+    ie_net.actual = cnn_net;
+    ie_net.name = name;
+    ie_net.batch_size = cnn_net.getBatchSize();
+    return ie_net;
+}
+// INetwork end
+
+// Connection
+InferenceEnginePython::Connection::Connection(PortInfo input, PortInfo output) : Connection() {
+    this->actual = InferenceEngine::Connection(InferenceEngine::PortInfo(input.layer_id, input.port_id),
+                                               InferenceEngine::PortInfo(output.layer_id, output.port_id));
+    this->_from = PortInfo(actual.from().layerId(), actual.from().portId());
+    this->to = PortInfo(actual.to().layerId(), actual.to().portId());
+}
+// Connection end
+
+// LayerBuilder
+InferenceEnginePython::LayerBuilder::LayerBuilder(const std::string &type, const std::string &name) : LayerBuilder() {
+    InferenceEngine::Builder::Layer layer(type, name);
+    this->actual = layer;
+    this->id = layer.getId();
+}
+
+const std::string &InferenceEnginePython::LayerBuilder::getName() {
+    return actual.getName();
+}
+
+const std::string &InferenceEnginePython::LayerBuilder::getType() {
+    return actual.getType();
+}
+
+std::vector<InferenceEnginePython::Port> InferenceEnginePython::LayerBuilder::getInputPorts() {
+    std::vector<Port> ports;
+    for (const auto &it : actual.getInputPorts()) {
+        ports.push_back(Port(it.shape()));
+    }
+    return ports;
+}
+
+std::vector<InferenceEnginePython::Port> InferenceEnginePython::LayerBuilder::getOutputPorts() {
+    std::vector<Port> ports;
+    for (const auto &it : actual.getOutputPorts()) {
+        ports.push_back(Port(it.shape()));
+    }
+    return ports;
+}
+
+std::map<std::string, std::string> InferenceEnginePython::LayerBuilder::getParameters() {
+    std::map<std::string, std::string> params_map;
+    for (const auto &it : actual.getParameters()) {
+        params_map.emplace(it.first, it.second);
+    }
+    return params_map;
+}
+
+void InferenceEnginePython::LayerBuilder::setParameters(std::map<std::string, std::string> params_map) {
+    std::map<std::string, InferenceEngine::Parameter> ie_params_map;
+    for (const auto &it : params_map) {
+        InferenceEngine::Parameter ie_param((it.second));
+        ie_params_map.emplace(it.first, ie_param);
+    }
+    actual = actual.setParameters(ie_params_map);
+}
+
+void InferenceEnginePython::LayerBuilder::setName(const std::string &name) {
+    actual = actual.setName(name);
+}
+
+void InferenceEnginePython::LayerBuilder::setType(const std::string &type) {
+    actual = actual.setType(type);
+}
+
+void InferenceEnginePython::LayerBuilder::setInputPorts(const std::vector<Port> ports) {
+    std::vector<InferenceEngine::Port> ie_ports;
+    for (const auto &it : ports) {
+        ie_ports.push_back(it.actual);
+    }
+    actual = actual.setInputPorts(ie_ports);
+}
+
+void InferenceEnginePython::LayerBuilder::setOutputPorts(const std::vector<Port> ports) {
+    std::vector<InferenceEngine::Port> ie_ports;
+    for (const auto &it : ports) {
+        ie_ports.push_back(it.actual);
+    }
+    actual = actual.setOutputPorts(ie_ports);
+}
+
+InferenceEnginePython::ILayer InferenceEnginePython::LayerBuilder::build() {
+    return buildILayer(actual.build());
+}
+
+std::map<std::string, InferenceEngine::Blob::Ptr> InferenceEnginePython::LayerBuilder::getConstantData() {
+    std::map<std::string, InferenceEngine::Blob::Ptr> data_map;
+    for (const auto &it : actual.getConstantData()) {
+        data_map.emplace(it.first, std::const_pointer_cast<InferenceEngine::Blob>(it.second));
+    }
+    return data_map;
+}
+
+InferenceEngine::Blob::Ptr InferenceEnginePython::LayerBuilder::allocateBlob(std::vector<size_t> dims,
+                                                                             const std::string &precision) {
+    InferenceEngine::Layout ie_layout;
+    ie_layout = InferenceEngine::TensorDesc::getLayoutByDims(dims);
+    InferenceEngine::Precision ie_precision = precision_map.at(precision);
+    const InferenceEngine::TensorDesc &tdesc = InferenceEngine::TensorDesc(ie_precision, dims, ie_layout);
+    InferenceEngine::Blob::Ptr blob;
+    switch (ie_precision) {
+        case InferenceEngine::Precision::FP32:
+            blob = InferenceEngine::make_shared_blob<float>(tdesc);
+            break;
+        case InferenceEngine::Precision::FP16:
+            blob = InferenceEngine::make_shared_blob<int>(tdesc);
+            break;
+        case InferenceEngine::Precision::I16:
+            blob = InferenceEngine::make_shared_blob<int>(tdesc);
+            break;
+        case InferenceEngine::Precision::U16:
+            blob = InferenceEngine::make_shared_blob<int>(tdesc);
+            break;
+        case InferenceEngine::Precision::U8:
+            blob = InferenceEngine::make_shared_blob<unsigned char>(tdesc);
+            break;
+        case InferenceEngine::Precision::I8:
+            blob = InferenceEngine::make_shared_blob<signed char>(tdesc);
+            break;
+        case InferenceEngine::Precision::I32:
+            blob = InferenceEngine::make_shared_blob<signed int>(tdesc);
+            break;
+        default:
+            blob = InferenceEngine::make_shared_blob<float>(tdesc);
+            break;
+    }
+
+    blob->allocate();
+    return blob;
+}
+
+void InferenceEnginePython::LayerBuilder::setConstantData(const std::map<std::string,
+                                                          InferenceEngine::Blob::Ptr> &const_data) {
+    actual.setConstantData(const_data);
+}
+// TODO(  ): Fix LAyerBuilder object copying - pass by reference
+// void LayerBuilder::addConstantData(const std::string & name, InferenceEngine::Blob::Ptr data){
+//     InferenceEngine::Blob::CPtr c_data = const_pointer_cast<const InferenceEngine::Blob>(data);
+//     actual.addConstantData(name, c_data);
+// }
+
+// LayerBuilder end
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.hpp b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl.hpp
new file mode 100644 (file)
index 0000000..b58994a
--- /dev/null
@@ -0,0 +1,161 @@
+// Copyright (c) 2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//        http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ie_blob.h>
+
+#include <iterator>
+
+#include <string>
+#include <iostream>
+#include <algorithm>
+#include <vector>
+#include <map>
+
+#include <sstream>
+#include <ie_builders.hpp>
+#include <inference_engine.hpp>
+
+#include <ie_api_impl.hpp>
+
+
+// namespace IE Python
+namespace InferenceEnginePython {
+struct LayerBuilder;
+
+struct Port {
+    Port() = default;
+
+    explicit Port(const std::vector<size_t> &shapes);
+
+    InferenceEngine::Port actual;
+    std::vector<size_t> shape;
+};
+
+struct ILayer {
+    InferenceEngine::ILayer::CPtr layer_ptr;
+    std::string name;
+    size_t id;
+    std::string type;
+    std::map<std::string, std::string> parameters;
+    std::map<std::string, InferenceEngine::Blob::Ptr> constant_data;
+    std::vector<Port> in_ports;
+    std::vector<Port> out_ports;
+};
+
+struct PortInfo {
+    PortInfo(size_t layer_id, size_t port_id);
+
+    PortInfo() : actual(0, 0) {}
+
+    InferenceEngine::PortInfo actual;
+    size_t layer_id;
+    size_t port_id;
+};
+
+struct Connection {
+    Connection() : actual(InferenceEngine::PortInfo(0), InferenceEngine::PortInfo(0)) {}
+
+    Connection(PortInfo input, PortInfo output);
+
+    InferenceEngine::Connection actual;
+    PortInfo _from;
+    PortInfo to;
+};
+
+struct INetwork {
+    InferenceEngine::INetwork::Ptr actual;
+    std::string name;
+    size_t size;
+    std::vector<ILayer> layers;
+    std::vector<ILayer> inputs;
+    std::vector<ILayer> outputs;
+
+    std::vector<Connection> getLayerConnections(size_t layer_id);
+
+    IENetwork to_ie_network();
+};
+
+struct NetworkBuilder {
+    InferenceEngine::Builder::Network::Ptr network_ptr;
+
+    explicit NetworkBuilder(const std::string &name);
+
+    NetworkBuilder() = default;
+
+    NetworkBuilder from_ie_network(const InferenceEnginePython::IENetwork &icnn_net);
+
+    INetwork build();
+
+    std::vector<LayerBuilder> getLayers();
+
+    LayerBuilder getLayer(size_t layer_id);
+
+    void removeLayer(const LayerBuilder &layer);
+
+    size_t addLayer(const LayerBuilder &layer);
+
+    size_t addAndConnectLayer(const std::vector<PortInfo> &input, const LayerBuilder &layer);
+
+    const std::vector<Connection> getLayerConnections(const LayerBuilder &layer);
+
+    void disconnect(const Connection &connection);
+
+    void connect(const PortInfo &input, const PortInfo &output);
+};
+
+struct LayerBuilder {
+    InferenceEngine::Builder::Layer actual;
+    size_t id;
+
+    LayerBuilder(const std::string &type, const std::string &name);
+
+    LayerBuilder() : actual("", "") {}
+
+    LayerBuilder from_ilayer(const ILayer &ilayer);
+
+    const std::string &getName();
+
+    void setName(const std::string &name);
+
+    const std::string &getType();
+
+    void setType(const std::string &type);
+
+    std::vector<Port> getInputPorts();
+
+    void setInputPorts(const std::vector<Port> ports);
+
+    std::vector<Port> getOutputPorts();
+
+    void setOutputPorts(const std::vector<Port> ports);
+
+
+    std::map<std::string, std::string> getParameters();
+
+    void setParameters(std::map<std::string, std::string> params_map);
+
+    ILayer build();
+
+    std::map<std::string, InferenceEngine::Blob::Ptr> getConstantData();
+
+    InferenceEngine::Blob::Ptr allocateBlob(std::vector<size_t> dims, const std::string &precision);
+
+    void setConstantData(const std::map<std::string, InferenceEngine::Blob::Ptr> &const_data);
+
+// TODO(  ): Fix LAyerBuilder object copying - pass by reference
+//    void addConstantData(const std::string & name, InferenceEngine::Blob::Ptr data);
+};
+}  // namespace InferenceEnginePython
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl_defs.pxd b/inference-engine/ie_bridges/python/src/openvino/inference_engine/dnn_builder/dnn_builder_impl_defs.pxd
new file mode 100644 (file)
index 0000000..29795f2
--- /dev/null
@@ -0,0 +1,97 @@
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+from libc.stddef cimport size_t
+from libcpp.memory cimport shared_ptr
+from libcpp.map cimport map
+from ..ie_api_impl_defs cimport IENetwork
+
+cdef extern from "<inference_engine.hpp>" namespace "InferenceEngine":
+    ctypedef vector[size_t] SizeVector
+
+    cdef cppclass TensorDesc:
+        SizeVector& getDims()
+        const Precision& getPrecision() const
+
+    cdef cppclass Blob:
+        ctypedef shared_ptr[Blob] Ptr
+        const TensorDesc& getTensorDesc() const
+        size_t element_size()  const
+
+    cdef cppclass Precision:
+        const char*name() const
+
+cdef extern from "dnn_builder_impl.hpp" namespace "InferenceEnginePython":
+    cdef cppclass ILayer:
+        const string name
+        size_t id
+        string type
+        map[string, string] parameters
+        vector[Port] in_ports
+        vector[Port] out_ports
+        map[string, Blob.Ptr] constant_data;
+
+
+    cdef cppclass INetwork:
+        string name
+        size_t size
+        vector[ILayer] layers
+        vector[ILayer] inputs
+        vector[ILayer] outputs
+        vector[Port] in_ports;
+        vector[Port] out_ports;
+        vector[Connection] getLayerConnections(size_t layer_id);
+        IENetwork to_ie_network();
+
+    cdef cppclass NetworkBuilder:
+        NetworkBuilder() except +
+        NetworkBuilder(string name) except +
+        NetworkBuilder from_ie_network(IENetwork &icnn_net) except +
+        INetwork build() except +
+        vector[LayerBuilder] getLayers() except +
+        LayerBuilder getLayer(size_t layer_id) except +
+        void removeLayer(const LayerBuilder& layer) except +
+        const vector[Connection] getLayerConnections(const LayerBuilder& layer) except +
+        void disconnect(const Connection& connection) except +
+        void connect(const PortInfo& input, const PortInfo& output) except +
+        size_t addLayer(const LayerBuilder& layer) except +
+        size_t addAndConnectLayer(const vector[PortInfo]& input, const LayerBuilder& layer);
+
+    cdef cppclass Port:
+        Port() except +
+        Port(const vector[size_t] & shapes) except +
+        const vector[size_t] shape
+
+
+    cdef cppclass PortInfo:
+        PortInfo(size_t layer_id, size_t port_id) except +
+        PortInfo() except +
+        size_t layer_id
+        size_t port_id
+
+    cdef cppclass Connection:
+        Connection(PortInfo input, PortInfo output) except +
+        Connection() except +
+        PortInfo _from
+        PortInfo to
+
+    cdef cppclass LayerBuilder:
+        LayerBuilder()
+        LayerBuilder(const string& type, const string& name ) except +
+        size_t id
+        LayerBuilder from_ilayer(const ILayer& ilayer) except +
+        string getName() except +
+        string getType() except +
+        vector[Port] getInputPorts() except +
+        vector[Port] getOutputPorts() except +
+        map[string, string] getParameters() except +
+        void setParameters(map[string, string] params_map) except +
+        void setName(const string & name) except +
+        void setType(const string & type) except +
+        void setInputPorts(const vector[Port] ports) except +
+        void setOutputPorts(const vector[Port] ports) except +
+        ILayer build() except +
+        map[string, Blob.Ptr] getConstantData()
+        void setConstantData(map[string, Blob.Ptr] &const_data)
+        # TODO: Fix LAyerBuilder object copying - pass by reference
+        # void addConstantData(const string & name, Blob.Ptr data)
+        Blob.Ptr allocateBlob(vector[size_t] dims, const string & precision)
@@ -1,8 +1,3 @@
-# Copyright (C) 2018 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-
 from .cimport ie_api_impl_defs as C
 from .ie_api_impl_defs cimport Blob, TensorDesc
 
@@ -24,24 +19,22 @@ cdef class BlobBuffer:
 cdef class InferRequest:
     cdef C.InferRequestWrap *impl
 
-    cpdef BlobBuffer _get_input_buffer(self, const string & blob_name)
-    cpdef BlobBuffer _get_output_buffer(self, const string & blob_name)
+    cpdef BlobBuffer _get_blob_buffer(self, const string & blob_name)
 
     cpdef infer(self, inputs = ?)
     cpdef async_infer(self, inputs = ?)
     cpdef wait(self, timeout = ?)
     cpdef get_perf_counts(self)
     cdef public:
-        _inputs, _outputs
+        _inputs_list, _outputs_list
 
 cdef class IENetwork:
     cdef C.IENetwork impl
 
-
 cdef class ExecutableNetwork:
     cdef unique_ptr[C.IEExecNetwork] impl
     cdef public:
-        _requests, async, _request_iterator
+        _requests, inputs, outputs
 
 cdef class IEPlugin:
     cdef C.IEPlugin impl
@@ -51,9 +44,6 @@ cdef class IEPlugin:
     cpdef void set_initial_affinity(self, IENetwork network) except *
     cpdef set get_supported_layers(self, IENetwork net)
 
-cdef class IENetReader:
-    cdef C.IENetReader impl
-
 cdef class IENetLayer:
     cdef C.IENetLayer impl
 
@@ -61,4 +51,7 @@ cdef class InputInfo:
     cdef C.InputInfo impl
 
 cdef class OutputInfo:
-    cdef C.OutputInfo impl
\ No newline at end of file
+    cdef C.OutputInfo impl
+
+cdef class LayersStatsMap(dict):
+    cdef C.IENetwork net_impl
@@ -1,20 +1,18 @@
-# Copyright (C) 2018 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-
 #distutils: language=c++
 from cython.operator cimport dereference as deref
 from .cimport ie_api_impl_defs as C
 from .ie_api_impl_defs cimport Blob, TensorDesc, SizeVector, Precision
 from libcpp.string cimport string
 from libcpp.vector cimport vector
+from libcpp.pair cimport pair
 from libcpp.map cimport map
 from libcpp.memory cimport unique_ptr
 from libc.stdint cimport int64_t
 import os
 import numpy as np
 from copy import deepcopy
+import warnings
+from collections import OrderedDict
 
 cdef extern from "<utility>" namespace "std" nogil:
     cdef unique_ptr[C.IEExecNetwork] move(unique_ptr[C.IEExecNetwork])
@@ -35,7 +33,7 @@ cdef dict_to_c_map(py_dict):
 
 supported_precisions = ["FP32", "FP16", "Q78", "I32", "I16", "I8", "U32", "U16"]
 supported_layouts = ["NCHW", "NHWC", "OIHW", "C", "CHW", "HW", "NC", "CN", "BLOCKED"]
-known_plugins = ['CPU', 'GPU', 'FPGA', 'MYRIAD', 'HETERO']
+known_plugins = ['CPU', 'GPU', 'FPGA', 'MYRIAD', 'HETERO', 'HDDL']
 
 def get_version():
     return C.get_version().decode()
@@ -68,7 +66,23 @@ cdef class IENetLayer:
     @property
     def params(self):
         return {k.decode(): v.decode() for k, v in self.impl.params}
-
+    @property
+    def parents(self):
+        cdef vector[string] c_parents = self.impl.parents
+        parents = []
+        return [parent.decode() for parent in c_parents]
+    @property
+    def children(self):
+        cdef vector[string] c_children = self.impl.children
+        children = []
+        return [child.decode() for child in c_children]
+    @property
+    def shape(self):
+        string_shape = self.impl.shape.decode()
+        return [int(i) for i in string_shape.split(' ')]
+    @property
+    def layout(self):
+        return self.impl.layout.decode()
     @affinity.setter
     def affinity(self, target_affinity):
         self.impl.setAffinity(target_affinity.encode())
@@ -80,7 +94,6 @@ cdef class IENetLayer:
     def precision(self, precision: str):
         self.impl.setPrecision(precision.upper().encode())
 
-
 cdef class InputInfo:
     @property
     def precision(self):
@@ -105,7 +118,6 @@ cdef class InputInfo:
                 "Unsupported layout {}! List of supported layouts: {}".format(layout, supported_layouts))
         self.impl.setLayout(layout.encode())
 
-
 cdef class OutputInfo:
     @property
     def precision(self):
@@ -122,20 +134,18 @@ cdef class OutputInfo:
             raise AttributeError(
                 "Unsupported precision {}! List of supported precisions: {}".format(precision, supported_precisions))
         self.impl.setPrecision(precision.encode())
-    # @layout.setter
-    # def layout(self, layout):
-    #     self.impl.setLayout(layout.encode())
 
 cdef class ExecutableNetwork:
     def __init__(self):
         self._requests = []
+        self.inputs = []
+        self.outputs = []
 
     def infer(self, inputs=None):
         current_request = self.requests[0]
         current_request.infer(inputs)
         return deepcopy(current_request.outputs)
 
-
     def start_async(self, request_id, inputs=None):
         if request_id not in list(range(len(self.requests))):
             raise ValueError("Incorrect request_id specified!")
@@ -145,21 +155,25 @@ cdef class ExecutableNetwork:
 
     @property
     def requests(self):
-        return self._requests
+        requests = []
+        for i in range(deref(self.impl).infer_requests.size()):
+            infer_request = InferRequest()
+            infer_request.impl = &(deref(self.impl).infer_requests[i])
+            infer_request._inputs_list = self.inputs
+            infer_request._outputs_list = self.outputs
+            requests.append(infer_request)
+        return requests
 
 cdef class InferRequest:
     def __init__(self):
-        self._inputs = {}
-        self._outputs = {}
-
-    cpdef BlobBuffer _get_input_buffer(self, const string & blob_name):
-        cdef BlobBuffer buffer = BlobBuffer()
-        buffer.reset(deref(self.impl).getInputBlob(blob_name))
-        return buffer
+        self._inputs_list = []
+        self._outputs_list = []
 
-    cpdef BlobBuffer _get_output_buffer(self, const string & blob_name):
+    cpdef BlobBuffer _get_blob_buffer(self, const string & blob_name):
         cdef BlobBuffer buffer = BlobBuffer()
-        buffer.reset(deref(self.impl).getOutputBlob(blob_name))
+        cdef Blob.Ptr blob_ptr
+        deref(self.impl).getBlobPtr(blob_name, blob_ptr)
+        buffer.reset(blob_ptr)
         return buffer
 
     cpdef infer(self, inputs=None):
@@ -192,17 +206,66 @@ cdef class InferRequest:
 
     @property
     def inputs(self):
-        return self._inputs
+        inputs = {}
+        for input in self._inputs_list:
+            inputs[input] = self._get_blob_buffer(input.encode()).to_numpy()
+        return inputs
 
     @property
     def outputs(self):
-        return self._outputs
+        outputs = {}
+        for output in self._outputs_list:
+            outputs[output] = self._get_blob_buffer(output.encode()).to_numpy()
+        return deepcopy(outputs)
+
+    def set_batch(self, size):
+        if size <= 0:
+            raise ValueError("Batch size should be positive integer number but {} specified".format(size))
+        deref(self.impl).setBatch(size)
 
     def _fill_inputs(self, inputs):
         for k, v in inputs.items():
-            self._inputs[k][:] = v
+            self.inputs[k][:] = v
+
+
+class LayerStats:
+    def __init__(self, min: tuple = (), max: tuple = ()):
+        self._min = min
+        self._max = max
+
+    @property
+    def min(self):
+        return self._min
+    @property
+    def max(self):
+        return self._max
+
+
+cdef class LayersStatsMap(dict):
+    def update(self, other=None, **kwargs):
+        super(LayersStatsMap, self).update(other, **kwargs)
+        cdef map[string, map[string, vector[float]]] c_stats_map
+        cdef map[string, vector[float]] c_node_stats
+        for k, v in self.items():
+            c_node_stats["min".encode()] = v.min
+            c_node_stats["max".encode()] = v.max
+            c_stats_map[k.encode()] = c_node_stats
+        self.net_impl.setStats(c_stats_map)
 
 cdef class IENetwork:
+    def __cinit__(self, model: str="", weights: str=""):
+        cdef string model_
+        cdef string weights_
+        if model and weights:
+            if not os.path.isfile(model):
+                raise Exception("Path to the model {} doesn't exists or it's a directory".format(model))
+            if not os.path.isfile(weights):
+                raise Exception("Path to the weights {} doesn't exists or it's a directory".format(weights))
+            model_ = model.encode()
+            weights_ = weights.encode()
+            self.impl = C.IENetwork(model_, weights_)
+        else:
+            self.impl = C.IENetwork()
     @property
     def name(self):
         name = bytes(self.impl.name)
@@ -213,7 +276,7 @@ cdef class IENetwork:
         cdef map[string, C.InputInfo] c_inputs = self.impl.getInputs()
         inputs = {}
         cdef InputInfo in_info
-        for input in  c_inputs:
+        for input in c_inputs:
             in_info = InputInfo()
             in_info.impl = input.second
             inputs[input.first.decode()] = in_info
@@ -224,7 +287,7 @@ cdef class IENetwork:
         cdef map[string, C.OutputInfo] c_outputs = self.impl.getOutputs()
         outputs = {}
         cdef OutputInfo out_info
-        for out in  c_outputs:
+        for out in c_outputs:
             out_info = OutputInfo()
             out_info.impl = out.second
             outputs[out.first.decode()] = out_info
@@ -243,23 +306,37 @@ cdef class IENetwork:
 
     @property
     def layers(self):
-        cdef map[string, C.IENetLayer] c_layers = <map[string, C.IENetLayer]> self.impl.getLayers()
-        layers = {}
+        cdef vector[pair[string, C.IENetLayer]] c_layers = self.impl.getLayers()
+        layers = OrderedDict()
         cdef IENetLayer net_l = IENetLayer()
         for l in c_layers:
             net_l = IENetLayer()
             net_l.impl = l.second
             layers[l.first.decode()] = net_l
         return layers
+    @property
+    def stats(self):
+        cdef map[string, map[string, vector[float]]] c_stats_map = self.impl.getStats()
+        py_stats_map = LayersStatsMap()
+        py_stats_map.net_impl = self.impl
+        for it in c_stats_map:
+            stats_map = LayersStatsMap()
+            py_stats_map[it.first.decode()] = LayerStats(min=tuple(it.second["min".encode()]),
+                                                         max=tuple(it.second["max".encode()]))
+        return py_stats_map
 
     @classmethod
     def from_ir(cls, model: str, weights: str):
+        warnings.filterwarnings("always",category=DeprecationWarning)
+        warnings.warn("from_ir() method of IENetwork is deprecated. "
+                      "Please use IENetwork class constructor to create valid IENetwork instance",
+                      DeprecationWarning)
         if not os.path.isfile(model):
             raise Exception("Path to the model {} doesn't exists or it's a directory".format(model))
         if not os.path.isfile(weights):
             raise Exception("Path to the weights {} doesn't exists or it's a directory".format(weights))
-        net_reader = IENetReader()
-        return net_reader.read(model, weights)
+        cdef IENetwork net = IENetwork(model, weights)
+        return net
 
     # TODO: Use enum with precision type instead of srting parameter when python2 support will not be required.
     def add_outputs(self, outputs, precision="FP32"):
@@ -273,6 +350,8 @@ cdef class IENetwork:
             _outputs.push_back(l.encode())
         self.impl.addOutputs(_outputs, precision.upper().encode())
 
+    def serialize(self, path_to_xml, path_to_bin):
+        self.impl.serialize(path_to_xml.encode(), path_to_bin.encode())
     def reshape(self, input_shapes: dict):
         cdef map[string, vector[size_t]] c_input_shapes;
         cdef vector[size_t] c_shape
@@ -312,8 +391,6 @@ cdef class IEPlugin:
             raise ValueError(
                 "Incorrect number of requests specified: {}. Expected positive integer number.".format(num_requests))
         cdef ExecutableNetwork exec_net = ExecutableNetwork()
-        cdef vector[string] inputs_list
-        cdef vector[string] outputs_list
         cdef map[string, string] c_config
 
         if config:
@@ -321,27 +398,8 @@ cdef class IEPlugin:
                 c_config[to_std_string(k)] = to_std_string(v)
 
         exec_net.impl = move(self.impl.load(network.impl, num_requests, c_config))
-
-        requests = []
-        for i in range(deref(exec_net.impl).infer_requests.size()):
-            infer_request = InferRequest()
-            infer_request.impl = &(deref(exec_net.impl).infer_requests[i])
-
-            inputs_list = infer_request.impl.getInputsList()
-            outputs_list = infer_request.impl.getOutputsList()
-
-            for input_b in inputs_list:
-                input_s = input_b.decode()
-                infer_request._inputs[input_s] = infer_request._get_input_buffer(input_b).to_numpy()
-
-            for output_b in outputs_list:
-                output_s = output_b.decode()
-                infer_request._outputs[output_s] = infer_request._get_output_buffer(output_b).to_numpy()
-
-            # create blob buffers
-            requests.append(infer_request)
-        exec_net._requests = tuple(requests)
-
+        exec_net.inputs = network.inputs.keys()
+        exec_net.outputs = list(network.outputs.keys())
         return exec_net
 
     cpdef void set_initial_affinity(self, IENetwork net) except *:
@@ -374,11 +432,6 @@ cdef class IEPlugin:
             c_config[to_std_string(k)] = to_std_string(v)
         self.impl.setConfig(c_config)
 
-cdef class IENetReader:
-    def read(self, model: str, weights: str) -> IENetwork:
-        cdef IENetwork net = IENetwork()
-        net.impl = self.impl.read(model.encode(), weights.encode())
-        return net
 
 cdef class BlobBuffer:
     """Copy-less accessor for Inference Engine Blob"""
@@ -1,31 +1,42 @@
-// Copyright (C) 2018 Intel Corporation
+// Copyright (c) 2018 Intel Corporation
 //
-// SPDX-License-Identifier: Apache-2.0
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
 //
+//        http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
 #include "ie_api_impl.hpp"
 #include "hetero/hetero_plugin_config.hpp"
 #include "ie_iinfer_request.hpp"
-std::map <std::string,InferenceEngine::Precision> precision_map = {{"FP32", InferenceEngine::Precision::FP32},
-                                                                       {"FP16", InferenceEngine::Precision::FP16},
-                                                                       {"Q78", InferenceEngine::Precision::Q78},
-                                                                       {"I32",  InferenceEngine::Precision::I32},
-                                                                       {"I16",  InferenceEngine::Precision::I16},
-                                                                       {"I8",  InferenceEngine::Precision::I8},
-                                                                       {"U16",  InferenceEngine::Precision::U16},
-                                                                       {"U8",  InferenceEngine::Precision::U8}};
-
-std::map <std::string,InferenceEngine::Layout> layout_map = {{"ANY", InferenceEngine::Layout::ANY},
-                                                                {"NCHW", InferenceEngine::Layout::NCHW},
-                                                                {"NHWC", InferenceEngine::Layout::NHWC},
-                                                                {"OIHW", InferenceEngine::Layout::OIHW},
-                                                                {"C", InferenceEngine::Layout::C},
-                                                                {"CHW", InferenceEngine::Layout::CHW},
-                                                                {"HW", InferenceEngine::Layout::HW},
-                                                                {"NC", InferenceEngine::Layout::NC},
-                                                                {"CN", InferenceEngine::Layout::CN},
-                                                                {"BLOCKED", InferenceEngine::Layout::BLOCKED}};
-#define stringify( name ) # name
+#include "details/ie_cnn_network_tools.h"
+
+std::map<std::string, InferenceEngine::Precision> precision_map = {{"FP32", InferenceEngine::Precision::FP32},
+                                                                   {"FP16", InferenceEngine::Precision::FP16},
+                                                                   {"Q78",  InferenceEngine::Precision::Q78},
+                                                                   {"I32",  InferenceEngine::Precision::I32},
+                                                                   {"I16",  InferenceEngine::Precision::I16},
+                                                                   {"I8",   InferenceEngine::Precision::I8},
+                                                                   {"U16",  InferenceEngine::Precision::U16},
+                                                                   {"U8",   InferenceEngine::Precision::U8}};
+
+std::map<std::string, InferenceEngine::Layout> layout_map = {{"ANY",     InferenceEngine::Layout::ANY},
+                                                             {"NCHW",    InferenceEngine::Layout::NCHW},
+                                                             {"NHWC",    InferenceEngine::Layout::NHWC},
+                                                             {"OIHW",    InferenceEngine::Layout::OIHW},
+                                                             {"C",       InferenceEngine::Layout::C},
+                                                             {"CHW",     InferenceEngine::Layout::CHW},
+                                                             {"HW",      InferenceEngine::Layout::HW},
+                                                             {"NC",      InferenceEngine::Layout::NC},
+                                                             {"CN",      InferenceEngine::Layout::CN},
+                                                             {"BLOCKED", InferenceEngine::Layout::BLOCKED}};
+#define stringify(name) # name
 #define IE_CHECK_CALL(expr) {                       \
     auto ret = (expr);                              \
     if (ret != InferenceEngine::StatusCode::OK) {   \
@@ -34,119 +45,121 @@ std::map <std::string,InferenceEngine::Layout> layout_map = {{"ANY", InferenceEn
 }                                                   \
 
 
-
-InferenceEnginePython::IENetwork InferenceEnginePython::IENetReader::read(std::string const &model,
-                                                                     std::string const &weights)
-{
+InferenceEnginePython::IENetwork::IENetwork(const std::string &model, const std::string &weights) {
     InferenceEngine::CNNNetReader net_reader;
     net_reader.ReadNetwork(model);
     net_reader.ReadWeights(weights);
-    const std::string &net_name = net_reader.getName();
-    InferenceEngine::CNNNetwork network = net_reader.getNetwork();
-    std::size_t batch_size = network.getBatchSize();
-    return {network, net_name, batch_size};
+    name = net_reader.getName();
+    actual = net_reader.getNetwork();
+    batch_size = actual.getBatchSize();
 }
 
-std::map<std::string, InferenceEnginePython::IENetLayer> InferenceEnginePython::IENetwork::getLayers()
-{
-    std::map<std::string, InferenceEnginePython::IENetLayer> result;
-    std::unordered_set<std::string> visisted;
-    const InferenceEngine::InputsDataMap &networkInputs = actual.getInputsInfo();
+void InferenceEnginePython::IENetwork::serialize(const std::string &path_to_xml, const std::string &path_to_bin) {
+    actual.serialize(path_to_xml, path_to_bin);
+}
 
-    using CNNLayerPtrCref = const InferenceEngine::CNNLayerPtr &;
-    std::function<void(CNNLayerPtrCref)> DFS = [&](CNNLayerPtrCref layer) {
+const std::vector<std::pair<std::string, InferenceEnginePython::IENetLayer>>
+InferenceEnginePython::IENetwork::getLayers() {
+    std::vector<std::pair<std::string, InferenceEnginePython::IENetLayer>> result;
+    std::vector<InferenceEngine::CNNLayerPtr> sorted_layers = InferenceEngine::details::CNNNetSortTopologically(actual);
+    for (const auto &layer : sorted_layers) {
         InferenceEnginePython::IENetLayer layer_info;
-        /* Assumes no cycles in graph */
-        for (auto &od : layer->outData)
-        {
-            for (auto nl : od->getInputTo())
-            {
-                auto i = visisted.find(nl.second->name);
-                if (i != visisted.end())
-                {
-                    continue;
-                }
-                DFS(nl.second);
-            }
-        }
-        visisted.emplace(layer->name);
+
         layer_info.layer_ptr = layer;
+        layer_info.network_ptr = actual;
         layer_info.name = layer->name;
         layer_info.type = layer->type;
-        std::string precision = layer->precision.name();
-        layer_info.precision = precision;
+        layer_info.precision = layer->precision.name();
         layer_info.params = layer->params;
         layer_info.affinity = layer->affinity;
-        result[layer->name] = layer_info;
-    };
-
-    std::set<InferenceEngine::CNNLayerPtr> inputs;
-    for (auto input : networkInputs) {
-        for (auto l : input.second->getInputData()->inputTo) {
-            inputs.insert(l.second);
+        std::vector<std::string> parents;
+        for (const auto &i : layer->insData) {
+            auto data = i.lock();
+            if (data) {
+                parents.emplace_back(data->getName());
+            }
         }
+        layer_info.parents = parents;
+        std::vector<std::string> children;
+        for (const auto &data : layer->outData) {
+            auto inputTo = data->getInputTo();
+            for (auto layer_iter : inputTo) {
+                InferenceEngine::CNNLayerPtr layer_in_data = layer_iter.second;
+                if (!layer_in_data) {
+                    THROW_IE_EXCEPTION << "Layer which takes data " << data->name << " is nullptr";
+                }
+                children.emplace_back(layer_in_data->name);
+            }
+        }
+        layer_info.children = children;
+        const InferenceEngine::TensorDesc &inputTensorDesc = layer->outData[0]->getTensorDesc();
+        for (const auto &it : layout_map) {
+            if (it.second == inputTensorDesc.getLayout()) {
+                layer_info.layout = it.first;
+            }
+        }
+        auto dims = inputTensorDesc.getDims();
+        std::string string_dims = "";
+        for (const auto &it : dims) {
+            string_dims += std::to_string(it) + " ";
+        }
+        string_dims = string_dims.substr(0, string_dims.size() - 1);
+        layer_info.shape = string_dims;
+        result.emplace_back(std::make_pair(layer->name, layer_info));
     }
-
-    for (auto &layer : inputs)
-    {
-        DFS(layer);
-    }
-
     return result;
-
 }
-std::map<std::string, InferenceEnginePython::InputInfo> InferenceEnginePython::IENetwork::getInputs(){
+
+const std::map<std::string, InferenceEnginePython::InputInfo> InferenceEnginePython::IENetwork::getInputs() {
     std::map<std::string, InferenceEnginePython::InputInfo> inputs;
     const InferenceEngine::InputsDataMap &inputsInfo = actual.getInputsInfo();
-    for (auto & in : inputsInfo){
+    for (auto &in : inputsInfo) {
         InferenceEnginePython::InputInfo info;
         info.actual = *in.second;
         const InferenceEngine::TensorDesc &inputTensorDesc = in.second->getTensorDesc();
         info.dims = inputTensorDesc.getDims();
-        for (auto it : precision_map )
+        for (auto it : precision_map)
             if (it.second == in.second->getPrecision())
-                info.precision =  it.first;
-        for (auto it : layout_map )
+                info.precision = it.first;
+        for (auto it : layout_map)
             if (it.second == in.second->getLayout())
-                info.layout =  it.first;
+                info.layout = it.first;
         inputs[in.first] = info;
     }
     return inputs;
 }
 
-std::map<std::string, InferenceEnginePython::OutputInfo> InferenceEnginePython::IENetwork::getOutputs(){
+const std::map<std::string, InferenceEnginePython::OutputInfo> InferenceEnginePython::IENetwork::getOutputs() {
     std::map<std::string, InferenceEnginePython::OutputInfo> outputs;
     const InferenceEngine::OutputsDataMap &outputsInfo = actual.getOutputsInfo();
-    for (auto & out : outputsInfo){
+    for (auto &out : outputsInfo) {
         InferenceEnginePython::OutputInfo info;
         info.actual = out.second;
         const InferenceEngine::TensorDesc &inputTensorDesc = out.second->getTensorDesc();
         info.dims = inputTensorDesc.getDims();
-        for (auto it : precision_map )
+        for (auto it : precision_map)
             if (it.second == out.second->getPrecision())
-                info.precision =  it.first;
-        for (auto it : layout_map )
+                info.precision = it.first;
+        for (auto it : layout_map)
             if (it.second == out.second->getLayout())
-                info.layout =  it.first;
+                info.layout = it.first;
         outputs[out.first] = info;
     }
     return outputs;
 }
 
-void InferenceEnginePython::IENetwork::addOutputs(const std::vector<std::string> & out_layers, const std::string &precision)
-{
-
-    for (auto && l : out_layers)
-    {
+void
+InferenceEnginePython::IENetwork::addOutputs(const std::vector<std::string> &out_layers, const std::string &precision) {
+    for (auto &&l : out_layers) {
         InferenceEngine::OutputsDataMap outputsDataMap = actual.getOutputsInfo();
-        if (outputsDataMap.find(l) != outputsDataMap.end())
-        {
+        if (outputsDataMap.find(l) != outputsDataMap.end()) {
             continue;
         }
         InferenceEngine::CNNLayerPtr cnnLayer = actual.getLayerByName(l.c_str());
         std::vector<InferenceEngine::DataPtr> outData = cnnLayer->outData;
         if (outData.size() != 1) {
-            std::cout << "Layer " << l << " has " << outData.size() << " output blobs and can not be set as output." << std::endl;
+            std::cout << "Layer " << l << " has " << outData.size() << " output blobs and can not be set as output."
+                      << std::endl;
             continue;
         }
         actual.addOutput(l);
@@ -155,29 +168,59 @@ void InferenceEnginePython::IENetwork::addOutputs(const std::vector<std::string>
     }
 }
 
-void InferenceEnginePython::IENetwork::setBatch(const size_t size)
-{
+void InferenceEnginePython::IENetwork::setBatch(const size_t size) {
     actual.setBatchSize(size);
 }
-void InferenceEnginePython::IENetwork::reshape(const std::map<std::string, std::vector<size_t>> & input_shapes){
+
+void InferenceEnginePython::IENetwork::reshape(const std::map<std::string, std::vector<size_t>> &input_shapes) {
     actual.reshape(input_shapes);
 }
 
-void InferenceEnginePython::InputInfo::setPrecision(std::string precision){
+const std::map<std::string, std::map<std::string, std::vector<float>>> InferenceEnginePython::IENetwork::getStats() {
+    InferenceEngine::ICNNNetworkStats *pstats = nullptr;
+    InferenceEngine::ResponseDesc response;
+    IE_CHECK_CALL(((InferenceEngine::ICNNNetwork &) actual).getStats(&pstats, &response));
+    auto statsMap = pstats->getNodesStats();
+    std::map<std::string, std::map<std::string, std::vector<float>>> map;
+    for (const auto &it : statsMap) {
+        std::map<std::string, std::vector<float>> stats;
+        stats.emplace("min", it.second->_minOutputs);
+        stats.emplace("max", it.second->_maxOutputs);
+        map.emplace(it.first, stats);
+    }
+    return map;
+}
+
+void
+InferenceEnginePython::IENetwork::setStats(
+        const std::map<std::string, std::map<std::string, std::vector<float>>> &stats) {
+    InferenceEngine::ICNNNetworkStats *pstats = nullptr;
+    InferenceEngine::ResponseDesc response;
+    IE_CHECK_CALL(((InferenceEngine::ICNNNetwork &) actual).getStats(&pstats, &response));
+    std::map<std::string, InferenceEngine::NetworkNodeStatsPtr> newNetNodesStats;
+    for (const auto &it : stats) {
+        InferenceEngine::NetworkNodeStatsPtr nodeStats = InferenceEngine::NetworkNodeStatsPtr(
+                new InferenceEngine::NetworkNodeStats());
+        newNetNodesStats.emplace(it.first, nodeStats);
+        nodeStats->_minOutputs = it.second.at("min");
+        nodeStats->_maxOutputs = it.second.at("max");
+    }
+    pstats->setNodesStats(newNetNodesStats);
+}
+
+void InferenceEnginePython::InputInfo::setPrecision(std::string precision) {
     actual.setPrecision(precision_map[precision]);
 }
 
-void InferenceEnginePython::InputInfo::setLayout(std::string layout){
+void InferenceEnginePython::InputInfo::setLayout(std::string layout) {
     actual.setLayout(layout_map[layout]);
 }
 
-void InferenceEnginePython::OutputInfo::setPrecision(std::string precision){
+void InferenceEnginePython::OutputInfo::setPrecision(std::string precision) {
     actual->setPrecision(precision_map[precision]);
 }
 
-InferenceEnginePython::IEPlugin::IEPlugin(const std::string &device, const std::vector<std::string> &plugin_dirs)
-{
-
+InferenceEnginePython::IEPlugin::IEPlugin(const std::string &device, const std::vector<std::string> &plugin_dirs) {
     InferenceEngine::PluginDispatcher dispatcher{plugin_dirs};
     actual = dispatcher.getPluginByDevice(device);
     const InferenceEngine::Version *pluginVersion;
@@ -188,65 +231,63 @@ InferenceEnginePython::IEPlugin::IEPlugin(const std::string &device, const std::
     device_name = device;
 }
 
-void InferenceEnginePython::IEPlugin::setInitialAffinity(InferenceEnginePython::IENetwork &net)
-{
+void InferenceEnginePython::IEPlugin::setInitialAffinity(const InferenceEnginePython::IENetwork &net) {
     InferenceEngine::HeteroPluginPtr hetero_plugin(actual);
     InferenceEngine::ResponseDesc response;
     auto &network = net.actual;
     IE_CHECK_CALL(hetero_plugin->SetAffinity(network, {}, &response));
 }
-std::set<std::string> InferenceEnginePython::IEPlugin::queryNetwork(InferenceEnginePython::IENetwork &net)
-{
-    InferenceEngine::CNNNetwork &network = net.actual;
+
+std::set<std::string> InferenceEnginePython::IEPlugin::queryNetwork(const InferenceEnginePython::IENetwork &net) {
+    const InferenceEngine::CNNNetwork &network = net.actual;
     InferenceEngine::QueryNetworkResult queryRes;
     actual->QueryNetwork(network, queryRes);
     return queryRes.supportedLayers;
 }
 
 
-void InferenceEnginePython::IENetLayer::setAffinity(const std::string & target_affinity){
+void InferenceEnginePython::IENetLayer::setAffinity(const std::string &target_affinity) {
     layer_ptr->affinity = target_affinity;
 }
 
-void InferenceEnginePython::IENetLayer::setParams(const std::map<std::string, std::string> & params_map){
+void InferenceEnginePython::IENetLayer::setParams(const std::map<std::string, std::string> &params_map) {
     layer_ptr->params = params_map;
 }
 
-std::map<std::string, InferenceEngine::Blob::Ptr> InferenceEnginePython::IENetLayer::getWeights(){
+std::map<std::string, InferenceEngine::Blob::Ptr> InferenceEnginePython::IENetLayer::getWeights() {
     auto w_layer = std::dynamic_pointer_cast<InferenceEngine::WeightableLayer>(layer_ptr);
     // IF current layer is weightable gather weights and biases from casted WeightableLayer and all other blobs
     // considered as custom and gathered from blobs field pf CNNLayer.
     std::map<std::string, InferenceEngine::Blob::Ptr> weights;
-    if (w_layer != nullptr){
-        if (w_layer->_weights != nullptr){
+    if (w_layer != nullptr) {
+        if (w_layer->_weights != nullptr) {
             weights["weights"] = w_layer->_weights;
         }
-        if (w_layer->_biases != nullptr){
+        if (w_layer->_biases != nullptr) {
             weights["biases"] = w_layer->_biases;
         }
-        for (auto it : w_layer->blobs){
-            if (it.first == "weights" || it.first == "biases"){
+        for (auto it : w_layer->blobs) {
+            if (it.first == "weights" || it.first == "biases") {
                 continue;
             }
             weights[it.first] = it.second;
         }
-    }
-    // Otherwise all layer's blobs are considered as custom and gathered from CNNLayer
-    else {
+    } else {
+        // Otherwise all layer's blobs are considered as custom and gathered from CNNLayer
         std::map<std::string, InferenceEngine::Blob::Ptr> map_placeholder;
-        weights = map_placeholder; // If layer has no blobs it should not be missed from weights map
-        for (auto it : layer_ptr->blobs){
+        weights = map_placeholder;  // If layer has no blobs it should not be missed from weights map
+        for (auto it : layer_ptr->blobs) {
             weights[it.first] = it.second;
         }
     }
     return weights;
 }
 
-void InferenceEnginePython::IENetLayer::setPrecision(std::string precision){
+void InferenceEnginePython::IENetLayer::setPrecision(std::string precision) {
     layer_ptr->precision = precision_map[precision];
 }
-void InferenceEnginePython::IEPlugin::addCpuExtension(const std::string &extension_path)
-{
+
+void InferenceEnginePython::IEPlugin::addCpuExtension(const std::string &extension_path) {
     InferenceEngine::ResponseDesc response;
     auto extension_ptr = InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(extension_path);
     auto extension = std::dynamic_pointer_cast<InferenceEngine::IExtension>(extension_ptr);
@@ -254,78 +295,49 @@ void InferenceEnginePython::IEPlugin::addCpuExtension(const std::string &extensi
 }
 
 std::unique_ptr<InferenceEnginePython::IEExecNetwork>
-InferenceEnginePython::IEPlugin::load(InferenceEnginePython::IENetwork &net,
+InferenceEnginePython::IEPlugin::load(const InferenceEnginePython::IENetwork &net,
                                       int num_requests,
-                                      const std::map<std::string, std::string> &config)
-{
+                                      const std::map<std::string, std::string> &config) {
     InferenceEngine::ResponseDesc response;
-    auto exec_network = InferenceEnginePython::make_unique<InferenceEnginePython::IEExecNetwork>(net.name, num_requests);
+    auto exec_network = InferenceEnginePython::make_unique<InferenceEnginePython::IEExecNetwork>(net.name,
+                                                                                                 num_requests);
 
     IE_CHECK_CALL(actual->LoadNetwork(exec_network->actual, net.actual, config, &response))
-    const InferenceEngine::InputsDataMap &inputs_info = net.actual.getInputsInfo();
-    const InferenceEngine::OutputsDataMap &outputs_info = net.actual.getOutputsInfo();
-
 
     for (size_t i = 0; i < num_requests; ++i) {
         InferRequestWrap &infer_request = exec_network->infer_requests[i];
         IE_CHECK_CALL(exec_network->actual->CreateInferRequest(infer_request.request_ptr, &response))
-
-        for (const auto& input : inputs_info) {
-            infer_request.inputs[input.first] = nullptr;
-            infer_request.request_ptr->GetBlob(input.first.c_str(), infer_request.inputs[input.first], &response);
-        }
-        for (const auto& output : outputs_info) {
-            infer_request.request_ptr->GetBlob(output.first.c_str(), infer_request.outputs[output.first], &response);
-        }
     }
 
     return exec_network;
 }
 
-void InferenceEnginePython::IEPlugin::setConfig(const std::map<std::string, std::string> & config) {
+void InferenceEnginePython::IEPlugin::setConfig(const std::map<std::string, std::string> &config) {
     InferenceEngine::ResponseDesc response;
     IE_CHECK_CALL(actual->SetConfig(config, &response))
 }
 
 InferenceEnginePython::IEExecNetwork::IEExecNetwork(const std::string &name, size_t num_requests) :
-    infer_requests(num_requests), name(name)
-{
+        infer_requests(num_requests), name(name) {
 }
 
-void InferenceEnginePython::IEExecNetwork::infer()
-{
+void InferenceEnginePython::IEExecNetwork::infer() {
     InferenceEngine::ResponseDesc response;
     InferRequestWrap &request = infer_requests[0];
     request.request_ptr->Infer(&response);
 }
 
 
-InferenceEngine::Blob::Ptr &InferenceEnginePython::InferRequestWrap::getInputBlob(const std::string &blob_name)
+void InferenceEnginePython::InferRequestWrap::getBlobPtr(const std::string &blob_name, InferenceEngine::Blob::Ptr &blob_ptr)
 {
-    return inputs.at(blob_name);
-}
-
-InferenceEngine::Blob::Ptr &InferenceEnginePython::InferRequestWrap::getOutputBlob(const std::string &blob_name)
-{
-    return outputs.at(blob_name);
+    InferenceEngine::ResponseDesc response;
+    IE_CHECK_CALL(request_ptr->GetBlob(blob_name.c_str(), blob_ptr, &response));
 }
 
-std::vector<std::string> InferenceEnginePython::InferRequestWrap::getInputsList() {
-    std::vector<std::string> inputs_list;
-    inputs_list.reserve(inputs.size());
-    std::transform(inputs.begin(), inputs.end(), std::back_inserter(inputs_list), [] (InferenceEngine::BlobMap::value_type it) -> std::string {
-        return it.first;
-    });
-    return inputs_list;
-}
 
-std::vector<std::string> InferenceEnginePython::InferRequestWrap::getOutputsList() {
-    std::vector<std::string> outputs_list;
-    outputs_list.reserve(inputs.size());
-    std::transform(outputs.begin(), outputs.end(), std::back_inserter(outputs_list), [] (InferenceEngine::BlobMap::value_type it) -> std::string {
-        return it.first;
-    });
-    return outputs_list;
+void InferenceEnginePython::InferRequestWrap::setBatch(int size) {
+    InferenceEngine::ResponseDesc response;
+    IE_CHECK_CALL(request_ptr->SetBatch(size, &response));
 }
 
 void InferenceEnginePython::InferRequestWrap::infer() {
@@ -344,13 +356,14 @@ int InferenceEnginePython::InferRequestWrap::wait(int64_t timeout) {
     return static_cast<int >(code);
 }
 
-std::map<std::string, InferenceEnginePython::ProfileInfo> InferenceEnginePython::InferRequestWrap::getPerformanceCounts(){
+std::map<std::string, InferenceEnginePython::ProfileInfo>
+InferenceEnginePython::InferRequestWrap::getPerformanceCounts() {
     std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> perf_counts;
     InferenceEngine::ResponseDesc response;
     request_ptr->GetPerformanceCounts(perf_counts, &response);
     std::map<std::string, InferenceEnginePython::ProfileInfo> perf_map;
 
-    for (auto it : perf_counts){
+    for (auto it : perf_counts) {
         InferenceEnginePython::ProfileInfo profile_info;
         switch (it.second.status) {
             case InferenceEngine::InferenceEngineProfileInfo::EXECUTED:
diff --git a/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp b/inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api_impl.hpp
new file mode 100644 (file)
index 0000000..7bb2dd3
--- /dev/null
@@ -0,0 +1,174 @@
+// Copyright (c) 2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//        http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ie_extension.h>
+#include <iterator>
+
+#include <string>
+#include <utility>
+#include <map>
+#include <vector>
+#include <set>
+
+#include <iostream>
+#include <algorithm>
+
+#include <sstream>
+#include <inference_engine.hpp>
+
+namespace InferenceEnginePython {
+struct IENetLayer {
+    InferenceEngine::CNNLayerPtr layer_ptr;
+    InferenceEngine::CNNNetwork network_ptr;
+    std::string name;
+    std::string type;
+    std::string precision;
+    std::string shape;
+    std::string layout;
+    std::vector<std::string> children;
+    std::vector<std::string> parents;
+    std::string affinity;
+    std::map<std::string, std::string> params;
+
+    void setAffinity(const std::string &target_affinity);
+
+    void setParams(const std::map<std::string, std::string> &params_map);
+
+    std::map<std::string, InferenceEngine::Blob::Ptr> getWeights();
+
+    void setPrecision(std::string precision);
+};
+
+struct InputInfo {
+    InferenceEngine::InputInfo actual;
+    std::vector<size_t> dims;
+    std::string precision;
+    std::string layout;
+
+    void setPrecision(std::string precision);
+
+    void setLayout(std::string layout);
+};
+
+struct OutputInfo {
+    InferenceEngine::DataPtr actual;
+    std::vector<size_t> dims;
+    std::string precision;
+    std::string layout;
+
+    void setPrecision(std::string precision);
+};
+
+struct ProfileInfo {
+    std::string status;
+    std::string exec_type;
+    std::string layer_type;
+    int64_t real_time;
+    int64_t cpu_time;
+    unsigned execution_index;
+};
+
+struct IENetwork {
+    InferenceEngine::CNNNetwork actual;
+    std::string name;
+    std::size_t batch_size;
+
+    void setBatch(const size_t size);
+
+    void addOutputs(const std::vector<std::string> &out_layers, const std::string &precision);
+
+    const std::vector<std::pair<std::string, InferenceEnginePython::IENetLayer>> getLayers();
+
+    const std::map<std::string, InferenceEnginePython::InputInfo> getInputs();
+
+    const std::map<std::string, InferenceEnginePython::OutputInfo> getOutputs();
+
+    void reshape(const std::map<std::string, std::vector<size_t>> &input_shapes);
+
+    void serialize(const std::string &path_to_xml, const std::string &path_to_bin);
+
+    void setStats(const std::map<std::string, std::map<std::string, std::vector<float>>> &stats);
+
+    const std::map<std::string, std::map<std::string, std::vector<float>>> getStats();
+
+    IENetwork(const std::string &model, const std::string &weights);
+
+    IENetwork() = default;
+};
+
+struct InferRequestWrap {
+    InferenceEngine::IInferRequest::Ptr request_ptr;
+
+    void infer();
+
+    void infer_async();
+
+    int  wait(int64_t timeout);
+
+    void getBlobPtr(const std::string &blob_name, InferenceEngine::Blob::Ptr &blob_ptr);
+
+    void setBatch(int size);
+
+    std::map<std::string, InferenceEnginePython::ProfileInfo> getPerformanceCounts();
+};
+
+
+struct IEExecNetwork {
+    InferenceEngine::IExecutableNetwork::Ptr actual;
+    std::vector<InferRequestWrap> infer_requests;
+    std::string name;
+
+    IEExecNetwork(const std::string &name, size_t num_requests);
+
+    void infer();
+};
+
+
+struct IEPlugin {
+    std::unique_ptr<InferenceEnginePython::IEExecNetwork> load(const InferenceEnginePython::IENetwork &net,
+                                                               int num_requests,
+                                                               const std::map<std::string, std::string> &config);
+
+    std::string device_name;
+    std::string version;
+
+    void setConfig(const std::map<std::string, std::string> &);
+
+    void addCpuExtension(const std::string &extension_path);
+
+    void setInitialAffinity(const InferenceEnginePython::IENetwork &net);
+
+    IEPlugin(const std::string &device, const std::vector<std::string> &plugin_dirs);
+
+    IEPlugin() = default;
+
+    std::set<std::string> queryNetwork(const InferenceEnginePython::IENetwork &net);
+
+    InferenceEngine::InferenceEnginePluginPtr actual;
+};
+
+template<class T>
+T *get_buffer(InferenceEngine::Blob &blob) {
+    return blob.buffer().as<T *>();
+}
+
+template<class T, class... Args>
+std::unique_ptr<T> make_unique(Args &&... args) {
+    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+std::string get_version();
+};  // namespace InferenceEnginePython
@@ -1,8 +1,3 @@
-# Copyright (C) 2018 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-
 from libc.stddef cimport size_t
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -10,7 +5,6 @@ from libcpp.map cimport map
 from libcpp.set cimport set
 from libcpp.pair cimport pair
 from libcpp.memory cimport unique_ptr, shared_ptr
-from libcpp cimport bool
 from libc.stdint cimport int64_t
 
 
@@ -28,7 +22,7 @@ cdef extern from "<inference_engine.hpp>" namespace "InferenceEngine":
         size_t element_size()  const
 
     cdef cppclass Precision:
-        const char* name() const
+        const char*name() const
 
 
 cdef extern from "ie_api_impl.hpp" namespace "InferenceEnginePython":
@@ -37,9 +31,11 @@ cdef extern from "ie_api_impl.hpp" namespace "InferenceEnginePython":
         string type
         string precision
         string affinity
+        string shape
+        string layout
+        vector[string] children
+        vector[string] parents
         map[string, string] params
-        # map[string, BlobInfo] blob_info
-        # map[string, Blob.Ptr] weights;
         void setAffinity(const string & target_affinity) except +
         void setParams(const map[string, string] & params_map) except +
         map[string, Blob.Ptr] getWeights() except +
@@ -58,7 +54,6 @@ cdef extern from "ie_api_impl.hpp" namespace "InferenceEnginePython":
         string layout
         void setPrecision(string precision)
 
-
     cdef cppclass ProfileInfo:
         string status
         string exec_type
@@ -68,51 +63,50 @@ cdef extern from "ie_api_impl.hpp" namespace "InferenceEnginePython":
         unsigned int execution_index
 
     cdef cppclass WeightsInfo:
-        Blob.Ptr &weights;
-        Blob.Ptr &biases;
+        Blob.Ptr & weights;
+        Blob.Ptr & biases;
         map[string, Blob.Ptr] custom_blobs;
 
-
     cdef cppclass IEExecNetwork:
         vector[InferRequestWrap] infer_requests
 
     cdef cppclass IENetwork:
+        IENetwork() except +
+        IENetwork(const string &, const string &) except +
         string name
         size_t batch_size
         map[string, vector[size_t]] inputs
-        map[string, IENetLayer] getLayers() except +
+        const vector[pair[string, IENetLayer]] getLayers() except +
         map[string, InputInfo] getInputs() except +
         map[string, OutputInfo] getOutputs() except +
         void addOutputs(vector[string] &, string &) except +
-        void setAffinity(map[string, string] &types_affinity_map, map[string, string] &layers_affinity_map) except +
+        void setAffinity(map[string, string] & types_affinity_map, map[string, string] & layers_affinity_map) except +
         void setBatch(size_t size) except +
         void setLayerParams(map[string, map[string, string]] params_map) except +
+        void serialize(const string& path_to_xml, const string& path_to_bin) except +
         void reshape(map[string, vector[size_t]] input_shapes) except +
+        void setStats(map[string, map[string, vector[float]]] & stats) except +
+        map[string, map[string, vector[float]]] getStats() except +
 
     cdef cppclass IEPlugin:
         IEPlugin() except +
         IEPlugin(const string &, const vector[string] &) except +
         unique_ptr[IEExecNetwork] load(IENetwork & net, int num_requests, const map[string, string]& config) except +
         void addCpuExtension(const string &) except +
-        void setConfig(const map[string, string]&) except +
+        void setConfig(const map[string, string] &) except +
         void setInitialAffinity(IENetwork & net) except +
-        set[string] queryNetwork(const IENetwork &net) except +
+        set[string] queryNetwork(const IENetwork & net) except +
         string device_name
         string version
 
-    cdef cppclass IENetReader:
-        IENetwork read(const string &, const string &) except +
-
     cdef cppclass InferRequestWrap:
-        vector[string] getInputsList() except +
-        vector[string] getOutputsList() except +
-        Blob.Ptr& getOutputBlob(const string &blob_name) except +
-        Blob.Ptr& getInputBlob(const string &blob_name) except +
+        void getBlobPtr(const string &blob_name, Blob.Ptr &blob_ptr)
         map[string, ProfileInfo] getPerformanceCounts() except +
         void infer() except +
         void infer_async() except +
         int wait(int64_t timeout) except +
+        void setBatch(int size) except +
 
-    cdef T* get_buffer[T](Blob &)
+    cdef T*get_buffer[T](Blob &)
 
     cdef string get_version()
diff --git a/inference-engine/include/builders/ie_argmax_layer.hpp b/inference-engine/include/builders/ie_argmax_layer.hpp
new file mode 100644 (file)
index 0000000..9ac1b5d
--- /dev/null
@@ -0,0 +1,83 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for ArgMax layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ArgMaxLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ArgMaxLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ArgMaxLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ArgMaxLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    ArgMaxLayer& setPort(const Port& port);
+    /**
+     * @brief Returns axis
+     * @return Axis
+     */
+    int getAxis() const;
+    /**
+     * @brief Sets axis
+     * @param axis Axis
+     * @return reference to layer builder
+     */
+    ArgMaxLayer& setAxis(int axis);
+    /**
+     * @brief Returns top K
+     * @return Top K
+     */
+    size_t getTopK() const;
+    /**
+     * @brief Sets top K
+     * @param topK Top K
+     * @return reference to layer builder
+     */
+    ArgMaxLayer& setTopK(size_t topK);
+    /**
+     * @brief Returns output maximum value
+     * @return Output maximum value
+     */
+    size_t getOutMaxVal() const;
+    /**
+     * @brief Sets output maximum value
+     * @param size Maximum value
+     * @return reference to layer builder
+     */
+    ArgMaxLayer& setOutMaxVal(size_t size);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_batch_normalization_layer.hpp b/inference-engine/include/builders/ie_batch_normalization_layer.hpp
new file mode 100644 (file)
index 0000000..dbdf538
--- /dev/null
@@ -0,0 +1,81 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for BatchNormalization layer
+ */
+class INFERENCE_ENGINE_API_CLASS(BatchNormalizationLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit BatchNormalizationLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit BatchNormalizationLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    BatchNormalizationLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    BatchNormalizationLayer& setPort(const Port &port);
+
+    /**
+     * @brief Sets weights for layer
+     * @param weights Constant blob with weights
+     * @return reference to layer builder
+     */
+    BatchNormalizationLayer& setWeights(const Blob::CPtr& weights);
+    /**
+     * @brief Sets biases for layer
+     * @param biases Constant blob with biases
+     * @return reference to layer builder
+     */
+    BatchNormalizationLayer& setBiases(const Blob::CPtr& biases);
+
+    /**
+     * @brief Returns epsilon
+     * @return Epsilon
+     */
+    float getEpsilon() const;
+    /**
+     * @brief Sets epsilon
+     * @param eps Epsilon
+     * @return reference to layer builder
+     */
+    BatchNormalizationLayer& setEpsilon(float eps);
+
+    /**
+     * @brief Validates layer before creation
+     * @param layer generic layer builder
+     */
+    static void validate(const Layer& layer);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_clamp_layer.hpp b/inference-engine/include/builders/ie_clamp_layer.hpp
new file mode 100644 (file)
index 0000000..a575962
--- /dev/null
@@ -0,0 +1,72 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Clamp layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ClampLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ClampLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ClampLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ClampLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    ClampLayer& setPort(const Port& port);
+    /**
+     * @brief Returns minimum value
+     * @return minimum value
+     */
+    float getMinValue() const;
+    /**
+     * @brief Sets minimum value
+     * @param minValue Minimum value
+     * @return reference to layer builder
+     */
+    ClampLayer& setMinValue(float minValue);
+    /**
+     * @brief Returns maximum value
+     * @return Maximum value
+     */
+    float getMaxValue() const;
+    /**
+     * @brief Sets maximum value
+     * @param maxValue Maximum value
+     * @return reference to layer builder
+     */
+    ClampLayer& setMaxValue(float maxValue);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_concat_layer.hpp b/inference-engine/include/builders/ie_concat_layer.hpp
new file mode 100644 (file)
index 0000000..96cd23b
--- /dev/null
@@ -0,0 +1,76 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Concat layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ConcatLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ConcatLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ConcatLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ConcatLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns vector with input ports
+     * @return vector with ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param ports Vector of input ports
+     * @return reference to layer builder
+     */
+    ConcatLayer& setInputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    ConcatLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns axis
+     * @return Axis
+     */
+    size_t getAxis() const;
+    /**
+     * @brief Sets axis
+     * @param axis Axis
+     * @return reference to layer builder
+     */
+    ConcatLayer& setAxis(size_t axis);
+
+private:
+    size_t axis;
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_const_layer.hpp b/inference-engine/include/builders/ie_const_layer.hpp
new file mode 100644 (file)
index 0000000..db0b31a
--- /dev/null
@@ -0,0 +1,57 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Const layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ConstLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ConstLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ConstLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ConstLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    ConstLayer& setPort(const Port& port);
+
+    /**
+     * @brief Sets constant data
+     * @param data constant blob with data
+     * @return reference to layer builder
+     */
+    ConstLayer& setData(const Blob::CPtr& data);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_convolution_layer.hpp b/inference-engine/include/builders/ie_convolution_layer.hpp
new file mode 100644 (file)
index 0000000..a577d5e
--- /dev/null
@@ -0,0 +1,163 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <vector>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for ArgMax layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ConvolutionLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ConvolutionLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ConvolutionLayer(Layer& genLayer);
+    /**
+     * @brief Operator creates generic layer builder
+     * @return Generic layer builder
+     */
+    operator Layer() const override;
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setName(const std::string& name);
+
+    /**
+     * @brief Sets weights for layer
+     * @param weights Constant blob with weights
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setWeights(const Blob::CPtr& weights);
+    /**
+     * @brief Sets biases for layer
+     * @param biases Constant blob with biases
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setBiases(const Blob::CPtr& biases);
+
+    /**
+     * @brief Returns input port
+     * @return Input port
+     */
+    const Port& getInputPort() const;
+    /**
+     * @brief Sets input port
+     * @param port Input port
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setInputPort(const Port& port);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns kernel size
+     * @return Kernel size
+     */
+    const std::vector<size_t> getKernel() const;
+    /**
+     * @brief Sets kernel size
+     * @param kernel Kernel size
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setKernel(const std::vector<size_t>& kernel);
+    /**
+     * @brief Returns vector of strides
+     * @return vector of strides
+     */
+    const std::vector<size_t> getStrides() const;
+    /**
+     * @brief Sets strides
+     * @param strides vector of strides
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setStrides(const std::vector<size_t>& strides);
+    /**
+     * @brief Returns dilations
+     * @return vector of dilations
+     */
+    const std::vector<size_t> getDilation() const;
+    /**
+     * @brief Sets dilations
+     * @param dilation Vector of dilations
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setDilation(const std::vector<size_t>& dilation);
+    /**
+     * @brief Returns begin paddings
+     * @return vector of paddings
+     */
+    const std::vector<size_t> getPaddingsBegin() const;
+    /**
+     * @brief Sets begin paddings
+     * @param paddings Vector of paddings
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setPaddingsBegin(const std::vector<size_t>& paddings);
+    /**
+     * @brief Return end paddings
+     * @return Vector of paddings
+     */
+    const std::vector<size_t> getPaddingsEnd() const;
+    /**
+     * @brief Sets end paddings
+     * @param paddings Vector of paddings
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setPaddingsEnd(const std::vector<size_t>& paddings);
+    /**
+     * @brief Returns group
+     * @return Group
+     */
+    size_t getGroup() const;
+    /**
+     * @brief Sets group
+     * @param group Group
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setGroup(size_t group);
+    /**
+     * @brief Return output depth
+     * @return Output depth
+     */
+    size_t getOutDepth() const;
+    /**
+     * @brief Sets output depth
+     * @param outDepth Output depth
+     * @return reference to layer builder
+     */
+    ConvolutionLayer& setOutDepth(size_t outDepth);
+
+    /**
+     * @brief Validates layer before creation
+     * @param layer generic layer builder
+     */
+    static void validate(const Layer& layer);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_crop_layer.hpp b/inference-engine/include/builders/ie_crop_layer.hpp
new file mode 100644 (file)
index 0000000..7bfbe94
--- /dev/null
@@ -0,0 +1,90 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Crop layer
+ */
+class INFERENCE_ENGINE_API_CLASS(CropLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit CropLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit CropLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    CropLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input ports
+     * @return Vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param port Vector of input ports
+     * @return reference to layer builder
+     */
+    CropLayer& setInputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Return output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    CropLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns axis
+     * @return Vector of axis
+     */
+    const std::vector<size_t> getAxis() const;
+    /**
+     * @brief Sets axis
+     * @param axis Vector of axis
+     * @return reference to layer builder
+     */
+    CropLayer& setAxis(const std::vector<size_t>& axis);
+    /**
+     * @brief Returns offsets
+     * @return Vector of offsets
+     */
+    const std::vector<size_t> getOffset() const;
+    /**
+     * @brief Sets offsets
+     * @param offsets Vector of offsets
+     * @return reference to layer builder
+     */
+    CropLayer& setOffset(const std::vector<size_t>& offsets);
+
+    /**
+     * @brief Validates layer before creation
+     * @param layer generic layer builder
+     */
+    static void validate(const Layer& layer);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_ctc_greedy_decoder_layer.hpp b/inference-engine/include/builders/ie_ctc_greedy_decoder_layer.hpp
new file mode 100644 (file)
index 0000000..78cdbd3
--- /dev/null
@@ -0,0 +1,74 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for CTCGreedyDecoder layer
+ */
+class INFERENCE_ENGINE_API_CLASS(CTCGreedyDecoderLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit CTCGreedyDecoderLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit CTCGreedyDecoderLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    CTCGreedyDecoderLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input ports
+     * @return Vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param ports Vector of input ports
+     * @return reference to layer builder
+     */
+    CTCGreedyDecoderLayer& setInputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    CTCGreedyDecoderLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns CTCMergeRepeated
+     * @return true if merge repeated
+     */
+    bool getCTCMergeRepeated() const;
+    /**
+     * @brief Sets CTCMergeRepeated
+     * @param flag bool value
+     * @return reference to layer builder
+     */
+    CTCGreedyDecoderLayer& setCTCMergeRepeated(bool flag);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
+
diff --git a/inference-engine/include/builders/ie_deconvolution_layer.hpp b/inference-engine/include/builders/ie_deconvolution_layer.hpp
new file mode 100644 (file)
index 0000000..c8d3925
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_convolution_layer.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Deconvolution layer
+ */
+class INFERENCE_ENGINE_API_CLASS(DeconvolutionLayer): public ConvolutionLayer {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit DeconvolutionLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit DeconvolutionLayer(Layer& genLayer);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_detection_output_layer.hpp b/inference-engine/include/builders/ie_detection_output_layer.hpp
new file mode 100644 (file)
index 0000000..e4ee542
--- /dev/null
@@ -0,0 +1,183 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for ArgMax layer
+ */
+class INFERENCE_ENGINE_API_CLASS(DetectionOutputLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit DetectionOutputLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit DetectionOutputLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns input ports
+     * @return Vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param ports Vector of input ports
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setInputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Returns number of classes
+     * @return Number of classes
+     */
+    size_t getNumClasses() const;
+    /**
+     * @brief Sets number of classes to be predict
+     * @param num Number of classes
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setNumClasses(size_t num);
+    /**
+     * @brief Returns background label ID
+     * @return Background ID
+     */
+    int getBackgroudLabelId() const;
+    /**
+     * @brief Sets background label ID
+     * @param labelId Background ID if there is no background class, set it to -1.
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setBackgroudLabelId(int labelId);
+    /**
+     * @brief Returns maximum number of results to be kept on NMS stage
+     * @return Top K
+     */
+    int getTopK() const;
+    /**
+     * @brief Sets maximum number of results to be kept on NMS stage
+     * @param topK Top K
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setTopK(int topK);
+    /**
+     * @brief Returns number of total boxes to be kept per image after NMS step
+     * @return Keep top K
+     */
+    int getKeepTopK() const;
+    /**
+     * @brief Sets number of total boxes to be kept per image after NMS step
+     * @param topK Keep top K
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setKeepTopK(int topK);
+    /**
+     * @brief Returns number of oriented classes
+     * @return Number of oriented classes
+     */
+    int getNumOrientClasses() const;
+    /**
+     * @brief Sets number of oriented classes
+     * @param numClasses Number of classes
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setNumOrientClasses(int numClasses);
+    /**
+     * @brief Returns type of coding method for bounding boxes
+     * @return String with code type
+     */
+    std::string getCodeType() const;
+    /**
+     * @brief Sets type of coding method for bounding boxes
+     * @param type Type
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setCodeType(std::string type);
+    /**
+     * @brief Returns interpolate orientation
+     * @return Interpolate orientation
+     */
+    int getInterpolateOrientation() const;
+    /**
+     * @brief Sets interpolate orientation
+     * @param orient Orientation
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setInterpolateOrientation(int orient);
+    /**
+     * @brief Returns threshold to be used in NMS stage
+     * @return Threshold
+     */
+    float getNMSThreshold() const;
+    /**
+     * @brief Sets threshold to be used in NMS stage
+     * @param threshold NMS threshold
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setNMSThreshold(float threshold);
+    /**
+     * @brief Returns confidence threshold
+     * @return Threshold
+     */
+    float getConfidenceThreshold() const;
+    /**
+     * @brief Sets confidence threshold
+     * @param threshold Threshold
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setConfidenceThreshold(float threshold);
+    /**
+     * @brief Returns share location
+     * @return true if bounding boxes are shared among different classes
+     */
+    bool getShareLocation() const;
+    /**
+     * @brief Sets share location
+     * @param flag true if bounding boxes are shared among different classes
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setShareLocation(bool flag);
+    /**
+     * @brief Returns encoded settings
+     * @return true if variance is encoded in target
+     */
+    bool getVariantEncodedInTarget() const;
+    /**
+     * @brief Sets encoded settings
+     * @param flag true if variance is encoded in target
+     * @return reference to layer builder
+     */
+    DetectionOutputLayer& setVariantEncodedInTarget(bool flag);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_eltwise_layer.hpp b/inference-engine/include/builders/ie_eltwise_layer.hpp
new file mode 100644 (file)
index 0000000..ffdacba
--- /dev/null
@@ -0,0 +1,96 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Eltwise layer
+ */
+class INFERENCE_ENGINE_API_CLASS(EltwiseLayer): public LayerFragment {
+public:
+    /**
+     * @brief The enum defines all Eltwise types
+     */
+    enum EltwiseType {
+        SUM = 1,
+        MAX,
+        MUL
+    };
+
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit EltwiseLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit EltwiseLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    EltwiseLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input ports
+     * @return Vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param ports Vector of input ports
+     * @return reference to layer builder
+     */
+    EltwiseLayer& setInputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    EltwiseLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns eltwise type
+     * @return Eltwise type
+     */
+    EltwiseType getEltwiseType() const;
+    /**
+     * @brief Sets eltwise type
+     * @param type Eltwise type
+     * @return reference to layer builder
+     */
+    EltwiseLayer& setEltwiseType(EltwiseType type);
+    /**
+     * @brief Returns eltwise scales
+     * @return Vector of scales
+     */
+    const std::vector<float> getScales() const;
+    /**
+     * @brief Sets eltwise scales
+     * @param scales Vector of scales
+     * @return reference to layer builder
+     */
+    EltwiseLayer& setScales(const std::vector<float>& scales);
+
+private:
+    EltwiseType type;
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_elu_layer.hpp b/inference-engine/include/builders/ie_elu_layer.hpp
new file mode 100644 (file)
index 0000000..ad5b3b4
--- /dev/null
@@ -0,0 +1,62 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for ELU layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ELULayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ELULayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ELULayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ELULayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    ELULayer& setPort(const Port& port);
+
+    /**
+     * @brief Returns alpha
+     * @return alpha
+     */
+    float getAlpha() const;
+    /**
+     * @brief Sets alpha
+     * @param alpha Alpha
+     * @return reference to layer builder
+     */
+    ELULayer& setAlpha(float alpha);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_fully_connected_layer.hpp b/inference-engine/include/builders/ie_fully_connected_layer.hpp
new file mode 100644 (file)
index 0000000..9b03f7d
--- /dev/null
@@ -0,0 +1,85 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for FullyConnected layer
+ */
+class INFERENCE_ENGINE_API_CLASS(FullyConnectedLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit FullyConnectedLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit FullyConnectedLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    FullyConnectedLayer& setName(const std::string& name);
+
+    /**
+     * @brief Sets weights for layer
+     * @param weights Constant blob with weights
+     * @return reference to layer builder
+     */
+    FullyConnectedLayer& setWeights(const Blob::CPtr& weights);
+    /**
+     * @brief Sets biases for layer
+     * @param biases Constant blob with biases
+     * @return reference to layer builder
+     */
+    FullyConnectedLayer& setBiases(const Blob::CPtr& biases);
+
+    /**
+     * @brief Returns input port
+     * @return Input port
+     */
+    const Port& getInputPort() const;
+    /**
+     * @brief Sets input port
+     * @param port Input port
+     * @return reference to layer builder
+     */
+    FullyConnectedLayer& setInputPort(const Port& port);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    FullyConnectedLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Return output size
+     * @return Output size
+     */
+    size_t getOutputNum() const;
+    /**
+     * @brief Sets output size
+     * @param outNum Output size
+     * @return reference to layer builder
+     */
+    FullyConnectedLayer& setOutputNum(size_t outNum);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_grn_layer.hpp b/inference-engine/include/builders/ie_grn_layer.hpp
new file mode 100644 (file)
index 0000000..f06f903
--- /dev/null
@@ -0,0 +1,61 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for ArgMax layer
+ */
+class INFERENCE_ENGINE_API_CLASS(GRNLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit GRNLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit GRNLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    GRNLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    GRNLayer& setPort(const Port& port);
+    /**
+     * @brief Returns beta
+     * @return Beta
+     */
+    float getBeta() const;
+    /**
+     * @brief Sets beta
+     * @param beta Beta
+     * @return reference to layer builder
+     */
+    GRNLayer& setBeta(float beta);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_input_layer.hpp b/inference-engine/include/builders/ie_input_layer.hpp
new file mode 100644 (file)
index 0000000..5312fcd
--- /dev/null
@@ -0,0 +1,56 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Input layer
+ */
+class INFERENCE_ENGINE_API_CLASS(InputLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit InputLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit InputLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    InputLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    InputLayer& setPort(const Port &port);
+
+    /**
+     * @brief Validates layer before creation
+     * @param layer generic layer builder
+     */
+    static void validate(const Layer& layer);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_layer_builder.hpp b/inference-engine/include/builders/ie_layer_builder.hpp
new file mode 100644 (file)
index 0000000..47620fa
--- /dev/null
@@ -0,0 +1,247 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <details/caseless.hpp>
+#include <ie_parameter.hpp>
+#include <ie_inetwork.hpp>
+#include <ie_blob.h>
+#include <string>
+#include <vector>
+#include <memory>
+#include <map>
+
+namespace InferenceEngine {
+namespace Builder {
+
+class Layer;
+
+/**
+ * @brief This structure implements a holder for validators
+ */
+struct ValidatorsHolder {
+    /**
+     * @brief Caseless map connects type with validator
+     */
+    details::caseless_map<std::string, std::function<void(const Layer&)>> validators;
+};
+
+/**
+ * @brief This class implements a builder for IE Layer
+ */
+class INFERENCE_ENGINE_API_CLASS(Layer) {
+public:
+    /**
+     * @brief The constructor creates a Layer builder with layer type and layer name
+     * @param type Layer type
+     * @param name Layer name
+     */
+    explicit Layer(const std::string& type, const std::string& name = "");
+    /**
+     * @brief The constructor creates a Layer builder from shared pointer to ILayer
+     * @param layer shared pointer to ILayer
+     */
+    explicit Layer(const ILayer::Ptr& layer);
+    /**
+     * @brief The constructor creates a Layer builder from shared pointer to constant ILayer
+     * @param layer shared pointer to constant ILayer
+     */
+    explicit Layer(const ILayer::CPtr& layer);
+    /**
+     * @brief The constructor creates a Layer builder with layer ID and layer builder
+     * @param id Layer ID
+     * @param layer layer builder
+     */
+    Layer(idx_t id, const Layer& layer);
+
+    /**
+     * @brief Returns layer builder ID
+     * @return ID
+     */
+    idx_t getId() const;
+
+    /**
+     * @brief Returns a reference to layer type
+     * @return Layer type
+     */
+    std::string& getType();
+    /**
+     * @brief Returns a reference to constant layer type
+     * @return constant layer type
+     */
+    const std::string& getType() const;
+    /**
+     * @brief Sets layer type
+     * @param type Layer type
+     * @return Reference to Layer builder
+     */
+    Layer& setType(const std::string& type);
+
+    /**
+     * @brief Returns a reference to layer name
+     * @return Layer name
+     */
+    std::string& getName();
+    /**
+     * @brief Returns a reference to constant layer name
+     * @return constant layer name
+     */
+    const std::string& getName() const;
+    /**
+     * @brief Sets layer name
+     * @param name Layer name
+     * @return Reference to Layer builder
+     */
+    Layer& setName(const std::string& name);
+
+    /**
+     * @brief Returns layer subgraph
+     * @return shared pointer to INetwork
+     */
+    INetwork::Ptr& getGraph();
+    /**
+     * @brief Returns constant layer subgraph
+     * @return constant shared pointer to INetwork
+     */
+    const INetwork::Ptr& getGraph() const;
+    /**
+     * @brief Sets layer subgraph
+     * @param graph constant shared pointer to INetwork
+     * @return Reference to Layer builder
+     */
+    Layer& setGraph(const INetwork::Ptr& graph);
+
+    /**
+     * @brief Returns map of parameters
+     * @return map of parameters
+     */
+    std::map<std::string, Parameter>& getParameters();
+    /**
+     * @brief Returns constant map of parameters
+     * @return constant map of parameters
+     */
+    const std::map<std::string, Parameter>& getParameters() const;
+    /**
+     * @brief Sets parameters for layer
+     * @param params constant map of parameters
+     * @return Reference to Layer builder
+     */
+    Layer& setParameters(const std::map<std::string, Parameter>& params);
+
+    /**
+     * @brief Returns map of internal blobs
+     * @return map of internal blobs
+     */
+    std::map<std::string, Blob::CPtr>& getConstantData();
+    /**
+     * @brief Returns constant map of internal blobs
+     * @return constant map of internal blobs
+     */
+    const std::map<std::string, Blob::CPtr>& getConstantData() const;
+    /**
+     * @brief Sets constant data for layer
+     * @param constData constant map of shared pointers to blobs
+     * @return Reference to Layer builder
+     */
+    Layer& setConstantData(const std::map<std::string, Blob::Ptr>& constData);
+    /**
+     * @brief Sets constant data for layer
+     * @param constData constant map of shared pointers to constant blobs
+     * @return Reference to Layer builder
+     */
+    Layer& setConstantData(const std::map<std::string, Blob::CPtr>& constData);
+    /**
+     * @brief Adds constant data for layer by name
+     * @param name Name of constant data
+     * @param data shared pointer to constant blob
+     * @return Reference to Layer builder
+     */
+    Layer& addConstantData(const std::string& name, const Blob::CPtr& data);
+
+    /**
+     * @brief Returns vector of input ports
+     * @return Vector of input ports
+     */
+    std::vector<Port>& getInputPorts();
+    /**
+     * @brief Returns constant vector of input ports
+     * @return constant vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param ports vector of ports
+     * @return Reference to Layer builder
+     */
+    Layer& setInputPorts(const std::vector<Port> &ports);
+
+    /**
+     * @brief Returns vector of output ports
+     * @return Vector of output ports
+     */
+    std::vector<Port>& getOutputPorts();
+    /**
+     * @brief Returns constant vector of output ports
+     * @return constant vector of output ports
+     */
+    const std::vector<Port>& getOutputPorts() const;
+    /**
+     * @brief Sets output ports
+     * @param ports vector of ports
+     * @return Reference to Layer builder
+     */
+    Layer& setOutputPorts(const std::vector<Port> &ports);
+
+    /**
+     * @brief Validates the current builder and generates ILayer object
+     * @return constant shared pointer to ILayer
+     */
+    const ILayer::Ptr build() const;
+
+    /**
+     * @brief Validates layer builder
+     */
+    void validate() const;
+
+    /**
+     * @brief Registers a new validator for type
+     * @param type Layer type
+     * @param validator Layer validator
+     */
+    static void addValidator(const std::string& type, const std::function<void(const Layer&)>& validator);
+
+private:
+    idx_t id;
+    std::string type;
+    std::string name;
+    INetwork::Ptr graph;
+    std::vector<Port> inPorts;
+    std::vector<Port> outPorts;
+    std::map<std::string, Parameter> params;
+    std::map<std::string, Blob::CPtr> constData;
+
+    static std::shared_ptr<ValidatorsHolder> getValidatorsHolder();
+};
+
+/**
+ * @brief This class registers layer validators
+ */
+class ValidatorRegisterBase {
+public:
+    /**
+     * @brief The constructor registers new layer validator
+     * @param type Layer type
+     * @param validator Layer validator
+     */
+    explicit ValidatorRegisterBase(const std::string& type, const std::function<void(const Layer&)>& validator) {
+        InferenceEngine::Builder::Layer::addValidator(type, validator);
+    }
+};
+
+#define REG_VALIDATOR_FOR(__type, __validator) \
+static InferenceEngine::Builder::ValidatorRegisterBase _reg_##__type(#__type, __validator)
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_layer_fragment.hpp b/inference-engine/include/builders/ie_layer_fragment.hpp
new file mode 100644 (file)
index 0000000..a9723b3
--- /dev/null
@@ -0,0 +1,76 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_builder.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief This class defines the basic functional for layer builders
+ */
+class INFERENCE_ENGINE_API_CLASS(LayerFragment) {
+public:
+    /**
+     * @brief The constructor creates layer builders with layer type and layer name
+     * @param type Layer type
+     * @param name Layer name
+     */
+    LayerFragment(const std::string& type, const std::string& name);
+    /**
+     * @brief The constructor creates layer builders from reference to generic layer builder
+     * @param genLayer Generic layer builder
+     */
+    explicit LayerFragment(Layer& genLayer);
+    /**
+     * @brief The copy constructor
+     * @param rval Source builder
+     */
+    explicit LayerFragment(const LayerFragment& rval);
+
+    /**
+     * @brief Copy operator for LayerFragment
+     * @param rval
+     * @return Layer builder
+     */
+    LayerFragment& operator=(const LayerFragment& rval);
+
+    /**
+     * @brief Virtual destructor
+     */
+    virtual ~LayerFragment() = default;
+
+    /**
+     * @brief The operator creates generic builder
+     * @return Generic builder
+     */
+    virtual operator Layer() const;
+
+    /**
+     * @brief Returns layer type
+     * @return Layer type
+     */
+    const std::string& getType() const;
+    /**
+     * @brief Returns layer name
+     * @return Layer name
+     */
+    const std::string& getName() const;
+
+protected:
+    const std::vector<size_t> uInts2size_t(const std::vector<unsigned int>& vector) const;
+    Layer& getLayer() const;
+
+private:
+    Layer layer;
+    Layer& refLayer;
+};
+
+}  // namespace Builder
+
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_memory_layer.hpp b/inference-engine/include/builders/ie_memory_layer.hpp
new file mode 100644 (file)
index 0000000..b399e95
--- /dev/null
@@ -0,0 +1,94 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Memory layer
+ */
+class INFERENCE_ENGINE_API_CLASS(MemoryLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit MemoryLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit MemoryLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    MemoryLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    MemoryLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns input port
+     * @return Input port
+     */
+    const Port& getInputPort() const;
+    /**
+     * @brief Sets input port
+     * @param port Input port
+     * @return reference to layer builder
+     */
+    MemoryLayer& setInputPort(const Port& port);
+    /**
+     * @brief Returns memory ID
+     * @return String with memory ID
+     */
+    const std::string getId() const;
+    /**
+     * @brief Sets memory ID
+     * @param id Memory ID
+     * @return reference to layer builder
+     */
+    MemoryLayer& setId(const std::string& id);
+    /**
+     * @brief Returns the index of memory layer
+     * @return Index
+     */
+    size_t getIndex() const;
+    /**
+     * @brief Sets the index of memory layer
+     * @param index Index equal 0 means this layer is output one.
+     * @return reference to layer builder
+     */
+    MemoryLayer& setIndex(size_t index);
+    /**
+     * @brief Returns size of the group
+     * @return Size of the group
+     */
+    size_t getSize() const;
+    /**
+     * @brief Sets size of the group
+     * @param size Size if size equals 2 means this group is a pair (only 2 is supported).
+     * @return reference to layer builder
+     */
+    MemoryLayer& setSize(size_t size);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_mvn_layer.hpp b/inference-engine/include/builders/ie_mvn_layer.hpp
new file mode 100644 (file)
index 0000000..ef92351
--- /dev/null
@@ -0,0 +1,83 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for MVN layer
+ */
+class INFERENCE_ENGINE_API_CLASS(MVNLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit MVNLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit MVNLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    MVNLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    MVNLayer& setPort(const Port& port);
+    /**
+     * @brief Returns across channels value
+     * @return true if mean values are shared across channels
+     */
+    bool getAcrossChannels() const;
+    /**
+     * @brief Sets across channels
+     * @param flag true if mean values are shared across channels
+     * @return reference to layer builder
+     */
+    MVNLayer& setAcrossChannels(bool flag);
+    /**
+     * @brief Returns normalize variance
+     * @return true if variance normalization is performed
+     */
+    bool getNormalize() const;
+    /**
+     * @brief Sets normalize variance
+     * @param flag true if variance normalization is performed
+     * @return reference to layer builder
+     */
+    MVNLayer& setNormalize(bool flag);
+    /**
+     * @brief Return epsilon
+     * @return Epsilon
+     */
+    float getEpsilon() const;
+    /**
+     * @brief Sets epsilon
+     * @param eps Epsilon
+     * @return reference to layer builder
+     */
+    MVNLayer& setEpsilon(float eps);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_network_builder.hpp b/inference-engine/include/builders/ie_network_builder.hpp
new file mode 100644 (file)
index 0000000..586a267
--- /dev/null
@@ -0,0 +1,185 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_builder.hpp>
+#include <ie_icnn_network.hpp>
+#include <cpp/ie_cnn_network.h>
+#include <ie_inetwork.hpp>
+#include <ie_context.hpp>
+#include <ie_common.h>
+#include <ie_blob.h>
+#include <utility>
+#include <memory>
+#include <string>
+#include <vector>
+#include <map>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief This class implements a builder for IE Network
+ */
+class INFERENCE_ENGINE_API_CLASS(Network) {
+public:
+    /**
+     * @brief A shared pointer to the Network builder
+     */
+    using Ptr = std::shared_ptr<Network>;
+
+    /**
+     * @brief The constructor creates a builder based on ICNNNetwork
+     *
+     * @param network constant reference to ICNNNetwork object
+     */
+    explicit Network(const ICNNNetwork& network);
+    /**
+     * @brief The constructor creates a empty builder with network name
+     *
+     * @param name Network name
+     */
+    explicit Network(const std::string& name);
+    /**
+     * @brief The constructor creates a builder based on INetwork
+     *
+     * @param network constant reference to INetwork object
+     */
+    explicit Network(const INetwork& network);
+
+    /**
+     * @brief The constructor creates a builder based on ICNNNetwork with custom Context
+     *
+     * @param network constant reference to ICNNNetwork object
+     */
+    Network(const Context& ieContext, const ICNNNetwork& network);
+    /**
+     * @brief The constructor creates a empty builder with network name and custom Context
+     *
+     * @param name Network name
+     */
+    Network(const Context& ieContext, const std::string& name);
+    /**
+     * @brief The constructor creates a builder based on INetwork with custom Context
+     *
+     * @param network constant reference to INetwork object
+     */
+    Network(const Context& ieContext, const INetwork& network);
+
+    /**
+     * @brief Virtual destructor
+     */
+    virtual ~Network() = default;
+
+    /**
+     * @brief Adds new layer and connects it with previous layers
+     *
+     * @param inputs Vector with PortInfo objects from previous layers
+     * @param layer Layer builder for new layer
+     *
+     * @return Id of new builder for the current network
+     */
+    idx_t addLayer(const std::vector<PortInfo>& inputs, const Layer& layer);
+    /**
+     * @brief Adds new layer
+     *
+     * @param layer Layer builder for new layer
+     *
+     * @return Id of new builder for the current network
+     */
+    idx_t addLayer(const Layer& layer);
+    /**
+     * @brief Removes a layer by ID
+     *
+     * @param layerId Layer ID
+     */
+    void removeLayer(idx_t layerId);
+
+    /**
+     * @brief Connects two layers
+     *
+     * @param input PortInfo object from previous layer
+     * @param output PortInfo object from next layer
+     */
+    void connect(const PortInfo& input, const PortInfo& output);
+    /**
+     * @brief Removes connection from the network
+     *
+     * @param connection Connection
+     */
+    void disconnect(const Connection& connection);
+
+    /**
+     * @brief Returns layer builder by ID
+     *
+     * @param layerId Layer ID
+     *
+     * @return Layer buider
+     */
+    Layer& getLayer(idx_t layerId);
+    /**
+     * @brief Returns constant layer builder by ID
+     *
+     * @param layerId Layer ID
+     *
+     * @return constant layer builder
+     */
+    const Layer& getLayer(idx_t layerId) const;
+
+    /**
+     * @brief Returns vector of layer builders
+     *
+     * @return Vector of layer builders
+     */
+    std::vector<Layer>& getLayers();
+    /**
+     * @brief Returns constant vector of layer builders
+     *
+     * @return constant vector of layer builders
+     */
+    const std::vector<Layer>& getLayers() const;
+
+    /**
+     * @brief Returns all connections for layer
+     *
+     * @param layerId Layer ID
+     *
+     * @return Vector of connections for the current layer
+     */
+    const std::vector<Connection> getLayerConnections(idx_t layerId) const noexcept;
+
+    /**
+     * @brief Builds and validate networks
+     *
+     * @return const shared pointer to INetwork
+     */
+    const INetwork::Ptr build() const;
+
+    /**
+     * @brief The operator builds network
+     *
+     * @return const shared pointer to INetwork
+     */
+    explicit operator const INetwork::Ptr() const;
+
+private:
+    const Context ctx;
+    const size_t version;
+    std::string name;
+    std::vector<Layer> layers;
+    std::vector<Connection> connections;
+};
+
+/**
+ * @brief This function converts INetwork to ICNNNetwork
+ *
+ * @param network constant shared pointer to INetwork object
+ * @return constant shared pointer to ICNNNetwork
+ */
+INFERENCE_ENGINE_API_CPP(const std::shared_ptr<ICNNNetwork>) convertToICNNNetwork(const INetwork::Ptr& network);
+
+}  // namespace Builder
+
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_norm_layer.hpp b/inference-engine/include/builders/ie_norm_layer.hpp
new file mode 100644 (file)
index 0000000..58d972b
--- /dev/null
@@ -0,0 +1,112 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Norm layer
+ */
+class INFERENCE_ENGINE_API_CLASS(NormLayer): public LayerFragment {
+public:
+    /**
+     * @brief The enum defines all Norm types
+     */
+    enum NormType {
+        WITHIN_CHANNEL = 0,
+        ACROSS_CHANNELS = 1
+    };
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit NormLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit NormLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    NormLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    NormLayer& setPort(const Port& port);
+    /**
+     * @brief Returns side length of the region
+     * @return Size
+     */
+    size_t getSize() const;
+    /**
+     * @brief Sets side length of the region
+     * @param size Size
+     * @return reference to layer builder
+     */
+    NormLayer& setSize(size_t size);
+    /**
+     * @brief Returns scaling parameter for the normalizing sum
+     * @return Scaling parameter
+     */
+    float getAlpha() const;
+    /**
+     * @brief Sets scaling parameter for the normalizing sum
+     * @param alpha Scaling parameter
+     * @return reference to layer builder
+     */
+    NormLayer& setAlpha(float alpha);
+    /**
+     * @brief Returns exponent for the normalizing sum
+     * @return Exponent
+     */
+    float getBeta() const;
+    /**
+     * @brief Sets exponent for the normalizing sum
+     * @param beta Exponent
+     * @return reference to layer builder
+     */
+    NormLayer& setBeta(float beta);
+    /**
+     * @brief Returns region type
+     * @return true if normalizing sum is performed over adjacent channels
+     */
+    bool getAcrossMaps() const;
+    /**
+     * @brief Sets region type
+     * @param acrossMap true if normalizing sum is performed over adjacent channels
+     * @return reference to layer builder
+     */
+    NormLayer& setAcrossMaps(bool acrossMap);
+    /**
+     * @brief Returns region type
+     * @return Norm type
+     */
+    NormType getRegion() const;
+    /**
+     * @brief Sets region type
+     * @param type region type
+     * @return reference to layer builder
+     */
+    NormLayer& setRegion(NormType type);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_normalize_layer.hpp b/inference-engine/include/builders/ie_normalize_layer.hpp
new file mode 100644 (file)
index 0000000..bc05381
--- /dev/null
@@ -0,0 +1,85 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Normalize layer
+ */
+class INFERENCE_ENGINE_API_CLASS(NormalizeLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit NormalizeLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit NormalizeLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    NormalizeLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    NormalizeLayer& setPort(const Port& port);
+
+    /**
+     * @brief Returns channel shared flag
+     * @return true if scale parameters are shared across channels
+     */
+    bool getChannelShared() const;
+    /**
+     * @brief Sets channel shared flag
+     * @param acrossMap true if scale parameters are shared across channels
+     * @return reference to layer builder
+     */
+    NormalizeLayer& setChannelShared(bool acrossMap);
+    /**
+     * @brief Returns across maps
+     * @return true if normalization is shared across channels
+     */
+    bool getAcrossMaps() const;
+    /**
+     * @brief Sets across map
+     * @param acrossMap true if normalization is shared across channels
+     * @return reference to layer builder
+     */
+    NormalizeLayer& setAcrossMaps(bool acrossMap);
+
+    /**
+     * @brief Returns epsilon
+     * @return Epsilon
+     */
+    float getEpsilon() const;
+    /**
+     * @brief Sets epsilon
+     * @param eps Epsilon
+     * @return reference to layer builder
+     */
+    NormalizeLayer& setEpsilon(float eps);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_output_layer.hpp b/inference-engine/include/builders/ie_output_layer.hpp
new file mode 100644 (file)
index 0000000..71abd38
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Output layer
+ */
+class INFERENCE_ENGINE_API_CLASS(OutputLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit OutputLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit OutputLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    OutputLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    OutputLayer& setPort(const Port &port);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_permute_layer.hpp b/inference-engine/include/builders/ie_permute_layer.hpp
new file mode 100644 (file)
index 0000000..54cfcf3
--- /dev/null
@@ -0,0 +1,86 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <vector>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Permute layer
+ */
+class INFERENCE_ENGINE_API_CLASS(PermuteLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit PermuteLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit PermuteLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    PermuteLayer& setName(const std::string& name);
+
+    /**
+     * @brief Sets weights for layer
+     * @param weights Constant blob with weights
+     * @return reference to layer builder
+     */
+    PermuteLayer& setWeights(const Blob::CPtr& weights);
+    /**
+     * @brief Sets biases for layer
+     * @param biases Constant blob with biases
+     * @return reference to layer builder
+     */
+    PermuteLayer& setBiases(const Blob::CPtr& biases);
+
+    /**
+     * @brief Returns input port
+     * @return Input port
+     */
+    const Port& getInputPort() const;
+    /**
+     * @brief Sets input port
+     * @param port Input port
+     * @return reference to layer builder
+     */
+    PermuteLayer& setInputPort(const Port& port);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    PermuteLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Return vector of dimensions indexes for output blob
+     * @return Order of dimensions for output blob
+     */
+    const std::vector<size_t> getOrder() const;
+    /**
+     * @brief Sets the order of dimensions for output blob
+     * @param order dimensions indexes for output blob
+     * @return reference to layer builder
+     */
+    PermuteLayer& setOrder(const std::vector<size_t>& order);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
+
diff --git a/inference-engine/include/builders/ie_pooling_layer.hpp b/inference-engine/include/builders/ie_pooling_layer.hpp
new file mode 100644 (file)
index 0000000..80150ae
--- /dev/null
@@ -0,0 +1,170 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <vector>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Pooling layer
+ */
+class INFERENCE_ENGINE_API_CLASS(PoolingLayer): public LayerFragment {
+public:
+    /**
+     * @brief The enum defines available pooling types
+     */
+    enum PoolingType {
+        MAX = 1,
+        AVG = 2
+    };
+
+    /**
+     * @brief The enum defines available rounding types
+     */
+    enum RoundingType {
+        CEIL = 1,
+        FLOOR = 2
+    };
+
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit PoolingLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit PoolingLayer(Layer& genLayer);
+    /**
+     * @brief Operator creates generic layer builder
+     * @return Generic layer builder
+     */
+    operator Layer() const override;
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    PoolingLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input port
+     * @return Input port
+     */
+    const Port& getInputPort() const;
+    /**
+     * @brief Sets input port
+     * @param port Input port
+     * @return reference to layer builder
+     */
+    PoolingLayer& setInputPort(const Port& port);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    PoolingLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns kernel size
+     * @return Kernel size
+     */
+    const std::vector<size_t> getKernel() const;
+    /**
+     * @brief Sets kernel size
+     * @param kernel Kernel size
+     * @return reference to layer builder
+     */
+    PoolingLayer& setKernel(const std::vector<size_t>& kernel);
+    /**
+     * @brief Returns vector of strides
+     * @return vector of strides
+     */
+    const std::vector<size_t> getStrides() const;
+    /**
+     * @brief Sets strides
+     * @param strides vector of strides
+     * @return reference to layer builder
+     */
+    PoolingLayer& setStrides(const std::vector<size_t>& strides);
+    /**
+     * @brief Returns begin paddings
+     * @return vector of paddings
+     */
+    const std::vector<size_t> getPaddingsBegin() const;
+    /**
+     * @brief Sets begin paddings
+     * @param paddings Vector of paddings
+     * @return reference to layer builder
+     */
+    PoolingLayer& setPaddingsBegin(const std::vector<size_t>& paddings);
+    /**
+     * @brief Return end paddings
+     * @return Vector of paddings
+     */
+    const std::vector<size_t> getPaddingsEnd() const;
+    /**
+     * @brief Sets end paddings
+     * @param paddings Vector of paddings
+     * @return reference to layer builder
+     */
+    PoolingLayer& setPaddingsEnd(const std::vector<size_t>& paddings);
+    /**
+     * @brief Returns pooling type
+     * @return Pooling type
+     */
+    PoolingType getPoolingType() const;
+    /**
+     * @brief Sets pooling type
+     * @param type Pooling type
+     * @return reference to layer builder
+     */
+    PoolingLayer& setPoolingType(PoolingType type);
+    /**
+     * @brief Returns rounding type
+     * @return Rounding type
+     */
+    RoundingType getRoundingType() const;
+    /**
+     * @brief Sets rounding types
+     * @param type Rounding type
+     * @return reference to layer builder
+     */
+    PoolingLayer& setRoundingType(RoundingType type);
+    /**
+     * @brief Returns a type of pooling strategy
+     * @return true if zero-values in the padding are not used
+     */
+    bool getExcludePad() const;
+    /**
+     * @brief Sets a type of pooling strategy
+     * @param exclude zero-values in the padding are not used if true
+     * @return reference to layer builder
+     */
+    PoolingLayer& setExcludePad(bool exclude);
+
+    /**
+     * @brief Validates layer before creation
+     * @param layer generic layer builder
+     */
+    static void validate(const Layer& layer);
+
+private:
+    PoolingType type;
+    RoundingType roundingType;
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_power_layer.hpp b/inference-engine/include/builders/ie_power_layer.hpp
new file mode 100644 (file)
index 0000000..94ef1cc
--- /dev/null
@@ -0,0 +1,83 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Power layer
+ */
+class INFERENCE_ENGINE_API_CLASS(PowerLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit PowerLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit PowerLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    PowerLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    PowerLayer& setPort(const Port& port);
+    /**
+     * @brief Returns power
+     * @return Power parameter
+     */
+    float getPower() const;
+    /**
+     * @brief Sets the power parameter
+     * @param power Power parameter
+     * @return reference to layer builder
+     */
+    PowerLayer& setPower(float power);
+    /**
+     * @brief Returns scaling parameter
+     * @return Scaling
+     */
+    float getScale() const;
+    /**
+     * @brief Sets scaling parameter
+     * @param scale Scaling parameter
+     * @return reference to layer builder
+     */
+    PowerLayer& setScale(float scale);
+    /**
+     * @brief Returns shifting parameter
+     * @return Shift
+     */
+    float getShift() const;
+    /**
+     * @brief Sets shift for the layer
+     * @param shift Shifting parameter
+     * @return reference to layer builder
+     */
+    PowerLayer& setShift(float shift);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_prelu_layer.hpp b/inference-engine/include/builders/ie_prelu_layer.hpp
new file mode 100644 (file)
index 0000000..5e7dedd
--- /dev/null
@@ -0,0 +1,67 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for PReLU layer
+ */
+class INFERENCE_ENGINE_API_CLASS(PReLULayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit PReLULayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit PReLULayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    PReLULayer& setName(const std::string& name);
+
+    /**
+     * @brief Sets weights for layer
+     * @param weights Constant blob with weights
+     * @return reference to layer builder
+     */
+    PReLULayer& setWeights(const Blob::CPtr& weights);
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    PReLULayer& setPort(const Port& port);
+    /**
+     * @brief Returns channel shared flag
+     * @return true if negative slope shared across channels
+     */
+    bool getChannelShared() const;
+    /**
+     * @brief Sets channel shared flag
+     * @param flag true if negative slope shared across channels
+     * @return reference to layer builder
+     */
+    PReLULayer& setChannelShared(bool flag);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_prior_box_clustered_layer.hpp b/inference-engine/include/builders/ie_prior_box_clustered_layer.hpp
new file mode 100644 (file)
index 0000000..61d7f16
--- /dev/null
@@ -0,0 +1,161 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for PriorBoxClustered layer
+ */
+class INFERENCE_ENGINE_API_CLASS(PriorBoxClusteredLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit PriorBoxClusteredLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit PriorBoxClusteredLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns input ports
+     * @return Vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param port Vector of input ports
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setInputPorts(const std::vector<Port>& port);
+    /**
+     * @brief Returns height and width of input image
+     * @return input image sizes
+     */
+    const std::vector<float> getImgSizes() const;
+    /**
+     * @brief Sets height and width sizes
+     * @param sizes Height and width sizes
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setImgSizes(const std::vector<float> sizes);
+    /**
+     * @brief returns distances between (height and width) box centers
+     * @return distances
+     */
+    const std::vector<float> getSteps() const;
+    /**
+     * @brief Sets distances between box centers for height and width
+     * @param steps Distances between box centers
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setSteps(const std::vector<float> steps);
+    /**
+     * @brief returns a distance between box centers
+     * @return distance
+     */
+    float getStep() const;
+    /**
+     * @brief Sets a distance between box centers
+     * @param steps A distance between box centers
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setStep(float step);
+    /**
+     * @brief Returns shift of box respectively to top left corner
+     * @return Shift
+     */
+    float getOffset() const;
+    /**
+     * @brief Sets shift of box respectively to top left corner
+     * @param offset Shift
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setOffset(float offset);
+    /**
+     * @brief Returns a variance of adjusting bounding boxes
+     * @return Variance
+     */
+    float getVariance() const;
+    /**
+     * @brief Sets a variance of adjusting bounding boxes
+     * @param variance Variance
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setVariance(float variance);
+    /**
+     * @brief Returns desired boxes width in pixels
+     * @return width of desired boxes
+     */
+    float getWidth() const;
+    /**
+     * @brief Sets desired boxes width in pixels
+     * @param width Width of desired boxes
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setWidth(float width);
+    /**
+     * @brief Returns desired boxes height in pixels
+     * @return height of desired boxes
+     */
+    float getHeight() const;
+    /**
+     * @brief Sets desired boxes height in pixels
+     * @param height Height of desired boxes
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setHeight(float height);
+    /**
+     * @brief Returns clip flag
+     * @return true if each value in the output blob is within [0,1]
+     */
+    bool getClip() const;
+    /**
+     * @brief sets clip flag
+     * @param flag true if each value in the output blob is within [0,1]
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setClip(bool flag);
+    /**
+     * @brief Returns flip flag
+     * @return list of boxes is augmented with the flipped ones if true
+     */
+    bool getFlip() const;
+    /**
+     * @brief Sets flip flag
+     * @param flag true if list of boxes is augmented with the flipped ones
+     * @return reference to layer builder
+     */
+    PriorBoxClusteredLayer& setFlip(bool flag);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_prior_box_layer.hpp b/inference-engine/include/builders/ie_prior_box_layer.hpp
new file mode 100644 (file)
index 0000000..8051d6c
--- /dev/null
@@ -0,0 +1,161 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for PriorBox layer
+ */
+class INFERENCE_ENGINE_API_CLASS(PriorBoxLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit PriorBoxLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit PriorBoxLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns input ports
+     * @return Vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param ports Vector of input ports
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setInputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Returns the minimum box size in pixels
+     * @return Minimum box size
+     */
+    size_t getMinSize() const;
+    /**
+     * @brief Sets the minimum box size in pixels
+     * @param minSize Minimum size
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setMinSize(size_t minSize);
+    /**
+     * @brief Returns the maximum box size in pixels
+     * @return maximum size
+     */
+    size_t getMaxSize() const;
+    /**
+     * @brief Sets the maximum box size in pixels
+     * @param maxSize Maximum size
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setMaxSize(size_t maxSize);
+    /**
+     * @brief Returns a distance between box centers
+     * @return Distance
+     */
+    float getStep() const;
+    /**
+     * @brief Sets a distance between box centers
+     * @param step Distance
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setStep(float step);
+    /**
+     * @brief Returns a shift of box respectively to top left corner
+     * @return Shift
+     */
+    float getOffset() const;
+    /**
+     * @brief Sets a shift of box respectively to top left corner
+     * @param offset Shift
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setOffset(float offset);
+    /**
+     * @brief Returns a variance of adjusting bounding boxes
+     * @return Variance
+     */
+    float getVariance() const;
+    /**
+     * @brief Sets a variance of adjusting bounding boxes
+     * @param variance Variance
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setVariance(float variance);
+    /**
+     * @brief Returns a flag that denotes type of inference
+     * @return true if max_size is used
+     */
+    bool getScaleAllSizes() const;
+    /**
+     * @brief Sets a flag that denotes a type of inference
+     * @param flag max_size is used if true
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setScaleAllSizes(bool flag);
+    /**
+     * @brief Returns clip flag
+     * @return true if each value in the output blob is within [0,1]
+     */
+    bool getClip() const;
+    /**
+     * @brief sets clip flag
+     * @param flag true if each value in the output blob is within [0,1]
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setClip(bool flag);
+    /**
+     * @brief Returns flip flag
+     * @return list of boxes is augmented with the flipped ones if true
+     */
+    bool getFlip() const;
+    /**
+     * @brief Sets flip flag
+     * @param flag true if list of boxes is augmented with the flipped ones
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setFlip(bool flag);
+    /**
+     * @brief Returns a variance of aspect ratios
+     * @return Vector of aspect ratios
+     */
+    const std::vector<size_t> getAspectRatio() const;
+    /**
+     * @brief Sets a variance of aspect ratios
+     * @param aspectRatio Vector of aspect ratios
+     * @return reference to layer builder
+     */
+    PriorBoxLayer& setAspectRatio(const std::vector<size_t>& aspectRatio);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_proposal_layer.hpp b/inference-engine/include/builders/ie_proposal_layer.hpp
new file mode 100644 (file)
index 0000000..e7fcac4
--- /dev/null
@@ -0,0 +1,151 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Proposal layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ProposalLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ProposalLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ProposalLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ProposalLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    ProposalLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns input ports
+     * @return Vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param ports Vector of input ports
+     * @return reference to layer builder
+     */
+    ProposalLayer& setInputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Returns the quantity of bounding boxes after applying NMS
+     * @return Quantity of bounding boxes
+     */
+    size_t getPostNMSTopN() const;
+    /**
+     * @brief Sets the quantity of bounding boxes after applying NMS
+     * @param topN Quantity of bounding boxes
+     * @return reference to layer builder
+     */
+    ProposalLayer& setPostNMSTopN(size_t topN);
+    /**
+     * @brief Returns the quantity of bounding boxes before applying NMS
+     * @return Quantity of bounding boxes
+     */
+    size_t getPreNMSTopN() const;
+    /**
+     * @brief Sets the quantity of bounding boxes before applying NMS
+     * @param topN Quantity of bounding boxes
+     * @return reference to layer builder
+     */
+    ProposalLayer& setPreNMSTopN(size_t topN);
+    /**
+     * @brief Returns minimum value of the proposal to be taken into consideration
+     * @return Threshold
+     */
+    float getNMSThresh() const;
+    /**
+     * @brief Sets minimum value of the proposal to be taken into consideration
+     * @param thresh Threshold
+     * @return reference to layer builder
+     */
+    ProposalLayer& setNMSThresh(float thresh);
+    /**
+     * @brief Returns base size for anchor generation
+     * @return Base size
+     */
+    size_t getBaseSize() const;
+    /**
+     * @brief Sets base size for anchor generation
+     * @param baseSize Base size for anchor generation
+     * @return reference to layer builder
+     */
+    ProposalLayer& setBaseSize(size_t baseSize);
+    /**
+     * @brief Returns minimum size of box to be taken into consideration
+     * @return Minimum size
+     */
+    size_t getMinSize() const;
+    /**
+     * @brief Sets minimum size of box to be taken into consideration
+     * @param minSize Minimum size of the box
+     * @return reference to layer builder
+     */
+    ProposalLayer& setMinSize(size_t minSize);
+    /**
+     * @brief Returns step size to slide over boxes in pixels
+     * @return Step size
+     */
+    size_t getFeatStride() const;
+    /**
+     * @brief Sets step size to slide over boxes in pixels
+     * @param featStride Step size
+     * @return reference to layer builder
+     */
+    ProposalLayer& setFeatStride(size_t featStride);
+    /**
+     * @brief Returns scales for anchor generation
+     * @return Vector of scales
+     */
+    const std::vector<float> getScale() const;
+    /**
+     * @brief Sets scales for anchor generation
+     * @param scales Vector of scales
+     * @return reference to layer builder
+     */
+    ProposalLayer& setScale(const std::vector<float>& scales);
+    /**
+     * @brief Returns ratios for anchor generation
+     * @return Vector of ratios
+     */
+    const std::vector<float> getRatio() const;
+    /**
+     * @brief Sets ratios for anchor generation
+     * @param ratios Vector of scales
+     * @return reference to layer builder
+     */
+    ProposalLayer& setRatio(const std::vector<float>& ratios);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
+
diff --git a/inference-engine/include/builders/ie_psroi_pooling_layer.hpp b/inference-engine/include/builders/ie_psroi_pooling_layer.hpp
new file mode 100644 (file)
index 0000000..82c9f47
--- /dev/null
@@ -0,0 +1,98 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for PSROIPooling layer
+ */
+class INFERENCE_ENGINE_API_CLASS(PSROIPoolingLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit PSROIPoolingLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit PSROIPoolingLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    PSROIPoolingLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input ports
+     * @return Vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param ports Vector of input ports
+     * @return reference to layer builder
+     */
+    PSROIPoolingLayer& setInputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Returns output ports
+     * @return Vector of output ports
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output ports
+     * @param port Vector of output ports
+     * @return reference to layer builder
+     */
+    PSROIPoolingLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns multiplicative spatial scale factor to translate ROI coordinates
+     * @return Spatial scale factor
+     */
+    float getSpatialScale() const;
+    /**
+     * @brief Sets multiplicative spatial scale factor to translate ROI coordinates
+     * @param spatialScale Spatial scale factor
+     * @return reference to layer builder
+     */
+    PSROIPoolingLayer& setSpatialScale(float spatialScale);
+    /**
+     * @brief Returns pooled output channel number
+     * @return Output channel number
+     */
+    size_t getOutputDim() const;
+    /**
+     * @brief Sets pooled output channel number
+     * @param outDim Output channel number
+     * @return reference to layer builder
+     */
+    PSROIPoolingLayer& setOutputDim(size_t outDim);
+    /**
+     * @brief Returns number of groups to encode position-sensitive score maps
+     * @return Number of groups
+     */
+    size_t getGroupSize() const;
+    /**
+     * @brief Sets number of groups to encode position-sensitive score maps
+     * @param size Number of groups
+     * @return reference to layer builder
+     */
+    PSROIPoolingLayer& setGroupSize(size_t size);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
+
+
+
diff --git a/inference-engine/include/builders/ie_region_yolo_layer.hpp b/inference-engine/include/builders/ie_region_yolo_layer.hpp
new file mode 100644 (file)
index 0000000..1a2d645
--- /dev/null
@@ -0,0 +1,155 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for RegionYolo layer
+ */
+class INFERENCE_ENGINE_API_CLASS(RegionYoloLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit RegionYoloLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit RegionYoloLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input port
+     * @return Input port
+     */
+    const Port& getInputPort() const;
+    /**
+     * @brief Sets input port
+     * @param port Input port
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setInputPort(const Port& port);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns number of coordinates for each region
+     * @return Number of coordinates
+     */
+    int getCoords() const;
+    /**
+     * @brief Sets number of coordinates for each region
+     * @param coords Number of coordinates
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setCoords(int coords);
+    /**
+     * @brief Returns number of classes for each region
+     * @return Number of classes
+     */
+    int getClasses() const;
+    /**
+     * @brief Sets number of classes for each region
+     * @param classes number of classes
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setClasses(int classes);
+    /**
+     * @brief Returns number of regions
+     * @return Number of regions
+     */
+    int getNum() const;
+    /**
+     * @brief Sets number of regions
+     * @param num Number of regions
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setNum(int num);
+    /**
+     * @brief Returns a flag which specifies the method of infer
+     * @return true if softmax is performed
+     */
+    bool getDoSoftMax() const;
+    /**
+     * @brief Sets a flag which specifies the method of infer
+     * @param flag softmax is performed if true
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setDoSoftMax(bool flag);
+    /**
+     * @brief Returns anchors coordinates of regions
+     * @return anchors coordinates
+     */
+    float getAnchors() const;
+    /**
+     * @brief Sets anchors coordinates of regions
+     * @param anchors Anchors coordinates
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setAnchors(float anchors);
+    /**
+     * @brief Returns mask
+     * @return Mask
+     */
+    int getMask() const;
+    /**
+     * @brief Sets mask
+     * @param mask Specifies which anchors to use
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setMask(int mask);
+    /**
+     * @brief Returns the number of the dimension from which flattening is performed
+     * @return Axis
+     */
+    size_t getAxis() const;
+    /**
+     * @brief Sets the number of the dimension from which flattening is performed
+     * @param axis Axis
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setAxis(size_t axis);
+    /**
+     * @brief Returns the number of the dimension on which flattening is ended
+     * @return End axis
+     */
+    size_t getEndAxis() const;
+    /**
+     * @brief Sets the number of the dimension on which flattening is ended
+     * @param axis End axis
+     * @return reference to layer builder
+     */
+    RegionYoloLayer& setEndAxis(size_t axis);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
+
+
+
+
+
diff --git a/inference-engine/include/builders/ie_relu6_layer.hpp b/inference-engine/include/builders/ie_relu6_layer.hpp
new file mode 100644 (file)
index 0000000..3bc3360
--- /dev/null
@@ -0,0 +1,62 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for ReLU6 layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ReLU6Layer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ReLU6Layer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ReLU6Layer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ReLU6Layer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    ReLU6Layer& setPort(const Port& port);
+
+    /**
+     * @brief Returns N value
+     * @return N
+     */
+    float getN() const;
+    /**
+     * @brief Sets N value
+     * @param n N value (6 by default)
+     * @return reference to layer builder
+     */
+    ReLU6Layer& setN(float n);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_relu_layer.hpp b/inference-engine/include/builders/ie_relu_layer.hpp
new file mode 100644 (file)
index 0000000..9422e19
--- /dev/null
@@ -0,0 +1,62 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for ReLU layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ReLULayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ReLULayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ReLULayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ReLULayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    ReLULayer& setPort(const Port& port);
+
+    /**
+     * @brief Returns negative slope
+     * @return Negative slope
+     */
+    float getNegativeSlope() const;
+    /**
+     * @brief Sets negative slope
+     * @param negativeSlope Negative slope
+     * @return reference to layer builder
+     */
+    ReLULayer& setNegativeSlope(float negativeSlope);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_reorg_yolo_layer.hpp b/inference-engine/include/builders/ie_reorg_yolo_layer.hpp
new file mode 100644 (file)
index 0000000..4719873
--- /dev/null
@@ -0,0 +1,77 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for ReorgYolo layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ReorgYoloLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ReorgYoloLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ReorgYoloLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ReorgYoloLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input port
+     * @return Input port
+     */
+    const Port& getInputPort() const;
+    /**
+     * @brief Sets input port
+     * @param ports Input port
+     * @return reference to layer builder
+     */
+    ReorgYoloLayer& setInputPort(const Port& ports);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    ReorgYoloLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns distance of cut throws in output blobs
+     * @return Stride
+     */
+    int getStride() const;
+    /**
+     * @brief Sets distance of cut throws in output blobs
+     * @param stride Stride
+     * @return reference to layer builder
+     */
+    ReorgYoloLayer& setStride(int stride);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
+
+
+
+
diff --git a/inference-engine/include/builders/ie_reshape_layer.hpp b/inference-engine/include/builders/ie_reshape_layer.hpp
new file mode 100644 (file)
index 0000000..42eacea
--- /dev/null
@@ -0,0 +1,73 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Reshape layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ReshapeLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ReshapeLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ReshapeLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ReshapeLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input port
+     * @return Input port
+     */
+    const Port& getInputPort() const;
+    /**
+     * @brief Sets input port
+     * @param port Input port
+     * @return reference to layer builder
+     */
+    ReshapeLayer& setInputPort(const Port& port);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    ReshapeLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns reshape dimensions
+     * @return Dimensions
+     */
+    const std::vector<int> getDims() const;
+    /**
+     * @brief Sets reshape dimensions
+     * @param dims Dimensions
+     * @return reference to layer builder
+     */
+    ReshapeLayer& setDims(const std::vector<int>& dims);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_roi_pooling_layer.hpp b/inference-engine/include/builders/ie_roi_pooling_layer.hpp
new file mode 100644 (file)
index 0000000..d6bb578
--- /dev/null
@@ -0,0 +1,84 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for ROIPooling layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ROIPoolingLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ROIPoolingLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ROIPoolingLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ROIPoolingLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input ports
+     * @return Vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param ports Vector of input ports
+     * @return reference to layer builder
+     */
+    ROIPoolingLayer& setInputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    ROIPoolingLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns a ratio of the input feature map over the input image size
+     * @return Spatial scale
+     */
+    float getSpatialScale() const;
+    /**
+     * @brief Sets a ratio of the input feature map over the input image size
+     * @param spatialScale Spatial scale
+     * @return reference to layer builder
+     */
+    ROIPoolingLayer& setSpatialScale(float spatialScale);
+    /**
+     * @brief Returns height and width of the ROI output feature map
+     * @return Vector contains height and width
+     */
+    const std::vector<int> getPooled() const;
+    /**
+     * @brief Sets height and width of the ROI output feature map
+     * @param pooled Vector with height and width
+     * @return reference to layer builder
+     */
+    ROIPoolingLayer& setPooled(const std::vector<int>& pooled);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_scale_shift_layer.hpp b/inference-engine/include/builders/ie_scale_shift_layer.hpp
new file mode 100644 (file)
index 0000000..361664e
--- /dev/null
@@ -0,0 +1,63 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for ScaleShift layer
+ */
+class INFERENCE_ENGINE_API_CLASS(ScaleShiftLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit ScaleShiftLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit ScaleShiftLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    ScaleShiftLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    ScaleShiftLayer& setPort(const Port &port);
+
+    /**
+     * @brief Sets weights for layer
+     * @param weights Constant blob with weights
+     * @return reference to layer builder
+     */
+    ScaleShiftLayer& setWeights(const Blob::CPtr& weights);
+    /**
+     * @brief Sets biases for layer
+     * @param biases Constant blob with biases
+     * @return reference to layer builder
+     */
+    ScaleShiftLayer& setBiases(const Blob::CPtr& biases);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_sigmoid_layer.hpp b/inference-engine/include/builders/ie_sigmoid_layer.hpp
new file mode 100644 (file)
index 0000000..6c48358
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Sigmoid layer
+ */
+class INFERENCE_ENGINE_API_CLASS(SigmoidLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit SigmoidLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit SigmoidLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    SigmoidLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    SigmoidLayer& setPort(const Port& port);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_simpler_nms_layer.hpp b/inference-engine/include/builders/ie_simpler_nms_layer.hpp
new file mode 100644 (file)
index 0000000..28cf6ee
--- /dev/null
@@ -0,0 +1,140 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for SimplerNMS layer
+ */
+class INFERENCE_ENGINE_API_CLASS(SimplerNMSLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit SimplerNMSLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit SimplerNMSLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    SimplerNMSLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input ports
+     * @return Vector of input ports
+     */
+    const std::vector<Port>& getInputPorts() const;
+    /**
+     * @brief Sets input ports
+     * @param ports Vector of input ports
+     */
+    SimplerNMSLayer& setInputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    SimplerNMSLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns the quantity of bounding boxes before applying NMS
+     * @return Quantity of bounding boxes
+     */
+    size_t getPreNMSTopN() const;
+    /**
+     * @brief Sets the quantity of bounding boxes before applying NMS
+     * @param topN Quantity of bounding boxes
+     * @return reference to layer builder
+     */
+    SimplerNMSLayer& setPreNMSTopN(size_t topN);
+    /**
+     * @brief Returns the quantity of bounding boxes after applying NMS
+     * @return Quantity of bounding boxes
+     */
+    size_t getPostNMSTopN() const;
+    /**
+     * @brief Sets the quantity of bounding boxes after applying NMS
+     * @param topN Quantity of bounding boxes
+     * @return reference to layer builder
+     */
+    SimplerNMSLayer& setPostNMSTopN(size_t topN);
+    /**
+     * @brief Returns the step size to slide over boxes in pixels
+     * @return Step size
+     */
+    size_t getFeatStride() const;
+    /**
+     * @brief Sets the step size to slide over boxes in pixels
+     * @param featStride Step size
+     * @return reference to layer builder
+     */
+    SimplerNMSLayer& setFeatStride(size_t featStride);
+    /**
+     * @brief Returns the minimum size of box to be taken into consideration
+     * @return Minimum size
+     */
+    size_t getMinBoxSize() const;
+    /**
+     * @brief Sets the minimum size of box to be taken into consideration
+     * @param minSize Minimum size
+     * @return reference to layer builder
+     */
+    SimplerNMSLayer& setMinBoxSize(size_t minSize);
+    /**
+     * @brief Returns scale for anchor boxes generating
+     * @return Scale for anchor boxes
+     */
+    size_t getScale() const;
+    /**
+     * @brief Sets scale for anchor boxes generating
+     * @param scale Scale for anchor boxes
+     * @return reference to layer builder
+     */
+    SimplerNMSLayer& setScale(size_t scale);
+
+    /**
+     * @brief Returns the minimum value of the proposal to be taken into consideration
+     * @return Threshold
+     */
+    float getCLSThreshold() const;
+    /**
+     * @brief Sets the minimum value of the proposal to be taken into consideration
+     * @param threshold Minimum value
+     * @return reference to layer builder
+     */
+    SimplerNMSLayer& setCLSThreshold(float threshold);
+    /**
+     * @brief Returns the minimum ratio of boxes overlapping to be taken into consideration
+     * @return Threshold
+     */
+    float getIOUThreshold() const;
+    /**
+     * @brief Sets the minimum ratio of boxes overlapping to be taken into consideration
+     * @param threshold Minimum value
+     * @return reference to layer builder
+     */
+    SimplerNMSLayer& setIOUThreshold(float threshold);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
+
diff --git a/inference-engine/include/builders/ie_softmax_layer.hpp b/inference-engine/include/builders/ie_softmax_layer.hpp
new file mode 100644 (file)
index 0000000..1ce13b8
--- /dev/null
@@ -0,0 +1,61 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for SoftMax layer
+ */
+class INFERENCE_ENGINE_API_CLASS(SoftMaxLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit SoftMaxLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit SoftMaxLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    SoftMaxLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    SoftMaxLayer& setPort(const Port& port);
+    /**
+     * @brief Returns axis
+     * @return Axis
+     */
+    size_t getAxis() const;
+    /**
+     * @brief Sets axis
+     * @param axis Axis
+     * @return reference to layer builder
+     */
+    SoftMaxLayer& setAxis(size_t axis);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_split_layer.hpp b/inference-engine/include/builders/ie_split_layer.hpp
new file mode 100644 (file)
index 0000000..526ed79
--- /dev/null
@@ -0,0 +1,73 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Split layer
+ */
+class INFERENCE_ENGINE_API_CLASS(SplitLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit SplitLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit SplitLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    SplitLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns output ports
+     * @return Vector of output ports
+     */
+    const std::vector<Port>& getOutputPorts() const;
+    /**
+     * @brief Sets output ports
+     * @param ports Vector of output ports
+     * @return reference to layer builder
+     */
+    SplitLayer& setOutputPorts(const std::vector<Port>& ports);
+    /**
+     * @brief Returns input port
+     * @return Input port
+     */
+    const Port& getInputPort() const;
+    /**
+     * @brief Sets input port
+     * @param port Input port
+     * @return reference to layer builder
+     */
+    SplitLayer& setInputPort(const Port& port);
+    /**
+     * @brief Returns axis
+     * @return Axis
+     */
+    size_t getAxis() const;
+    /**
+     * @brief Sets axis
+     * @param axis Axis
+     * @return reference to layer builder
+     */
+    SplitLayer& setAxis(size_t axis);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_tanh_layer.hpp b/inference-engine/include/builders/ie_tanh_layer.hpp
new file mode 100644 (file)
index 0000000..acb0002
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for TanH layer
+ */
+class INFERENCE_ENGINE_API_CLASS(TanHLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit TanHLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit TanHLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    TanHLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns port with shapes for the layer
+     * @return Port with shapes
+     */
+    const Port& getPort() const;
+    /**
+     * @brief Sets port shapes for the layer
+     * @param port Port with shapes
+     * @return reference to layer builder
+     */
+    TanHLayer& setPort(const Port& port);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
diff --git a/inference-engine/include/builders/ie_tile_layer.hpp b/inference-engine/include/builders/ie_tile_layer.hpp
new file mode 100644 (file)
index 0000000..de03ba2
--- /dev/null
@@ -0,0 +1,89 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_layer_fragment.hpp>
+#include <ie_inetwork.hpp>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace Builder {
+
+/**
+ * @brief The class represents a builder for Tile layer
+ */
+class INFERENCE_ENGINE_API_CLASS(TileLayer): public LayerFragment {
+public:
+    /**
+     * @brief The constructor creates a builder with the name
+     * @param name Layer name
+     */
+    explicit TileLayer(const std::string& name = "");
+    /**
+     * @brief The constructor creates a builder from generic builder
+     * @param genLayer generic builder
+     */
+    explicit TileLayer(Layer& genLayer);
+    /**
+     * @brief Sets the name for the layer
+     * @param name Layer name
+     * @return reference to layer builder
+     */
+    TileLayer& setName(const std::string& name);
+
+    /**
+     * @brief Returns input port
+     * @return Input port
+     */
+    const Port& getInputPort() const;
+    /**
+     * @brief Sets input port
+     * @param port Input port
+     * @return reference to layer builder
+     */
+    TileLayer& setInputPort(const Port& port);
+    /**
+     * @brief Returns output port
+     * @return Output port
+     */
+    const Port& getOutputPort() const;
+    /**
+     * @brief Sets output port
+     * @param port Output port
+     * @return reference to layer builder
+     */
+    TileLayer& setOutputPort(const Port& port);
+    /**
+     * @brief Returns axis
+     * @return Axis
+     */
+    size_t getAxis() const;
+    /**
+     * @brief Sets axis
+     * @param axis Axis
+     * @return reference to layer builder
+     */
+    TileLayer& setAxis(size_t axis);
+    /**
+     * @brief Returns tiles
+     * @return Tiles
+     */
+    size_t getTiles() const;
+    /**
+     * @brief Sets tiles
+     * @param tiles Tiles
+     * @return reference to layer builder
+     */
+    TileLayer& setTiles(size_t tiles);
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
+
+
+
+
+
index 3fac6a2..dc440ba 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 5a9b55a..7bc0b25 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 6805247..82d13cf 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -70,6 +69,11 @@ public:
     }
 
     /**
+     * @brief A destructor
+     */
+    virtual ~CNNNetwork() {}
+
+    /**
      * @brief Wraps original method
      * ICNNNetwork::getPrecision
      */
@@ -215,6 +219,15 @@ public:
         CALL_STATUS_FNC(reshape, inputShapes);
     }
 
+    /**
+     * @brief Serialize network to IR and weights files.
+     * @param xmlPath Path to output IR file.
+     * @param binPath Path to output weights file.
+     */
+    void serialize(const std::string &xmlPath, const std::string &binPath) const {
+        CALL_STATUS_FNC(serialize, xmlPath, binPath);
+    }
+
 protected:
     /**
      * @brief reader extra reference, might be nullptr
index 6e169fe..10317af 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -137,9 +136,8 @@ public:
     }
 
     /**
-    * @brief Sets data that will contain result of the inference
-    * @param results - a reference to a map of result blobs accessed by output names.
-    *        The type of Blob must correspond to the network output precision and size.
+    * @brief Sets new batch size when dynamic batching is enabled in executable network that created this request.
+    * @param batch new batch size to be used by all the following inference calls for this request.
     */
     void SetBatch(const int batch) {
         CALL_STATUS_FNC(SetBatch, batch);
index 99d32c8..f9bd90a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 045819e..5605209 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index b4a4ba2..f3e0d7a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 85d224f..6b083e1 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 0acff44..514a639 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -141,7 +140,7 @@ private:
     }
 
     template <typename T>
-    std::pair<StatusCode, bool> status_code_assign(const T & arg) {
+    std::pair<StatusCode, bool> status_code_assign(const T &) {
         return {static_cast<StatusCode>(0), false};
     }
 };
diff --git a/inference-engine/include/details/ie_inetwork_iterator.hpp b/inference-engine/include/details/ie_inetwork_iterator.hpp
new file mode 100644 (file)
index 0000000..84f8dee
--- /dev/null
@@ -0,0 +1,137 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief A header file for the CNNNetworkIterator class
+ * @file ie_cnn_network_iterator.hpp
+ */
+#pragma once
+#include <utility>
+#include <unordered_map>
+#include <unordered_set>
+#include <list>
+#include <iterator>
+#include <memory>
+#include <vector>
+
+#include <ie_inetwork.hpp>
+
+namespace InferenceEngine {
+namespace details {
+
+template<class NT, class LT>
+class INetworkIterator: public std::iterator<std::input_iterator_tag, std::shared_ptr<LT>> {
+public:
+    explicit INetworkIterator(NT * network, bool toEnd = false): network(network), currentIdx(0) {
+        if (!network || toEnd)
+            return;
+        const auto& inputs = network->getInputs();
+
+        std::vector<std::shared_ptr<LT>> allInputs;
+        for (const auto& input : inputs) {
+            allInputs.push_back(std::dynamic_pointer_cast<LT>(input));
+        }
+
+        bool res = forestDFS(allInputs, [&](std::shared_ptr<LT> current) {
+            sortedLayers.push_back(current);
+        }, false);
+
+        if (!res) {
+            THROW_IE_EXCEPTION << "Sorting not possible, due to existed loop.";
+        }
+
+        std::reverse(std::begin(sortedLayers), std::end(sortedLayers));
+        currentLayer = getNextLayer();
+    }
+    bool operator!=(const INetworkIterator& that) const {
+        return !operator==(that);
+    }
+    bool operator==(const INetworkIterator& that) const {
+        return network == that.network && currentLayer == that.currentLayer;
+    }
+    typename INetworkIterator::reference operator*() {
+        if (nullptr == currentLayer) {
+            THROW_IE_EXCEPTION << "iterator out of bound";
+        }
+        return currentLayer;
+    }
+
+    INetworkIterator& operator++() {
+        currentLayer = getNextLayer();
+        return *this;
+    }
+
+    const INetworkIterator<NT, LT> operator++(int) {
+        INetworkIterator<NT, LT> retval = *this;
+        ++(*this);
+        return retval;
+    }
+
+private:
+    std::vector<std::shared_ptr<LT>> sortedLayers;
+    std::shared_ptr<LT> currentLayer;
+    size_t currentIdx;
+    NT *network = nullptr;
+
+    std::shared_ptr<LT> getNextLayer() {
+        return (sortedLayers.size() > currentIdx) ? sortedLayers[currentIdx++] : nullptr;
+    }
+
+    template<class T>
+    inline bool forestDFS(const std::vector<std::shared_ptr<LT>>& heads, const T &visit, bool bVisitBefore) {
+        if (heads.empty()) {
+            return true;
+        }
+
+        std::unordered_map<idx_t, bool> visited;
+        for (auto & layer : heads) {
+            if (!DFS(visited, layer, visit, bVisitBefore)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template<class T>
+    inline bool DFS(std::unordered_map<idx_t, bool> &visited,
+                    const std::shared_ptr<LT> &layer,
+                    const T &visit,
+                    bool visitBefore) {
+        if (layer == nullptr) {
+            return true;
+        }
+
+        if (visitBefore)
+            visit(layer);
+
+        visited[layer->getId()] = false;
+        for (const auto &connection : network->getLayerConnections(layer->getId())) {
+            if (connection.to().layerId() == layer->getId()) {
+                continue;
+            }
+            const auto outLayer = network->getLayer(connection.to().layerId());
+            auto i = visited.find(outLayer->getId());
+            if (i != visited.end()) {
+                /**
+                 * cycle detected we entered still not completed node
+                 */
+                if (!i->second) {
+                    return false;
+                }
+                continue;
+            }
+
+            if (!DFS(visited, outLayer, visit, visitBefore)) {
+                return false;
+            }
+        }
+        if (!visitBefore)
+            visit(layer);
+        visited[layer->getId()] = true;
+        return true;
+    }
+};
+
+}  // namespace details
+}  // namespace InferenceEngine
index 6f5aa01..a1b55dd 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 588dc24..8d823ad 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 6e348ee..3afe7c5 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 6953bc1..b280cc1 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -37,9 +36,8 @@ class PreAllocator : public IAllocator {
     }
     /**
      * @brief The PreAllocator class does not utilize this function
-     * @param handle Memory handle to unlock
      */
-    void  unlock(void * handle) noexcept override {}
+    void  unlock(void *) noexcept override {}  // NOLINT
 
     /**
      * @brief Returns a pointer to preallocated memory
@@ -55,10 +53,11 @@ class PreAllocator : public IAllocator {
     }
     /**
      * @brief The PreAllocator class cannot release the handle
-     * @param handle Memory handle to release
      * @return false
      */
-    bool   free(void* handle) noexcept override { return false;}
+    bool   free(void *) noexcept override {  // NOLINT
+        return false;
+    }
 
     /**
      * @brief Deletes current allocator. 
index 4c6dd3f..6b93d26 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 6932360..a4973ff 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -86,7 +85,7 @@ public:
     * @brief The main constructor
     * @param name Name of a shared library file
     */
-    explicit SOPointer(const std::string &name)
+    explicit SOPointer(const file_name_t &name)
         : _so_loader(new Loader(name.c_str()))
         , _pointedObj(details::shared_from_irelease(
             SymbolLoader<Loader>(_so_loader).template instantiateSymbol<T>(SOCreatorTrait<T>::name))) {
@@ -161,6 +160,6 @@ protected:
  * @param name Name of the shared library file
  */
 template <class T>
-inline std::shared_ptr<T> make_so_pointer(const std::string & name) = delete;
+inline std::shared_ptr<T> make_so_pointer(const file_name_t & name) = delete;
 
 }  // namespace InferenceEngine
index 75b192e..27be898 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -15,7 +14,9 @@
 // Avoidance of Windows.h to include winsock library.
 #define _WINSOCKAPI_
 // Avoidance of Windows.h to define min/max.
+#ifndef NOMINMAX
 #define NOMINMAX
+#endif
 #include <windows.h>
 #include <direct.h>
 
@@ -35,7 +36,7 @@ private:
      *        WinAPI LoadLibrary rules
      * @param pluginName Full or relative path to the plugin library
      */
-    explicit SharedObjectLoader(const char* pluginName) {
+    explicit SharedObjectLoader(LPCTSTR pluginName) {
         char cwd[1024];
         shared_object = LoadLibrary(pluginName);
         if (!shared_object) {
diff --git a/inference-engine/include/gna/gna_config.hpp b/inference-engine/include/gna/gna_config.hpp
new file mode 100644 (file)
index 0000000..29b4342
--- /dev/null
@@ -0,0 +1,78 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief A header that defines advanced related properties for VPU plugins.
+ * These properties should be used in SetConfig() and LoadNetwork() methods of plugins
+ *
+ * @file vpu_plugin_config.hpp
+ */
+
+#pragma once
+
+#include <string>
+#include "../ie_plugin_config.hpp"
+
+namespace InferenceEngine {
+
+namespace GNAConfigParams {
+
+#define GNA_CONFIG_KEY(name) InferenceEngine::GNAConfigParams::_CONFIG_KEY(GNA_##name)
+#define GNA_CONFIG_VALUE(name) InferenceEngine::GNAConfigParams::GNA_##name
+
+#define DECLARE_GNA_CONFIG_KEY(name) DECLARE_CONFIG_KEY(GNA_##name)
+#define DECLARE_GNA_CONFIG_VALUE(name) DECLARE_CONFIG_VALUE(GNA_##name)
+
+/**
+* @brief Scale factor that is calculated by user, in order to use static quantisation feature
+* This option should be used with floating point value serialized to string with decimal separator equals to . (dot)
+*/
+DECLARE_GNA_CONFIG_KEY(SCALE_FACTOR);
+
+/**
+* @brief By default gna api work in Int16 precision, however this can be adjusted if necessary,
+* currently supported values are I16, I8
+*/
+DECLARE_GNA_CONFIG_KEY(PRECISION);
+
+
+/**
+* @brief if turned on, dump GNA firmware model into specified file
+*/
+DECLARE_GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE);
+
+/**
+* @brief GNA proc_type setting that should be one of GNA_AUTO, GNA_HW, GNA_SW, GNA_SW_EXACT
+*/
+DECLARE_GNA_CONFIG_KEY(DEVICE_MODE);
+
+DECLARE_GNA_CONFIG_VALUE(AUTO);
+DECLARE_GNA_CONFIG_VALUE(HW);
+DECLARE_GNA_CONFIG_VALUE(SW);
+DECLARE_GNA_CONFIG_VALUE(SW_EXACT);
+
+/**
+* @brief if enabled produced minimum memory footprint for loaded network in GNA memory, default value is YES
+*/
+DECLARE_GNA_CONFIG_KEY(COMPACT_MODE);
+
+/**
+* @brief The option to enable/disable uniformly distributed PWL algorithm.
+* By default (in case of NO value set) the optimized algorithm called "Recursive Descent Algorithm for Finding
+* the Optimal Minimax Piecewise Linear Approximation of Convex Functions is used.
+* If value is YES then simple uniform distribution used to create PWL approximation of activation functions
+* Uniform distribution usually gives poor approximation with same number of segments
+*/
+DECLARE_GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN);
+
+/**
+* @brief By default, the GNA plugin uses one worker thread for inference computations.
+* This parameter allows you to create up to 127 threads for software modes.
+*
+* Note that multithreading mode does not guarantee the same computation order as order
+* of issuing. Additionally, in this case, software modes do not implement any serializations.
+*/
+DECLARE_GNA_CONFIG_KEY(LIB_N_THREADS);
+}  // namespace GNAConfigParams
+}  // namespace InferenceEngine
index b326e1d..b9f5f5c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 9a77c4b..3a71e75 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 
 #include "details/ie_no_copy.hpp"
 
-#if defined(_WIN32) && !defined(USE_STATIC_IE)
-    #define INFERENCE_ENGINE_CDECL
-    #ifdef IMPLEMENT_INFERENCE_ENGINE_API
+#if defined(USE_STATIC_IE) || ( defined(__GNUC__) && (__GNUC__ < 4) )
+    #define INFERENCE_ENGINE_API(TYPE) extern "C" TYPE
+    #define INFERENCE_ENGINE_API_CPP(type) type
+    #define INFERENCE_ENGINE_API_CLASS(type)    type
+    #define INFERENCE_ENGINE_CDECL __attribute__((cdecl))
+#else
+    #if defined(_WIN32)
+        #define INFERENCE_ENGINE_CDECL
+        #ifdef IMPLEMENT_INFERENCE_ENGINE_API
             #define INFERENCE_ENGINE_API(type) extern "C"   __declspec(dllexport) type __cdecl
             #define INFERENCE_ENGINE_API_CPP(type)  __declspec(dllexport) type __cdecl
             #define INFERENCE_ENGINE_API_CLASS(type)        __declspec(dllexport) type
-    #else
+        #else
             #define INFERENCE_ENGINE_API(type) extern "C"  __declspec(dllimport) type __cdecl
             #define INFERENCE_ENGINE_API_CPP(type)  __declspec(dllimport) type __cdecl
             #define INFERENCE_ENGINE_API_CLASS(type)   __declspec(dllimport) type
+        #endif
+    #else
+        #define INFERENCE_ENGINE_CDECL __attribute__((cdecl))
+        #ifdef IMPLEMENT_INFERENCE_ENGINE_API
+            #define INFERENCE_ENGINE_API(type) extern "C" __attribute__((visibility("default"))) type
+            #define INFERENCE_ENGINE_API_CPP(type) __attribute__((visibility("default"))) type
+            #define INFERENCE_ENGINE_API_CLASS(type) __attribute__((visibility("default"))) type
+        #else
+            #define INFERENCE_ENGINE_API(type)   extern "C"   type
+            #define INFERENCE_ENGINE_API_CPP(type)   type
+            #define INFERENCE_ENGINE_API_CLASS(type)   type
+        #endif
     #endif
-#else
-#define INFERENCE_ENGINE_API(TYPE) extern "C" TYPE
-#define INFERENCE_ENGINE_API_CPP(type) type
-#define INFERENCE_ENGINE_API_CLASS(type)    type
-#define INFERENCE_ENGINE_CDECL __attribute__((cdecl))
 #endif
index c6d0203..21267a3 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -126,7 +125,7 @@ public:
      * @param layout New layout to set
      * @return Total number of elements (a product of all the dimensions)
      */
-    size_t Resize(const SizeVector &dims, Layout layout = Layout::ANY) {
+    size_t Resize(const SizeVector &dims, Layout layout = Layout::ANY) noexcept {
         bool bret = deallocate();
 
         if (layout != Layout::ANY) {
@@ -147,9 +146,9 @@ public:
      * @param layout New layout to set
      * @return The total number of elements (a product of all the dims)
      */
-    size_t Reshape(const SizeVector &dims, Layout layout = Layout::ANY) {
+    size_t Reshape(const SizeVector &dims, Layout layout = Layout::ANY) noexcept {
         if (product(tensorDesc.getDims()) != product(dims)) {
-            THROW_IE_EXCEPTION << "cannot reshape when total size changes";
+            return 0;
         }
 
         if (layout != Layout::ANY) {
@@ -164,28 +163,28 @@ public:
      * @deprecated Please use TensorDesc for working with dimensions.
      * @brief Returns the tensor dimensions vector with reversed order.
      */
-    const SizeVector dims() const {
+    const SizeVector dims() const noexcept {
         return SizeVector(tensorDesc.getDims().rbegin(), tensorDesc.getDims().rend());
     }
 
     /**
      * @brief Returns the tensor description
      */
-    const TensorDesc &getTensorDesc() const {
+    const TensorDesc &getTensorDesc() const noexcept {
         return tensorDesc;
     }
 
     /**
      * @brief Returns the total number of elements (a product of all the dims)
      */
-    size_t size() const {
+    size_t size() const noexcept {
         return product(tensorDesc.getDims());
     }
 
     /**
      * @brief Returns the size of the current Blob in bytes.
      */
-    size_t byteSize() const {
+    size_t byteSize() const noexcept {
         return product(tensorDesc.getDims()) * element_size();
     }
 
@@ -199,27 +198,27 @@ public:
      * @brief Allocates memory to store the data.
      * Abstract method.
      */
-    virtual void allocate() = 0;
+    virtual void allocate() noexcept = 0;
 
     /**
      * @brief Releases previously allocated data.
      * Abstract method.
      */
-    virtual bool deallocate() = 0;
+    virtual bool deallocate() noexcept = 0;
 
     /**
      * @brief Gets access to the allocated memory.
      * Abstract method.
      * @return A LockedMemory object
      */
-    virtual LockedMemory<void> buffer() = 0;
+    virtual LockedMemory<void> buffer() noexcept = 0;
 
     /**
      * @brief Gets read-only access to the allocated memory.
      * Abstract method.
      * @return A LockedMemory object
      */
-    virtual LockedMemory<const void> cbuffer() const = 0;
+    virtual LockedMemory<const void> cbuffer() const noexcept = 0;
 
 protected:
     /**
@@ -232,7 +231,7 @@ protected:
      * @param dims Reference to a vector with dimension values of type size_t
      * @return Result of multiplication
      */
-    static size_t product(const SizeVector &dims) {
+    static size_t product(const SizeVector &dims) noexcept {
         if (dims.empty())
             return 0;
         return std::accumulate(std::begin(dims), std::end(dims), (size_t) 1, std::multiplies<size_t>());
@@ -401,7 +400,7 @@ public:
      * @brief Creates an new empty rvalue LockedMemory object.
      * @return rvalue for the empty locked object of type T
      */
-    virtual LockedMemory<T> data() {
+    virtual LockedMemory<T> data() noexcept {
         return std::move(lockme<T>());
     }
 
@@ -409,7 +408,7 @@ public:
      * @brief Creates a new empty rvalue read-only LockedMemory object.
      * @return rvalue for the empty locked const object of type T.
      */
-    virtual LockedMemory<const T> readOnly() const {
+    virtual LockedMemory<const T> readOnly() const noexcept {
         return std::move(lockme<const T>());
     }
 
@@ -418,7 +417,7 @@ public:
       * @brief Copies data from the given vector to the blob.
       * @param that Vector of values to copy to the blob
       */
-    void set(const std::vector<T> &that) {
+    void set(const std::vector<T> &that)  {
         if (tensorDesc.getDims().size() != 0 && that.size() != product(tensorDesc.getDims()))
             THROW_IE_EXCEPTION << "Size mismatch between dims and vector";
         if (tensorDesc.getDims().size() == 0) {
@@ -435,7 +434,7 @@ public:
     /**
      * @brief Allocates or reallocates memory
      */
-    void allocate() override {
+    void allocate() noexcept override {
         if (_handle != nullptr) {
             getAllocator()->free(_handle);
         }
@@ -445,7 +444,7 @@ public:
     /**
      * @brief Frees all allocated data
      */
-    bool deallocate() override {
+    bool deallocate() noexcept override {
         return free();
     }
 
@@ -453,7 +452,7 @@ public:
      * @brief Creates a new LockedMemory instance holding void pointer.
      * @return LockedMemory instance holding void pointer
      */
-    LockedMemory<void> buffer() override {
+    LockedMemory<void> buffer() noexcept override {
         return std::move(lockme<void>());
     }
 
@@ -461,7 +460,7 @@ public:
      * @brief Creates a new LockedMemory instance holding constant void pointer.
      * @return LockedMemory instance holding constant void pointer
      */
-    LockedMemory<const void> cbuffer() const override {
+    LockedMemory<const void> cbuffer() const noexcept override {
         return std::move(lockme<const void>());
     }
 
@@ -589,6 +588,7 @@ protected:
  */
 template<class Type>
 inline typename TBlob<Type>::Ptr make_shared_blob(Precision p, Layout l, const SizeVector &dims) {
+    IE_ASSERT(p.hasStorageType<Type>());
     return std::make_shared<TBlob<Type>>(p, l, dims);
 }
 
@@ -602,6 +602,7 @@ inline typename TBlob<Type>::Ptr make_shared_blob(Precision p, Layout l, const S
  */
 template<class Type>
 inline typename TBlob<Type>::Ptr make_shared_blob(Precision p, const SizeVector &dims) {
+    IE_ASSERT(p.hasStorageType<Type>());
     return make_shared_blob<Type>(p, TensorDesc::getLayoutByDims(dims), dims);
 }
 
@@ -615,6 +616,7 @@ inline typename TBlob<Type>::Ptr make_shared_blob(Precision p, const SizeVector
  */
 template<typename Type, class TArg>
 inline typename InferenceEngine::TBlob<Type>::Ptr make_shared_blob(Precision p, Layout l, const TArg &arg) {
+    IE_ASSERT(p.hasStorageType<Type>());
     return std::make_shared<InferenceEngine::TBlob<Type>>(p, l, arg);
 }
 
@@ -628,6 +630,7 @@ inline typename InferenceEngine::TBlob<Type>::Ptr make_shared_blob(Precision p,
  */
 template<typename Type, class TArg>
 inline typename InferenceEngine::TBlob<Type>::Ptr make_shared_blob(Precision p, const TArg &arg) {
+    IE_ASSERT(p.hasStorageType<Type>());
     return make_shared_blob<Type, TArg>(p, TensorDesc::getLayoutByDims(arg), arg);
 }
 
@@ -639,6 +642,7 @@ inline typename InferenceEngine::TBlob<Type>::Ptr make_shared_blob(Precision p,
  */
 template<typename Type>
 inline typename InferenceEngine::TBlob<Type>::Ptr make_shared_blob(const TensorDesc& tensorDesc) {
+    IE_ASSERT(tensorDesc.getPrecision().hasStorageType<Type>());
     return std::make_shared<InferenceEngine::TBlob<Type>>(tensorDesc);
 }
 
@@ -652,6 +656,7 @@ inline typename InferenceEngine::TBlob<Type>::Ptr make_shared_blob(const TensorD
  */
 template<typename Type>
 inline typename InferenceEngine::TBlob<Type>::Ptr make_shared_blob(const TensorDesc& tensorDesc, Type * ptr, size_t size = 0) {
+    IE_ASSERT(tensorDesc.getPrecision().hasStorageType<Type>());
     return std::make_shared<InferenceEngine::TBlob<Type>>(tensorDesc, ptr, size);
 }
 
@@ -682,13 +687,14 @@ inline typename InferenceEngine::TBlob<TypeTo>::Ptr make_shared_blob(const TBlob
 /**
  * @deprecated Use TensorDesc in order to create Blob::Ptr.
  * @brief Creates a blob with the given precision.
- * @tparam Type Type of the shared pointer to be created
+ * @tparam TypeTo Type of the shared pointer to be created
  * @param p Given precision
  * @return A shared pointer to the blob created
  */
-template<typename Type>
-inline typename InferenceEngine::TBlob<Type>::Ptr make_shared_blob(Precision p, Layout l = NCHW) {
-    return std::make_shared<TBlob<Type>>(p, l);
+template<typename TypeTo>
+inline typename InferenceEngine::TBlob<TypeTo>::Ptr make_shared_blob(Precision p, Layout l = NCHW) {
+    IE_ASSERT(p.hasStorageType<TypeTo>());
+    return std::make_shared<TBlob<TypeTo>>(p, l);
 }
 
 /**
@@ -703,6 +709,7 @@ inline typename InferenceEngine::TBlob<Type>::Ptr make_shared_blob(Precision p,
  */
 template<typename TypeTo>
 inline typename TBlob<TypeTo>::Ptr make_shared_blob(Precision p, Layout l, SizeVector dims, const std::vector<TypeTo> &arg) {
+    IE_ASSERT(p.hasStorageType<TypeTo>());
     auto blob = std::make_shared<TBlob<TypeTo>>(p, l, dims);
     blob->set(arg);
     return blob;
@@ -719,6 +726,7 @@ inline typename TBlob<TypeTo>::Ptr make_shared_blob(Precision p, Layout l, SizeV
  */
 template<typename TypeTo>
 inline typename TBlob<TypeTo>::Ptr make_shared_blob(Precision p, Layout l, const std::vector<TypeTo> &arg) {
+    IE_ASSERT(p.hasStorageType<TypeTo>());
     auto blob = std::make_shared<TBlob<TypeTo>>(p, l);
     blob->set(arg);
     return blob;
@@ -734,6 +742,7 @@ inline typename TBlob<TypeTo>::Ptr make_shared_blob(Precision p, Layout l, const
  */
 template<typename TypeTo>
 inline typename TBlob<TypeTo>::Ptr make_shared_blob(Precision p, const std::vector<TypeTo> &arg) {
+    IE_ASSERT(p.hasStorageType<TypeTo>());
     return make_shared_blob<TypeTo>(p, TensorDesc::getLayoutByDims(arg), arg);
 }
 
@@ -749,6 +758,7 @@ inline typename TBlob<TypeTo>::Ptr make_shared_blob(Precision p, const std::vect
  */
 template <typename TypeTo>
 inline typename TBlob<TypeTo>::Ptr make_shared_blob(Precision p, Layout l, const SizeVector &dims, TypeTo * ptr, size_t size = 0) {
+    IE_ASSERT(p.hasStorageType<TypeTo>());
     auto blob = std::make_shared<TBlob<TypeTo>>(p, l, dims, ptr, size);
     return blob;
 }
@@ -764,6 +774,7 @@ inline typename TBlob<TypeTo>::Ptr make_shared_blob(Precision p, Layout l, const
  */
 template <typename TypeTo>
 inline typename TBlob<TypeTo>::Ptr make_shared_blob(Precision p, const SizeVector &dims, TypeTo * ptr, size_t size = 0) {
+    IE_ASSERT(p.hasStorageType<TypeTo>());
     return make_shared_blob<TypeTo>(p, TensorDesc::getLayoutByDims(dims), dims, ptr, size);
 }
 
diff --git a/inference-engine/include/ie_builders.hpp b/inference-engine/include/ie_builders.hpp
new file mode 100644 (file)
index 0000000..ad2543f
--- /dev/null
@@ -0,0 +1,49 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <builders/ie_network_builder.hpp>
+#include <builders/ie_layer_builder.hpp>
+
+#include <builders/ie_argmax_layer.hpp>
+#include <builders/ie_clamp_layer.hpp>
+#include <builders/ie_concat_layer.hpp>
+#include <builders/ie_const_layer.hpp>
+#include <builders/ie_convolution_layer.hpp>
+#include <builders/ie_crop_layer.hpp>
+#include <builders/ie_ctc_greedy_decoder_layer.hpp>
+#include <builders/ie_deconvolution_layer.hpp>
+#include <builders/ie_detection_output_layer.hpp>
+#include <builders/ie_eltwise_layer.hpp>
+#include <builders/ie_elu_layer.hpp>
+#include <builders/ie_fully_connected_layer.hpp>
+#include <builders/ie_grn_layer.hpp>
+#include <builders/ie_input_layer.hpp>
+#include <builders/ie_memory_layer.hpp>
+#include <builders/ie_mvn_layer.hpp>
+#include <builders/ie_norm_layer.hpp>
+#include <builders/ie_normalize_layer.hpp>
+#include <builders/ie_output_layer.hpp>
+#include <builders/ie_permute_layer.hpp>
+#include <builders/ie_pooling_layer.hpp>
+#include <builders/ie_power_layer.hpp>
+#include <builders/ie_prelu_layer.hpp>
+#include <builders/ie_prior_box_clustered_layer.hpp>
+#include <builders/ie_prior_box_layer.hpp>
+#include <builders/ie_proposal_layer.hpp>
+#include <builders/ie_psroi_pooling_layer.hpp>
+#include <builders/ie_region_yolo_layer.hpp>
+#include <builders/ie_relu6_layer.hpp>
+#include <builders/ie_relu_layer.hpp>
+#include <builders/ie_reorg_yolo_layer.hpp>
+#include <builders/ie_reshape_layer.hpp>
+#include <builders/ie_roi_pooling_layer.hpp>
+#include <builders/ie_scale_shift_layer.hpp>
+#include <builders/ie_sigmoid_layer.hpp>
+#include <builders/ie_simpler_nms_layer.hpp>
+#include <builders/ie_softmax_layer.hpp>
+#include <builders/ie_split_layer.hpp>
+#include <builders/ie_tanh_layer.hpp>
+#include <builders/ie_tile_layer.hpp>
index 6489e8a..e08c265 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include <string>
 #include <ostream>
 #include <algorithm>
+#include <cstdlib>
 #include <details/ie_exception.hpp>
 
+#include "ie_unicode.hpp"
+
 namespace InferenceEngine {
 /**
  * @brief Represents tensor size.
@@ -67,12 +69,6 @@ union UserValue {
     void *v_ptr;
 };
 
-enum CellType {
-    ORIG,
-    LSTM,
-    GRU
-};
-
 /**
  * @enum Layout
  * @brief Layouts that the inference engine supports
@@ -83,6 +79,8 @@ enum Layout : uint8_t {
     // I/O data layouts
     NCHW = 1,
     NHWC = 2,
+    NCDHW = 3,
+    NDHWC = 4,
 
     // weight layouts
     OIHW = 64,
diff --git a/inference-engine/include/ie_context.hpp b/inference-engine/include/ie_context.hpp
new file mode 100644 (file)
index 0000000..d7aca90
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief This is a header file for the IE Context class
+ * @file ie_context.hpp
+ */
+#pragma once
+
+#include <details/caseless.hpp>
+#include <ie_iextension.h>
+#include <string>
+#include <vector>
+#include <map>
+
+namespace InferenceEngine {
+
+/**
+ * @brief This class implements object
+ */
+class INFERENCE_ENGINE_API_CLASS(Context) {
+public:
+    Context();
+
+    /**
+     * @brief Registers extension within the context
+     * @param ext Pointer to already loaded extension
+     */
+    void addExtension(const IShapeInferExtensionPtr& ext);
+
+    /**
+     * @brief Registers Shape Infer implementation within the Context
+     * @param type Layer type
+     * @param impl Shape Infer implementation
+     */
+    void addShapeInferImpl(const std::string& type, const IShapeInferImpl::Ptr& impl);
+
+    /**
+     * @brief Returns the shape infer implementation by layer type
+     * @param type Layer type
+     * @return Shape Infer implementation
+     */
+    IShapeInferImpl::Ptr getShapeInferImpl(const std::string& type);
+
+private:
+    details::caseless_map<std::string, IShapeInferImpl::Ptr> shapeInferImpls;
+};
+
+}  // namespace InferenceEngine
index 818594e..2088919 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -160,6 +159,6 @@ public:
      */
     const UserValue& getUserObject() const;
 private:
-    TensorDesc tensorDesc;
+    mutable TensorDesc tensorDesc;
 };
 }  // namespace InferenceEngine
index 2a2b9b6..2cc67cc 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -28,6 +27,7 @@ enum class TargetDevice : uint8_t {
     eGPU = 3,
     eFPGA = 4,
     eMYRIAD = 5,
+    eHDDL = 6,
     eGNA = 7,
     eHETERO = 8
 };
@@ -51,6 +51,7 @@ class TargetDeviceInfo {
             DECL_DEVICE(GPU),
             DECL_DEVICE(FPGA),
             DECL_DEVICE(MYRIAD),
+            DECL_DEVICE(HDDL),
             DECL_DEVICE(GNA),
             DECL_DEVICE(HETERO)
         };
@@ -65,6 +66,7 @@ class TargetDeviceInfo {
             { "GPU", InferenceEngine::TargetDevice::eGPU },
             { "FPGA", InferenceEngine::TargetDevice::eFPGA },
             { "MYRIAD", InferenceEngine::TargetDevice::eMYRIAD },
+            { "HDDL", InferenceEngine::TargetDevice::eHDDL },
             { "GNA", InferenceEngine::TargetDevice::eGNA },
             { "BALANCED", InferenceEngine::TargetDevice::eBalanced },
             { "HETERO", InferenceEngine::TargetDevice::eHETERO }
index 3f0e80c..a934a78 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 871d8c9..926dbd6 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -53,7 +52,7 @@ public:
    * @brief Loads extension from a shared library
    * @param name Full or relative path to extension library
    */
-    explicit Extension(const std::string &name)
+    explicit Extension(const file_name_t &name)
             : actual(name) {}
 
     /**
@@ -127,7 +126,7 @@ public:
    * @brief Loads extension from a shared library
    * @param name Full or relative path to extension library
    */
-    explicit ShapeInferExtension(const std::string &name)
+    explicit ShapeInferExtension(const file_name_t &name)
             : actual(name) {}
 
     /**
@@ -192,7 +191,7 @@ protected:
  * @return shared_pointer A wrapper for the given type from a specific shared module
  */
 template<>
-inline std::shared_ptr<IShapeInferExtension> make_so_pointer(const std::string &name) {
+inline std::shared_ptr<IShapeInferExtension> make_so_pointer(const file_name_t &name) {
     return std::make_shared<ShapeInferExtension>(name);
 }
 
@@ -202,7 +201,7 @@ inline std::shared_ptr<IShapeInferExtension> make_so_pointer(const std::string &
  * @return shared_pointer A wrapper for the given type from a specific shared module
  */
 template<>
-inline std::shared_ptr<IExtension> make_so_pointer(const std::string &name) {
+inline std::shared_ptr<IExtension> make_so_pointer(const file_name_t &name) {
     return std::make_shared<Extension>(name);
 }
 
index 0a060b1..820c2b4 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 6aa1f84..07b2444 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -29,8 +28,6 @@ namespace InferenceEngine {
  * @brief A collection that contains string as key, and Data smart pointer as value
  */
 using OutputsDataMap = std::map<std::string, DataPtr>;
-class IShapeInferExtension;
-using IShapeInferExtensionPtr = std::shared_ptr<IShapeInferExtension>;
 
 /**
  * @brief This is the main interface to describe the NN topology
@@ -148,8 +145,9 @@ public:
      * @return Status code of the operation
      * @note: Current implementation of the function sets batch size to the first dimension of all layers in the networks.
      * Before calling it make sure that all your layers have batch in the first dimension, otherwise the method works incorrectly.
-     * This limitation is resolved via [Shape Inference feature](./docs/Inference_Engine_Developer_Guide/ShapeInference.md)
+     * This limitation is resolved via shape inference feature
      * by using InferenceEngine::ICNNNetwork::reshape method.
+     * To read more refer to the Shape Inference section in documentation
      */
     virtual StatusCode setBatchSize(size_t size, ResponseDesc* responseDesc) noexcept = 0;
 
@@ -170,7 +168,7 @@ public:
      * @param resp Pointer to the response message that holds a description of an error if any occurred
      * @return Status code of the operation
      */
-    virtual StatusCode reshape(const InputShapes& inputShapes, ResponseDesc* resp) noexcept { return NOT_IMPLEMENTED; };
+    virtual StatusCode reshape(const InputShapes& /*inputShapes*/, ResponseDesc* /*resp*/) noexcept { return NOT_IMPLEMENTED; };
 
     /**
      * @brief Registers extension within the plugin
@@ -179,8 +177,16 @@ public:
      * @return Status code of the operation. OK if succeeded
      */
     virtual StatusCode
-    AddExtension(const IShapeInferExtensionPtr& extension, ResponseDesc* resp) noexcept { return NOT_IMPLEMENTED; };
+    AddExtension(const IShapeInferExtensionPtr& /*extension*/, ResponseDesc* /*resp*/) noexcept { return NOT_IMPLEMENTED; };
+
+    virtual StatusCode getStats(ICNNNetworkStats** /*stats*/, ResponseDesc* /*resp*/) const noexcept { return NOT_IMPLEMENTED; };
 
-    virtual StatusCode getStats(ICNNNetworkStats** stats, ResponseDesc* resp) const noexcept { return NOT_IMPLEMENTED; };
+    /**
+     * @brief Serialize network to IR and weights files.
+     * @param xmlPath Path to output IR file.
+     * @param binPath Path to output weights file.
+     * @return Status code of the operation
+     */
+    virtual StatusCode serialize(const std::string &xmlPath, const std::string &binPath, ResponseDesc* resp) const noexcept = 0;
 };
 }  // namespace InferenceEngine
index 904a5f2..440c202 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index efc6f5b..0b0a915 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 54e0b70..c0ea3f8 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -135,8 +134,8 @@ public:
      * @param resp Response descriptor
      * @return Status code
      */
-    virtual StatusCode getShapes(const std::vector<TensorDesc>& inShapes, std::vector<TensorDesc>& outShapes,
-                                 ResponseDesc* resp) noexcept {
+    virtual StatusCode getShapes(const std::vector<TensorDesc>& /*inShapes*/, std::vector<TensorDesc>& /*outShapes*/,
+                                 ResponseDesc* /*resp*/) noexcept {
         return NOT_IMPLEMENTED;
     }
 
@@ -230,11 +229,11 @@ public:
      */
     virtual StatusCode getPrimitiveTypes(char**& types, unsigned int& size, ResponseDesc* resp) noexcept = 0;
 
-    StatusCode getShapeInferTypes(char**& types, unsigned int& size, ResponseDesc* resp) noexcept override {
+    StatusCode getShapeInferTypes(char**&, unsigned int&, ResponseDesc*) noexcept override {
         return NOT_IMPLEMENTED;
     };
 
-    StatusCode getShapeInferImpl(IShapeInferImpl::Ptr& impl, const char* type, ResponseDesc* resp) noexcept override {
+    StatusCode getShapeInferImpl(IShapeInferImpl::Ptr&, const char*, ResponseDesc*) noexcept override {
         return NOT_IMPLEMENTED;
     };
 };
index a202fb3..326c350 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -80,7 +79,7 @@ public:
      */
     virtual void QueryNetwork(const std::string &device,
                               const ICNNNetwork &network,
-                              const std::map<std::string, std::string>& config,
+                              const std::map<std::string, std::string>& /*config*/,
                               QueryNetworkResult &res) noexcept {
         QueryNetwork(device, network, res);
     };
index 708baba..fe09be7 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index f2bfbf8..2c007df 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/include/ie_inetwork.hpp b/inference-engine/include/ie_inetwork.hpp
new file mode 100644 (file)
index 0000000..41c02f0
--- /dev/null
@@ -0,0 +1,366 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file for the Inference Engine Network interface
+ * @file ie_inetwork.hpp
+ */
+#pragma once
+
+#include <utility>
+#include <string>
+#include <memory>
+#include <vector>
+#include <map>
+#include <ie_parameter.hpp>
+#include <ie_context.hpp>
+#include <ie_layouts.h>
+#include <ie_blob.h>
+
+namespace InferenceEngine {
+
+/**
+ * @brief A type of network objects indexes.
+ */
+using idx_t = size_t;
+
+/**
+ * @brief This class contains a pair from layerId and port index
+ */
+class PortInfo {
+public:
+    /**
+     * @brief The constructor creates a PortInfo object for port 0
+     * @param layerID Layer id
+     */
+    PortInfo(idx_t layerID): layer(layerID), port(0) {}  // NOLINT
+
+    /**
+     * @brief The constructor creates a PortInfo object
+     * @param layerID Layer id
+     * @param portID Port id
+     */
+    PortInfo(idx_t layerID, idx_t portID): layer(layerID), port(portID) {}
+
+    /**
+     * @brief Get layer id
+     * @return Layer id
+     */
+    idx_t layerId() const {
+        return layer;
+    }
+
+    /**
+     * @brief Get port id
+     * @return Port id
+     */
+    idx_t portId() const {
+        return port;
+    }
+
+    /**
+     * @brief Compares the given PortInfo object with the current one
+     * @param portInfo PortInfo object to compare with
+     * @return true if the given PortInfo object is equal to the current one, false - otherwise
+     */
+    bool operator==(const PortInfo& portInfo) const {
+        return layer == portInfo.layerId() && port == portInfo.portId();
+    }
+
+    /**
+     * @brief Checks if the given PortInfo object is not equal to the current one
+     * @param portInfo PortInfo object to compare with
+     * @return true if the given PortInfo object is not equal to the current one, false - otherwise
+     */
+    bool operator!=(const PortInfo& portInfo) const {
+        return !(*this == portInfo);
+    }
+
+private:
+    idx_t layer;
+    idx_t port;
+};
+
+/**
+ * @brief This class is the main object to describe the Inference Engine connection.
+ */
+class Connection {
+public:
+    /**
+     * @brief Constructor of a connection object.
+     * @param input pair of the index of input layer and the index of output port
+     * @param output pair of the index of output layer and the index of input port
+     */
+    Connection(const PortInfo& input, const PortInfo& output): input(input), output(output) {}
+
+    /**
+     * @brief Compares the given Connection with the current one
+     * @param connection Connection to compare with
+     * @return true if the given Connection is equal to the current one, false - otherwise
+     */
+    bool operator==(const Connection& connection) const {
+        return input == connection.from() && output == connection.to();
+    }
+
+    /**
+     * @brief Checks if the given Connection is not equal to the current one
+     * @param connection Connection to compare with
+     * @return true if the given Connection is not equal to the current one, false - otherwise
+     */
+    bool operator!=(const Connection& connection) const {
+        return !(*this == connection);
+    }
+
+    /**
+     * Returns a constant reference to a pair of input layer index and output port index.
+     * @return pair of the index of input layer and the index of output port
+     */
+    const PortInfo& from() const {
+        return input;
+    }
+
+    /**
+     * Returns a constant reference to a pair of output layer index and input port index.
+     * @return pair of the index of output layer and the index of input port
+     */
+    const PortInfo& to() const {
+        return output;
+    }
+
+private:
+    PortInfo input;
+    PortInfo output;
+};
+
+/**
+ * @brief This class is the main object to describe the Inference Engine port.
+ */
+class Port {
+public:
+    /**
+     * @brief Default constructor of a port object.
+     */
+    Port() = default;
+    /**
+     * @brief Constructor of a port object with shapes.
+     * @param shapes port shapes
+     */
+    explicit Port(const SizeVector& shapes): pShapes(shapes) {}
+
+    /**
+     * @brief Copy constructor.
+     * @param port object to copy
+     */
+    Port(const Port& port) {
+        this->pShapes = port.pShapes;
+    }
+
+    /**
+     * @brief Returns a constant reference to a vector with shapes.
+     * Shapes should be initialized if shape is empty.
+     * @return constant reference to shapes
+     */
+    const SizeVector& shape() const noexcept {
+        return pShapes;
+    }
+
+    /**
+     * @brief Returns a reference to a vector with shapes.
+     * Shapes should be initialized if shape is empty.
+     * @return reference to shapes
+     */
+    SizeVector& shape() noexcept {
+        return pShapes;
+    }
+
+private:
+    SizeVector pShapes;
+};
+
+/**
+ * @brief This class is the main interface to describe the Inference Engine layer parameters.
+ * All methods here are constant and do not throw exceptions.
+ */
+class IParameters {
+public:
+    /**
+     * @brief A shared pointer to the IParameters object.
+     */
+    using Ptr = std::shared_ptr<IParameters>;
+
+    /**
+     * @brief Virtual destructor for the parameters interface
+     */
+    virtual ~IParameters() = default;
+
+    /**
+     * @brief Returns a constant reference to a map with parameters.
+     * @return Map of parameters
+     */
+    virtual const std::map<std::string, Parameter>& getParameters() const noexcept = 0;
+
+    /**
+     * @brief Returns a constant reference to a constant pointers to constant data.
+     * @return Map of constant pointers to constant data
+     */
+    virtual const std::map<std::string, Blob::CPtr>& getConstantData() const noexcept = 0;
+};
+
+class INetwork;
+template <class T>
+class INetwotkIterator;
+
+/**
+ * @brief This class is the main interface to describe the Inference Engine layer.
+ * All methods here are constant and do not throw exceptions.
+ */
+class ILayer {
+public:
+    /**
+     * @brief A shared pointer to the ILayer object
+     */
+    using Ptr = std::shared_ptr<ILayer>;
+    /**
+     * @brief A shared pointer to the const ILayer object
+     */
+    using CPtr = std::shared_ptr<const ILayer>;
+
+    /**
+     * @brief Virtual destructor for the layer interface
+     */
+    virtual ~ILayer() = default;
+
+    /**
+     * @brief Returns a id of the layer.
+     * @return Layer id
+     */
+    virtual idx_t getId() const noexcept = 0;
+
+    /**
+     * @brief Returns a layer name.
+     * @return Layer name
+     */
+    virtual const std::string& getName() const noexcept = 0;
+
+    /**
+     * @brief Returns a layer type.
+     * @return Layer type
+     */
+    virtual const std::string& getType() const noexcept = 0;
+
+    /**
+     * @brief Returns a constant smart pointer reference to a Network interface.
+     * @return Network interface smart pointer
+     */
+     virtual const std::shared_ptr<INetwork>& getGraph() const noexcept = 0;
+
+    /**
+     * @brief Returns a constant smart pointer reference to a Parameters interface.
+     * @return Parameters interface smart pointer
+     */
+    virtual const IParameters::Ptr& getParameters() const noexcept = 0;
+
+    /**
+     * @brief Returns a constant reference to a vector with input ports.
+     * @return Vector of input ports
+     */
+    virtual const std::vector<Port>& getInputPorts() const noexcept = 0;
+
+    /**
+     * @brief Returns a constant reference to a vector with output ports.
+     * @return Vector of output ports
+     */
+    virtual const std::vector<Port>& getOutputPorts() const noexcept = 0;
+};
+
+namespace details {
+
+template<class NT, class LT>
+class INetworkIterator;
+
+}  // namespace details
+
+/**
+ * @brief This class is the main interface to describe the Inference Engine network.
+ *
+ * All methods here are constant and do not throw exceptions.
+ */
+class INetwork {
+public:
+    /**
+     * @brief A shared pointer to the INetwork object.
+     */
+    using Ptr = std::shared_ptr<INetwork>;
+    /**
+     * @brief A constant iterator for INetwork objects definition
+     */
+    using const_iterator = details::INetworkIterator<const INetwork, const ILayer>;
+
+    /**
+     * @brief Virtual destructor for the network interface
+     */
+    virtual ~INetwork() = default;
+
+    /**
+     * @brief Begin network iterator
+     * @return const INetwork iterator
+     */
+    virtual const_iterator begin() const noexcept = 0;
+
+    /**
+     * @brief End network iterator
+     * @return const INetwork iterator
+     */
+    virtual const_iterator end() const noexcept = 0;
+
+    /**
+     * @brief Returns a number of layers in the network.
+     * @return Layers count
+     */
+    virtual size_t size() const noexcept = 0;
+
+    /**
+     * @brief Returns a constant smart pointer to a Layer interface.
+     * If the layer is missing, returns nullptr.
+     * @param id Id of the Layer
+     * @return Layer interface smart pointer
+     */
+    virtual const ILayer::Ptr getLayer(idx_t id) const noexcept = 0;
+
+    /**
+     * @brief Returns a constant vector of input layers.
+     * @return Vector of input layers
+     */
+    virtual const std::vector<ILayer::Ptr> getInputs() const noexcept = 0;
+
+    /**
+     * @brief Returns a constant vector of output layers.
+     * @return Vector of output layers
+     */
+    virtual const std::vector<ILayer::Ptr> getOutputs() const noexcept = 0;
+
+    /**
+     * @brief Returns a constant vector of connections for specific layer.
+     * If the layer is missing, returns empty vector.
+     * @param layerId layer index
+     * @return Vector of connections
+     */
+    virtual const std::vector<Connection> getLayerConnections(idx_t layerId) const noexcept = 0;
+
+    /**
+     * @brief Returns a network name.
+     * @return Network name
+     */
+    virtual const std::string& getName() const noexcept = 0;
+
+    /**
+     * @brief Returns a network context
+     * @return const reference to Context
+     */
+    virtual const Context& getContext() const noexcept = 0;
+};
+
+}  // namespace InferenceEngine
+
+#include <details/ie_inetwork_iterator.hpp>
index 24a99bd..17f6a67 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 46d5d3e..4582842 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -22,8 +21,6 @@
 #include "ie_device.hpp"
 #include "ie_layers_property.hpp"
 
-#include "ie_icnn_network.hpp"
-
 namespace InferenceEngine {
 /**
  * @brief This is an internal common Layer parameter parsing arguments
@@ -503,6 +500,10 @@ public:
      * @brief Number of groups
      */
     unsigned int _group = 1u;
+    /**
+     * @brief Auto padding type
+     */
+    std::string _auto_pad;
 
     /**
      * @brief Creates a new ConvolutionLayer instance.
@@ -593,6 +594,10 @@ public:
      * @brief A flag that indicates if padding is excluded or not
      */
     bool _exclude_pad = false;
+    /**
+     * @brief Auto padding type
+     */
+    std::string _auto_pad;
 
     /**
     * @brief Creates a new PoolingLayer instance.
@@ -926,61 +931,33 @@ public:
 };
 
 /**
-* @brief This class represents RNN sequence layer
-*/
-class RNNLayer : public WeightableLayer {
-public:
-    CellType cellType;
-
-    /**
-    * @brief An axis by which iteration is performed. Axis=0 means first input blob dimension is sequence, axis=1 means first dimension is batch.
-    */
-    unsigned int _axis = 1;
-
-    using WeightableLayer::WeightableLayer;
-
-    /**
-    * @brief Creates a new RNNLayer instance.
-    */
-    explicit RNNLayer(const LayerParams &p) : WeightableLayer(p) {}
-};
-
-/**
-* @brief This class represents LSTMCell pseudo-layer to be used in TensorIterator
-*/
-class LSTMCell : public WeightableLayer {
-public:
-    using WeightableLayer::WeightableLayer;
-};
-
-class ICNNNetReader;
-/**
-* @brief This class represents TensorIterator layer
-*/
+ * @brief This class represents TensorIterator layer
+ */
 class TensorIterator : public CNNLayer {
 public:
-    using CNNNetReaderPtr = std::shared_ptr<ICNNNetReader>;
-    CNNNetReaderPtr reader;
-
-    struct BackEdge {
-        int fromLayer;
-        int fromPort;
-        int toLayer;
-        int toPort;
+    struct PortMap {
+        // Data map rule
+        int from;      /**< Index of exteral data from ins/outs fields of CNNLayer */
+        int to;        /**< Index of internal data in iterator body */
+
+        // Iteration rule
+        int axis;      /**< Axis to iterate throught */
+        int stride;    /**< Stride to iterate throught */
+        int start;     /**< Start index of iteration range */
+        int end;       /**< Last index of iteration range  */
+        int part_size; /**< Part size which will be transfered to body subnetwork */
     };
 
-    struct Port {
-        int external_port_id;
-        int internal_layer_id;
-        int internal_port_id;
-        int axis;
-        int part_size;
-        int stride;
+    struct Body {
+        std::vector<DataPtr> inputs;
+        std::vector<DataPtr> outputs;
     };
 
-    std::vector<Port> input_ports;
-    std::vector<Port> output_ports;
-    std::vector<BackEdge> backEdges;
+    std::vector<PortMap> input_port_map;
+    std::vector<PortMap> output_port_map;
+    std::vector<PortMap> back_edges;
+
+    Body body;
 
     using CNNLayer::CNNLayer;
 };
@@ -1045,4 +1022,83 @@ public:
     using WeightableLayer::WeightableLayer;
 };
 
+/**
+ * @brief This class represents a general matrix multiplication operation layer
+ * Formula is: dst := alpha*src1*src2 + beta*src3
+ */
+class GemmLayer : public CNNLayer {
+public:
+    /**
+    * @brief A scale factor of src1 matrix
+    */
+    float alpha = 1.f;
+    /**
+    * @brief A scale factor of src3 matrix
+    */
+    float beta = 1.f;
+    /**
+    * @brief A flag that indicates if the src1 matrix is to be transposed
+    */
+    bool transpose_a = false;
+    /**
+    * @brief A flag that indicates if the src2 matrix is to be transposed
+    */
+    bool transpose_b = false;
+    /**
+    * @brief Creates a new GemmLayer instance.
+    */
+    using CNNLayer::CNNLayer;
+};
+
+/**
+ * @brief This class represents a standard Pad layer
+ * Adds paddings to input tensor
+ */
+class PadLayer : public CNNLayer {
+public:
+    /**
+     * @enum ePadMode
+     * @brief Defines possible modes of pad operation
+     */
+    enum ePadMode {
+        Constant = 0, Edge, Reflect, Symmetric
+    };
+
+    /**
+    * @brief Size of padding in the beginning of each axis
+    */
+    PropertyVector<unsigned int> pads_begin;
+    /**
+    * @brief Size of padding in the end of each axis
+    */
+    PropertyVector<unsigned int> pads_end;
+    /**
+    * @brief Mode of pad operation
+    */
+    ePadMode pad_mode = Constant;
+    /**
+    * @brief A pad value which is used for filling in Constant mode
+    */
+    float pad_value = 0.0f;
+    /**
+    * @brief Creates a new PadLayer instance.
+    */
+    using CNNLayer::CNNLayer;
+};
+
+/**
+ * @brief This class represents a standard Gather layer
+ * Gather slices from Dictionary according to Indexes
+ */
+class GatherLayer : public CNNLayer {
+public:
+    /**
+    * @brief The axis in Dictionary to gather Indexes from
+    */
+    int axis = 0;
+    /**
+    * @brief Creates a new GatherLayer instance.
+    */
+    using CNNLayer::CNNLayer;
+};
 }  // namespace InferenceEngine
index d0dcc8a..52d434c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -9,6 +8,8 @@
  */
 #pragma once
 
+#include <vector>
+
 namespace InferenceEngine {
 
 constexpr const int MAX_DIMS_NUMBER = 12;
@@ -32,13 +33,20 @@ public:
         if (len > N) {
             THROW_IE_EXCEPTION << "Property size exceeed limit of: " << N;
         }
-        for (int i = 0; i < len; i++) {
+        for (size_t i = 0; i < len; i++) {
             _axises[i] = val;
             _allocated[i] = true;
         }
         _length = len;
     }
 
+    explicit PropertyVector(const std::vector<T>& values) {
+        size_t i = 0;
+        for (const auto val : values) {
+            insert(i++, val);
+        }
+    }
+
     /**
      * @brief allows access up-to capacity size
      * @param index
index 7e5055e..f4c0e4d 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -218,11 +217,18 @@ public:
      * @param l memory layout
      */
     void setLayout(Layout l) {
-        bool inconsistentLayout = false;
-        switch (layout) {
+        bool inconsistentLayout = true;
+        switch (l) {
             case Layout::C:
                 inconsistentLayout = dims.size() != 1;
                 break;
+            case Layout::BLOCKED:
+                inconsistentLayout = false;
+                break;
+            case Layout::NCDHW:
+            case Layout::NDHWC:
+                inconsistentLayout = dims.size() != 5;
+                break;
             case Layout::OIHW:
             case Layout::NCHW:
             case Layout::NHWC:
@@ -240,7 +246,7 @@ public:
                 break;
         }
         if (inconsistentLayout)
-            THROW_IE_EXCEPTION << "Dims and format are inconsistent.";
+            THROW_IE_EXCEPTION << "Dims(" << std::to_string(dims.size()) << ") and format(" << std::to_string(l) << ") are inconsistent.";
         layout = l;
     }
 
index 0269895..59e81f0 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 708da98..4dbd3f4 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #define IE_THREAD_SEQ 2
 
 #if IE_THREAD == IE_THREAD_TBB
+#define TBB_PREVIEW_LOCAL_OBSERVER 1
+#include "tbb/task_scheduler_observer.h"
 #include "tbb/parallel_for.h"
 #include "tbb/task_arena.h"
 
 #include "tbb/parallel_reduce.h"
 #include "tbb/blocked_range.h"
 #include "tbb/blocked_range2d.h"
+#include "tbb/blocked_range3d.h"
 
 inline int  parallel_get_max_threads() { return tbb::this_task_arena::max_concurrency(); }
 inline int  parallel_get_num_threads() { return parallel_get_max_threads(); }
 inline int  parallel_get_thread_num()  { return tbb::this_task_arena::current_thread_index(); }
 inline void parallel_set_num_threads(int n) { return; }
+inline int  parallel_get_env_threads() { return 0; }
 
 #elif IE_THREAD == IE_THREAD_OMP
+#include <cstdlib>
+#include <string>
 #include <omp.h>
+
+
 /* MSVC still supports omp 2.0 only */
 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
 #   define collapse(x)
@@ -39,8 +46,20 @@ inline int  parallel_get_max_threads() { return omp_get_max_threads(); }
 inline int  parallel_get_num_threads() { return omp_get_num_threads(); }
 inline int  parallel_get_thread_num()  { return omp_get_thread_num(); }
 inline void parallel_set_num_threads(int n) { omp_set_num_threads(n); }
+inline int  parallel_get_env_threads() {
+    int env_cores = 0;
+    if (getenv("OMP_NUM_THREADS") != nullptr) {
+        try {
+            env_cores = std::stoi(getenv("OMP_NUM_THREADS"));
+        } catch (const std::exception&) {
+            env_cores = 0;
+        }
+    }
+    return env_cores;
+}
 
 #elif IE_THREAD == IE_THREAD_SEQ
+inline int  parallel_get_env_threads() { return 1; }
 inline int  parallel_get_max_threads() { return 1; }
 inline int  parallel_get_num_threads() { return 1; }
 inline int  parallel_get_thread_num()  { return 0; }
@@ -75,6 +94,35 @@ void parallel_nt(int nthr, F func) {
 #endif
 }
 
+template <typename F>
+void parallel_nt_static(int nthr, F func) {
+#if IE_THREAD == IE_THREAD_SEQ
+    const bool serial = true;
+#else
+    const bool serial = false;
+#endif
+
+    if (serial || nthr == 1) {
+        func(0, 1);
+        return;
+    }
+
+    if (nthr == 0) nthr = parallel_get_max_threads();
+#if IE_THREAD == IE_THREAD_TBB
+    tbb::parallel_for(0, nthr, [&](int ithr) {
+            func(ithr, nthr);
+        }
+        , tbb::static_partitioner{});
+
+#elif IE_THREAD == IE_THREAD_OMP
+
+#   pragma omp parallel num_threads(nthr)
+    {
+        func(parallel_get_thread_num(), parallel_get_num_threads());
+    }
+#endif
+}
+
 template <typename T0, typename R, typename F>
 R parallel_sum(const T0 D0, R &input, F func) {
 #if IE_THREAD == IE_THREAD_TBB
@@ -91,10 +139,17 @@ R parallel_sum(const T0 D0, R &input, F func) {
         });
 #else
     R sum = input;
+
+#ifdef _MSC_VER
+    using T0_IT = typename std::make_signed<T0>::type;
+#else
+    using T0_IT = T0;
+#endif
+
 #if IE_THREAD == IE_THREAD_OMP
     #pragma omp parallel for reduction(+ : sum) schedule(static)
 #endif
-    for (T0 dim1 = 0; dim1 < D0; dim1++) {
+    for (T0_IT dim1 = 0; dim1 < D0; dim1++) {
         sum += func(dim1);
     }
     return sum;
@@ -120,17 +175,71 @@ R parallel_sum2d(const T0 D0, const T1 D1, R input, F func) {
         });
 #else
     R sum = input;
+
+#ifdef _MSC_VER
+    using T0_IT = typename std::make_signed<T0>::type;
+    using T1_IT = typename std::make_signed<T1>::type;
+#else
+    using T0_IT = T0;
+    using T1_IT = T1;
+#endif
+
 #if IE_THREAD == IE_THREAD_OMP
     #pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static)
 #endif
-    for (T0 dim2 = 0; dim2 < D0; dim2++) {
-        for (T1 dim1 = 0; dim1 < D1; dim1++) {
+    for (T0_IT dim2 = 0; dim2 < D0; dim2++) {
+        for (T1_IT dim1 = 0; dim1 < D1; dim1++) {
             sum += func(dim2, dim1);
         }
     }
     return sum;
 #endif
 }
+template <typename T0, typename T1, typename T2, typename R, typename F>
+R parallel_sum3d(const T0 D0, const T1 D1, const T2 D2, R input, F func) {
+#if IE_THREAD == IE_THREAD_TBB
+    return tbb::parallel_reduce(
+        tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2), input,
+        [&](const tbb::blocked_range3d<T0, T1, T2>& r, R init)->R {
+            R sum = init;
+            for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) {
+                for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
+                    for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) {
+                        sum += func(dim1, dim2, dim3);
+                    }
+                }
+            }
+            return sum;
+        },
+        [](R x, R y)->R {
+            return x + y;
+        });
+#else
+    R sum = input;
+
+#ifdef _MSC_VER
+    using T0_IT = typename std::make_signed<T0>::type;
+    using T1_IT = typename std::make_signed<T1>::type;
+    using T2_IT = typename std::make_signed<T2>::type;
+#else
+    using T0_IT = T0;
+    using T1_IT = T1;
+    using T2_IT = T2;
+#endif
+
+#if IE_THREAD == IE_THREAD_OMP
+    #pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static)
+#endif
+    for (T0_IT dim1 = 0; dim1 < D0; dim1++) {
+        for (T1_IT dim2 = 0; dim2 < D1; dim2++) {
+            for (T2_IT dim3 = 0; dim3 < D2; dim3++) {
+                sum += func(dim1, dim2, dim3);
+            }
+        }
+    }
+    return sum;
+#endif
+}
 
 template<typename T>
 inline T parallel_it_init(T start) { return start; }
diff --git a/inference-engine/include/ie_parameter.hpp b/inference-engine/include/ie_parameter.hpp
new file mode 100644 (file)
index 0000000..59526ad
--- /dev/null
@@ -0,0 +1,365 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief A header file for the CNNNetworkIterator class
+ * @file ie_cnn_network_iterator.hpp
+ */
+#pragma once
+
+#include <details/ie_exception.hpp>
+#include <algorithm>
+#include <iterator>
+#include <vector>
+#include <cctype>
+#include <string>
+#include <map>
+
+namespace InferenceEngine {
+
+/**
+ * @brief This class represents an object to work with different parameters
+ */
+class Parameter {
+public:
+    /**
+     * @brief Default constructor
+     */
+    Parameter() = default;
+
+    /**
+     * @brief The constructor creates a Parameter object with string value
+     * @param value string value
+     */
+    Parameter(const std::string& value): initialized(true), value(value) {}         // NOLINT
+
+    /**
+     * @brief The constructor creates a Parameter object with template value
+     * @param value template value
+     */
+    template <class T>
+    Parameter(const T& value): initialized(true), value(std::to_string(value)) {}   // NOLINT
+
+    /**
+     * @brief The constructor creates a Parameter object with a vector of template values
+     * @param values vector of template values
+     */
+    template <class T>
+    Parameter(const std::vector<T>& values): initialized(true) {                    // NOLINT
+        for (const auto& val : values) {
+            if (!value.empty())
+                value += ",";
+            value += std::to_string(val);
+        }
+    }
+
+    /**
+     * @brief The cast to string object
+     * Throws exception if parameter was not found.
+     * @return string value
+     */
+    operator std::string() const {                                                  // NOLINT
+        return asString();
+    }
+
+    /**
+     * @brief Returns a string value for the given parameter or returns the default one
+     * @param def Default value of the parameter if not found
+     * @return A string value
+     */
+    std::string asString(std::string def) const {
+        if (!initialized) {
+            return def;
+        }
+        return value;
+    }
+
+    /**
+     * @brief Returns a string value for the given parameter.
+     * Throws exception if parameter was not found.
+     * @return A string value
+     */
+    std::string asString() const {
+        if (!initialized) {
+            THROW_IE_EXCEPTION << "Parameter was not initialized!";
+        }
+        return value;
+    }
+
+    /**
+     * @brief Gets float value for the given parameter
+     * @param def - default value of the parameter if not found
+     * @return float value
+     */
+    float asFloat(float def) const {
+        std::string val = asString(std::to_string(def));
+        try {
+            return std::stof(val);
+        } catch (...) {
+            THROW_IE_EXCEPTION << "Value " << val << " cannot be casted to float.";
+        }
+    }
+
+    /**
+     * @brief Returns a float value for the given layer parameter
+     * @return A float value for the specified parameter
+     */
+    float asFloat() const {
+        std::string val = asString();
+        try {
+            return std::stof(val);
+        } catch (...) {
+            THROW_IE_EXCEPTION << "Value " << val << " cannot be casted to float.";
+        }
+    }
+
+    /**
+     * @brief Returns a vector of float values for the given parameter or returns the default value
+     * @param def Default value of the parameter if not found
+     * @return vector of float values
+     */
+    std::vector<float> asFloats(std::vector<float> def) const {
+        std::string vals = asString("");
+        std::vector<float> result;
+        std::istringstream stream(vals);
+        std::string str;
+        if (vals.empty())
+            return def;
+        while (getline(stream, str, ',')) {
+            try {
+                result.push_back(std::stof(str));
+            } catch (...) {
+                THROW_IE_EXCEPTION << "Value " << vals << " cannot be casted to floats.";
+            }
+        }
+        return result;
+    }
+
+    /**
+     * @brief Returns a vector of float values for the given parameter
+     * @return vector of float values
+     */
+    std::vector<float> asFloats() const {
+        std::string vals = asString();
+        std::vector<float> result;
+        std::istringstream stream(vals);
+        std::string str;
+        while (getline(stream, str, ',')) {
+            try {
+                result.push_back(std::stof(str));
+            } catch (...) {
+                THROW_IE_EXCEPTION << "Value " << vals << " cannot be casted to floats.";
+            }
+        }
+        return result;
+    }
+
+    /**
+     * @brief Returns an integer value for the given parameter or returns the default value
+     * @param def Default value of the parameter if not found
+     * @return An int value for the specified parameter
+     */
+    int asInt(int def) const {
+        std::string val = asString(std::to_string(def));
+        try {
+            return std::stoi(val);
+        } catch (...) {
+            THROW_IE_EXCEPTION << "Value " << val << " cannot be casted to int.";
+        }
+    }
+
+    /**
+     * @brief Returns an integer value for the given parameter
+     * @return An int value for the specified parameter
+     */
+    int asInt() const {
+        std::string val = asString();
+        try {
+            return std::stoi(val);
+        } catch (...) {
+            THROW_IE_EXCEPTION << "Value " << val << " cannot be casted to int.";
+        }
+    }
+
+
+    /**
+     * @brief Returns a vector of int values for the given parameter or returns the default value
+     * @param def Default value of the parameter if not found
+     * @return vector of int values
+     */
+    std::vector<int> asInts(std::vector<int> def) const {
+        std::string vals = asString("");
+        std::vector<int> result;
+        std::istringstream stream(vals);
+        std::string str;
+        if (vals.empty())
+            return def;
+        while (getline(stream, str, ',')) {
+            try {
+                result.push_back(std::stoi(str));
+            } catch (...) {
+                THROW_IE_EXCEPTION << "Value " << vals << " cannot be casted to ints.";
+            }
+        }
+        return result;
+    }
+
+    /**
+     * @brief Returns a vector of int values for the given parameter
+     * @return vector of int values
+     */
+    std::vector<int> asInts() const {
+        std::string vals = asString();
+        std::vector<int> result;
+        std::istringstream stream(vals);
+        std::string str;
+        while (getline(stream, str, ',')) {
+            try {
+                result.push_back(std::stoi(str));
+            } catch (...) {
+                THROW_IE_EXCEPTION << "Value " << vals <<  " cannot be casted to ints.";
+            }
+        }
+        return result;
+    }
+    /**
+     * @brief Returns an unsigned integer value for the given parameter or returns the default value
+     * @param def Default value of the parameter if not found
+     * @return An unsigned integer value for the specified parameter
+     */
+    unsigned int asUInt(unsigned int def) const {
+        std::string val = asString(std::to_string(def));
+        std::string message = "Value " + val + " cannot be casted to unsigned int.";
+        try {
+            int value = std::stoi(val);
+            if (value < 0) {
+                THROW_IE_EXCEPTION << message;
+            }
+            return static_cast<unsigned int>(value);
+        } catch (...) {
+            THROW_IE_EXCEPTION << message;
+        }
+    }
+
+    /**
+     * @brief Returns an unsigned integer value for the given parameter
+     * @return An unsigned integer value for the specified parameter
+     */
+    unsigned int asUInt() const {
+        std::string val = asString();
+        std::string message = "Value " + val + " cannot be casted to unsigned int.";
+        try {
+            int value = std::stoi(val);
+            if (value < 0) {
+                THROW_IE_EXCEPTION << message;
+            }
+            return static_cast<unsigned int>(value);
+        } catch (...) {
+            THROW_IE_EXCEPTION << message;
+        }
+    }
+
+
+    /**
+     * @brief Returns a vector of unsigned int values for the given parameter or returns the default value
+     * @param def Default value of the parameter if not found
+     * @return vector of unsigned int values
+     */
+    std::vector<unsigned int> asUInts(std::vector<unsigned int> def) const {
+        std::string vals = asString("");
+        std::vector<unsigned int> result;
+        std::istringstream stream(vals);
+        std::string str;
+        std::string message = "Value " + vals +  " cannot be casted to unsigned ints.";
+        if (vals.empty())
+            return def;
+        while (getline(stream, str, ',')) {
+            try {
+                int value = std::stoi(str);
+                if (value < 0) {
+                    THROW_IE_EXCEPTION << message;
+                }
+                result.push_back(static_cast<unsigned int>(value));
+            } catch (...) {
+                THROW_IE_EXCEPTION << message;
+            }
+        }
+        return result;
+    }
+
+    /**
+     * @brief Returns a vector of unsigned int values for the given parameter
+     * @return vector of unsigned int values
+     */
+    std::vector<unsigned int> asUInts() const {
+        std::string vals = asString();
+        std::vector<unsigned int> result;
+        std::istringstream stream(vals);
+        std::string str;
+        std::string message = "Value " + vals +  " cannot be casted to unsigned ints.";
+        while (getline(stream, str, ',')) {
+            try {
+                int value = std::stoi(str);
+                if (value < 0) {
+                    THROW_IE_EXCEPTION << message;
+                }
+                result.push_back(static_cast<unsigned int>(value));
+            } catch (...) {
+                THROW_IE_EXCEPTION << message;
+            }
+        }
+        return result;
+    }
+
+    /**
+     * @brief Returns an boolean value for the given parameter.
+     * The valid values are (true, false, 1, 0).
+     * @param def Default value of the parameter if not found
+     * @return An bool value for the specified parameter
+     */
+    bool asBool(bool def) const {
+        std::string val = asString(std::to_string(def));
+        std::string loweredCaseValue;
+        std::transform(val.begin(), val.end(), std::back_inserter(loweredCaseValue), [](char value) {
+            return std::tolower(value);
+        });
+
+        bool result = false;
+
+        if (!(std::istringstream(loweredCaseValue) >> std::boolalpha >> result)) {
+            // attempting parse using non alpha bool
+            return static_cast<bool>(asInt(def));
+        }
+
+        return result;
+    }
+
+    /**
+     * @brief Returns an boolean value for the given parameter.
+     * The valid values are (true, false, 1, 0).
+     * @return An bool value for the specified parameter
+     */
+    bool asBool() const {
+        std::string val = asString();
+        std::string loweredCaseValue;
+        std::transform(val.begin(), val.end(), std::back_inserter(loweredCaseValue), [](char value) {
+            return std::tolower(value);
+        });
+
+        bool result = false;
+
+        if (!(std::istringstream(loweredCaseValue) >> std::boolalpha >> result)) {
+            // attempting parse using non alpha bool
+            return static_cast<bool>(asInt());
+        }
+
+        return result;
+    }
+
+private:
+    bool initialized;
+    std::string value;
+};
+
+}  // namespace InferenceEngine
index df94b6d..5623dd6 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 
 
 #if defined(_WIN32)
-#ifdef IMPLEMENT_INFERENCE_ENGINE_PLUGIN
-#define INFERENCE_PLUGIN_API(type) extern "C"   __declspec(dllexport) type
-#else
-#define INFERENCE_PLUGIN_API(type) extern "C" type
-#endif
-
+    #ifdef IMPLEMENT_INFERENCE_ENGINE_PLUGIN
+        #define INFERENCE_PLUGIN_API(type) extern "C"   __declspec(dllexport) type
+    #else
+        #define INFERENCE_PLUGIN_API(type) extern "C" type
+    #endif
+#elif(__GNUC__ >= 4)
+    #ifdef IMPLEMENT_INFERENCE_ENGINE_PLUGIN
+        #define INFERENCE_PLUGIN_API(type) extern "C"   __attribute__((visibility("default"))) type
+    #else
+        #define INFERENCE_PLUGIN_API(type) extern "C" type
+    #endif
 #else
-#define INFERENCE_PLUGIN_API(TYPE) extern "C" TYPE
+    #define INFERENCE_PLUGIN_API(TYPE) extern "C" TYPE
 #endif
 
 namespace InferenceEngine {
@@ -162,7 +166,7 @@ public:
      * @param network Network object to query
      * @param resp Pointer to the response message that holds a description of an error if any occurred
      */
-    virtual void QueryNetwork(const ICNNNetwork& network, QueryNetworkResult& res) const noexcept {
+    virtual void QueryNetwork(const ICNNNetwork& /*network*/, QueryNetworkResult& res) const noexcept {
         res.rc = InferenceEngine::NOT_IMPLEMENTED;
     }
 
@@ -172,8 +176,8 @@ public:
      * @param config Map of pairs: (config parameter name, config parameter value)
      * @param resp Pointer to the response message that holds a description of an error if any occurred
      */
-    virtual void QueryNetwork(const ICNNNetwork& network,
-                              const std::map<std::string, std::string> &config, QueryNetworkResult& res) const noexcept {
+    virtual void QueryNetwork(const ICNNNetwork& /*network*/,
+                              const std::map<std::string, std::string> &/*config*/, QueryNetworkResult& res) const noexcept {
         res.rc = InferenceEngine::NOT_IMPLEMENTED;
     }
 };
index 6fba66d..0e3397d 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -39,13 +38,34 @@ DECLARE_CONFIG_VALUE(YES);
 DECLARE_CONFIG_VALUE(NO);
 
 /**
+* @brief Limit #threads that are used by Inference Engine for inference on the CPU.
+*/
+DECLARE_CONFIG_KEY(CPU_THREADS_NUM);
+
+/**
 * @brief The name for setting CPU affinity per thread option.
 * It is passed to IInferencePlugin::SetConfig(), this option should be used with values:
 * PluginConfigParams::YES or PluginConfigParams::NO
+* Ignored, if the OpenVINO compiled with OpenMP threading and any affinity-related OpenMP's
+* environment variable is set
 */
 DECLARE_CONFIG_KEY(CPU_BIND_THREAD);
 
 /**
+* @brief Optimize CPU execution to maximize throughput.
+* It is passed to IInferencePlugin::SetConfig(), this option should be used with values:
+* - KEY_CPU_THROUGHPUT_NUMA creates as many streams as needed to accomodate NUMA and avoid associated penalties
+* - KEY_CPU_THROUGHPUT_AUTO creates bare minimum of streams to improve the performance,
+*   this is the most portable option if you have no insights into how many cores you target machine will have
+*   (and what is the optimal number of streams)
+* - finally, specifying the positive integer value creates the requested number of streams
+*/
+DECLARE_CONFIG_VALUE(CPU_THROUGHPUT_NUMA);
+DECLARE_CONFIG_VALUE(CPU_THROUGHPUT_AUTO);
+DECLARE_CONFIG_KEY(CPU_THROUGHPUT_STREAMS);
+
+
+/**
 * @brief The name for setting performance counters option.
 * It is passed to IInferencePlugin::SetConfig(), this option should be used with values:
 * PluginConfigParams::YES or PluginConfigParams::NO
@@ -125,10 +145,21 @@ DECLARE_CONFIG_KEY(DEVICE_ID);
 /**
 * @brief the key for enabling exclusive mode for async requests of different executable networks and the same plugin.
 * Sometimes it's necessary to avoid oversubscription requests that are sharing the same device in parallel.
-* E.g. There 2 task executors for CPU device: one - in FPGA, another - in MKLDNN. Parallel execution both of them leads to
-* not optimal CPU usage. More efficient to run the corresponding tasks one by one via single executor.
+* E.g. There 2 task executors for CPU device: one - in the Hetero plugin, another - in pure CPU plugin.
+* Parallel execution both of them might lead to oversubscription and not optimal CPU usage. More efficient
+* to run the corresponding tasks one by one via single executor.
+* By default, the option is set to YES for hetero cases, and to NO for conventional (single-plugin) cases
+* Notice that setting YES disables the CPU streams feature (see another config key in this file)
 */
 DECLARE_CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS);
 
+/**
+ * @brief This key enables dumping of the internal primitive graph.
+ * Should be passed into LoadNetwork method to enable dumping of internal graph of primitives and
+ * corresponding configuration information. Value is a name of output dot file without extension.
+ * Files <dot_file_name>_init.dot and <dot_file_name>_perf.dot will be produced.
+ */
+DECLARE_CONFIG_KEY(DUMP_EXEC_GRAPH_AS_DOT);
+
 }  // namespace PluginConfigParams
 }  // namespace InferenceEngine
index 257b494..60d729d 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -24,24 +23,24 @@ public:
      * @brief A constructor
      * @param pp Vector of paths to plugin directories
      */
-    explicit PluginDispatcher(const std::vector<std::string> &pp) : pluginDirs(pp) {}
+    explicit PluginDispatcher(const std::vector<file_name_t> &pp) : pluginDirs(pp) {}
 
     /**
     * @brief Loads a plugin from plugin directories
     * @param name Plugin name
     * @return A pointer to the loaded plugin
     */
-    virtual InferencePlugin getPluginByName(const std::string& name) const {
+    virtual InferencePlugin getPluginByName(const file_name_t& name) const {
         std::stringstream err;
         for (auto &pluginPath : pluginDirs) {
             try {
                 return InferencePlugin(InferenceEnginePluginPtr(make_plugin_name(pluginPath, name)));
             }
             catch (const std::exception &ex) {
-                err << "cannot load plugin: " << name << " from " << pluginPath << ": " << ex.what() << ", skipping\n";
+                err << "cannot load plugin: " << fileNameToString(name) << " from " << fileNameToString(pluginPath) << ": " << ex.what() << ", skipping\n";
             }
         }
-        THROW_IE_EXCEPTION << "Plugin " << name << " cannot be loaded: " << err.str() << "\n";
+        THROW_IE_EXCEPTION << "Plugin " << fileNameToString(name) << " cannot be loaded: " << err.str() << "\n";
     }
 
     /**
@@ -77,7 +76,7 @@ public:
         std::stringstream err;
         for (std::string& name : result.names) {
             try {
-                return getPluginByName(name);
+                return getPluginByName(stringToFileName(name));
             }
             catch (const std::exception &ex) {
                 err << "Tried load plugin : " << name << ",  error: " << ex.what() << "\n";
@@ -93,17 +92,26 @@ protected:
     * @param input Plugin name
     * @return The path to the plugin
     */
-    std::string make_plugin_name(const std::string &path, const std::string &input) const {
-        std::string separator =
+    file_name_t make_plugin_name(const file_name_t &path, const file_name_t &input) const {
+        file_name_t separator =
 #if defined _WIN32 || defined __CYGWIN__
-        "\\";
+#   if defined UNICODE
+            L"\\";
+#   else
+            "\\";
+#   endif
 #else
-        "/";
+            "/";
 #endif
         if (path.empty())
-            separator = "";
+            separator = file_name_t();
 #ifdef _WIN32
-        return path + separator + input + ".dll";
+        return path + separator + input +
+#   if defined UNICODE
+            L".dll";
+#   else
+            ".dll";
+#   endif
 #elif __APPLE__
         return path + separator + "lib" + input + ".dylib";
 #else
@@ -111,7 +119,8 @@ protected:
 #endif
     }
 
+
 private:
-    std::vector<std::string> pluginDirs;
+    std::vector<file_name_t> pluginDirs;
 };
 }  // namespace InferenceEngine
index 5687093..6c10cf5 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 72589c8..d50fe5c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -80,20 +79,44 @@ public:
         return Precision(8 * sizeof(T), typeName == nullptr ? typeid(T).name() : typeName);
     }
 
+    /** @brief checks whether given storage class T can be used for store objects of current precision */
+    template <class T>
+    bool hasStorageType(const char * typeName = nullptr) const noexcept {
+        if (sizeof(T) != size()) {
+            return false;
+        }
+#define CASE(x, y) case x: return std::is_same<T, y>()
+#define CASE2(x, y1, y2) case x: return std::is_same<T, y1>() || std::is_same<T, y2>()
+
+        switch (precisionInfo.value) {
+            CASE(FP32, float);
+            CASE2(FP16, int16_t, uint16_t);
+            CASE(I16, int16_t);
+            CASE(I32, int32_t);
+            CASE(U16, uint16_t);
+            CASE(U8, uint8_t);
+            CASE(I8, int8_t);
+            CASE2(Q78, int16_t, uint16_t);
+            default : return areSameStrings(name(), typeName == nullptr ? typeid(T).name() : typeName);
+#undef CASE
+#undef CASE2
+        }
+    }
+
     /** @brief Equality operator with Precision object */
-    bool operator == (const Precision  & p) const noexcept  {
+    bool operator == (const Precision  & p) const noexcept {
         return precisionInfo.value == p &&
             precisionInfo.bitsSize == p.precisionInfo.bitsSize &&
             areSameStrings(precisionInfo.name, p.precisionInfo.name);
     }
 
     /** @brief Equality operator with ePrecision enum value */
-    bool operator == (const ePrecision  p) const noexcept  {
+    bool operator == (const ePrecision  p) const noexcept {
         return precisionInfo.value == p;
     }
 
     /** @brief Inequality operator with ePrecision enum value */
-    bool operator != (const ePrecision   p) const noexcept  {
+    bool operator != (const ePrecision   p) const noexcept {
         return precisionInfo.value != p;
     }
 
@@ -103,7 +126,7 @@ public:
         return *this;
     }
 
-    /** @brief Cust operator to a bool */
+    /** @brief Cast operator to a bool */
     explicit operator bool() const noexcept {
         return precisionInfo.value != UNSPECIFIED;
     }
@@ -113,7 +136,7 @@ public:
         return precisionInfo.value == UNSPECIFIED;
     }
 
-    /** @brief Cust operator to a ePrecision */
+    /** @brief Cast operator to a ePrecision */
     operator Precision::ePrecision  () const noexcept {
         return precisionInfo.value;
     }
@@ -162,7 +185,7 @@ public:
     template<Precision::ePrecision precision>
     static PrecisionInfo makePrecisionInfo(const char * name);
 
-    static bool areSameStrings(const char *l, const char *r) {
+    static bool areSameStrings(const char *l, const char *r) noexcept {
         if (l == r)
             return true;
 
@@ -208,7 +231,7 @@ struct PrecisionTrait<Precision::FP32> {
 
 template<>
 struct PrecisionTrait<Precision::FP16> {
-    using value_type = uint16_t;
+    using value_type = int16_t;
 };
 template<>
 struct PrecisionTrait<Precision::Q78> {
index bc5d7bd..1b984ff 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index e24f276..d4e4fbc 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 4b11ffb..5f71dc9 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/include/ie_unicode.hpp b/inference-engine/include/ie_unicode.hpp
new file mode 100644 (file)
index 0000000..f8231fa
--- /dev/null
@@ -0,0 +1,61 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief This is a header file with common inference engine definitions.
+ * @file ie_common.h
+ */
+#pragma once
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <ostream>
+#include <algorithm>
+#include <cstdlib>
+#include <details/ie_exception.hpp>
+
+#ifdef UNICODE
+typedef wchar_t tchar;
+typedef std::wstring file_name_t;
+#else
+typedef char tchar;
+typedef std::string file_name_t;
+#endif
+
+namespace InferenceEngine {
+
+/**
+* @brief Conversion from possibly-wide character string to a single-byte chain.
+*/
+inline std::string fileNameToString(const file_name_t& str) {
+#ifdef UNICODE
+    size_t maxlen = (str.length() + 1) * sizeof(wchar_t) / sizeof(char);
+    std::vector<char> mbstr(maxlen);
+    mbstr[0] = 0;
+    std::wcstombs(&mbstr[0], str.c_str(), maxlen);
+    std::string res = std::string(&mbstr[0]);
+    return res;
+#else
+    return str;
+#endif
+}
+
+/**
+* @brief Conversion from single-byte character string to a possibly-wide one
+*/
+inline file_name_t stringToFileName(const std::string& str) {
+#ifdef UNICODE
+    size_t maxlen = str.length() + 1;
+    std::vector<wchar_t> wcstr(maxlen);
+    wcstr[0] = 0;
+    std::mbstowcs(&wcstr[0], str.c_str(), maxlen);
+    file_name_t res = file_name_t(&wcstr[0]);
+    return res;
+#else
+    return str;
+#endif
+}
+
+}  // namespace InferenceEngine
index bd2981e..2ba9f02 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 36ab093..d743115 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 08f043e..352d943 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 88be546..fdb70e2 100755 (executable)
@@ -22,7 +22,6 @@ function yes_or_no {
 # install dependencies
 if [[ -f /etc/lsb-release ]]; then
     # Ubuntu
-    system_ver=`cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2`
     sudo -E apt update
     sudo -E apt-get install -y \
             build-essential \
@@ -41,6 +40,7 @@ if [[ -f /etc/lsb-release ]]; then
             automake \
             libtool \
             autoconf \
+            libpng12-dev \
             libcairo2-dev \
             libpango1.0-dev \
             libglib2.0-dev \
@@ -52,11 +52,6 @@ if [[ -f /etc/lsb-release ]]; then
             gstreamer1.0-plugins-base \
             libusb-1.0-0-dev \
             libopenblas-dev
-    if [ $system_ver = "18.04" ]; then
-           sudo -E apt-get install -y libpng-dev
-    else
-           sudo -E apt-get install -y libpng12-dev
-    fi 
 else
     # CentOS 7.x
     sudo -E yum install -y centos-release-scl epel-release
index a3eb147..1f7bb9f 100644 (file)
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 cmake_minimum_required (VERSION 2.8)
 
 project(Samples)
@@ -11,7 +12,9 @@ if (CMAKE_BUILD_TYPE STREQUAL "")
 endif()
 
 if (NOT(BIN_FOLDER))
-    if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
+        set (ARCH armv7l)
+    elseif("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
         set (ARCH intel64)
     else()
         set (ARCH ia32)
@@ -47,42 +50,54 @@ else ()
     set (LIBRARY_OUTPUT_PATH ${LIBRARY_OUTPUT_DIRECTORY}/lib)
 endif()
 
-find_package(InferenceEngine 1.4 REQUIRED)
+# use this flag if you need to throw custom message in case if the IE package is not found.
+if (IE_NOT_FOUND_MESSAGE)
+    find_package(InferenceEngine 1.5 QUIET)
+    if (NOT(InferenceEngine_FOUND))
+        message(FATAL_ERROR ${IE_NOT_FOUND_MESSAGE})
+    endif()
+else()
+    find_package(InferenceEngine 1.5 REQUIRED)
+endif()
 
 if (WIN32)
-    if(NOT "${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+    if (NOT "${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
         message(FATAL_ERROR "Only 64-bit supported on Windows")
     endif()
 
-    set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS _CRT_SECURE_NO_WARNINGS)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_SCL_SECURE_NO_WARNINGS -DNOMINMAX")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc") #no asynchronous structured exception handling
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LARGEADDRESSAWARE")
-    if (THREADING STREQUAL "OMP")
-        find_package(OpenMP)
-        if (OPENMP_FOUND)
-            set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-            set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        endif()
-    endif()
+    set_property (DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS _CRT_SECURE_NO_WARNINGS)
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_SCL_SECURE_NO_WARNINGS -DNOMINMAX")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc") #no asynchronous structured exception handling
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LARGEADDRESSAWARE")
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Werror=return-type ")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Werror=return-type ")
     if (APPLE)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-command-line-argument")
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-command-line-argument")
     elseif(UNIX)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wuninitialized -Winit-self -Wmaybe-uninitialized")
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wuninitialized -Winit-self")
+        if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wmaybe-uninitialized")
+        endif()
     endif()
 endif()
 
+
 ####################################
 ## to use C++11
 set (CMAKE_CXX_STANDARD 11)
 set (CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
+set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
 ####################################
 
-set(GFLAGS_IS_SUBPROJECT TRUE)
-add_subdirectory(thirdparty/gflags)
+set (GFLAGS_IS_SUBPROJECT TRUE)
+set (HAVE_SYS_STAT_H 1)
+set (HAVE_INTTYPES_H 1)
+
+if (WIN32)
+    # add_compile_options("/WX")
+else()
+    add_compile_options("-Werror")
+endif()
 
 # Properties->C/C++->General->Additional Include Directories
 include_directories (
@@ -93,38 +108,32 @@ include_directories (
 )
 
 if (UNIX)
-    SET(LIB_DL dl)
-endif()
-
-# Find OpenCV library if exists
-find_package(OpenCV)
-if(OpenCV_FOUND)
-    include_directories(${OpenCV_INCLUDE_DIRS})
-    add_definitions(-DUSE_OPENCV)
-else()
-    set (BUILD_VALIDATION_APP OFF)
-    message(WARNING "No suitable OpenCV version detected, BUILD_VALIDATION_APP is set to OFF")
+    set (LIB_DL dl)
 endif()
 
+add_subdirectory(thirdparty/gflags)
 add_subdirectory(common/format_reader)
 
-####################################################
-# SAMPLES list
-####################################################
-add_subdirectory(classification_sample)
-add_subdirectory(classification_sample_async)
-add_subdirectory(hello_autoresize_classification)
-add_subdirectory(hello_classification)
-add_subdirectory(hello_request_classification)
-add_subdirectory(hello_shape_infer_ssd)
-add_subdirectory(object_detection_sample_ssd)
-add_subdirectory(style_transfer_sample)
-
-add_subdirectory(benchmark_app)
-add_subdirectory(calibration_tool)
-if (BUILD_VALIDATION_APP)
-    add_subdirectory(validation_app)
-else()
-    message(STATUS "Validation app build is switched off")
-endif()
-####################################################
+# collect all samples subdirectories
+file(GLOB subdirs RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *)
+# skip building of unnecessary subdirs
+list(REMOVE_ITEM subdirs archived common thirdparty)
+
+foreach (dir ${subdirs})
+    if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${dir})
+        # check if a subdirectory contains CMakeLists.txt. In this case we can build it.
+        file(GLOB is_sample_dir "${CMAKE_CURRENT_SOURCE_DIR}/${dir}/CMakeLists.txt")
+        if(is_sample_dir)
+            # check if specified sample/demo is found.
+            if (BUILD_SAMPLE_NAME)
+                list(FIND BUILD_SAMPLE_NAME ${dir} index)
+            endif()
+            if (index EQUAL -1)
+                message(STATUS "${dir} SKIPPED")
+            else()
+                # Include subdirectory to the project.
+                add_subdirectory(${dir})
+            endif()
+        endif()
+    endif()
+endforeach()
index f3d6061..87db730 100644 (file)
@@ -1,25 +1,11 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#      http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 cmake_minimum_required(VERSION 2.8)
 
 set (TARGET_NAME "benchmark_app")
 
-if( BUILD_SAMPLE_NAME AND NOT ${BUILD_SAMPLE_NAME} STREQUAL ${TARGET_NAME} )
-    message(STATUS "SAMPLE ${TARGET_NAME} SKIPPED")
-    return()
-endif()
-
 file (GLOB SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         )
index e3a125c..ab0bbd7 100644 (file)
@@ -1,10 +1,13 @@
 # Benchmark Application Demo
 
-This topic demonstrates how to run the Benchmark Application demo, which performs inference using convolutional networks.
+This topic demonstrates how to use the Benchmark Application to estimate deep learning inference performance on supported devices. Performance can be measured for two inference modes: synchronous and asynchronous. 
+
+> **NOTE:** This topic describes usage of C++ implementation of the Benchmark Application. For the Python* implementation, refer to [Benchmark Application (Python*)](./samples/python_samples/benchmark_app/README.md)
+
 
 ## How It Works
 
-**NOTE:** To achieve benchmark results similar to the official published results, set CPU frequency to 2.9GHz and GPU frequency to 1GHz.
+**NOTE:** To achieve benchmark results similar to the official published results, set CPU frequency to 2.9GHz and GPU frequency to 1GHz.
 
 Upon the start-up, the application reads command-line parameters and loads a network and images to the Inference Engine plugin. The number of infer requests and execution approach depend on a mode defined with the `-api` command-line parameter.
 
@@ -56,15 +59,24 @@ Options:
 
 Running the application with the empty list of options yields the usage message given above and an error message.
 
-To run the demo, you can use one-layer public models or one-layer pre-trained and optimized models delivered with the package that support images as input.
+You can run the application for one input layer four-dimensional models that support images as input, for example, public 
+AlexNet and GoogLeNet models that can be downloaded 
+with the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader).
+
+> **NOTE**: To run the application, the model should be first converted to the Inference Engine format (\*.xml + \*.bin) 
+using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
 
-For example, to do inference on an image using a trained network with multiple outputs on CPU, run the following command:
+For example, to perform inference on CPU in the synchronous mode and get estimated performance metrics for AlexNet model, run the following command:
+
+```sh
+./benchmark_app -i <path_to_image>/inputImage.bmp -m <path_to_model>/alexnet_fp32.xml -d CPU -api sync
+```
 
+For the asynchronous mode:
 ```sh
-./benchmark_app -i <path_to_image>/inputImage.bmp -m <path_to_model>/multiple-output.xml -d CPU
+./benchmark_app -i <path_to_image>/inputImage.bmp -m <path_to_model>/alexnet_fp32.xml -d CPU -api async
 ```
 
-**NOTE**: Public models should be first converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/Model_Optimizer_Developer_Guide/Deep_Learning_Model_Optimizer_DevGuide.md).
 
 ## Demo Output
 
@@ -84,4 +96,6 @@ For asynchronous API, the application outputs only throughput:
 ```
 
 ## See Also
-* [Using Inference Engine Samples](./docs/Inference_Engine_Developer_Guide/Samples_Overview.md)
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
+* [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
+* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader)
index 3d92cb2..6ae2ffa 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -44,16 +43,24 @@ static const char target_device_message[] = "Specify a target device to infer on
 static const char iterations_count_message[] = "Optional. Number of iterations. " \
 "If not specified, the number of iterations is calculated depending on a device.";
 
-/// @brief message for iterations count
+/// @brief message for requests count
 static const char infer_requests_count_message[] = "Optional. Number of infer requests (default value is 2).";
 
+/// @brief message for #threads for CPU inference
+static const char infer_num_threads_message[] = "Optional. Number of threads to use for inference on the CPU "
+                                                "(including Hetero cases).";
+
 /// @brief message for user library argument
 static const char custom_cpu_library_message[] = "Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.";
 
 /// @brief message for clDNN custom kernels desc
 static const char custom_cldnn_message[] = "Required for GPU custom kernels. Absolute path to an .xml file with the kernels description.";
 
-static const char batch_size_message[] = "Batch size value. If not specified, the batch size value is determined from IR";
+static const char batch_size_message[] = "Optional. Batch size value. If not specified, the batch size value is determined from IR";
+
+// @brief message for CPU threads pinning option
+static const char infer_threads_pinning_message[] = "Optional. Enable (\"YES\" is default value) or disable (\"NO\")" \
+                                                  "CPU threads pinning for CPU-involved inference.";
 
 /// @brief Define flag for showing help message <br>
 DEFINE_bool(h, false, help_message);
@@ -91,11 +98,15 @@ DEFINE_int32(niter, 0, iterations_count_message);
 /// @brief Number of infer requests in parallel
 DEFINE_int32(nireq, 2, infer_requests_count_message);
 
+/// @brief Number of threads to use for inference on the CPU (also affects Hetero cases)
+DEFINE_int32(nthreads, 0, infer_num_threads_message);
+
 /// @brief Define parameter for batch size <br>
 /// Default is 0 (that means don't specify)
 DEFINE_int32(b, 0, batch_size_message);
 
-
+// @brief Enable plugin messages
+DEFINE_string(pin, "YES", infer_threads_pinning_message);
 /**
 * @brief This function show a help message
 */
@@ -116,4 +127,7 @@ static void showUsage() {
     std::cout << "    -c \"<absolute_path>\"    " << custom_cldnn_message << std::endl;
     std::cout << "    -nireq \"<integer>\"      " << infer_requests_count_message << std::endl;
     std::cout << "    -b \"<integer>\"          " << batch_size_message << std::endl;
+    std::cout << "    Some CPU-specific performance options" << std::endl;
+    std::cout << "    -nthreads \"<integer>\"   " << infer_num_threads_message << std::endl;
+    std::cout << "    -pin \"YES\"/\"NO\"       " << infer_threads_pinning_message << std::endl;
 }
index 68b7bf7..134287b 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -177,7 +176,7 @@ int main(int argc, char *argv[]) {
             const InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;
 
             /** Set the precision of output data provided by the user, should be called before load of the network to the plugin **/
-            outData->precision = outputPrecision;
+            outData->setPrecision(outputPrecision);
             InferenceEngine::TBlob<float>::Ptr output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
             output->allocate();
             outputBlobs[item.first] = output;
@@ -186,7 +185,17 @@ int main(int argc, char *argv[]) {
         // --------------------------- 5. Loading model to the plugin ------------------------------------------
 
         slog::info << "Loading model to the plugin" << slog::endl;
-        const std::map<std::string, std::string> networkConfig;
+        std::map<std::string, std::string> networkConfig;
+        if (FLAGS_d.find("CPU") != std::string::npos) {  // CPU supports few special performance-oriented keys
+            // limit threading for CPU portion of inference
+            if (FLAGS_nthreads != 0)
+                networkConfig[PluginConfigParams::KEY_CPU_THREADS_NUM] = std::to_string(FLAGS_nthreads);
+            // pin threads for CPU portion of inference
+            networkConfig[PluginConfigParams::KEY_CPU_BIND_THREAD] = FLAGS_pin;
+            // for pure CPU execution, more throughput-oriented execution via streams
+            if (FLAGS_api == "async" && FLAGS_d == "CPU")
+                networkConfig[PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS] = std::to_string(FLAGS_nireq);
+        }
         InferenceEngine::ExecutableNetwork exeNetwork = plugin.LoadNetwork(cnnNetwork, networkConfig);
 
         // --------------------------- 6. Performance measurements stuff ------------------------------------------
@@ -218,6 +227,10 @@ int main(int argc, char *argv[]) {
                 slog::info << "Start inference synchronously (" << durationInNanoseconds * 0.000001 << " ms duration)" << slog::endl << slog::endl;
             }
 
+            // warming up - out of scope
+            inferRequest.Infer();
+            inferRequest.Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
+
             const auto startTime = Time::now();
             auto currentTime = Time::now();
 
index 7ec85ed..f69a6e7 100644 (file)
@@ -1,16 +1,7 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#      http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 cmake_minimum_required(VERSION 2.8)
 
 set (TARGET_NAME "calibration_tool")
@@ -36,14 +27,11 @@ file (GLOB MAIN_HEADERS
 source_group("src" FILES ${MAIN_SRC})
 source_group("include" FILES ${MAIN_HEADERS})
 
-# opencv include folders
-find_package(OpenCV QUIET COMPONENTS core imgproc highgui imgcodecs)
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS imgcodecs QUIET)
 if(NOT(OpenCV_FOUND))
-    find_package(OpenCV QUIET COMPONENTS world)
-    if(NOT(OpenCV_FOUND))
-        message(WARNING "No suitable OpenCV version detected, " ${TARGET_NAME} " skipped")
-        return()
-    endif()
+    message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " skipped")
+    return()
 endif()
 
 # Properties->C/C++->General->Additional Include Directories
@@ -51,7 +39,6 @@ include_directories (${CMAKE_CURRENT_SOURCE_DIR}/../classification_sample/core
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/os/windows
         ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-        ${OpenCV_INCLUDE_DIRS}
         ${CMAKE_CURRENT_SOURCE_DIR}/../validation_app)
 
 link_directories(${LIB_FOLDER})
index b05b11b..f40c671 100644 (file)
@@ -22,6 +22,7 @@ Available options:
       -t "RawOD" to collect only statistics for Object Detection network and write statistics to IR. With this option, a model is not calibrated. For calibration and statisctics collection, use "-t OD" instead
     -i <path>                 Required. Path to a directory with validation images. For Classification models, the directory must contain folders named as labels with images inside or a .txt file with a list of images. For Object Detection models, the dataset must be in VOC format.
     -m <path>                 Required. Path to an .xml file with a trained model, including model name and extension.
+    -lbl <path>               Labels file path. The labels file contains names of the dataset classes
     -l <absolute_path>        Required for CPU custom layers. Absolute path to a shared library with the kernel implementations.
     -c <absolute_path>        Required for GPU custom kernels. Absolute path to an .xml file with the kernel descriptions.
     -d <device>               Target device to infer on: CPU (default), GPU, FPGA, or MYRIAD. The application looks for a suitable plugin for the specified device.
@@ -31,9 +32,10 @@ Available options:
     -ppWidth W                Preprocessing width (overrides -ppSize, used with ppType="ResizeCrop")
     -ppHeight H               Preprocessing height (overrides -ppSize, used with ppType="ResizeCrop")
     --dump                    Dump file names and inference results to a .csv file
-    -subset                  Number of pictures from the whole validation set tocreate the calibration dataset. Default value is 0, which stands forthe whole provided dataset
-    -output <output_IR>      Output name for calibrated model. Default is <original_model_name>_i8.xml|bin
-    -threshold               Threshold for a maximum accuracy drop of quantized model. Must be an integer number (percents) without a percent sign. Default value is 1, which stands for accepted accuracy drop in 1%
+    -subset                   Number of pictures from the whole validation set tocreate the calibration dataset. Default value is 0, which stands forthe whole provided dataset
+    -output <output_IR>       Output name for calibrated model. Default is <original_model_name>_i8.xml|bin
+    -threshold                Threshold for a maximum accuracy drop of quantized model. Must be an integer number (percents) without a percent sign. Default value is 1, which stands for accepted accuracy drop in 1%
+    - stream_output           Flag for printing progress as a plain text.When used, interactive progress bar is replaced with multiline output
 
     Classification-specific options:
       -Czb true               "Zero is a background" flag. Some networks are trained with a modified dataset where the class IDs  are enumerated from 1, but 0 is an undefined "background" class (which is never detected)
@@ -72,18 +74,21 @@ If you decide to use the subset of the given dataset, use the ImageNet-like form
 instead of "folder as classes" format. This brings a more accurate calibration as you are likely to get images
 representing different classes.
 
-For example, to calibrate the pretrained TensorFlow\* `inception_v4_tf.xml` classification model,
-run the following command:
+To run the sample you can use classification models that can be downloaded with the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or other image classification models.
+
+For example, to calibrate the trained Caffe\* `resnet-50` classification model, run the following command:
 
 ```bash
-./calibration_tool -t C -m inception_v4_tf.xml -i ILSVRC2012_val.txt -Czb false -ppType "ResizeCrop" -ppSize 342 -b 1 -d CPU -subset 2000
+./calibration_tool -t C -m resnet-50.xml -i ILSVRC2012_val.txt -Czb false -ppType "ResizeCrop" -ppSize 342 -b 1 -d CPU -subset 2000
 ```
 
+> **NOTE**: To run the tool for a model, the model should be first converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
 ## Calibrate Object Detection Model
 
 This topic demonstrates how to run the Calibration Tool on the Object Detection CNN on a set of images. Please
 review the list of Object Detection models used for validation of the Calibration Tool
-in the [8-bit Inference Introduction](./docs/Inference_Engine_Developer_Guide/Int8Inference.md).
+in the [8-bit Inference Introduction](./docs/IE_DG/Int8Inference.md).
 Any network that can be inferred with the Inference Engine and has the same input and output
 format as the SSD CNN should be supported as well.
 
@@ -100,4 +105,4 @@ Once you have prepared the dataset, you can calibrate the model on it by running
 
 ## See Also
 
-* [Using Inference Engine Samples](./docs/Inference_Engine_Developer_Guide/Samples_Overview.md)
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
index 21ce5f1..d4cf7fe 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -155,7 +154,13 @@ void Int8Calibrator::collectFP32Statistic() {
         _cBatch = networkReaderC.getNetwork().getBatchSize();
     } else {
         // Not zero means "use the specified value"
-        networkReaderC.getNetwork().setBatchSize(_cBatch);
+        auto input_shapes = networkReaderC.getNetwork().getInputShapes();
+        std::string input_name;
+        SizeVector input_shape;
+        std::tie(input_name, input_shape) = *input_shapes.begin();
+        input_shape[0] = _cBatch;
+        input_shapes[input_name] = input_shape;
+        networkReaderC.getNetwork().reshape(input_shapes);
     }
 
     /** Extract model name and load weights **/
@@ -213,7 +218,13 @@ void Int8Calibrator::validateInt8Config(const InferenceEngine::NetworkStatsMap &
         _cBatch = networkReaderC.getNetwork().getBatchSize();
     } else {
         // Not zero means "use the specified value"
-        networkReaderC.getNetwork().setBatchSize(_cBatch);
+        auto input_shapes = networkReaderC.getNetwork().getInputShapes();
+        std::string input_name;
+        SizeVector input_shape;
+        std::tie(input_name, input_shape) = *input_shapes.begin();
+        input_shape[0] = _cBatch;
+        input_shapes[input_name] = input_shape;
+        networkReaderC.getNetwork().reshape(input_shapes);
     }
 
     /** Extract model name and load weights **/
@@ -237,90 +248,97 @@ void Int8Calibrator::validateInt8Config(const InferenceEngine::NetworkStatsMap &
     _inferRequestI8C = executable_network.CreateInferRequest();
 }
 
-CNNNetwork Int8Calibrator::createICNNNetworkForLayer(CNNLayer::Ptr layerToClone) {
+CNNNetwork Int8Calibrator::createICNNNetworkForLayer(CNNLayer::Ptr layerToClone, bool hasReLU) {
     CNNLayer::Ptr layerRelU = layerToClone->outData[0]->inputTo.begin()->second;
 
     InferenceEngine::CNNNetReader reader1;
-    std::string inpuitName = layerToClone->insData[0].lock()->name;
-    std::string model = "<net name=\"L\" version=\"2\" batch=\"1\"><layers> "        \
+    DataPtr inputData = layerToClone->insData[0].lock();
+    std::string inputName = inputData->name;
+
+    size_t inputBatch = inputData->getTensorDesc().getDims()[0];
+    size_t inputChannels = inputData->getTensorDesc().getDims()[1];
+    size_t inputHeight = inputData->getTensorDesc().getDims()[2];
+    size_t inputWidth = inputData->getTensorDesc().getDims()[3];
+
+    DataPtr outputData = layerToClone->outData[0];
+    size_t outputBatch = outputData->getTensorDesc().getDims()[0];
+    size_t outputChannels = outputData->getTensorDesc().getDims()[1];
+    size_t outputHeight = outputData->getTensorDesc().getDims()[2];
+    size_t outputWidth = outputData->getTensorDesc().getDims()[3];
+
+    ConvolutionLayer *pConvS = dynamic_cast<ConvolutionLayer *>(layerToClone.get());
+
+    std::string model = "<net name=\"L\" version=\"2\" batch=\"1\"><layers> "\
         "<layer name=\"" +
-        inpuitName +
+        inputName +
         "\" type=\"Input\" precision=\"FP32\" id=\"0\"> "\
         "<output>"\
         "<port id=\"0\">"\
-        "<dim>1</dim>"\
-        "<dim>3</dim>"\
-        "<dim>224</dim>"\
-        "<dim>224</dim>"\
+        "<dim>" + std::to_string(inputBatch) + "</dim>"\
+        "<dim>" + std::to_string(inputChannels) + "</dim>"\
+        "<dim>" + std::to_string(inputHeight) + "</dim>"\
+        "<dim>" + std::to_string(inputWidth) + "</dim>"\
         "</port>"\
         "</output>"\
-        "</layer>" \
+        "</layer>"\
         "<layer name=\"" +
         layerToClone->name +
-        "\" type=\"Convolution\" precision=\"FP32\" id=\"1\">" \
-        "<convolution_data stride-x=\"2\" stride-y=\"2\" pad-x=\"3\" pad-y=\"3\" kernel-x=\"7\" kernel-y=\"7\" output=\"64\" group=\"1\" />"\
+        "\" type=\"Convolution\" precision=\"FP32\" id=\"1\">"\
+        "<convolution_data stride-x=\"" + std::to_string(pConvS->_stride_x) +
+        "\" stride-y=\"" + std::to_string(pConvS->_stride_y) +
+        "\" pad-x=\"" + std::to_string(pConvS->_padding_x) +
+        "\" pad-y=\"" + std::to_string(pConvS->_padding_y) +
+        "\" kernel-x=\"" + std::to_string(pConvS->_kernel_x) +
+        "\" kernel-y=\"" + std::to_string(pConvS->_kernel_y) +
+        "\" dilation-x=\"" + std::to_string(pConvS->_dilation_x) +
+        "\" dilation-y=\"" + std::to_string(pConvS->_dilation_y) +
+        "\" output=\"" + std::to_string(pConvS->_out_depth) +
+        "\" group=\"" + std::to_string(pConvS->_group) + "\" />"\
         "<input>"\
         "<port id=\"1\">"\
-        "<dim>1</dim>"\
-        "<dim>3</dim>"\
-        "<dim>224</dim>"\
-        "<dim>224</dim>"\
+        "<dim>" + std::to_string(inputBatch) + "</dim>"\
+        "<dim>" + std::to_string(inputChannels) + "</dim>"\
+        "<dim>" + std::to_string(inputHeight) + "</dim>"\
+        "<dim>" + std::to_string(inputWidth) + "</dim>"\
         "</port>"\
         "</input>"\
         "<output>"\
         "<port id=\"2\">"\
-        "<dim>1</dim>"\
-        "<dim>64</dim>"\
-        "<dim>112</dim>"\
-        "<dim>112</dim>"\
-        "</port>"\
-        "</output>"\
-        "</layer>"\
-        "<layer name=\"" +
-        layerRelU->name +
-        "\" type=\"ReLU\" precision=\"FP32\" id=\"2\">"\
-        "<input>"
-        "<port id=\"3\">"\
-        "<dim>1</dim>"\
-        "<dim>64</dim>"\
-        "<dim>112</dim>"\
-        "<dim>112</dim>"\
-        "</port>"\
-        "</input>"\
-        "<output>"\
-        "<port id=\"4\">"\
-        "<dim>1</dim>"\
-        "<dim>64</dim>"\
-        "<dim>112</dim>"\
-        "<dim>112</dim>"\
-        "</port>"\
-        "</output>"\
-        "</layer>"\
-        "<layer name=\"" +
-        layerToClone->name +
-        "_\" type=\"ScaleShift\" precision=\"FP32\" id=\"3\">"\
-        "<input>"
-        "<port id=\"5\">"\
-        "<dim>1</dim>"\
-        "<dim>64</dim>"\
-        "<dim>112</dim>"\
-        "<dim>112</dim>"\
-        "</port>"\
-        "</input>"\
-        "<output>"\
-        "<port id=\"6\">"\
-        "<dim>1</dim>"\
-        "<dim>64</dim>"\
-        "<dim>112</dim>"\
-        "<dim>112</dim>"\
+        "<dim>" + std::to_string(outputBatch) + "</dim>"\
+        "<dim>" + std::to_string(outputChannels) + "</dim>"\
+        "<dim>" + std::to_string(outputHeight) + "</dim>"\
+        "<dim>" + std::to_string(outputWidth) + "</dim>"\
         "</port>"\
         "</output>"\
-        "</layer>"\
-        "</layers> <edges>"\
-        "<edge from-layer=\"0\" from-port=\"0\" to-layer=\"1\" to-port=\"1\"/> "\
-        "<edge from-layer=\"1\" from-port=\"2\" to-layer=\"2\" to-port=\"3\"/> "\
-        "<edge from-layer=\"2\" from-port=\"4\" to-layer=\"3\" to-port=\"5\"/> "\
-        "</edges></net>";
+        "</layer>";
+    if (hasReLU) {
+        model += "<layer name=\"" +
+            layerRelU->name +
+            "\" type=\"ReLU\" precision=\"FP32\" id=\"2\">"\
+            "<input>"
+            "<port id=\"3\">"\
+            "<dim>" + std::to_string(outputBatch) + "</dim>"\
+            "<dim>" + std::to_string(outputChannels) + "</dim>"\
+            "<dim>" + std::to_string(outputHeight) + "</dim>"\
+            "<dim>" + std::to_string(outputWidth) + "</dim>"\
+            "</port>"\
+            "</input>"\
+            "<output>"\
+            "<port id=\"4\">"\
+            "<dim>" + std::to_string(outputBatch) + "</dim>"\
+            "<dim>" + std::to_string(outputChannels) + "</dim>"\
+            "<dim>" + std::to_string(outputHeight) + "</dim>"\
+            "<dim>" + std::to_string(outputWidth) + "</dim>"\
+            "</port>"\
+            "</output>"\
+            "</layer>";
+    }
+    model += "</layers> <edges>"\
+        "<edge from-layer=\"0\" from-port=\"0\" to-layer=\"1\" to-port=\"1\"/> ";
+    if (hasReLU) {
+        model += "<edge from-layer=\"1\" from-port=\"2\" to-layer=\"2\" to-port=\"3\"/> ";
+    }
+    model += "</edges></net>";
 
     reader1.ReadNetwork(model.c_str(), model.length());
     ICNNNetwork &n = reader1.getNetwork();
@@ -331,107 +349,11 @@ CNNNetwork Int8Calibrator::createICNNNetworkForLayer(CNNLayer::Ptr layerToClone)
 
     CNNLayerPtr convLayer;
     n.getLayerByName(layerToClone->name.c_str(), convLayer, nullptr);
-    ConvolutionLayer *pConvS = dynamic_cast<ConvolutionLayer *>(layerToClone.get());
     ConvolutionLayer *pConvT = dynamic_cast<ConvolutionLayer *>(convLayer.get());
-    pConvT->_kernel_x = pConvS->_kernel_x;
-    pConvT->_kernel_y = pConvS->_kernel_y;
-    pConvT->_stride_x = pConvS->_stride_x;
-    pConvT->_stride_y = pConvS->_stride_y;
-    pConvT->_out_depth = pConvS->_out_depth;
-    pConvT->_padding_x = pConvS->_padding_x;
-    pConvT->_padding_y = pConvS->_padding_y;
-    pConvT->_dilation_x = pConvS->_dilation_x;
-    pConvT->_dilation_y = pConvS->_dilation_y;
-    pConvT->_group = pConvS->_group;
     pConvT->_weights = pConvS->_weights;
     pConvT->_biases = pConvS->_biases;
     pConvT->blobs = pConvS->blobs;
 
-    std::shared_ptr<Data> cur = layerToClone->insData[0].lock();
-    if (cur == nullptr) {
-        THROW_IE_EXCEPTION << "[Samples] shared ptr layerToClone->insData[0].lock() return nullptr";
-    }
-    DataPtr inputEdge = std::make_shared<Data>(*cur.get());
-
-    inputEdge->getInputTo().clear();
-    inputEdge->name = inpuitName;
-    inputEdge->creatorLayer = inputLayer;
-    inputEdge->inputTo[layerToClone->name] = convLayer;
-    inputEdge->getInputTo().clear();
-    inputEdge->inputTo[layerToClone->name] = convLayer;
-
-    inputs.begin()->second->setInputData(inputEdge);
-
-    convLayer->insData.clear();
-    convLayer->insData.push_back(inputEdge);
-
-    inputLayer->outData.clear();
-    inputLayer->outData.push_back(inputEdge);
-
-    DataPtr convEdge = std::make_shared<Data>(*layerToClone->outData[0].get());
-    convEdge->getInputTo().clear();
-    convEdge->creatorLayer = convLayer;
-    convEdge->name = convLayer->name;
-    convLayer->outData.clear();
-    convLayer->outData.push_back(convEdge);
-
-    CNNLayerPtr reluLayer;
-    n.getLayerByName(layerRelU->name.c_str(), reluLayer, nullptr);
-    DataPtr reluEdge = std::make_shared<Data>(*layerRelU->outData[0].get());
-    reluEdge->getInputTo().clear();
-    reluEdge->creatorLayer = reluLayer;
-    reluEdge->name = reluLayer->name;
-    reluLayer->insData.clear();
-    reluLayer->insData.push_back(convEdge);
-    reluLayer->outData.clear();
-    reluLayer->outData.push_back(reluEdge);
-
-    convEdge->inputTo[reluLayer->name] = reluLayer;
-
-    CNNLayerPtr ssLayer;
-    std::string ssLayerName = convLayer->name + "_";
-    n.getLayerByName(ssLayerName.c_str(), ssLayer, nullptr);
-    DataPtr ssEdge = std::make_shared<Data>(*layerRelU->outData[0].get());
-    ssEdge->getInputTo().clear();
-    ssEdge->creatorLayer = ssLayer;
-    ssEdge->name = ssLayer->name;
-    ssLayer->insData.clear();
-    ssLayer->insData.push_back(reluEdge);
-    ssLayer->outData.clear();
-    ssLayer->outData.push_back(ssEdge);
-
-    reluEdge->inputTo[ssLayer->name] = ssLayer;
-
-    n.addOutput(ssLayer->name);
-
-    // filling weights and biases
-    size_t channels = ssEdge->getTensorDesc().getDims()[1];
-    Blob::Ptr weights = nullptr;
-    SizeVector wdims;
-    wdims.push_back(channels);
-    weights = make_shared_blob<float, const SizeVector>(Precision::FP32, Layout::C, wdims);
-    weights->allocate();
-    float *dataw = weights->buffer().as<float *>();
-    for (size_t i = 0; i < channels; i++) {
-        dataw[i] = 1.0f;
-    }
-    ssLayer->blobs["weights"] = weights;
-
-    Blob::Ptr biases = nullptr;
-    SizeVector bdims;
-    bdims.push_back(channels);
-    biases = make_shared_blob<float, const SizeVector>(Precision::FP32, Layout::C, bdims);
-    biases->allocate();
-    float *datab = biases->buffer().as<float *>();
-    for (size_t i = 0; i < channels; i++) {
-        datab[i] = 0.0f;
-    }
-    ssLayer->blobs["biases"] = biases;
-
-    auto wss = dynamic_cast<WeightableLayer*>(ssLayer.get());
-    wss->_weights = weights;
-    wss->_biases = biases;
-
     return reader1.getNetwork();
 }
 
@@ -442,7 +364,13 @@ void Int8Calibrator::collectByLayerStatistic(const InferenceEngine::NetworkStats
     networkReaderC.ReadNetwork(_modelFileNameI8C);
     if (!networkReaderC.isParseSuccess()) THROW_IE_EXCEPTION << "cannot load a failed Model";
     if (_cBatch != 0) {
-        networkReaderC.getNetwork().setBatchSize(_cBatch);
+        auto input_shapes = networkReaderC.getNetwork().getInputShapes();
+        std::string input_name;
+        SizeVector input_shape;
+        std::tie(input_name, input_shape) = *input_shapes.begin();
+        input_shape[0] = _cBatch;
+        input_shapes[input_name] = input_shape;
+        networkReaderC.getNetwork().reshape(input_shapes);
     }
 
     /** Extract model name and load weights **/
@@ -474,49 +402,51 @@ void Int8Calibrator::collectByLayerStatistic(const InferenceEngine::NetworkStats
 
         // if only one output from conv and if it is an output to relu
         bool quattization = false;
-        if (layerToClone->outData.size() == 1 && layerToClone->outData[0]->inputTo.size() == 1) {
+        if (layerToClone->outData.size() == 1
+            && layerToClone->outData[0]->inputTo.size() == 1
+            && CaselessEq<std::string>()(layerToClone->outData[0]->inputTo.begin()->second->name, "relu")) {
             layerRelU = layerToClone->outData[0]->inputTo.begin()->second;
-            if (layerRelU->type == "ReLU") {
-                quattization = true;
-            }
         }
 
-        if (quattization) {
-            CNNNetwork n = createICNNNetworkForLayer(layerToClone);
-            if (_cBatch != 0) {
-                n.setBatchSize(_cBatch);
-            }
+        CNNNetwork n = createICNNNetworkForLayer(layerToClone, layerRelU ? true : false);
+        if (_cBatch != 0) {
+            auto input_shapes = n.getInputShapes();
+            std::string input_name;
+            SizeVector input_shape;
+            std::tie(input_name, input_shape) = *input_shapes.begin();
+            input_shape[0] = _cBatch;
+            input_shapes[input_name] = input_shape;
+            n.reshape(input_shapes);
+        }
 
-            // Initialize statistic
-            ICNNNetworkStats *pstats = nullptr;
-            ICNNNetwork &in = n;
-            StatusCode s = in.getStats(&pstats, nullptr);
-            if (s == StatusCode::OK && pstats) {
-                pstats->setNodesStats(stat);
-            }
+        // Initialize statistic
+        ICNNNetworkStats *pstats = nullptr;
+        ICNNNetwork &in = n;
+        StatusCode s = in.getStats(&pstats, nullptr);
+        if (s == StatusCode::OK && pstats) {
+            pstats->setNodesStats(stat);
+        }
 
-            InferenceEngine::InputsDataMap inputs = n.getInputsInfo();
-            DataPtr q = inputs.begin()->second->getInputData();
+        InferenceEngine::InputsDataMap inputs = n.getInputsInfo();
+        DataPtr q = inputs.begin()->second->getInputData();
 
-            ExecutableNetwork enetwork = _pluginI8C.LoadNetwork(n, { { CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(YES) } });
-            _singleLayerNetworks.push_back(enetwork);
-            InferenceEngine::InferRequest request = enetwork.CreateInferRequest();
-            std::string inpuitName = layerToClone->insData[0].lock()->name;
-            request.SetBlob(inpuitName, _inferRequestI8C.GetBlob(inpuitName));
-            _singleLayerRequests[layerToClone->name] = { request, layerRelU->name, layerToClone->name };
-        }
+        ExecutableNetwork enetwork = _pluginI8C.LoadNetwork(n, { { CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(YES) } });
+        _singleLayerNetworks.push_back(enetwork);
+        InferenceEngine::InferRequest request = enetwork.CreateInferRequest();
+        std::string inputName = layerToClone->insData[0].lock()->name;
+        request.SetBlob(inputName, _inferRequestI8C.GetBlob(inputName));
+        _singleLayerRequests[layerToClone->name] = { request, layerRelU ? layerRelU->name : layerToClone->name, layerToClone->name };
     }
 }
 
 
-void Int8Calibrator::collectCalibrationStatistic() {
+void Int8Calibrator::collectCalibrationStatistic(size_t pics) {
     if (_collectByLayer) {
         std::map<std::string, SingleLayerData>::iterator it = _singleLayerRequests.begin();
         while (it != _singleLayerRequests.end()) {
             it->second._request.Infer();
             Blob::Ptr expected = _inferRequestI8C.GetBlob(it->second._outputName);
-            std::string i8Out = it->second._outputI8Name + "_";
-            Blob::Ptr result = it->second._request.GetBlob(i8Out.c_str());
+            Blob::Ptr result = it->second._request.GetBlob(it->second._outputName);
             float diff = compare_NRMSD(result, expected);
             it->second._int8Accuracy.push_back(diff);
             it++;
@@ -533,11 +463,12 @@ void Int8Calibrator::collectCalibrationStatistic() {
 
             size_t N, C, statCount;
             if (outBlob->dims().size() == 4 && outBlob->layout() == Layout::NCHW) {
-                N = outBlob->dims()[3];
+                // TODO(amalyshe) cahnge to using of tensor desc
+                N = pics;
                 C = outBlob->dims()[2];
                 statCount = C;
             } else if (outBlob->dims().size() == 2 && outBlob->layout() == Layout::NC) {
-                N = outBlob->dims()[1];
+                N = pics;
                 C = outBlob->dims()[0];
                 statCount = 1;
             } else {
@@ -627,13 +558,23 @@ ClassificationCalibrator::ClassificationCalibrator(int nPictures, const std::str
     _cBatch = flags_b;
 }
 
-shared_ptr<Processor::InferenceMetrics> ClassificationCalibrator::Process() {
+shared_ptr<Processor::InferenceMetrics> ClassificationCalibrator::Process(bool stream_output) {
     inferRequest = _inferRequestI8C;
     int top1Result = 0, total = 0;
 
     ClassificationSetGenerator generator;
 
+    try {
+        generator.readLabels(labelFileName);
+    } catch (InferenceEngine::details::InferenceEngineException& ex) {
+        slog::warn << "Can't read labels file " << labelFileName << slog::endl;
+    }
     auto validationMap = generator.getValidationMap(imagesPath);
+
+    if (validationMap.size() == 0) {
+        THROW_IE_EXCEPTION << "The validation dataset in " << imagesPath << "is empty. Check the dataset file or folder and the labels file";
+    }
+
     ImageDecoder decoder;
 
     // ----------------------------Do inference-------------------------------------------------------------
@@ -646,7 +587,7 @@ shared_ptr<Processor::InferenceMetrics> ClassificationCalibrator::Process() {
     }
 
 
-    ConsoleProgress progress(_nPictures);
+    ConsoleProgress progress(_nPictures, stream_output);
 
     CalibrationMetrics im;
 
@@ -675,12 +616,11 @@ shared_ptr<Processor::InferenceMetrics> ClassificationCalibrator::Process() {
         ipics += batch;
 
         Infer(progress, filesWatched, im);
-        collectCalibrationStatistic();
+        collectCalibrationStatistic(b);
 
         std::vector<unsigned> results;
         auto firstOutputData = firstOutputBlob->buffer().as<PrecisionTrait<Precision::FP32>::value_type *>();
         InferenceEngine::TopResults(1, *firstOutputBlob, results);
-
         for (int i = 0; i < b; i++) {
             int expc = expected[i];
             if (zeroBackground) expc++;
@@ -712,15 +652,17 @@ SSDObjectDetectionCalibrator::SSDObjectDetectionCalibrator(int nPictures, const
     _modelFileNameI8C = modelFileName;
     _pluginI8C = plugin;
     _nPictures = nPictures;
+    _cBatch = flags_b;
 }
 
-shared_ptr<Processor::InferenceMetrics> SSDObjectDetectionCalibrator::Process() {
+shared_ptr<Processor::InferenceMetrics> SSDObjectDetectionCalibrator::Process(bool stream_output) {
     inferRequest = _inferRequestI8C;
 
     // Parsing PASCAL VOC2012 format
     VOCAnnotationParser vocAnnParser;
     VOCAnnotationCollector annCollector(annotationsPath);
 
+
     if (annCollector.annotations().size() == 0) {
         ObjectDetectionInferenceMetrics emptyIM(this->threshold);
 
@@ -762,7 +704,7 @@ shared_ptr<Processor::InferenceMetrics> SSDObjectDetectionCalibrator::Process()
         _nPictures = annCollector.annotations().size();
     }
 
-    ConsoleProgress progress(_nPictures);
+    ConsoleProgress progress(_nPictures, stream_output);
 
     ObjectDetectionInferenceMetrics im(threshold);
 
@@ -807,23 +749,21 @@ shared_ptr<Processor::InferenceMetrics> SSDObjectDetectionCalibrator::Process()
             ipics++;
         }
 
-        if (files.size() == batch) {
-            InferenceEngine::StatusCode sts;
-            InferenceEngine::ResponseDesc dsc;
+        InferenceEngine::StatusCode sts;
+        InferenceEngine::ResponseDesc dsc;
 
-            // Infer model
-            Infer(progress, filesWatched, im);
-            collectCalibrationStatistic();
+        // Infer model
+        Infer(progress, filesWatched, im);
+        collectCalibrationStatistic(b);
 
-            // Processing the inference result
-            std::map<std::string, std::list<DetectedObject>> detectedObjects = processResult(files);
+        // Processing the inference result
+        std::map<std::string, std::list<DetectedObject>> detectedObjects = processResult(files);
 
-            // Calculating similarity
-            //
-            for (int b = 0; b < files.size(); b++) {
-                ImageDescription result(detectedObjects[files[b]]);
-                im.apc.consumeImage(result, scaledDesiredForFiles.at(files[b]));
-            }
+        // Calculating similarity
+        //
+        for (int b = 0; b < files.size(); b++) {
+            ImageDescription result(detectedObjects[files[b]]);
+            im.apc.consumeImage(result, scaledDesiredForFiles.at(files[b]));
         }
     }
     progress.finish();
index f533e33..05e7c1e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -90,7 +89,7 @@ protected:
      * This function should be called from final callibrator after and each Infer for each picture
      * It calculates by layer accuracy drop and as well it also collect activation values statistic
      */
-    void collectCalibrationStatistic();
+    void collectCalibrationStatistic(size_t pics);
 
     /**
      * This function should be called from calibration class after Infer of all picture
@@ -127,7 +126,8 @@ private:
      * Since Inference Engine API mostly directed to the loading of network from IR, we need to create
      * such IR first, read through stream and modify network to correspond required parameters
      */
-    InferenceEngine::CNNNetwork createICNNNetworkForLayer(InferenceEngine::CNNLayer::Ptr layerToClone);
+    InferenceEngine::CNNNetwork createICNNNetworkForLayer(InferenceEngine::CNNLayer::Ptr layerToClone,
+                                                          bool hasReLU);
 
     std::map<std::string, float> _layersAccuracyDrop;
     std::vector<InferenceEngine::ExecutableNetwork> _singleLayerNetworks;
@@ -157,7 +157,7 @@ public:
                               InferenceEngine::InferencePlugin plugin, CsvDumper &dumper, const std::string &flags_l,
                               PreprocessingOptions preprocessingOptions, bool zeroBackground);
 
-    shared_ptr<InferenceMetrics> Process()override;
+    shared_ptr<InferenceMetrics> Process(bool stream_output = false) override;
 };
 
 
@@ -174,5 +174,5 @@ public:
                                  InferencePlugin plugin, CsvDumper &dumper,
                                  const std::string &flags_a, const std::string &classes_list_file);
 
-    shared_ptr<InferenceMetrics> Process()override;
+    shared_ptr<InferenceMetrics> Process(bool stream_output = false) override;
 };
index fd95a29..cd01014 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -30,7 +29,6 @@
 #include "calibrator_processors.h"
 #include "SSDObjectDetectionProcessor.hpp"
 #include "YOLOObjectDetectionProcessor.hpp"
-#include "network_serializer.h"
 #include "ie_icnn_network_stats.hpp"
 #include "details/caseless.hpp"
 
@@ -93,12 +91,17 @@ static const char custom_cldnn_message[] = "Required for GPU custom kernels. "
 /// @brief Message for user library argument
 static const char custom_cpu_library_message[] = "Required for CPU custom layers. "
                                                  "Absolute path to a shared library with the kernel implementations.";
+/// @brief Message for labels file
+static const char labels_file_message[] = "Labels file path. The labels file contains names of the dataset classes";
 
 static const char zero_background_message[] = "\"Zero is a background\" flag. Some networks are trained with a modified"
                                               " dataset where the class IDs "
                                               " are enumerated from 1, but 0 is an undefined \"background\" class"
                                               " (which is never detected)";
 
+static const char stream_output_message[] = "Flag for printing progress as a plain text.When used, interactive progress"
+                                            " bar is replaced with multiline output";
+
 /// @brief Network type options and their descriptions
 static const char* types_descriptions[][2] = {
     { "C", "calibrate Classification network and write the calibrated network to IR" },
@@ -177,10 +180,15 @@ DEFINE_string(l, "", custom_cpu_library_message);
 /// @brief Define parameter for accuracy drop threshold
 DEFINE_double(threshold, 1.0f, accuracy_threshold_message);
 
+/// @brief Define path to output calibrated model
+DEFINE_bool(stream_output, false, stream_output_message);
+
 DEFINE_int32(subset, 0, number_of_pictures_message);
 
 DEFINE_string(output, "", output_model_name);
 
+DEFINE_string(lbl, "", labels_file_message);
+
 /**
  * @brief This function shows a help message
  */
@@ -196,6 +204,7 @@ static void showUsage() {
     }
     std::cout << "    -i <path>                 " << image_message << std::endl;
     std::cout << "    -m <path>                 " << model_message << std::endl;
+    std::cout << "    -lbl <path>               " << labels_file_message << std::endl;
     std::cout << "    -l <absolute_path>        " << custom_cpu_library_message << std::endl;
     std::cout << "    -c <absolute_path>        " << custom_cldnn_message << std::endl;
     std::cout << "    -d <device>               " << target_device_message << std::endl;
@@ -219,6 +228,9 @@ static void showUsage() {
     std::cout << "      -ODa <path>             " << obj_detection_annotations_message << std::endl;
     std::cout << "      -ODc <file>             " << obj_detection_classes_message << std::endl;
     std::cout << "      -ODsubdir <name>        " << obj_detection_subdir_message << std::endl << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "    -stream_output                   " << stream_output_message << std::endl;
 }
 
 enum NetworkType {
@@ -270,8 +282,7 @@ void SaveCalibratedIR(const std::string &originalName,
     }
 
     slog::info << "Write calibrated network to " << outModelName << ".(xml|bin) IR file\n";
-    CNNNetworkSerializer serializer;
-    serializer.Serialize(outModelName + ".xml", outModelName + ".bin", networkReader.getNetwork());
+    networkReader.getNetwork().serialize(outModelName + ".xml", outModelName + ".bin");
 }
 
 /**
@@ -321,7 +332,6 @@ int main(int argc, char *argv[]) {
             // Checking required OD-specific options
             if (FLAGS_ODa.empty()) ee << UserException(11, "Annotations folder is not specified for object detection (missing -a option)");
             if (FLAGS_ODc.empty()) ee << UserException(12, "Classes file is not specified (missing -c option)");
-            if (FLAGS_b > 0) ee << UserException(13, "Batch option other than 0 is not supported for Object Detection networks");
         }
 
         if (!ee.empty()) throw ee;
@@ -384,7 +394,7 @@ int main(int argc, char *argv[]) {
         if (netType == Classification || netType == RawC) {
             processor = std::shared_ptr<Processor>(
                 new ClassificationCalibrator(FLAGS_subset, FLAGS_m, FLAGS_d, FLAGS_i, FLAGS_b,
-                                                plugin, dumper, FLAGS_l, preprocessingOptions, FLAGS_Czb));
+                                                plugin, dumper, FLAGS_lbl, preprocessingOptions, FLAGS_Czb));
         } else if (netType == ObjDetection || netType == RawOD) {
             if (FLAGS_ODkind == "SSD") {
                 processor = std::shared_ptr<Processor>(
@@ -411,7 +421,7 @@ int main(int argc, char *argv[]) {
             slog::info << "Collecting activation statistics" << slog::endl;
         }
         calibrator->collectFP32Statistic();
-        shared_ptr<Processor::InferenceMetrics> pIMFP32 = processor->Process();
+        shared_ptr<Processor::InferenceMetrics> pIMFP32 = processor->Process(FLAGS_stream_output);
         const CalibrationMetrics* mFP32 = dynamic_cast<const CalibrationMetrics*>(pIMFP32.get());
         std:: cout << "  FP32 Accuracy: " << OUTPUT_FLOATING(100.0 * mFP32->AccuracyResult) << "% " << std::endl;
 
@@ -427,7 +437,7 @@ int main(int argc, char *argv[]) {
                 std::cout << "Validate int8 accuracy, threshold for activation statistics = " << threshold << std::endl;
                 InferenceEngine::NetworkStatsMap tmpStatMap = calibrator->getStatistic(threshold);
                 calibrator->validateInt8Config(tmpStatMap, {});
-                shared_ptr<Processor::InferenceMetrics> pIM_I8 = processor->Process();
+                shared_ptr<Processor::InferenceMetrics> pIM_I8 = processor->Process(FLAGS_stream_output);
                 const CalibrationMetrics *mI8 = dynamic_cast<const CalibrationMetrics *>(pIM_I8.get());
                 if (maximalAccuracy < mI8->AccuracyResult) {
                     maximalAccuracy = mI8->AccuracyResult;
@@ -446,7 +456,7 @@ int main(int argc, char *argv[]) {
                 slog::info << "Collecting intermediate per-layer accuracy drop" << slog::endl;
                 // getting statistic on accuracy drop by layers
                 calibrator->collectByLayerStatistic(statMap);
-                processor->Process();
+                processor->Process(FLAGS_stream_output);
                 // starting to reduce number of layers being converted to Int8
                 std::map<std::string, float>  layersAccuracyDrop = calibrator->layersAccuracyDrop();
 
@@ -463,7 +473,7 @@ int main(int argc, char *argv[]) {
                     slog::info << "Returning of '" << it->second << "' to FP32 precision, start validation\n";
                     layersToInt8[it->second] = false;
                     calibrator->validateInt8Config(statMap, layersToInt8);
-                    pIM_I8 = processor->Process();
+                    pIM_I8 = processor->Process(FLAGS_stream_output);
                     mI8 = dynamic_cast<const CalibrationMetrics *>(pIM_I8.get());
                     maximalAccuracy = mI8->AccuracyResult;
                     if ((mFP32->AccuracyResult - maximalAccuracy) > (FLAGS_threshold / 100)) {
diff --git a/inference-engine/samples/calibration_tool/network_serializer.h b/inference-engine/samples/calibration_tool/network_serializer.h
deleted file mode 100644 (file)
index d0b91ae..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (C) 2018 Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "inference_engine.hpp"
-#include <pugixml/pugixml.hpp>
-#include <string>
-
-/** Class for serialization of model been presented as ICNNNetwork to the disk
- */
-class CNNNetworkSerializer {
-public:
-    void Serialize(const std::string &xmlPath, const std::string &binPath,
-                   InferenceEngine::ICNNNetwork& network);
-
-protected:
-    void updateStdLayerParams(InferenceEngine::CNNLayer::Ptr layer);
-};
index b412fdc..4c80190 100644 (file)
@@ -1,25 +1,11 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#      http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 cmake_minimum_required(VERSION 2.8)
 
 set (TARGET_NAME "classification_sample")
 
-if( BUILD_SAMPLE_NAME AND NOT ${BUILD_SAMPLE_NAME} STREQUAL ${TARGET_NAME} )
-    message(STATUS "SAMPLE ${TARGET_NAME} SKIPPED")
-    return()
-endif()
-
 file (GLOB SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         )
index 83f00de..26e943b 100644 (file)
@@ -1,7 +1,7 @@
 # Image Classification Sample
 
-This topic demonstrates how to build and run the Image Classification sample application, which doe
-inference using image classification networks like AlexNet and GoogLeNet.
+This topic demonstrates how to run the Image Classification sample application, which perform
+inference using image classification networks such as AlexNet and GoogLeNet.
 
 ## Running
 
@@ -15,36 +15,30 @@ InferenceEngine:
 classification_sample [OPTION]
 Options:
 
-    -h                      
-                            Print a usage message.
-    -i "<path1>" "<path2>"
-                            Required. Path to a folder with images or path to an image files: a .ubyte file for LeNet
-                            and a .bmp file for the other networks.
-    -m "<path>"             
-                            Required. Path to an .xml file with a trained model.
-        -l "<absolute_path>"
-                            Optional. Absolute path to library with MKL-DNN (CPU) custom layers (*.so).
+    -h                        Print a usage message.
+    -i "<path1>" "<path2>"    Required. Path to a folder with images or path to an image files: a .ubyte file for LeNet
+                              and a .bmp file for the other networks.
+    -m "<path>"               Required. Path to an .xml file with a trained model.
+        -l "<absolute_path>"  Optional. Absolute path to library with MKL-DNN (CPU) custom layers (*.so).
         Or
-        -c "<absolute_path>"
-                            Optional. Absolute path to clDNN (GPU) custom layers config (*.xml).
-    -pp "<path>"            
-                            Path to a plugin folder.
-    -d "<device>"           
-                            Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified
-    -nt "<integer>"         
-                            Number of top results (default 10)
-    -ni "<integer>"         
-                            Number of iterations (default 1)
-    -pc                     
-                            Enables per-layer performance report
-    -p_msg                  
-                            Enables messages from a plugin
+        -c "<absolute_path>"  Optional. Absolute path to clDNN (GPU) custom layers config (*.xml).
+    -pp "<path>"              Path to a plugin folder.
+    -d "<device>"             Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified
+    -nt "<integer>"           Number of top results (default 10)
+    -ni "<integer>"           Number of iterations (default 1)
+    -pc                       Enables per-layer performance report
+    -p_msg                    Enables messages from a plugin
 
 ```
 
-Running the application with the empty list of options yields the usage message given above and an error message.
+Running the application with the empty list of options yields the usage message given above.
+
+To run the sample you can use AlexNet and GoogLeNet models that can be downloaded with the OpenVINO [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader) or other image classification models. 
+
+> **IMPORTANT**: To run the sample, the model should be first converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
+For example, to perform inference of an AlexNet model (previously converted to the Inference Engine format) on CPU, use the following command:
 
-You can do inference on an image using a trained AlexNet network on Intel&reg; Processors using the following command:
 ```sh
 ./classification_sample -i <path_to_image>/cat.bmp -m <path_to_model>/alexnet_fp32.xml
 ```
@@ -65,4 +59,6 @@ Engine plugin. When inference is done, the application creates an
 output image and outputs data to the standard output stream.
 
 ## See Also 
-* [Using Inference Engine Samples](./docs/Inference_Engine_Developer_Guide/Samples_Overview.md)
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
+* [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
+* [Model Downloader](https://github.com/opencv/open_model_zoo/tree/2018/model_downloader)
\ No newline at end of file
index e6d70e8..96e6e41 100644 (file)
@@ -1,25 +1,11 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#      http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 cmake_minimum_required(VERSION 2.8)
 
 set (TARGET_NAME "classification_sample_async")
 
-if( BUILD_SAMPLE_NAME AND NOT ${BUILD_SAMPLE_NAME} STREQUAL ${TARGET_NAME} )
-    message(STATUS "SAMPLE ${TARGET_NAME} SKIPPED")
-    return()
-endif()
-
 file (GLOB SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         )
index 4fbea9a..995a5d6 100644 (file)
@@ -64,6 +64,8 @@ You can do inference on an image using a trained AlexNet network on FPGA with fa
 ./classification_sample_async -i <path_to_image>/cat.bmp -m <path_to_model>/alexnet_fp32.xml -nt 5 -d HETERO:FPGA,CPU -nireq 2 -ni 200
 ```
 
+> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
 ### Outputs
 
 By default the application outputs top-10 inference results for each infer request.
@@ -80,4 +82,4 @@ Then in the loop it starts inference for the current infer request and switch fo
 When inference is done, the application outputs data to the standard output stream.
 
 ## See Also
-* [Using Inference Engine Samples](./docs/Inference_Engine_Developer_Guide/Samples_Overview.md)
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
index 95d2ff4..c0a202c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -45,6 +44,9 @@ static const char iterations_count_message[] = "Number of iterations (default 1)
 /// @brief message for iterations count
 static const char ninfer_request_message[] = "Number of infer request for pipelined mode (default 1)";
 
+/// @brief message for #threads for CPU inference
+static const char infer_num_threads_message[] = "Optional. Number of threads to use for inference on the CPU "
+                                                "(including Hetero cases).";
 
 /// @brief message for clDNN custom kernels desc
 static const char custom_cldnn_message[] = "Required for clDNN (GPU)-targeted custom kernels."\
@@ -54,6 +56,10 @@ static const char custom_cldnn_message[] = "Required for clDNN (GPU)-targeted cu
 static const char custom_cpu_library_message[] = "Required for MKLDNN (CPU)-targeted custom layers." \
                                                  "Absolute path to a shared library with the kernels impl.";
 
+// @brief message for CPU threads pinning option
+static const char cpu_threads_pinning_message[] = "Optional. Enable (\"YES\"default) or disable (\"NO\")" \
+                                                  "CPU threads pinning for CPU-involved inference.";
+
 /// @brief message for plugin messages
 static const char plugin_message[] = "Enables messages from a plugin";
 
@@ -98,6 +104,13 @@ DEFINE_int32(nireq, 1, ninfer_request_message);
 /// @brief Enable plugin messages
 DEFINE_bool(p_msg, false, plugin_message);
 
+/// @brief Enable plugin messages
+DEFINE_string(pin, "YES", cpu_threads_pinning_message);
+
+/// @brief Number of threads to use for inference on the CPU (also affects Hetero cases)
+DEFINE_int32(nthreads, 0, infer_num_threads_message);
+
+
 /**
 * @brief This function show a help message
 */
@@ -119,4 +132,7 @@ static void showUsage() {
     std::cout << "    -pc                     " << performance_counter_message << std::endl;
     std::cout << "    -nireq \"<integer>\"      " << ninfer_request_message << std::endl;
     std::cout << "    -p_msg                  " << plugin_message << std::endl;
+    std::cout << "    Some CPU-specific performance options" << std::endl;
+    std::cout << "    -nthreads \"<integer>\"   " << infer_num_threads_message << std::endl;
+    std::cout << "    -pin \"YES\"/\"NO\"       " << cpu_threads_pinning_message << std::endl;
 }
index c424484..e8428ef 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -185,10 +184,18 @@ int main(int argc, char *argv[]) {
         slog::info << "Loading model to the plugin" << slog::endl;
 
         std::map<std::string, std::string> config;
-        if (FLAGS_pc) {
+        if (FLAGS_pc)
             config[PluginConfigParams::KEY_PERF_COUNT] = PluginConfigParams::YES;
+        if (FLAGS_d.find("CPU") != std::string::npos) {  // CPU supports few special performance-oriented keys
+            // limit threading for CPU portion of inference
+            config[PluginConfigParams::KEY_CPU_THREADS_NUM] = std::to_string(FLAGS_nthreads);
+            // pin threads for CPU portion of inference
+            config[PluginConfigParams::KEY_CPU_BIND_THREAD] = FLAGS_pin;
+            // for pure CPU execution, more throughput-oriented execution via streams
+            if (FLAGS_d == "CPU")
+                config[PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS] = std::to_string(FLAGS_nireq);
         }
-        ExecutableNetwork executable_network = plugin.LoadNetwork(network, {});
+        ExecutableNetwork executable_network = plugin.LoadNetwork(network, config);
         // -----------------------------------------------------------------------------------------------------
 
         // --------------------------- 5. Create infer request -------------------------------------------------
@@ -237,6 +244,9 @@ int main(int argc, char *argv[]) {
         typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
         typedef std::chrono::duration<float> fsec;
 
+        // warming up
+        inferRequests[0].StartAsync();
+        inferRequests[0].Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
         double total = 0.0;
         /** Start inference & calc performance **/
         auto t0 = Time::now();
@@ -244,16 +254,11 @@ int main(int argc, char *argv[]) {
         size_t currentInfer = 0;
         size_t prevInfer = (FLAGS_nireq > 1) ? 1 : 0;
 
-
-        // warming up
-        inferRequests[0].StartAsync();
-        inferRequests[0].Wait(10000);
-
         for (int iter = 0; iter < FLAGS_ni + FLAGS_nireq; ++iter) {
             if (iter < FLAGS_ni) {
                 inferRequests[currentInfer].StartAsync();
             }
-            inferRequests[prevInfer].Wait(10000);
+            inferRequests[prevInfer].Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
 
             currentInfer++;
             if (currentInfer >= FLAGS_nireq) {
index abc86b5..0498e0a 100644 (file)
@@ -1,16 +1,6 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 cmake_minimum_required(VERSION 2.8)
 
@@ -24,19 +14,20 @@ file (GLOB LIBRARY_HEADERS
         ${CMAKE_CURRENT_SOURCE_DIR}/*.h
         )
 
-# Find OpenCV libray if exists
-find_package(OpenCV)
-if(OpenCV_FOUND)
-    include_directories(${OpenCV_INCLUDE_DIRS})
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS imgcodecs QUIET)
+if(NOT(OpenCV_FOUND))
+    message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " is built without OPENCV support")
 else()
-    message(STATUS "OPENCV is disabled or not found, " ${TARGET_NAME} " is built without OPENCV support")
+    add_definitions(-DUSE_OPENCV)
 endif()
 
+add_definitions(-DIMPLEMENT_FORMAT_READER)
+
 if(UNIX)
     list(REMOVE_ITEM MAIN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dllmain.cpp)
-else()
-    add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_API)
 endif()
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_API)
 
 # Create named folders for the sources within the .vcproj
 # Empty name lists them directly under the .vcproj
index d26ce50..56822ff 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index f0c6591..53ca373 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 1a17a5f..a698431 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -33,6 +32,6 @@ void Registry::RegisterReader(CreatorFunction f) {
     _data.push_back(f);
 }
 
-FORMAT_READER_API(Reader*)CreateFormatReader(const char *filename) {
+FORMAT_READER_API(Reader*) CreateFormatReader(const char *filename) {
     return Registry::CreateReader(filename);
 }
\ No newline at end of file
index 3a3f551..8a4cfcd 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include <vector>
 #include<iostream>
 
-#ifdef _WIN32
-    #define FORMAT_READER_API(TYPE) extern "C"   __declspec(dllexport)  TYPE __cdecl
-#else  // Linux and Mac
-    #define FORMAT_READER_API(TYPE) extern "C" TYPE
+#if defined(_WIN32)
+# ifdef IMPLEMENT_FORMAT_READER
+# define FORMAT_READER_API(type) extern "C"   __declspec(dllexport) type
+# else
+# define FORMAT_READER_API(type) extern "C" type
+# endif
+#elif(__GNUC__ >= 4)
+# ifdef IMPLEMENT_FORMAT_READER
+#  define FORMAT_READER_API(type) extern "C"   __attribute__((visibility("default"))) type
+# else
+#  define FORMAT_READER_API(type) extern "C" type
+# endif
+#else
+# define FORMAT_READER_API(TYPE) extern "C" TYPE
 #endif
 
 
@@ -69,4 +78,4 @@ public:
  * \brief Function for create reader
  * @return FormatReader pointer
  */
-FORMAT_READER_API(FormatReader::Reader*)CreateFormatReader(const char *filename);
\ No newline at end of file
+FORMAT_READER_API(FormatReader::Reader*) CreateFormatReader(const char *filename);
\ No newline at end of file
index dc42e60..b29b39b 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -10,7 +9,7 @@
 
 #include <opencv2/opencv.hpp>
 
-#include <../samples/slog.hpp>
+#include <samples/slog.hpp>
 
 using namespace std;
 using namespace FormatReader;
index c665209..764b5b4 100644 (file)
@@ -1,8 +1,10 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
-
+/**
+ * \brief Register for readers
+ * \file register.h
+ */
 #pragma once
 
 #include <format_reader.h>
index bae8282..40bcf9e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -45,7 +44,7 @@ struct dirent {
 };
 
 class DIR {
-    WIN32_FIND_DATA FindFileData;
+    WIN32_FIND_DATAA FindFileData;
     HANDLE hFind;
     dirent *next;
 
@@ -63,7 +62,7 @@ public:
             ws += "*";
         else
             ws += "\\*";
-        hFind = FindFirstFile(ws.c_str(), &FindFileData);
+        hFind = FindFirstFileA(ws.c_str(), &FindFileData);
         FindFileData.dwReserved0 = hFind != INVALID_HANDLE_VALUE;
     }
 
@@ -87,7 +86,7 @@ public:
         size_t outSize;
         mbstowcs_s(&outSize, wbuf, 4094, FindFileData.cFileName, 4094);
         next = new dirent(wbuf);
-        FindFileData.dwReserved0 = FindNextFile(hFind, &FindFileData);
+        FindFileData.dwReserved0 = FindNextFileA(hFind, &FindFileData);
         return next;
     }
 };
@@ -108,4 +107,4 @@ static struct dirent *readdir(DIR *dp) {
 
 static void closedir(DIR *dp) {
     delete dp;
-}
\ No newline at end of file
+}
index 5917e7f..88c87e3 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include <algorithm>
 #include <chrono>
 
-#ifdef USE_OPENCV
-    #include <opencv2/opencv.hpp>
-#endif
-
 #include <ie_plugin_dispatcher.hpp>
 #include <ie_plugin_ptr.hpp>
 #include <cpp/ie_cnn_net_reader.h>
@@ -88,8 +83,8 @@ static InferenceEngine::TargetDevice getDeviceFromStr(const std::string &deviceN
 * @param device - device to infer on
 * @return Plugin pointer
 */
-static InferenceEngine::InferenceEnginePluginPtr selectPlugin(const std::vector<std::string> &pluginDirs,
-                                                              const std::string &plugin,
+static InferenceEngine::InferenceEnginePluginPtr selectPlugin(const std::vector<file_name_t> &pluginDirs,
+                                                              const file_name_t &plugin,
                                                               InferenceEngine::TargetDevice device) {
     InferenceEngine::PluginDispatcher dispatcher(pluginDirs);
 
@@ -107,8 +102,8 @@ static InferenceEngine::InferenceEnginePluginPtr selectPlugin(const std::vector<
  * @param device - string representation of device to infer on
  * @return Plugin pointer
  */
-static UNUSED InferenceEngine::InferenceEnginePluginPtr selectPlugin(const std::vector<std::string> &pluginDirs,
-                                                                     const std::string &plugin,
+static UNUSED InferenceEngine::InferenceEnginePluginPtr selectPlugin(const std::vector<file_name_t> &pluginDirs,
+                                                                     const file_name_t &plugin,
                                                                      const std::string &device) {
     return selectPlugin(pluginDirs, plugin, getDeviceFromStr(device));
 }
@@ -118,7 +113,7 @@ static UNUSED InferenceEngine::InferenceEnginePluginPtr selectPlugin(const std::
  * @param filepath - full file name
  * @return filename without extension
  */
-static UNUSED std::string fileNameNoExt(const std::string &filepath) {
+static std::string fileNameNoExt(const std::string &filepath) {
     auto pos = filepath.rfind('.');
     if (pos == std::string::npos) return filepath;
     return filepath.substr(0, pos);
@@ -197,8 +192,8 @@ static UNUSED std::ostream &operator<<(std::ostream &os, const PluginVersion &ve
 }
 
 inline void printPluginVersion(InferenceEngine::InferenceEnginePluginPtr ptr, std::ostream& stream) {
-    const PluginVersion *pluginVersion = nullptr;
-    ptr->GetVersion((const InferenceEngine::Version*&)pluginVersion);
+    const InferenceEngine::Version *pluginVersion = nullptr;
+    ptr->GetVersion(pluginVersion);
     stream << pluginVersion << std::endl;
 }
 
@@ -519,11 +514,11 @@ static UNUSED void addRectangles(unsigned char *data, size_t height, size_t widt
         if (w < 0) w = 0;
         if (h < 0) h = 0;
 
-        if (x >= width) { x = width - 1; w = 0; thickness = 1; }
-        if (y >= height) { y = height - 1; h = 0; thickness = 1; }
+        if (static_cast<std::size_t>(x) >= width) { x = width - 1; w = 0; thickness = 1; }
+        if (static_cast<std::size_t>(y) >= height) { y = height - 1; h = 0; thickness = 1; }
 
-        if (x + w >= width) { w = width - x - 1; }
-        if (y + h >= height) { h = height - y - 1; }
+        if (static_cast<std::size_t>(x + w) >= width) { w = width - x - 1; }
+        if (static_cast<std::size_t>(y + h) >= height) { h = height - y - 1; }
 
         thickness = std::min(std::min(thickness, w / 2 + 1), h / 2 + 1);
 
@@ -532,26 +527,26 @@ static UNUSED void addRectangles(unsigned char *data, size_t height, size_t widt
         for (int t = 0; t < thickness; t++) {
             shift_first = (y + t) * width * 3;
             shift_second = (y + h - t) * width * 3;
-            for (int i = x; i < x + w + 1; i++) {
-                data[shift_first + i * 3] = colors.at(cls).red();
-                data[shift_first + i * 3 + 1] = colors.at(cls).green();
-                data[shift_first + i * 3 + 2] = colors.at(cls).blue();
-                data[shift_second + i * 3] = colors.at(cls).red();
-                data[shift_second + i * 3 + 1] = colors.at(cls).green();
-                data[shift_second + i * 3 + 2] = colors.at(cls).blue();
+            for (int ii = x; ii < x + w + 1; ii++) {
+                data[shift_first + ii * 3] = colors.at(cls).red();
+                data[shift_first + ii * 3 + 1] = colors.at(cls).green();
+                data[shift_first + ii * 3 + 2] = colors.at(cls).blue();
+                data[shift_second + ii * 3] = colors.at(cls).red();
+                data[shift_second + ii * 3 + 1] = colors.at(cls).green();
+                data[shift_second + ii * 3 + 2] = colors.at(cls).blue();
             }
         }
 
         for (int t = 0; t < thickness; t++) {
             shift_first = (x + t) * 3;
             shift_second = (x + w - t) * 3;
-            for (int i = y; i < y + h + 1; i++) {
-                data[shift_first + i * width * 3] = colors.at(cls).red();
-                data[shift_first + i * width * 3 + 1] = colors.at(cls).green();
-                data[shift_first + i * width * 3 + 2] = colors.at(cls).blue();
-                data[shift_second + i * width * 3] = colors.at(cls).red();
-                data[shift_second + i * width * 3 + 1] = colors.at(cls).green();
-                data[shift_second + i * width * 3 + 2] = colors.at(cls).blue();
+            for (int ii = y; ii < y + h + 1; ii++) {
+                data[shift_first + ii * width * 3] = colors.at(cls).red();
+                data[shift_first + ii * width * 3 + 1] = colors.at(cls).green();
+                data[shift_first + ii * width * 3 + 2] = colors.at(cls).blue();
+                data[shift_second + ii * width * 3] = colors.at(cls).red();
+                data[shift_second + ii * width * 3 + 1] = colors.at(cls).green();
+                data[shift_second + ii * width * 3 + 2] = colors.at(cls).blue();
             }
         }
     }
@@ -710,8 +705,8 @@ public:
     float xmin, xmax, ymin, ymax, prob;
     bool difficult;
 
-    DetectedObject(int objectType, float xmin, float ymin, float xmax, float ymax, float prob, bool difficult = false)
-        : objectType(objectType), xmin(xmin), xmax(xmax), ymin(ymin), ymax(ymax), prob(prob), difficult(difficult) {
+    DetectedObject(int _objectType, float _xmin, float _ymin, float _xmax, float _ymax, float _prob, bool _difficult = false)
+        : objectType(_objectType), xmin(_xmin), xmax(_xmax), ymin(_ymin), ymax(_ymax), prob(_prob), difficult(_difficult) {
     }
 
     DetectedObject(const DetectedObject& other) = default;
@@ -781,8 +776,8 @@ public:
     const std::list<DetectedObject> alist;
     const bool check_probs;
 
-    explicit ImageDescription(const std::list<DetectedObject> &alist, bool check_probs = false)
-            : alist(alist), check_probs(check_probs) {
+    explicit ImageDescription(const std::list<DetectedObject> &_alist, bool _check_probs = false)
+            : alist(_alist), check_probs(_check_probs) {
     }
 
     static float ioUMultiple(const ImageDescription &detectedObjects, const ImageDescription &desiredObjects) {
@@ -815,8 +810,6 @@ public:
             float coeff = 1.0;
             if (check_probs) {
                 if (bestJ != doB.end()) {
-                    DetectedObject test = *bestJ;
-                    DetectedObject test1 = *doS.begin();
                     float mn = std::min((*bestJ).prob, (*doS.begin()).prob);
                     float mx = std::max((*bestJ).prob, (*doS.begin()).prob);
 
@@ -867,23 +860,20 @@ private:
     }
 
 public:
-    explicit AveragePrecisionCalculator(double threshold) : threshold(threshold) { }
+    explicit AveragePrecisionCalculator(double _threshold) : threshold(_threshold) { }
 
     // gt_bboxes -> des
     // bboxes -> det
 
     void consumeImage(const ImageDescription &detectedObjects, const ImageDescription &desiredObjects) {
-            // Collecting IoU values
-        int tp = 0, fp = 0;
-
+        // Collecting IoU values
         std::vector<bool> visited(desiredObjects.alist.size(), false);
         std::vector<DetectedObject> bboxes{ std::begin(detectedObjects.alist), std::end(detectedObjects.alist) };
         std::sort(bboxes.begin(), bboxes.end(), SortBBoxDescend);
 
 
         for (auto&& detObj : bboxes) {
-                // Searching for the best match to this detection
-
+            // Searching for the best match to this detection
             // Searching for desired object
             float overlap_max = -1;
             int jmax = -1;
@@ -932,8 +922,6 @@ public:
 
         std::map<int, double> res;
 
-        double AP = 0;
-        double q = 0;
         for (auto m : matches) {
             // Sorting
             std::sort(m.second.begin(), m.second.end(), SortPairDescend);
@@ -1054,65 +1042,3 @@ static UNUSED void addRectangles(unsigned char *data, size_t height, size_t widt
         }
     }
 }
-
-#ifdef USE_OPENCV
-/**
-* @brief Sets image data stored in cv::Mat object to a given Blob object.
-* @param orig_image - given cv::Mat object with an image data.
-* @param blob - Blob object which to be filled by an image data.
-* @param batchIndex - batch index of an image inside of the blob.
-*/
-template <typename T>
-void matU8ToBlob(const cv::Mat& orig_image, InferenceEngine::Blob::Ptr& blob, int batchIndex = 0) {
-    InferenceEngine::SizeVector blobSize = blob->getTensorDesc().getDims();
-    const size_t width = blobSize[3];
-    const size_t height = blobSize[2];
-    const size_t channels = blobSize[1];
-    T* blob_data = blob->buffer().as<T*>();
-
-    cv::Mat resized_image(orig_image);
-    if (width != orig_image.size().width || height!= orig_image.size().height) {
-        cv::resize(orig_image, resized_image, cv::Size(width, height));
-    }
-
-    int batchOffset = batchIndex * width * height * channels;
-
-    for (size_t c = 0; c < channels; c++) {
-        for (size_t  h = 0; h < height; h++) {
-            for (size_t w = 0; w < width; w++) {
-                blob_data[batchOffset + c * width * height + h * width + w] =
-                        resized_image.at<cv::Vec3b>(h, w)[c];
-            }
-        }
-    }
-}
-
-/**
- * @brief Wraps data stored inside of a passed cv::Mat object by new Blob pointer.
- * @note: No memory allocation is happened. The blob just points to already existing
- *        cv::Mat data.
- * @param mat - given cv::Mat object with an image data.
- * @return resulting Blob pointer.
- */
-static InferenceEngine::Blob::Ptr wrapMat2Blob(const cv::Mat &mat) {
-    size_t channels = mat.channels();
-    size_t height = mat.size().height;
-    size_t width = mat.size().width;
-
-    size_t strideH = mat.step.buf[0];
-    size_t strideW = mat.step.buf[1];
-
-    bool is_dense =
-            strideW == channels &&
-            strideH == channels * width;
-
-    if (!is_dense) THROW_IE_EXCEPTION
-                << "Doesn't support conversion from not dense cv::Mat";
-
-    InferenceEngine::TensorDesc tDesc(InferenceEngine::Precision::U8,
-                                      {1, channels, height, width},
-                                      InferenceEngine::Layout::NHWC);
-
-    return InferenceEngine::make_shared_blob<uint8_t>(tDesc, mat.data);
-}
-#endif
\ No newline at end of file
diff --git a/inference-engine/samples/common/samples/ocv_common.hpp b/inference-engine/samples/common/samples/ocv_common.hpp
new file mode 100644 (file)
index 0000000..c979cd3
--- /dev/null
@@ -0,0 +1,73 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file with common samples functionality using OpenCV
+ * @file ocv_common.hpp
+ */
+
+#pragma once
+
+#include <samples/common.hpp>
+#include <opencv2/opencv.hpp>
+
+/**
+* @brief Sets image data stored in cv::Mat object to a given Blob object.
+* @param orig_image - given cv::Mat object with an image data.
+* @param blob - Blob object which to be filled by an image data.
+* @param batchIndex - batch index of an image inside of the blob.
+*/
+template <typename T>
+void matU8ToBlob(const cv::Mat& orig_image, InferenceEngine::Blob::Ptr& blob, int batchIndex = 0) {
+    InferenceEngine::SizeVector blobSize = blob->getTensorDesc().getDims();
+    const size_t width = blobSize[3];
+    const size_t height = blobSize[2];
+    const size_t channels = blobSize[1];
+    T* blob_data = blob->buffer().as<T*>();
+
+    cv::Mat resized_image(orig_image);
+    if (width != orig_image.size().width || height!= orig_image.size().height) {
+        cv::resize(orig_image, resized_image, cv::Size(width, height));
+    }
+
+    int batchOffset = batchIndex * width * height * channels;
+
+    for (size_t c = 0; c < channels; c++) {
+        for (size_t  h = 0; h < height; h++) {
+            for (size_t w = 0; w < width; w++) {
+                blob_data[batchOffset + c * width * height + h * width + w] =
+                        resized_image.at<cv::Vec3b>(h, w)[c];
+            }
+        }
+    }
+}
+
+/**
+ * @brief Wraps data stored inside of a passed cv::Mat object by new Blob pointer.
+ * @note: No memory allocation is happened. The blob just points to already existing
+ *        cv::Mat data.
+ * @param mat - given cv::Mat object with an image data.
+ * @return resulting Blob pointer.
+ */
+static InferenceEngine::Blob::Ptr wrapMat2Blob(const cv::Mat &mat) {
+    size_t channels = mat.channels();
+    size_t height = mat.size().height;
+    size_t width = mat.size().width;
+
+    size_t strideH = mat.step.buf[0];
+    size_t strideW = mat.step.buf[1];
+
+    bool is_dense =
+            strideW == channels &&
+            strideH == channels * width;
+
+    if (!is_dense) THROW_IE_EXCEPTION
+                << "Doesn't support conversion from not dense cv::Mat";
+
+    InferenceEngine::TensorDesc tDesc(InferenceEngine::Precision::U8,
+                                      {1, channels, height, width},
+                                      InferenceEngine::Layout::NHWC);
+
+    return InferenceEngine::make_shared_blob<uint8_t>(tDesc, mat.data);
+}
\ No newline at end of file
index b13cfa3..23eb8d3 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -58,7 +57,7 @@ public:
     }
 
     // Specializing for LogStreamEndLine to support slog::endl
-    LogStream& operator<< (const LogStreamEndLine &arg) {
+    LogStream& operator<< (const LogStreamEndLine &/*arg*/) {
         _new_line = true;
 
         (*_log_stream) << std::endl;
index f807be4..d70a974 100644 (file)
@@ -1,40 +1,24 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#      http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 cmake_minimum_required(VERSION 2.8)
 
 set (TARGET_NAME "hello_autoresize_classification")
 
-if( BUILD_SAMPLE_NAME AND NOT ${BUILD_SAMPLE_NAME} STREQUAL ${TARGET_NAME} )
-    message(STATUS "SAMPLE ${TARGET_NAME} SKIPPED")
-    return()
-endif()
-
 file (GLOB SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         )
 
-# Find OpenCV libray if exists
-find_package(OpenCV)
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS imgcodecs QUIET)
 if(NOT(OpenCV_FOUND))
-    message(STATUS "OPENCV is disabled or not found, " ${TARGET_NAME} " skiped")
+    message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " skipped")
     return()
 endif()
 
 source_group("src" FILES ${SRC})
 
-include_directories(${OpenCV_INCLUDE_DIRS})
-
 link_directories(${LIB_FOLDER})
 
 # Create library file from sources.
index e264d28..524ec22 100644 (file)
@@ -3,7 +3,7 @@
 This topic describes how to run the Hello Autoresize Classification sample application.
 The sample is simplified version of [Image Classification Sample](./samples/classification_sample/README.md).
 It's intended to demonstrate using of new input autoresize API of Inference Engine in applications. Refer to
-[Integrate with customer application New Request API](./docs/Inference_Engine_Developer_Guide/Integrate_with_customer_application_new_API.md) for details.
+[Integrate with customer application New Request API](./docs/IE_DG/Integrate_with_customer_application_new_API.md) for details.
 
 There is also new API introduced to crop a ROI object and set it as input without additional memory re-allocation.
 To properly demonstrate this new API it's required to run several networks in pipeline which is out of scope of this sample.
@@ -18,9 +18,11 @@ You can do inference on an image using a trained AlexNet network on Intel&reg; P
 ./hello_autoresize_classification <path_to_model>/alexnet_fp32.xml <path_to_image>/cat.bmp CPU
 ```
 
+> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
 ### Outputs
 
 The application outputs top-10 inference results. 
 
 ## See Also 
-* [Using Inference Engine Samples](./docs/Inference_Engine_Developer_Guide/Samples_Overview.md)
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
index cbd0ecf..2ac9337 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -9,9 +8,8 @@
 #include <string>
 #include <cstdlib>
 
-#include <opencv2/opencv.hpp>
 #include <inference_engine.hpp>
-#include <samples/common.hpp>
+#include <samples/ocv_common.hpp>
 
 using namespace InferenceEngine;
 
index e32e977..9531a21 100644 (file)
@@ -1,33 +1,19 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#      http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 cmake_minimum_required(VERSION 2.8)
 
 set (TARGET_NAME "hello_classification")
 
-if( BUILD_SAMPLE_NAME AND NOT ${BUILD_SAMPLE_NAME} STREQUAL ${TARGET_NAME} )
-    message(STATUS "SAMPLE ${TARGET_NAME} SKIPPED")
-    return()
-endif()
-
 file (GLOB SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         )
 
-# Find OpenCV libray if exists
-find_package(OpenCV)
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS imgcodecs QUIET)
 if(NOT(OpenCV_FOUND))
-    message(STATUS "OPENCV is disabled or not found, " ${TARGET_NAME} " skiped")
+    message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " skipped")
     return()
 endif()
 
@@ -35,13 +21,16 @@ endif()
 # Empty name lists them directly under the .vcproj
 source_group("src" FILES ${SRC})
 
-include_directories(${OpenCV_INCLUDE_DIRS})
-
 link_directories(${LIB_FOLDER})
 
 # Create library file from sources.
 add_executable(${TARGET_NAME} ${SRC})
 
+if(WIN32)
+       # This target supports UNICODE on Windows
+       set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_FLAGS "/D_UNICODE /DUNICODE")
+endif()
+
 set_target_properties(${TARGET_NAME} PROPERTIES "CMAKE_CXX_FLAGS" "${CMAKE_CXX_FLAGS} -fPIE"
 COMPILE_PDB_NAME ${TARGET_NAME})
 
index abc0108..d9482e1 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -9,32 +8,47 @@
 #include <string>
 #include <cstdlib>
 
+#ifdef UNICODE
+#include <tchar.h>
+#endif
+
 #include <opencv2/opencv.hpp>
 #include <inference_engine.hpp>
 
 using namespace InferenceEngine;
 
+#ifndef UNICODE
+#define tcout std::cout
+#define _T(STR) STR
+#else
+#define tcout std::wcout
+#endif
+
+#ifndef UNICODE
 int main(int argc, char *argv[]) {
+#else
+int wmain(int argc, wchar_t *argv[]) {
+#endif
     try {
         // ------------------------------ Parsing and validation of input args ---------------------------------
         if (argc != 3) {
-            std::cout << "Usage : ./hello_classification <path_to_model> <path_to_image>" << std::endl;
+            tcout << _T("Usage : ./hello_classification <path_to_model> <path_to_image>") << std::endl;
             return EXIT_FAILURE;
         }
 
-        const std::string input_model{argv[1]};
-        const std::string input_image_path{argv[2]};
+        const file_name_t input_model{argv[1]};
+        const file_name_t input_image_path{argv[2]};
         // -----------------------------------------------------------------------------------------------------
 
         // --------------------------- 1. Load Plugin for inference engine -------------------------------------
-        PluginDispatcher dispatcher({"../../../lib/intel64", ""});
+        PluginDispatcher dispatcher({_T("../../../lib/intel64"), _T("")});
         InferencePlugin plugin(dispatcher.getSuitablePlugin(TargetDevice::eCPU));
         // -----------------------------------------------------------------------------------------------------
 
         // --------------------------- 2. Read IR Generated by ModelOptimizer (.xml and .bin files) ------------
         CNNNetReader network_reader;
-        network_reader.ReadNetwork(input_model);
-        network_reader.ReadWeights(input_model.substr(0, input_model.size() - 4) + ".bin");
+        network_reader.ReadNetwork(fileNameToString(input_model));
+        network_reader.ReadWeights(fileNameToString(input_model).substr(0, input_model.size() - 4) + ".bin");
         network_reader.getNetwork().setBatchSize(1);
         CNNNetwork network = network_reader.getNetwork();
         // -----------------------------------------------------------------------------------------------------
@@ -64,7 +78,7 @@ int main(int argc, char *argv[]) {
 
         // --------------------------- 6. Prepare input --------------------------------------------------------
 
-        cv::Mat image = cv::imread(input_image_path);
+        cv::Mat image = cv::imread(fileNameToString(input_image_path));
 
         /* Resize manually and copy data from the image to the input blob */
         Blob::Ptr input = infer_request.GetBlob(input_name);
index 42a2ba8..8818453 100644 (file)
@@ -1,33 +1,19 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#      http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 cmake_minimum_required(VERSION 2.8)
 
 set (TARGET_NAME "hello_request_classification")
 
-if( BUILD_SAMPLE_NAME AND NOT ${BUILD_SAMPLE_NAME} STREQUAL ${TARGET_NAME} )
-    message(STATUS "SAMPLE ${TARGET_NAME} SKIPPED")
-    return()
-endif()
-
 file (GLOB SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         )
 
-# Find OpenCV libray if exists
-find_package(OpenCV)
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS imgcodecs QUIET)
 if(NOT(OpenCV_FOUND))
-    message(STATUS "OPENCV is disabled or not found, " ${TARGET_NAME} " skiped")
+    message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " skipped")
     return()
 endif()
 
@@ -35,8 +21,6 @@ endif()
 # Empty name lists them directly under the .vcproj
 source_group("src" FILES ${SRC})
 
-include_directories(${OpenCV_INCLUDE_DIRS})
-
 link_directories(${LIB_FOLDER})
 
 # Create library file from sources.
index dd2caeb..708fa81 100644 (file)
@@ -3,7 +3,7 @@
 This topic describes how to run the Hello Infer Classification sample application.
 The sample is simplified version of [Image Classification Sample](./samples/classification_sample/README.md).
 It's intended to demonstrate using of new Infer Request API of Inference Engine in applications. Refer to 
-[Integrate with customer application New Request API](./docs/Inference_Engine_Developer_Guide/Integrate_with_customer_application_new_API.md) for details.
+[Integrate with customer application New Request API](./docs/IE_DG/Integrate_with_customer_application_new_API.md) for details.
 
 ## Running
 
@@ -12,10 +12,12 @@ You can do inference on an image using a trained AlexNet network on Intel&reg; P
 ./hello_autoresize_classification <path_to_model>/alexnet_fp32.xml <path_to_image>/cat.bmp CPU
 ```
 
+> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
 ### Outputs
 
 The application outputs top-10 inference results. 
 
 
 ## See Also 
-* [Using Inference Engine Samples](./docs/Inference_Engine_Developer_Guide/Samples_Overview.md)
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
index 3cad38d..ffc9856 100644 (file)
@@ -15,11 +15,6 @@ cmake_minimum_required(VERSION 2.8)
 
 set(TARGET_NAME "hello_shape_infer_ssd")
 
-if (BUILD_SAMPLE_NAME AND NOT ${BUILD_SAMPLE_NAME} STREQUAL ${TARGET_NAME})
-    message(STATUS "SAMPLE ${TARGET_NAME} SKIPPED")
-    return()
-endif ()
-
 file(GLOB SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         )
@@ -28,10 +23,10 @@ file(GLOB HEADERS
         ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
         )
 
-# Find OpenCV libray if exists
-find_package(OpenCV)
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS imgcodecs QUIET)
 if(NOT(OpenCV_FOUND))
-    message(STATUS "OPENCV is disabled or not found, " ${TARGET_NAME} " skiped")
+    message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " skipped")
     return()
 endif()
 
@@ -40,8 +35,6 @@ endif()
 source_group("src" FILES ${SRC})
 source_group("headers" FILES ${HEADERS})
 
-include_directories(${OpenCV_INCLUDE_DIRS})
-
 link_directories(${LIB_FOLDER})
 
 # Create library file from sources.
index 3193d8d..f275abc 100644 (file)
@@ -1,7 +1,7 @@
 # Hello Shape Infer Sample
 
 This topic demonstrates how to run the Hello Shape Infer SSD application, which does inference using object detection
-networks like SSD-VGG. The sample shows how to use [Shape Inference feature](./docs/Inference_Engine_Developer_Guide/ShapeInference.md).
+networks like SSD-VGG. The sample shows how to use [Shape Inference feature](./docs/IE_DG/ShapeInference.md).
 
 ## Running
 
@@ -10,6 +10,8 @@ You can use the following command to do inference on Intel&reg; Processors on an
 ./hello_shape_infer_ssd <path_to_model>/ssd_300.xml <path_to_image>/500x500.bmp CPU 3
 ```
 
+> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
 ### Outputs
 
 The application renders an image with detected objects enclosed in rectangles. It outputs the list of classes
@@ -17,4 +19,4 @@ of the detected objects along with the respective confidence values and the coor
 rectangles to the standard output stream.
 
 ## See Also
-* [Using Inference Engine Samples](./docs/Inference_Engine_Developer_Guide/Samples_Overview.md)
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
index bbd143a..020b941 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,10 +6,9 @@
 #include <memory>
 #include <string>
 
-#include <opencv2/opencv.hpp>
 #include <inference_engine.hpp>
-#include <samples/common.hpp>
 #include <ext_list.hpp>
+#include <samples/ocv_common.hpp>
 
 #include "shape_infer_extension.hpp"
 
@@ -96,7 +94,7 @@ int main(int argc, char* argv[]) {
             throw std::logic_error("Incorrect output dimensions for SSD model");
         }
         if (output_info == nullptr) {
-            THROW_IE_EXCEPTION << "[SAMPLES] shared_ptr ouput_info == nullptr";
+            THROW_IE_EXCEPTION << "[SAMPLES] internal error - output information is empty";
         }
 
         output_info->setPrecision(Precision::FP32);
@@ -161,9 +159,9 @@ int main(int argc, char* argv[]) {
                           << image_id << std::endl;
             }
         }
+
         cv::imwrite("hello_shape_infer_ssd_output.jpg", image);
         std::cout << "The resulting image was saved in the file: hello_shape_infer_ssd_output.jpg" << std::endl;
-
         // -----------------------------------------------------------------------------------------------------
     } catch (const std::exception& ex) {
         std::cerr << ex.what() << std::endl;
diff --git a/inference-engine/samples/lenet_network_graph_builder/CMakeLists.txt b/inference-engine/samples/lenet_network_graph_builder/CMakeLists.txt
new file mode 100644 (file)
index 0000000..aab4788
--- /dev/null
@@ -0,0 +1,37 @@
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+cmake_minimum_required(VERSION 2.8)
+
+set (TARGET_NAME "lenet_network_graph_builder")
+
+file (GLOB MAIN_SRC
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+        )
+
+file (GLOB MAIN_HEADERS
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.h
+        )
+
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+source_group("src" FILES ${MAIN_SRC})
+source_group("include" FILES ${MAIN_HEADERS})
+
+
+link_directories(${LIB_FOLDER})
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${MAIN_SRC} ${MAIN_HEADERS})
+
+add_dependencies(${TARGET_NAME} gflags)
+
+set_target_properties(${TARGET_NAME} PROPERTIES "CMAKE_CXX_FLAGS" "${CMAKE_CXX_FLAGS} -fPIE"
+        COMPILE_PDB_NAME ${TARGET_NAME})
+
+target_link_libraries(${TARGET_NAME} ${InferenceEngine_LIBRARIES} gflags format_reader)
+
+if(UNIX)
+    target_link_libraries( ${TARGET_NAME} ${LIB_DL} pthread)
+endif()
\ No newline at end of file
diff --git a/inference-engine/samples/lenet_network_graph_builder/LeNet.bin b/inference-engine/samples/lenet_network_graph_builder/LeNet.bin
new file mode 100644 (file)
index 0000000..7ce66d0
Binary files /dev/null and b/inference-engine/samples/lenet_network_graph_builder/LeNet.bin differ
diff --git a/inference-engine/samples/lenet_network_graph_builder/README.md b/inference-engine/samples/lenet_network_graph_builder/README.md
new file mode 100644 (file)
index 0000000..d7fdfb7
--- /dev/null
@@ -0,0 +1,54 @@
+# Lenet Number Classifications Network using Graph Builder API
+
+This sample demonstrates how to execute inference using Inference Engine Graph Builder API to build a network on example of the LeNet classifications network.
+XML file is not required for network building now. Inference Engine Graph Builder API allows building of a network "on the fly" from source code. The sample uses 1-channel ubyte pictures as input.
+<br>
+
+## Running
+
+Running the application with the <code>-h</code> option yields the following usage message:
+```sh
+./lenet_network_graph_builder -h
+InferenceEngine:
+    API version ............ <version>
+    Build .................. <number>
+
+lenet_network_graph_builder [OPTION]
+Options:
+
+    -h                      Print a usage message.
+    -m "<path>"             Path to a .bin file with weights for trained model
+    -i "<path>"             Required. Path to image or folder with images
+    -d "<device>"           Specify the target device to infer on this. Sample will look for a suitable plugin for device specified(default value is CPU)
+    -pp "<path>"            Path to a plugin folder
+    -pc                     Enables per-layer performance report
+    -nt "<integer>"         Number of top results (default 10)
+    -ni "<integer>"         Number of iterations (default 1)
+
+```
+
+Running the application with empty list of options yields the usage message given above.
+
+For example, to do inference of an ubyte image on a GPU run the following command:
+```sh
+./lenet_network_graph_builder -i <path_to_image> -m <path_to_weights_file> -d GPU
+```
+
+### Outputs
+
+By default the application outputs top-10 inference results for each infer request.
+In addition to this information it will provide throughput value measured in frames per seconds.
+
+### How it works
+
+Upon the start-up the sample application reads command line parameters and loads a network and an image to the Inference 
+Engine plugin. When inference is done, the application creates an 
+output image and outputs data to the standard output stream.
+
+Upon the start-up the sample reads command line parameters and builds a network using Graph Builder API and passed weights file.
+Then, the application loads built network and an image to the Inference Engine plugin.
+
+When inference is done, the application outputs inference results to the standard output stream.
+
+## See Also
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
diff --git a/inference-engine/samples/lenet_network_graph_builder/lenet_network_graph_builder.hpp b/inference-engine/samples/lenet_network_graph_builder/lenet_network_graph_builder.hpp
new file mode 100644 (file)
index 0000000..7cb59e2
--- /dev/null
@@ -0,0 +1,90 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <gflags/gflags.h>
+#include <iostream>
+
+#ifdef _WIN32
+#include <os/windows/w_dirent.h>
+#else
+#include <dirent.h>
+#endif
+
+#define DEFAULT_PATH_P "./lib"
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message";
+
+/// @brief message for images argument
+static const char input_message[] = "Required. Path to image or folder with images";
+
+/// @brief message for model argument
+static const char model_message[] = "Path to an .bin file with weights for trained model";
+
+/// @brief message for assigning cnn calculation to device
+static const char target_device_message[] = "Specify the target device to infer on this. " \
+                                            "Sample will look for a suitable plugin for device specified" \
+                                            "(default value is CPU)";
+
+/// @brief message for plugin_path argument
+static const char plugin_path_message[] = "Path to a plugin folder";
+
+/// @brief message for performance counters
+static const char performance_counter_message[] = "Enables per-layer performance report";
+
+/// @brief message for top results number
+static const char ntop_message[] = "Number of top results (default 10)";
+
+/// @brief message for iterations count
+static const char iterations_count_message[] = "Number of iterations (default 1)";
+
+/// \brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// \brief Define parameter for set weight file <br>
+/// It is a parameter
+DEFINE_string(m, "", model_message);
+
+/// \brief Define parameter for set image file <br>
+/// It is a required parameter
+DEFINE_string(i, "", input_message);
+
+/// \brief device the target device to infer on <br>
+DEFINE_string(d, "CPU", target_device_message);
+
+/// \brief Define parameter for set path to plugins <br>
+/// Default is ./lib
+DEFINE_string(pp, "", plugin_path_message);
+
+/// @brief Enable per-layer performance report
+DEFINE_bool(pc, false, performance_counter_message);
+
+/// @brief Top results number (default 10) <br>
+DEFINE_int32(nt, 10, ntop_message);
+
+/// @brief Iterations count (default 1)
+DEFINE_int32(ni, 1, iterations_count_message);
+
+/**
+ * \brief This function show a help message
+ */
+static void showUsage() {
+    std::cout << std::endl;
+    std::cout << "lenet_network_graph_builder [OPTION]" << std::endl;
+    std::cout << "Options:" << std::endl;
+    std::cout << std::endl;
+    std::cout << "    -h                      " << help_message << std::endl;
+    std::cout << "    -m \"<path>\"             " << model_message << std::endl;
+    std::cout << "    -i \"<path>\"             " << input_message << std::endl;
+    std::cout << "    -d \"<device>\"           " << target_device_message << std::endl;
+    std::cout << "    -pp \"<path>\"            " << plugin_path_message << std::endl;
+    std::cout << "    -pc                     " << performance_counter_message << std::endl;
+    std::cout << "    -nt \"<integer>\"         " << ntop_message << std::endl;
+    std::cout << "    -ni \"<integer>\"         " << iterations_count_message << std::endl;
+}
+
diff --git a/inference-engine/samples/lenet_network_graph_builder/main.cpp b/inference-engine/samples/lenet_network_graph_builder/main.cpp
new file mode 100644 (file)
index 0000000..cd9031a
--- /dev/null
@@ -0,0 +1,332 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <fstream>
+#include <vector>
+#include <string>
+#include <memory>
+
+#include <inference_engine.hpp>
+#include <ie_builders.hpp>
+#include <ie_utils.hpp>
+#include <format_reader_ptr.h>
+
+#include <samples/common.hpp>
+#include <samples/slog.hpp>
+#include <samples/args_helper.hpp>
+
+#include <gflags/gflags.h>
+#include "lenet_network_graph_builder.hpp"
+
+using namespace InferenceEngine;
+
+bool ParseAndCheckCommandLine(int argc, char *argv[]) {
+    slog::info << "Parsing input parameters" << slog::endl;
+
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        showUsage();
+        return false;
+    }
+
+    if (FLAGS_ni <= 0) {
+        throw std::logic_error("Incorrect value for ni argument. It should be more than 0");
+    }
+
+    if (FLAGS_nt <= 0 || FLAGS_nt > 10) {
+        throw std::logic_error("Incorrect value for nt argument. It should be more than 0 and less than 10");
+    }
+
+    return true;
+}
+
+void readFile(const std::string &file_name, void *buffer, size_t maxSize) {
+    std::ifstream inputFile;
+
+    inputFile.open(file_name, std::ios::binary | std::ios::in);
+    if (!inputFile.is_open()) {
+        throw std::logic_error("cannot open file weight file");
+    }
+    if (!inputFile.read(reinterpret_cast<char *>(buffer), maxSize)) {
+        inputFile.close();
+        throw std::logic_error("cannot read bytes from weight file");
+    }
+
+    inputFile.close();
+}
+
+TBlob<uint8_t>::CPtr ReadWeights(std::string filepath) {
+    std::ifstream weightFile(filepath, std::ifstream::ate | std::ifstream::binary);
+    int64_t fileSize = weightFile.tellg();
+
+    if (fileSize < 0) {
+        throw std::logic_error("Incorrect weight file");
+    }
+
+    size_t ulFileSize = static_cast<size_t>(fileSize);
+
+    TBlob<uint8_t>::Ptr weightsPtr(new TBlob<uint8_t>(Precision::FP32, C, {ulFileSize}));
+    weightsPtr->allocate();
+    readFile(filepath, weightsPtr->buffer(), ulFileSize);
+
+    return weightsPtr;
+}
+
+/**
+ * @brief The entry point for inference engine automatic squeezenet networt builder sample
+ * @file squeezenet networt builder/main.cpp
+ * @example squeezenet networt builder/main.cpp
+ */
+int main(int argc, char *argv[]) {
+    try {
+        slog::info << "InferenceEngine: " << GetInferenceEngineVersion() << slog::endl;
+
+        if (!ParseAndCheckCommandLine(argc, argv)) {
+            return 0;
+        }
+
+        /** This vector stores paths to the processed images **/
+        std::vector<std::string> images;
+        parseInputFilesArguments(images);
+        if (images.empty()) {
+            throw std::logic_error("No suitable images were found");
+        }
+
+        // --------------------------- 1. Load Plugin for inference engine -------------------------------------
+        slog::info << "Loading plugin" << slog::endl;
+        InferencePlugin plugin = PluginDispatcher({FLAGS_pp, "../../../lib/intel64", ""}).getPluginByDevice(FLAGS_d);
+        printPluginVersion(plugin, std::cout);
+
+        /** Per layer metrics **/
+        if (FLAGS_pc) {
+            plugin.SetConfig({ { PluginConfigParams::KEY_PERF_COUNT, PluginConfigParams::YES } });
+        }
+        // -----------------------------------------------------------------------------------------------------
+
+        //--------------------------- 2. Create network using graph builder ------------------------------------
+        TBlob<uint8_t>::CPtr weightsPtr = ReadWeights(FLAGS_m);
+
+        Builder::Network builder("LeNet");
+        size_t layerId = builder.addLayer(Builder::InputLayer("data").setPort(Port({1, 1, 28, 28})));
+        auto ptrWeights = make_shared_blob(TensorDesc(Precision::FP32, {500}, Layout::C),
+                weightsPtr->cbuffer().as<float *>());
+        auto ptrBiases = make_shared_blob(TensorDesc(Precision::FP32, {20}, Layout::C),
+                weightsPtr->cbuffer().as<float *>() + 500);
+        layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer("conv1").setKernel({5, 5}).setDilation({1, 1})
+                  .setGroup(1).setStrides({1, 1}).setOutDepth(20).setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0})
+                  .setWeights(ptrWeights).setBiases(ptrBiases));
+        layerId = builder.addLayer({{layerId}}, Builder::PoolingLayer("pool1").setExcludePad(true).setKernel({2, 2})
+                  .setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0})
+                  .setPoolingType(Builder::PoolingLayer::PoolingType::MAX)
+                  .setRoundingType(Builder::PoolingLayer::RoundingType::CEIL).setStrides({2, 2}));
+        ptrWeights = make_shared_blob(TensorDesc(Precision::FP32, {25000}, Layout::C),
+                weightsPtr->cbuffer().as<float *>() + 520);
+        ptrBiases = make_shared_blob(TensorDesc(Precision::FP32, {50}, Layout::C),
+                weightsPtr->cbuffer().as<float *>() + 25520);
+        layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer("conv2").setDilation({1, 1}).setGroup(1)
+                  .setKernel({5, 5}).setOutDepth(50).setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0})
+                  .setStrides({1, 1}).setWeights(ptrWeights).setBiases(ptrBiases));
+        layerId = builder.addLayer({{layerId}}, Builder::PoolingLayer("pool2").setExcludePad(true).setKernel({2, 2})
+                  .setPaddingsBegin({0, 0}).setPaddingsEnd({0, 0}).setPoolingType(Builder::PoolingLayer::PoolingType::MAX)
+                  .setRoundingType(Builder::PoolingLayer::RoundingType::CEIL).setStrides({2, 2}));
+        ptrWeights = make_shared_blob(TensorDesc(Precision::FP32, {400000}, Layout::C),
+                weightsPtr->cbuffer().as<float *>() + 102280 / 4);
+        ptrBiases = make_shared_blob(TensorDesc(Precision::FP32, {500}, Layout::C),
+                weightsPtr->cbuffer().as<float *>() + 1702280 / 4);
+        layerId = builder.addLayer({{layerId}}, Builder::FullyConnectedLayer("ip1").setOutputNum(500)
+                  .setWeights(ptrWeights).setBiases(ptrBiases));
+        layerId = builder.addLayer({{layerId}}, Builder::ReLULayer("relu1").setNegativeSlope(0.0f));
+        ptrWeights = make_shared_blob(TensorDesc(Precision::FP32, {5000}, Layout::C),
+                weightsPtr->cbuffer().as<float *>() + 1704280 / 4);
+        ptrBiases = make_shared_blob(TensorDesc(Precision::FP32, {10}, Layout::C),
+                weightsPtr->cbuffer().as<float *>() + 1724280 / 4);
+        layerId = builder.addLayer({{layerId}}, Builder::FullyConnectedLayer("ip2").setOutputNum(10)
+                  .setWeights(ptrWeights).setBiases(ptrBiases));
+        layerId = builder.addLayer({{layerId}}, Builder::SoftMaxLayer("prob").setAxis(1));
+        size_t outputId = builder.addLayer({PortInfo(layerId)}, Builder::OutputLayer("sf_out"));
+
+        CNNNetwork network{Builder::convertToICNNNetwork(builder.build())};
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 3. Configure input & output ---------------------------------------------
+        // --------------------------- Prepare input blobs -----------------------------------------------------
+        slog::info << "Preparing input blobs" << slog::endl;
+
+        InputsDataMap inputInfo = network.getInputsInfo();
+        if (inputInfo.size() != 1) {
+            throw std::logic_error("Sample supports topologies only with 1 input");
+        }
+
+        auto inputInfoItem = *inputInfo.begin();
+
+        /** Specifying the precision and layout of input data provided by the user.
+         * This should be called before load of the network to the plugin **/
+        inputInfoItem.second->setPrecision(Precision::FP32);
+        inputInfoItem.second->setLayout(Layout::NCHW);
+
+        std::vector<std::shared_ptr<unsigned char>> imagesData;
+        for (auto & i : images) {
+            FormatReader::ReaderPtr reader(i.c_str());
+            if (reader.get() == nullptr) {
+                slog::warn << "Image " + i + " cannot be read!" << slog::endl;
+                continue;
+            }
+            /** Store image data **/
+            std::shared_ptr<unsigned char> data(
+                    reader->getData(inputInfoItem.second->getTensorDesc().getDims()[3],
+                                    inputInfoItem.second->getTensorDesc().getDims()[2]));
+            if (data.get() != nullptr) {
+                imagesData.push_back(data);
+            }
+        }
+
+        if (imagesData.empty()) {
+            throw std::logic_error("Valid input images were not found!");
+        }
+
+        /** Setting batch size using image count **/
+        network.setBatchSize(imagesData.size());
+        size_t batchSize = network.getBatchSize();
+        slog::info << "Batch size is " << std::to_string(batchSize) << slog::endl;
+
+        // --------------------------- Prepare output blobs -----------------------------------------------------
+        slog::info << "Checking that the outputs are as the demo expects" << slog::endl;
+        OutputsDataMap outputInfo(network.getOutputsInfo());
+        std::string firstOutputName;
+
+        for (auto & item : outputInfo) {
+            if (firstOutputName.empty()) {
+                firstOutputName = item.first;
+            }
+            DataPtr outputData = item.second;
+            if (!outputData) {
+                throw std::logic_error("output data pointer is not valid");
+            }
+
+            item.second->setPrecision(Precision::FP32);
+        }
+
+        if (outputInfo.size() != 1) {
+            throw std::logic_error("This demo accepts networks having only one output");
+        }
+
+        DataPtr& output = outputInfo.begin()->second;
+        auto outputName = outputInfo.begin()->first;
+
+        const SizeVector outputDims = output->getTensorDesc().getDims();
+        const int classCount = outputDims[1];
+
+        if (classCount > 10) {
+            throw std::logic_error("Incorrect number of output classes for LeNet network");
+        }
+
+        if (outputDims.size() != 2) {
+            throw std::logic_error("Incorrect output dimensions for LeNet");
+        }
+        output->setPrecision(Precision::FP32);
+        output->setLayout(Layout::NC);
+
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 4. Loading model to the plugin ------------------------------------------
+        slog::info << "Loading model to the plugin" << slog::endl;
+        ExecutableNetwork exeNetwork = plugin.LoadNetwork(network, {});
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 5. Create infer request -------------------------------------------------
+        InferRequest infer_request = exeNetwork.CreateInferRequest();
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 6. Prepare input --------------------------------------------------------
+        /** Iterate over all the input blobs **/
+        for (const auto & item : inputInfo) {
+            /** Creating input blob **/
+            Blob::Ptr input = infer_request.GetBlob(item.first);
+
+            /** Filling input tensor with images. First b channel, then g and r channels **/
+            size_t num_channels = input->getTensorDesc().getDims()[1];
+            size_t image_size = input->getTensorDesc().getDims()[2] * input->getTensorDesc().getDims()[3];
+
+            auto data = input->buffer().as<PrecisionTrait<Precision::FP32>::value_type*>();
+
+            /** Iterate over all input images **/
+            for (size_t image_id = 0; image_id < imagesData.size(); ++image_id) {
+                /** Iterate over all pixel in image (b,g,r) **/
+                for (size_t pid = 0; pid < image_size; pid++) {
+                    /** Iterate over all channels **/
+                    for (size_t ch = 0; ch < num_channels; ++ch) {
+                        /**          [images stride + channels stride + pixel id ] all in bytes            **/
+                        data[image_id * image_size * num_channels + ch * image_size + pid ] = imagesData.at(image_id).get()[pid*num_channels + ch];
+                    }
+                }
+            }
+        }
+        inputInfo = {};
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 7. Do inference ---------------------------------------------------------
+        typedef std::chrono::high_resolution_clock Time;
+        typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
+        typedef std::chrono::duration<float> fsec;
+
+        double total = 0.0;
+        /** Start inference & calc performance **/
+        for (int iter = 0; iter < FLAGS_ni; ++iter) {
+            auto t0 = Time::now();
+            infer_request.Infer();
+            auto t1 = Time::now();
+            fsec fs = t1 - t0;
+            ms d = std::chrono::duration_cast<ms>(fs);
+            total += d.count();
+        }
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 8. Process output -------------------------------------------------------
+        slog::info << "Processing output blobs" << slog::endl;
+
+        const Blob::Ptr outputBlob = infer_request.GetBlob(firstOutputName);
+        auto outputData = outputBlob->buffer().as<PrecisionTrait<Precision::FP32>::value_type*>();
+
+        /** Validating -nt value **/
+        const int resultsCnt = outputBlob->size() / batchSize;
+        if (FLAGS_nt > resultsCnt || FLAGS_nt < 1) {
+            slog::warn << "-nt " << FLAGS_nt << " is not available for this network (-nt should be less than " \
+                      << resultsCnt+1 << " and more than 0)\n            will be used maximal value : " << resultsCnt;
+            FLAGS_nt = resultsCnt;
+        }
+
+        /** This vector stores id's of top N results **/
+        std::vector<unsigned> results;
+        TopResults(FLAGS_nt, *outputBlob, results);
+
+        std::cout << std::endl << "Top " << FLAGS_nt << " results:" << std::endl << std::endl;
+
+        /** Print the result iterating over each batch **/
+        for (int image_id = 0; image_id < batchSize; ++image_id) {
+            std::cout << "Image " << images[image_id] << std::endl << std::endl;
+            for (size_t id = image_id * FLAGS_nt, cnt = 0; cnt < FLAGS_nt; ++cnt, ++id) {
+                std::cout.precision(7);
+                /** Getting probability for resulting class **/
+                const auto result = outputData[results[id] + image_id*(outputBlob->size() / batchSize)];
+                std::cout << std::left << std::fixed << "Number: " << results[id] << "; Probability: " << result << std::endl;
+            }
+            std::cout << std::endl;
+        }
+        // -----------------------------------------------------------------------------------------------------
+        std::cout << std::endl << "total inference time: " << total << std::endl;
+        std::cout << "Average running time of one iteration: " << total / static_cast<double>(FLAGS_ni) << " ms" << std::endl;
+        std::cout << std::endl << "Throughput: " << 1000 * static_cast<double>(FLAGS_ni) * batchSize / total << " FPS" << std::endl;
+        std::cout << std::endl;
+        // -----------------------------------------------------------------------------------------------------
+
+        /** Show performance results **/
+        if (FLAGS_pc) {
+            printPerformanceCounts(infer_request, std::cout);
+        }
+    } catch  (const std::exception &ex) {
+        slog::err << ex.what() << slog::endl;
+        return 3;
+    }
+    return 0;
+}
\ No newline at end of file
index 43403a9..60cd38e 100644 (file)
@@ -1,25 +1,11 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#      http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 cmake_minimum_required(VERSION 2.8)
 
 set (TARGET_NAME "object_detection_sample_ssd")
 
-if( BUILD_SAMPLE_NAME AND NOT ${BUILD_SAMPLE_NAME} STREQUAL ${TARGET_NAME} )
-    message(STATUS "SAMPLE ${TARGET_NAME} SKIPPED")
-    return()
-endif()
-
 file (GLOB MAIN_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         )
index 116a7e6..dc6f477 100644 (file)
@@ -33,13 +33,19 @@ Running the application with the empty list of options yields the usage message
 
 To run the sample, you can use a set of pre-trained and optimized models delivered with the package or a Caffe* public model.
 
-**NOTE**: A public model should be converted to the Inference Engine format (`.xml` + `.bin`) using the Model Optimizer tool. For Model Optimizer documentation, see https://software.intel.com/en-us/articles/OpenVINO-ModelOptimizer.
+**NOTE**: A public model should be converted to the Inference Engine format (`.xml` + `.bin`) using the Model Optimizer tool. For Model Optimizer documentation, see https://software.intel.com/en-us/articles/OpenVINO-ModelOptimizer.
 
-For example, to do inference on a CPU with the OpenVINO&trade; toolkit person detection SSD model (`<INSTAL_DIR>/deployment_tools/intel_models/person-detection-retail-00013`), run the following command:
+For example, to do inference on a CPU with the OpenVINO&trade; toolkit person detection SSD models, run one of the following commands:
 
 ```sh
-./object_detection_sample_ssd -i <path_to_image>/inputImage.bmp -m person-detection-retail-0013.xml -d CPU
+./object_detection_sample_ssd -i <path_to_image>/inputImage.bmp -m <INSTAL_DIR>/deployment_tools/intel_models/person-detection-retail-0013/FP32/person-detection-retail-0013.xml -d CPU
 ```
+or
+```sh
+./object_detection_sample_ssd -i <path_to_image>/inputImage.jpg -m <INSTALL_DIR>/deployment_tools/intel_models/person-detection-retail-0002/FP32/person-detection-retail-0002.xml -d CPU
+```
+
+> **NOTE**: Before running the sample with another trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
 
 ### Outputs
 
@@ -54,4 +60,4 @@ Engine plugin. When inference is done, the application creates an
 output image and outputs data to the standard output stream.
 
 ## See Also 
-* [Using Inference Engine Samples](./docs/Inference_Engine_Developer_Guide/Samples_Overview.md)
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
diff --git a/inference-engine/samples/perfcheck/CMakeLists.txt b/inference-engine/samples/perfcheck/CMakeLists.txt
new file mode 100644 (file)
index 0000000..bc08b7d
--- /dev/null
@@ -0,0 +1,55 @@
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set(TARGET_NAME perfcheck)
+
+find_package(OpenCV COMPONENTS imgproc QUIET)
+if(NOT(OpenCV_FOUND))
+    message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " skipped")
+    return()
+endif()
+
+file(GLOB SOURCES *.cpp)
+
+add_executable(${TARGET_NAME} ${SOURCES})
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    target_compile_options(${TARGET_NAME}
+        PRIVATE "-Wall"
+        PRIVATE "-Wextra"
+        PRIVATE "-Wformat"
+        PRIVATE "-Wno-missing-field-initializers"
+    )
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    target_compile_options(${TARGET_NAME}
+        PRIVATE "-Weverything"
+        PRIVATE "-Wno-c++98-compat"
+        PRIVATE "-Wno-global-constructors"
+        PRIVATE "-Wno-missing-variable-declarations"
+        PRIVATE "-Wno-exit-time-destructors"
+        PRIVATE "-Wno-undefined-func-template"
+    )
+endif()
+
+target_include_directories(${TARGET_NAME} SYSTEM PRIVATE
+    "${IE_MAIN_SOURCE_DIR}/include"
+    "${IE_MAIN_SOURCE_DIR}/samples/common"
+    "${IE_MAIN_SOURCE_DIR}/samples/common/format_reader"
+    "${IE_MAIN_SOURCE_DIR}/samples/common/os/windows"
+    "${CMAKE_SOURCE_DIR}/src/vpu/graph_transformer/include"
+    ${OpenCV_INCLUDE_DIRS}
+)
+
+add_dependencies(${TARGET_NAME} gflags IE::ie_cpu_extension)
+target_link_libraries(${TARGET_NAME} PRIVATE
+    ${InferenceEngine_LIBRARIES} format_reader
+    IE::ie_cpu_extension
+    ${OpenCV_LIBS}
+    gflags
+    ${CMAKE_DL_LIBS}
+)
+
+set_target_properties(${TARGET_NAME} PROPERTIES "CMAKE_CXX_FILES" "${CMAKE_CXX_FILES} -fPIE")
diff --git a/inference-engine/samples/perfcheck/README.md b/inference-engine/samples/perfcheck/README.md
new file mode 100644 (file)
index 0000000..daf0448
--- /dev/null
@@ -0,0 +1,73 @@
+# Perfcheck Sample
+
+This topic demonstrates how to build and run the Perfcheck sample application, which estimates performance by calculating minimum, average, and maximum FPS.
+
+## How It Works
+
+Upon the start-up, the sample application reads command line parameters and loads a network and its inputs from given directory to the Inference Engine plugin.
+Then application starts infer requests in asynchronous mode till specified number of iterations is finished.
+After inference stage, Perfcheck sample computes total time of execution, divides execution time in 10 intervals and evaluates minimum, average and maximum FPS among these intervals.
+
+## Running
+
+Running the application with the <code>-h</code> option yields the following usage message:
+
+```sh
+./perfcheck -h
+[ INFO ] Inference Engine:
+        API version ............ <version>
+        Build .................. <number>
+
+perfcheck [OPTIONS]
+[OPTIONS]:
+        -m                       <value>        Required. Path to an .xml file with a trained model.
+        -h                                      Optional. Print a usage message.
+        -d                       <value>        Optional. Specify the target device to infer on. Sample will look for a suitable plugin for device specified. Default value: CPU.
+        -pp                      <value>        Optional. Path to a plugin folder.
+        -l                       <value>        Optional. Required for CPU custom layers. Absolute path to a shared library with the kernels implementation.
+        -c                       <value>        Optional. Required for GPU custom kernels. Absolute path to an .xml file with the kernels description.
+        -inputs_dir              <value>        Optional. Path to a folder with images and binaries for inputs. Default value: ".".
+        -config                  <value>        Optional. Path to a configuration file.
+        -num_iterations          <value>        Optional. Specify number of iterations. Default value: 1000. Must be greater than or equal to 1000.
+        -batch                   <value>        Optional. Specify batch. Default value: 1.
+        -num_networks            <value>        Optional. Specify number of networks. Default value: 1. Must be less than or equal to 16.
+        -num_requests            <value>        Optional. Specify number of infer requests. Default value depends on specified device.
+        -num_fpga_devices        <value>        Optional. Specify number of FPGA devices. Default value: 1.
+```
+
+Running the application with the empty list of options yields an error message.
+
+You can use the following command to do inference on Intel® Processors on images from a folder using a trained Faster R-CNN network:
+
+```sh
+./perfcheck -m <path_to_model>/faster_rcnn.xml -inputs_dir <path_to_inputs> -d CPU
+```
+
+> **NOTE**: Public models should be first converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](https://software.intel.com/en-us/articles/OpenVINO-ModelOptimizer).
+
+## Sample Output
+
+The application outputs a performance statistics that shows: total execution time (in milliseconds), number of iterations, batch size, minimum, average and maximum FPS.
+Example of sample output:
+
+```sh
+[ INFO ] Inference Engine:
+       API version ............ <version>
+       Build .................. <number>
+[ INFO ] Loading network files:
+[ INFO ]       <path_to_model_xml_file>
+[ INFO ]       <path_to_model_bin_file>
+[ INFO ] Loading network 0
+[ INFO ] All networks are loaded
+
+Total time:     8954.61 ms
+Num iterations: 1000
+Batch:          1
+Min fps:        110.558
+Avg fps:        111.674
+Max fps:        112.791
+```
+
+## See Also
+
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
diff --git a/inference-engine/samples/perfcheck/main.cpp b/inference-engine/samples/perfcheck/main.cpp
new file mode 100644 (file)
index 0000000..88d5de9
--- /dev/null
@@ -0,0 +1,552 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#if defined(_WIN32)
+#include <os/windows/w_dirent.h>
+#else
+#include <sys/stat.h>
+#include <dirent.h>
+#endif
+
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <memory>
+#include <map>
+#include <cmath>
+#include <future>
+#include <atomic>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <mutex>
+#include <limits>
+
+#include <gflags/gflags.h>
+#include <opencv2/opencv.hpp>
+
+#include "inference_engine.hpp"
+#include "ext_list.hpp"
+
+//#include "vpu/vpu_plugin_config.hpp"
+#include "samples/common.hpp"
+#include "samples/slog.hpp"
+
+#include "perfcheck.h"
+
+
+static bool parseCommandLine(int *argc, char ***argv) {
+    gflags::ParseCommandLineNonHelpFlags(argc, argv, true);
+
+    if (FLAGS_h) {
+        showUsage();
+        return false;
+    }
+
+    if (FLAGS_m.empty()) {
+        throw std::invalid_argument("Path to model xml file is required");
+    }
+
+    if (FLAGS_num_iterations < MIN_ITERATIONS) {
+        throw std::invalid_argument("Number of iterations must be not smaller than 1000. "
+                                    "Got " + std::to_string(FLAGS_num_iterations));
+    }
+
+    if (MAX_NETWORKS < FLAGS_num_networks) {
+        throw std::invalid_argument("Only number of networks not greater than " + std::to_string(MAX_NETWORKS) + " "
+                                    "is supported. Got " + std::to_string(FLAGS_num_networks));
+    }
+
+    if (FLAGS_d.empty()) {
+        throw std::invalid_argument("Plugin name is required");
+    }
+
+    if (1 < *argc) {
+        std::stringstream message;
+        message << "Unknown arguments: ";
+        for (auto arg = 1; arg < *argc; arg++) {
+            message << argv[arg];
+            if (arg < *argc) {
+                message << " ";
+            }
+        }
+        throw std::invalid_argument(message.str());
+    }
+
+    return true;
+}
+
+static std::map<std::string, std::string> parseConfig(const std::string &configName, char comment = '#') {
+    std::map<std::string, std::string> config = {};
+
+    std::ifstream file(configName);
+    if (!file.is_open()) {
+        return config;
+    }
+
+    std::string key, value;
+    while (file >> key >> value) {
+        if (key.empty() || key[0] == comment) {
+            continue;
+        }
+        config[key] = value;
+    }
+
+    return config;
+}
+
+static std::size_t getNumberRequests(const std::string &plugin) {
+    static const std::unordered_map<std::string, std::size_t> supported_plugins = {
+        { "MYRIAD", 4   },
+        { "HDDL",   100 },
+        { "FPGA",   3   },
+    };
+
+    auto device = plugin;
+    if (plugin.find("HETERO:") == 0) {
+        auto separator   = plugin.find(",");
+        auto deviceBegin = std::string("HETERO:").size();
+        auto deviceEnd   = separator == std::string::npos ? plugin.size() : separator;
+        device = plugin.substr(deviceBegin, deviceEnd - deviceBegin);
+    }
+
+    auto num_requests = supported_plugins.find(device);
+    return num_requests == supported_plugins.end() ? 1 : num_requests->second;
+}
+
+#if defined(WIN32)
+typedef std::chrono::time_point<std::chrono::steady_clock> time_point;
+#else
+typedef std::chrono::time_point<std::chrono::system_clock> time_point;
+#endif
+
+static void printFPS(std::size_t num_requests, std::size_t num_intervals, const std::vector<time_point> &points) {
+    std::size_t num_exclude = 2 * num_requests;
+    /* evaluate from the end of previous */
+    std::size_t first_point = num_exclude - 1;
+    std::size_t last_point  = points.size() - num_exclude;
+    auto begin = points[first_point];
+    auto end   = points[last_point - 1];
+
+    using ms = std::chrono::duration<double, std::ratio<1, 1000>>;
+
+    auto num_iterations = last_point - first_point - 1;
+    auto total = std::chrono::duration_cast<ms>(end - begin).count();
+    auto avg_fps = static_cast<double>(num_iterations) * 1000.0 * FLAGS_batch / total;
+
+    auto min_fps = std::numeric_limits<double>::max();
+    auto max_fps = std::numeric_limits<double>::min();
+    double step = total / num_intervals;
+    std::size_t first_point_in_interval = first_point + 1;
+    auto first_time_in_interval = std::chrono::time_point_cast<ms>(begin);
+    for (std::size_t interval = 0; interval < num_intervals; interval++) {
+        std::size_t num_points_in_interval = 0;
+        auto last_time_in_interval = first_time_in_interval + ms(step);
+        if (interval == num_intervals - 1) {
+            last_time_in_interval = end;
+        }
+
+        while (first_point_in_interval + num_points_in_interval < last_point &&
+               points[first_point_in_interval + num_points_in_interval] <= last_time_in_interval) {
+            num_points_in_interval++;
+        }
+
+        double fps = num_points_in_interval * FLAGS_batch / step * 1000;
+        min_fps = std::min(min_fps, fps);
+        max_fps = std::max(max_fps, fps);
+
+        first_point_in_interval += num_points_in_interval;
+        first_time_in_interval = last_time_in_interval;
+    }
+
+    std::cout << std::endl;
+    std::cout << "Total time:     " << total << " ms";
+    std::cout << std::endl;
+
+    std::cout << "Num iterations: " << num_iterations << std::endl;
+    std::cout << "Batch:          " << FLAGS_batch << std::endl;
+
+    std::cout << "Min fps:        " << min_fps << std::endl;
+    std::cout << "Avg fps:        " << avg_fps << std::endl;
+    std::cout << "Max fps:        " << max_fps << std::endl;
+}
+
+template<typename T>
+static bool isImage(const T &blob) {
+    auto descriptor = blob->getTensorDesc();
+    if (descriptor.getLayout() != InferenceEngine::NCHW) {
+        return false;
+    }
+
+    auto channels = descriptor.getDims()[1];
+    return channels == 3;
+}
+
+static std::vector<std::string> extractFilesByExtension(const std::string &directory, const std::string &extension) {
+    std::vector<std::string> files;
+
+    DIR *dir = opendir(directory.c_str());
+    if (!dir) {
+        throw std::invalid_argument("Can not open " + directory);
+    }
+
+    auto getExtension = [](const std::string &name) {
+        auto extensionPosition = name.rfind('.', name.size());
+        return extensionPosition == std::string::npos ? "" : name.substr(extensionPosition + 1, name.size() - 1);
+    };
+
+    dirent *ent = nullptr;
+    while ((ent = readdir(dir))) {
+        std::string file_name = ent->d_name;
+        if (getExtension(file_name) != extension) {
+            continue;
+        }
+
+        std::stringstream stream;
+        stream << directory << "/" << file_name;
+
+        auto full_file_name = stream.str();
+
+        struct stat st = {};
+        if (stat(full_file_name.c_str(), &st) != 0) {
+            continue;
+        }
+
+        bool is_directory = (st.st_mode & S_IFDIR) != 0;
+        if (is_directory) {
+            continue;
+        }
+
+        files.push_back(full_file_name);
+    }
+
+    closedir(dir);
+
+    return files;
+}
+
+static float asfloat(uint32_t v) {
+    union {
+        float f;
+        std::uint32_t u;
+    } converter = {0};
+    converter.u = v;
+    return converter.f;
+}
+
+static short f32tof16(float x) {
+    static float min16 = asfloat((127 - 14) << 23);
+
+    static float max16 = asfloat(((127 + 15) << 23) | 0x007FE000);
+    static uint32_t max16f16 = ((15 + 15) << 10) | 0x3FF;
+
+    static constexpr std::uint32_t EXP_MASK_F32 = 0x7F800000U;
+
+    union {
+        float f;
+        uint32_t u;
+    } v = {0};
+    v.f = x;
+
+    uint32_t s = (v.u >> 16) & 0x8000;
+
+    v.u &= 0x7FFFFFFF;
+
+    if ((v.u & EXP_MASK_F32) == EXP_MASK_F32) {
+        if (v.u & 0x007FFFFF) {
+            return static_cast<short>(s | (v.u >> (23 - 10)) | 0x0200);
+        } else {
+            return static_cast<short>(s | (v.u >> (23 - 10)));
+        }
+    }
+
+    float halfULP = asfloat(v.u & EXP_MASK_F32) * asfloat((127 - 11) << 23);
+    v.f += halfULP;
+
+    if (v.f < min16 * 0.5f) {
+        return static_cast<short>(s);
+    }
+
+    if (v.f < min16) {
+        return static_cast<short>(s | (1 << 10));
+    }
+
+    if (v.f >= max16) {
+        return static_cast<short>(max16f16 | s);
+    }
+
+    v.u -= ((127 - 15) << 23);
+
+    v.u >>= (23 - 10);
+
+    return static_cast<short>(v.u | s);
+}
+
+static void loadImage(const std::string &imageFilename, InferenceEngine::Blob::Ptr &blob) {
+    InferenceEngine::TensorDesc tensDesc = blob->getTensorDesc();
+
+    cv::Mat image = cv::imread(imageFilename);
+    if (image.empty()) {
+        throw std::invalid_argument("Can not read image from " + imageFilename);
+    }
+
+    std::size_t batch = blob->dims()[3];
+    std::size_t w = blob->dims()[0];
+    std::size_t h = blob->dims()[1];
+    auto img_w = static_cast<std::size_t>(image.cols);
+    auto img_h = static_cast<std::size_t>(image.rows);
+
+    auto numBlobChannels = blob->dims()[2];
+    auto numImageChannels = static_cast<std::size_t>(image.channels());
+    if (numBlobChannels != numImageChannels && numBlobChannels != 1) {
+        throw std::invalid_argument("Input channels mismatch: image channels " + std::to_string(numImageChannels) +
+                                    ", network channels " + std::to_string(numBlobChannels) +
+                                    ", expecting count of image channels are equal to count if network channels"
+                                    "or count of network channels are equal to 1");
+    }
+
+    auto nPixels = w * h;
+    unsigned char *RGB8 = image.data;
+    float xscale = 1.0f * img_w / w;
+    float yscale = 1.0f * img_h / h;
+
+    for (std::size_t n = 0; n != batch; n++) {
+        for (std::size_t i = 0; i < h; ++i) {
+            auto y = static_cast<std::size_t>(std::floor((i + 0.5f) * yscale));
+            for (std::size_t j = 0; j < w; ++j) {
+                auto x = static_cast<std::size_t>(std::floor((j + 0.5f) * xscale));
+                for (std::size_t k = 0; k < numBlobChannels; k++) {
+                    float value = 1.0f * RGB8[(y * img_w + x) * numImageChannels + k];
+                    if (InferenceEngine::Precision::FP16 == tensDesc.getPrecision()) {
+                        if (tensDesc.getLayout() == InferenceEngine::NHWC) {
+                            blob->buffer().as<std::int16_t *>()[n * h * w * numBlobChannels + (i * w + j) * numBlobChannels + k] = f32tof16(value);
+                        } else {
+                            blob->buffer().as<std::int16_t *>()[n * h * w * numBlobChannels + (i * w + j) + k * nPixels] = f32tof16(value);
+                        }
+                    } else {
+                        if (tensDesc.getLayout() == InferenceEngine::NHWC) {
+                            blob->buffer().as<float *>()[n * h * w * numBlobChannels + (i * w + j) * numBlobChannels + k] = value;
+                        } else {
+                            blob->buffer().as<float *>()[n * h * w * numBlobChannels + (i * w + j) + k * nPixels] = value;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+static void loadBinaryTensor(const std::string &binaryFileName, InferenceEngine::Blob::Ptr &blob) {
+    InferenceEngine::TensorDesc tensDesc = blob->getTensorDesc();
+
+    std::ifstream binaryFile(binaryFileName, std::ios_base::binary | std::ios_base::ate);
+    if (!binaryFile) {
+        throw std::invalid_argument("Can not open \"" + binaryFileName + "\"");
+    }
+
+    auto fileSize = static_cast<std::size_t>(binaryFile.tellg());
+    binaryFile.seekg(0, std::ios_base::beg);
+    if (!binaryFile.good()) {
+        throw std::invalid_argument("Can not read \"" + binaryFileName + "\"");
+    }
+
+    auto networkSize = blob->size() * sizeof(float);
+    if (fileSize != networkSize) {
+        throw std::invalid_argument("File \"" + binaryFileName + "\" contains " + std::to_string(fileSize) + " bytes "
+                                    "but network expects " + std::to_string(networkSize));
+    }
+
+    for (std::size_t i = 0; i < blob->size(); i++) {
+        float src = 0.f;
+        binaryFile.read(reinterpret_cast<char *>(&src), sizeof(float));
+        if (InferenceEngine::Precision::FP16 == tensDesc.getPrecision()) {
+            blob->buffer().as<std::int16_t *>()[i] = f32tof16(src);
+        } else {
+            blob->buffer().as<float *>()[i] = src;
+        }
+    }
+}
+
+static void loadInputs(std::size_t requestIdx, const std::vector<std::string> &images,
+                       const std::vector<std::string> &binaries, InferenceEngine::InferRequest &request,
+                       InferenceEngine::CNNNetwork &network) {
+    for (auto &&input : network.getInputsInfo()) {
+        auto blob = request.GetBlob(input.first);
+
+        if (isImage(blob)) {
+            loadImage(images[requestIdx % images.size()], blob);
+        } else {
+            loadBinaryTensor(binaries[requestIdx % binaries.size()], blob);
+        }
+    }
+}
+
+int main(int argc, char *argv[]) {
+    try {
+        slog::info << "Inference Engine: " << InferenceEngine::GetInferenceEngineVersion() << slog::endl;
+
+        if (!parseCommandLine(&argc, &argv)) {
+            return EXIT_SUCCESS;
+        }
+
+        std::string binFileName = fileNameNoExt(FLAGS_m) + ".bin";
+        slog::info << "Loading network files:" <<
+            slog::endl << "\t" << FLAGS_m <<
+            slog::endl << "\t" << binFileName <<
+        slog::endl;
+
+        InferenceEngine::CNNNetReader networkReader;
+        networkReader.ReadNetwork(FLAGS_m);
+        networkReader.ReadWeights(binFileName);
+
+        auto network = networkReader.getNetwork();
+        network.setBatchSize(FLAGS_batch);
+
+        if (FLAGS_d.find("MYRIAD") != std::string::npos || FLAGS_d.find("HDDL") != std::string::npos) {
+            /**
+             * on VPU devices FP16 precision allows avoid extra conversion operations and shows better performance
+             **/
+            for (auto &&input : network.getInputsInfo()) {
+                input.second->setPrecision(InferenceEngine::Precision::FP16);
+            }
+
+            for (auto &&output : network.getOutputsInfo()) {
+                output.second->setPrecision(InferenceEngine::Precision::FP16);
+            }
+        }
+
+        auto plugin = InferenceEngine::PluginDispatcher({FLAGS_pp, "../../../lib/intel64", ""}).getPluginByDevice(FLAGS_d);
+
+        /* If CPU device, load default library with extensions that comes with the product */
+        if (FLAGS_d.find("CPU") != std::string::npos) {
+            /**
+             * cpu_extensions library is compiled from "extension" folder containing
+             * custom MKLDNNPlugin layer implementations. These layers are not supported
+             * by mkldnn, but they can be useful for inferencing custom topologies.
+             **/
+            plugin.AddExtension(std::make_shared<InferenceEngine::Extensions::Cpu::CpuExtensions>());
+        }
+
+        if (!FLAGS_l.empty()) {
+            plugin.AddExtension(InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(FLAGS_l));
+            slog::info << "CPU Extension loaded: " << FLAGS_l << slog::endl;
+        }
+
+        if (!FLAGS_c.empty()) {
+            /* clDNN Extensions are loaded from an .xml description and OpenCL kernel files */
+            plugin.SetConfig({{InferenceEngine::PluginConfigParams::KEY_CONFIG_FILE, FLAGS_c}});
+            slog::info << "GPU Extension loaded: " << FLAGS_c << slog::endl;
+        }
+
+        auto config = parseConfig(FLAGS_config);
+        std::vector<InferenceEngine::ExecutableNetwork> networks(FLAGS_num_networks);
+        for (std::size_t net = 0; net < networks.size(); ++net) {
+            slog::info << "Loading network " << net;
+            if (FLAGS_d.find("FPGA") != std::string::npos) {
+                if (FLAGS_num_fpga_devices != 1) {
+                    config[InferenceEngine::PluginConfigParams::KEY_DEVICE_ID] = std::to_string(net % FLAGS_num_fpga_devices);
+                    slog::info << " to device " << (net % FLAGS_num_fpga_devices);
+                }
+            }
+            slog::info << slog::endl;
+
+            networks[net] = plugin.LoadNetwork(network, config);
+        }
+        slog::info << "All networks are loaded" << slog::endl;
+
+        auto num_requests = FLAGS_num_requests == 0 ? getNumberRequests(FLAGS_d) : FLAGS_num_requests;
+
+        auto images = extractFilesByExtension(FLAGS_inputs_dir, "bmp");
+        auto hasImageInput = [](const InferenceEngine::CNNNetwork &net) {
+            auto inputs = net.getInputsInfo();
+            auto isImageInput = [](const InferenceEngine::InputsDataMap::value_type &input) {
+                return isImage(input.second);
+            };
+            return std::any_of(inputs.begin(), inputs.end(), isImageInput);
+        };
+
+        if (hasImageInput(network) && images.empty()) {
+            throw std::invalid_argument("The directory \"" + FLAGS_inputs_dir + "\" does not contain images for network");
+        }
+
+        auto binaries = extractFilesByExtension(FLAGS_inputs_dir, "bin");
+        auto hasBinaryInput = [](const InferenceEngine::CNNNetwork &net) {
+            auto inputs = net.getInputsInfo();
+            auto isBinaryInput = [](const InferenceEngine::InputsDataMap::value_type &input) {
+                return !isImage(input.second);
+            };
+            return std::any_of(inputs.begin(), inputs.end(), isBinaryInput);
+        };
+
+        if (hasBinaryInput(network) && binaries.empty()) {
+            throw std::invalid_argument("The directory \"" + FLAGS_inputs_dir + "\" does not contain binaries for network");
+        }
+
+        std::size_t iteration{0};
+        std::mutex dump_time;
+        std::atomic<std::size_t> num_finished{0};
+
+        std::promise<void> done;
+        num_requests *= FLAGS_num_networks;
+        std::size_t num_iterations = 2 * num_requests + FLAGS_num_iterations + 2 * num_requests;
+
+        std::vector<InferenceEngine::InferRequest> requests(num_requests);
+        std::vector<time_point> time_points(num_iterations);
+
+        using callback_t = std::function<void(InferenceEngine::InferRequest, InferenceEngine::StatusCode)>;
+
+        for (std::size_t request = 0; request < num_requests; ++request) {
+            requests[request] = networks[request % networks.size()].CreateInferRequest();
+
+            loadInputs(request, images, binaries, requests[request], network);
+
+            callback_t callback =
+                [num_requests, num_iterations, &iteration, &time_points, &dump_time, &num_finished, &done]
+                (InferenceEngine::InferRequest inferRequest, InferenceEngine::StatusCode code) {
+                if (code != InferenceEngine::StatusCode::OK) {
+                    THROW_IE_EXCEPTION << "Infer request failed with code " << code;
+                }
+
+                std::size_t current_finished_iteration = 0;
+                {
+                    std::lock_guard<std::mutex> lock(dump_time);
+
+                    current_finished_iteration = iteration++;
+                    if (current_finished_iteration < num_iterations) {
+                        time_points[current_finished_iteration] = std::chrono::high_resolution_clock::now();
+                    }
+                }
+
+                if (current_finished_iteration < num_iterations - 1) {
+                    inferRequest.StartAsync();
+                } else {
+                    if (++num_finished == num_requests) {
+                        done.set_value();
+                    }
+                }
+            };
+
+            requests[request].SetCompletionCallback<callback_t>(callback);
+        }
+
+        auto doneFuture = done.get_future();
+
+        for (auto &&request : requests) {
+            request.StartAsync();
+        }
+
+        doneFuture.wait();
+
+        printFPS(num_requests, 10, time_points);
+    } catch (const std::exception &error) {
+        slog::err << error.what() << slog::endl;
+        return EXIT_FAILURE;
+    } catch (...) {
+        slog::err << "Unknown/internal exception happened." << slog::endl;
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/inference-engine/samples/perfcheck/perfcheck.h b/inference-engine/samples/perfcheck/perfcheck.h
new file mode 100644 (file)
index 0000000..01419f1
--- /dev/null
@@ -0,0 +1,90 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+static constexpr std::size_t MIN_ITERATIONS = 1000;
+static constexpr std::size_t MAX_NETWORKS   = 16;
+
+/// @brief message for model argument
+static constexpr char model_message[] = "Required. Path to an .xml file with a trained model.";
+DEFINE_string(m, "", model_message);
+
+/// @brief message for help argument
+static constexpr char help_message[] = "Optional. Print a usage message.";
+DEFINE_bool(h, false, help_message);
+
+/// @brief message target_device argument
+static constexpr char target_device_message[] = "Optional. Specify the target device to infer on. " \
+"Sample will look for a suitable plugin for device specified. Default: CPU.";
+DEFINE_string(d, "CPU", target_device_message);
+
+/// @brief message for plugin_path argument
+static constexpr char plugin_path_message[] = "Optional. Path to a plugin folder.";
+DEFINE_string(pp, "", plugin_path_message);
+
+/// @brief message for custom_cpu_library argument
+static constexpr char custom_cpu_library_message[] = "Optional. Required for CPU custom layers. " \
+"Absolute path to a shared library with the kernels implementation.";
+DEFINE_string(l, "", custom_cpu_library_message);
+
+/// @brief message for custom_gpu_library argument
+static constexpr char custom_gpu_library_message[] = "Optional. Required for GPU custom kernels. "\
+"Absolute path to the xml file with the kernels description.";
+DEFINE_string(c, "",  custom_gpu_library_message);
+
+/// @brief message for inputs_dir argument
+static constexpr char inputs_dir_message[] = "Optional. Path to a folder with images and binaries for inputs. " \
+"Default value: \".\".";
+DEFINE_string(inputs_dir, ".", inputs_dir_message);
+
+/// @brief message for config argument
+static constexpr char config_message[] = "Optional. Path to a configuration file.";
+DEFINE_string(config, "", config_message);
+
+/// @brief message for num_iterations argument
+static constexpr char num_iterations_message[] = "Optional. Specify number of iterations. " \
+"Default value: 1000. Must be greater than or equal to 1000.";
+DEFINE_uint32(num_iterations, MIN_ITERATIONS, num_iterations_message);
+
+/// @brief message for batch argument
+static constexpr char batch_message[] = "Optional. Specify batch. Default value: 1.";
+DEFINE_uint32(batch, 1, batch_message);
+
+/// @brief message for num_networks argument
+static constexpr char num_networks_message[] = "Optional. Specify number of networks. Default value: 1. Must be less than or equal to 16";
+DEFINE_uint32(num_networks, 1, num_networks_message);
+
+/// @brief message for num_requests argument
+static constexpr char num_requests_message[] = "Optional. Specify number of infer requests. " \
+"Default value depends on specified device.";
+DEFINE_uint32(num_requests, 0, num_requests_message);
+
+/// @brief message for num_fpga_devices argument
+static constexpr char num_fpga_devices_message[]  = "Optional. Specify number of FPGA devices. Default value: 1.";
+DEFINE_uint32(num_fpga_devices, 1, num_fpga_devices_message);
+
+/**
+* \brief This function shows a help message
+*/
+static void showUsage() {
+    std::cout << std::endl;
+    std::cout << "perfcheck [OPTIONS]" << std::endl;
+    std::cout << "[OPTIONS]:" << std::endl;
+    std::cout << "\t-m                \t <value> \t" << model_message              << std::endl;
+    std::cout << "\t-h                \t         \t" << help_message               << std::endl;
+    std::cout << "\t-d                \t <value> \t" << target_device_message      << std::endl;
+    std::cout << "\t-pp               \t <value> \t" << plugin_path_message        << std::endl;
+    std::cout << "\t-l                \t <value> \t" << custom_cpu_library_message << std::endl;
+    std::cout << "\t-c                \t <value> \t" << custom_gpu_library_message << std::endl;
+    std::cout << "\t-inputs_dir       \t <value> \t" << inputs_dir_message         << std::endl;
+    std::cout << "\t-config           \t <value> \t" << config_message             << std::endl;
+    std::cout << "\t-num_iterations   \t <value> \t" << num_iterations_message     << std::endl;
+    std::cout << "\t-batch            \t <value> \t" << batch_message              << std::endl;
+    std::cout << "\t-num_networks     \t <value> \t" << num_networks_message       << std::endl;
+    std::cout << "\t-num_requests     \t <value> \t" << num_requests_message       << std::endl;
+    std::cout << "\t-num_fpga_devices \t <value> \t" << num_fpga_devices_message   << std::endl;
+
+    std::cout << std::endl;
+}
diff --git a/inference-engine/samples/speech_sample/CMakeLists.txt b/inference-engine/samples/speech_sample/CMakeLists.txt
new file mode 100644 (file)
index 0000000..33e7e72
--- /dev/null
@@ -0,0 +1,37 @@
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+cmake_minimum_required(VERSION 2.8)
+
+set (TARGET_NAME "speech_sample")
+
+file (GLOB MAIN_SRC
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+        )
+
+file (GLOB MAIN_HEADERS
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.h
+        )
+
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+source_group("src" FILES ${MAIN_SRC})
+source_group("include" FILES ${MAIN_HEADERS})
+
+
+link_directories(${LIB_FOLDER})
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${MAIN_SRC} ${MAIN_HEADERS})
+
+add_dependencies(${TARGET_NAME} gflags)
+
+set_target_properties(${TARGET_NAME} PROPERTIES "CMAKE_CXX_FLAGS" "${CMAKE_CXX_FLAGS} -fPIE"
+COMPILE_PDB_NAME ${TARGET_NAME})
+
+target_link_libraries(${TARGET_NAME} ${InferenceEngine_LIBRARIES} gflags)
+
+if(UNIX)
+    target_link_libraries( ${TARGET_NAME} ${LIB_DL} pthread)
+endif()
diff --git a/inference-engine/samples/speech_sample/README.md b/inference-engine/samples/speech_sample/README.md
new file mode 100644 (file)
index 0000000..31f2b8d
--- /dev/null
@@ -0,0 +1,208 @@
+# Automatic Speech Recognition Sample
+
+This topic shows how to run the speech sample application, which
+demonstrates acoustic model inference based on Kaldi\* neural networks
+and speech feature vectors.
+
+## Running
+
+### Usage
+
+Running the application with the `-h` option yields the following
+usage message:
+
+```sh
+$ ./speech_sample -h
+InferenceEngine: 
+    API version ............ <version>
+    Build .................. <number>
+
+speech_sample [OPTION]
+Options:
+
+    -h                      Print a usage message.
+    -i "<path>"             Required. Path to an .ark file.
+    -m "<path>"             Required. Path to an .xml file with a trained model (required if -rg is missing).
+    -o "<path>"             Output file name (default name is scores.ark).
+    -l "<absolute_path>"    Required for MKLDNN (CPU)-targeted custom layers.Absolute path to a shared library with the kernels impl.
+    -d "<device>"           Specify the target device to infer on; CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, GNA_SW_EXACT is acceptable. Sample will look for a suitable plugin for device specified
+    -p                      Plugin name. For example MKLDNNPlugin. If this parameter is pointed, the sample will look for this plugin only
+    -pp                     Path to a plugin folder.
+    -pc                     Enables performance report
+    -q "<mode>"             Input quantization mode:  static (default), dynamic, or user (use with -sf).
+    -qb "<integer>"         Weight bits for quantization:  8 or 16 (default)
+    -sf "<double>"          Optional user-specified input scale factor for quantization (use with -q user).
+    -bs "<integer>"         Batch size 1-8 (default 1)
+    -r "<path>"             Read reference score .ark file and compare scores.
+    -rg "<path>"            Read GNA model from file using path/filename provided (required if -m is missing).
+    -wg "<path>"            Write GNA model to file using path/filename provided.
+    -we "<path>"            Write GNA embedded model to file using path/filename provided.
+    -nthreads "<integer>"   Optional. Number of threads to use for concurrent async inference requests on the GNA.
+
+```
+
+Running the application with the empty list of options yields the
+usage message given above and an error message.
+
+### Model Preparation
+
+> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
+You can use the following model optimizer command to convert a Kaldi
+nnet1 or nnet2 neural network to Intel IR format:
+
+```sh
+$ python3 mo.py --framework kaldi --input_model wsj_dnn5b_smbr.nnet --counts wsj_dnn5b_smbr.counts --remove_output_softmax
+```
+
+Assuming that the model optimizer (`mo.py`), Kaldi-trained neural
+network, `wsj_dnn5b_smbr.nnet`, and Kaldi class counts file,
+`wsj_dnn5b_smbr.counts`, are in the working directory this produces
+the Intel IR network consisting of `wsj_dnn5b_smbr.xml` and
+`wsj_dnn5b_smbr.bin`.
+
+The following pretrained models are available:
+
+* wsj\_dnn5b\_smbr
+* rm\_lstm4f
+* rm\_cnn4a\_smbr
+
+All of them can be downloaded from [https://download.01.org/openvinotoolkit/2018_R3/models_contrib/GNA/](https://download.01.org/openvinotoolkit/2018_R3/models_contrib/GNA/).
+
+
+### Speech Inference
+
+Once the IR is created, you can use the following command to do
+inference on Intel^&reg; Processors with the GNA co-processor (or
+emulation library):
+
+```sh
+$ ./speech_sample -d GNA_AUTO -bs 2 -i wsj_dnn5b_smbr_dev93_10.ark -m wsj_dnn5b_smbr_fp32.xml -o scores.ark -r wsj_dnn5b_smbr_dev93_scores_10.ark
+```
+
+Here, the floating point Kaldi-generated reference neural network
+scores (`wsj_dnn5b_smbr_dev93_scores_10.ark`) corresponding to the input
+feature file (`wsj_dnn5b_smbr_dev93_10.ark`) are assumed to be available
+for comparison.
+
+### Sample Output
+
+The acoustic log likelihood sequences for all utterances are stored in
+the Kaldi ARK file, `scores.ark`.  If the `-r` option is used, a report on
+the statistical score error is generated for each utterance such as
+the following:
+
+``` sh
+Utterance 0: 4k0c0301
+   Average inference time per frame: 6.26867 ms
+         max error: 0.0667191
+         avg error: 0.00473641
+     avg rms error: 0.00602212
+       stdev error: 0.00393488
+```
+
+## How it works
+
+Upon the start-up the speech_sample application reads command line parameters
+and loads a Kaldi-trained neural network along with Kaldi ARK speech
+feature vector file to the Inference Engine plugin. It then performs
+inference on all speech utterances stored in the input ARK
+file. Context-windowed speech frames are processed in batches of 1-8
+frames according to the `-bs` parameter.  Batching across utterances is
+not supported by this sample.  When inference is done, the application
+creates an output ARK file.  If the `-r` option is given, error
+statistics are provided for each speech utterance as shown above.
+
+### GNA-specific details
+
+#### Quantization
+
+If the GNA device is selected (for example, using the `-d` GNA flag),
+the GNA Inference Engine plugin quantizes the model and input feature
+vector sequence to integer representation before performing inference.
+Several parameters control neural network quantization.  The `-q` flag
+determines the quantization mode.  Three modes are supported: static,
+dynamic, and user-defined.  In static quantization mode, the first
+utterance in the input ARK file is scanned for dynamic range.  The
+scale factor (floating point scalar multiplier) required to scale the
+maximum input value of the first utterance to 16384 (15 bits) is used
+for all subsequent inputs.  The neural network is quantized to
+accomodate the scaled input dynamic range.  In user-defined
+quantization mode, the user may specify a scale factor via the `-sf`
+flag that will be used for static quantization.  In dynamic
+quantization mode, the scale factor for each input batch is computed
+just before inference on that batch.  The input and network are
+(re)quantized on-the-fly using an efficient procedure.
+
+The `-qb` flag provides a hint to the GNA plugin regarding the preferred
+target weight resolution for all layers.  For example, when `-qb 8` is
+specified, the plugin will use 8-bit weights wherever possible in the
+network.  Note that it is not always possible to use 8-bit weights due
+to GNA hardware limitations.  For example, convolutional layers always
+use 16-bit weights (GNA harware verison 1 and 2).  This limitation
+will be removed in GNA hardware version 3 and higher.
+
+#### Execution Modes
+
+Several execution modes are supported via the `-d` flag.  If the device
+is set to `CPU` and the GNA plugin is selected, the GNA device is
+emulated in fast-but-not-bit-exact mode.  If the device is set to
+`GNA_AUTO`, then the GNA hardware is used if available and the driver is
+installed.  Otherwise, the GNA device is emulated in
+fast-but-not-bit-exact mode.  If the device is set to `GNA_HW`, then the
+GNA hardware is used if available and the driver is installed.
+Otherwise, an error will occur.  If the device is set to `GNA_SW`, the
+GNA device is emulated in fast-but-not-bit-exact mode.  Finally, if
+the device is set to `GNA_SW_EXACT`, the GNA device is emulated in
+bit-exact mode.
+
+#### Loading and Saving Models
+
+The GNA plugin supports loading and saving of the GNA-optimized model
+(non-IR) via the `-rg` and `-wg` flags.  Thereby, it is possible to avoid
+the cost of full model quantization at run time. The GNA plugin also
+supports export of firmware-compatible embedded model images for the
+Intel® Speech Enabling Developer Kit and Amazon Alexa* Premium
+Far-Field Voice Development Kit via the `-we` flag (save only).
+
+In addition to performing inference directly from a GNA model file, these options make it possible to:
+- Convert from IR format to GNA format model file (`-m`, `-wg`)
+- Convert from IR format to embedded format model file (`-m`, `-we`)
+- Convert from GNA format to embedded format model file (`-rg`, `-we`)
+
+## Use of Sample in Kaldi* Speech Recognition Pipeline
+
+The Wall Street Journal DNN model used in this example was prepared
+using the Kaldi s5 recipe and the Kaldi Nnet (nnet1) framework.  It is
+possible to recognize speech by substituting the `speech_sample` for
+Kaldi's nnet-forward command.  Since the speech_sample does not yet 
+use pipes, it is necessary to use temporary files for speaker-
+transformed feature vectors and scores when running the Kaldi speech
+recognition pipeline.  The following operations assume that feature
+extraction was already performed according to the `s5` recipe and that
+the working directory within the Kaldi source tree is `egs/wsj/s5`.
+1. Prepare a speaker-transformed feature set given the feature transform specified
+  in `final.feature_transform` and the feature files specified in `feats.scp`:
+```
+nnet-forward --use-gpu=no final.feature_transform "ark,s,cs:copy-feats scp:feats.scp ark:- |" ark:feat.ark
+```
+2. Score the feature set using the `speech_sample`:
+```
+./speech_sample -d GNA_AUTO -bs 8 -i feat.ark -m wsj_dnn5b_smbr_fp32.xml -o scores.ark
+```
+3. Run the Kaldi decoder to produce n-best text hypotheses and select most likely text given the WFST (`HCLG.fst`), vocabulary (`words.txt`), and TID/PID mapping (`final.mdl`):
+```
+latgen-faster-mapped --max-active=7000 --max-mem=50000000 --beam=13.0 --lattice-beam=6.0 --acoustic-scale=0.0833 --allow-partial=true --word-symbol-table=words.txt final.mdl HCLG.fst ark:scores.ark ark:-| lattice-scale --inv-acoustic-scale=13 ark:- ark:- | lattice-best-path --word-symbol-table=words.txt ark:- ark,t:-  > out.txt &
+```
+4. Run the word error rate tool to check accuracy given the vocabulary (`words.txt`) and reference transcript (`test_filt.txt`):
+```
+cat out.txt | utils/int2sym.pl -f 2- words.txt | sed s:\<UNK\>::g | compute-wer --text --mode=present ark:test_filt.txt ark,p:-
+```
+
+## Links 
+
+- [Main Page](index.html)
+- [Use of the Inference Engine](./docs/IE_DG/Integrate_with_customer_application.md)
+- [Intel's Deep Learning Model Optimizer Developer Guide](https://software.intel.com/en-us/model-optimizer-devguide)
+- [Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
+- [Deep Learning Deployment Toolkit Web Page](https://software.intel.com/en-us/computer-vision-sdk)
diff --git a/inference-engine/samples/speech_sample/main.cpp b/inference-engine/samples/speech_sample/main.cpp
new file mode 100644 (file)
index 0000000..e0dc005
--- /dev/null
@@ -0,0 +1,839 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "speech_sample.hpp"
+
+#include <gflags/gflags.h>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <map>
+#include <fstream>
+#include <random>
+#include <string>
+#include <vector>
+#include <utility>
+#include <time.h>
+#include <thread>
+#include <chrono>
+#include <limits>
+#include <iomanip>
+#include <inference_engine.hpp>
+#include <gna/gna_config.hpp>
+
+#include <samples/common.hpp>
+#include <samples/slog.hpp>
+#include <samples/args_helper.hpp>
+
+#ifndef ALIGN
+#define ALIGN(memSize, pad)   ((static_cast<int>((memSize) + pad - 1) / pad) * pad)
+#endif
+#define MAX_SCORE_DIFFERENCE 0.0001f
+#define MAX_VAL_2B_FEAT 16384
+
+using namespace InferenceEngine;
+
+typedef std::chrono::high_resolution_clock Time;
+typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
+typedef std::chrono::duration<float> fsec;
+typedef struct {
+    uint32_t numScores;
+    uint32_t numErrors;
+    float threshold;
+    float maxError;
+    float rmsError;
+    float sumError;
+    float sumRmsError;
+    float sumSquaredError;
+    float maxRelError;
+    float sumRelError;
+    float sumSquaredRelError;
+} score_error_t;
+
+void GetKaldiArkInfo(const char *fileName,
+                     uint32_t numArrayToFindSize,
+                     uint32_t *ptrNumArrays,
+                     uint32_t *ptrNumMemoryBytes) {
+    uint32_t numArrays = 0;
+    uint32_t numMemoryBytes = 0;
+
+    std::ifstream in_file(fileName, std::ios::binary);
+    if (in_file.good()) {
+        while (!in_file.eof()) {
+            std::string line;
+            uint32_t numRows = 0u, numCols = 0u, num_bytes = 0u;
+            std::getline(in_file, line, '\0');  // read variable length name followed by space and NUL
+            std::getline(in_file, line, '\4');  // read "BFM" followed by space and control-D
+            if (line.compare("BFM ") != 0) {
+                break;
+            }
+            in_file.read(reinterpret_cast<char *>(&numRows), sizeof(uint32_t));  // read number of rows
+            std::getline(in_file, line, '\4');                                   // read control-D
+            in_file.read(reinterpret_cast<char *>(&numCols), sizeof(uint32_t));  // read number of columns
+            num_bytes = numRows * numCols * sizeof(float);
+            in_file.seekg(num_bytes, in_file.cur);                               // read data
+
+            if (numArrays == numArrayToFindSize) {
+                numMemoryBytes += num_bytes;
+            }
+            numArrays++;
+        }
+        in_file.close();
+    } else {
+        fprintf(stderr, "Failed to open %s for reading in GetKaldiArkInfo()!\n", fileName);
+        exit(-1);
+    }
+
+    if (ptrNumArrays != NULL) *ptrNumArrays = numArrays;
+    if (ptrNumMemoryBytes != NULL) *ptrNumMemoryBytes = numMemoryBytes;
+}
+
+void LoadKaldiArkArray(const char *fileName, uint32_t arrayIndex, std::string &ptrName, std::vector<uint8_t> &memory,
+                       uint32_t *ptrNumRows, uint32_t *ptrNumColumns, uint32_t *ptrNumBytesPerElement) {
+    std::ifstream in_file(fileName, std::ios::binary);
+    if (in_file.good()) {
+        uint32_t i = 0;
+        while (i < arrayIndex) {
+            std::string line;
+            uint32_t numRows = 0u, numCols = 0u;
+            std::getline(in_file, line, '\0');  // read variable length name followed by space and NUL
+            std::getline(in_file, line, '\4');  // read "BFM" followed by space and control-D
+            if (line.compare("BFM ") != 0) {
+                break;
+            }
+            in_file.read(reinterpret_cast<char *>(&numRows), sizeof(uint32_t));     // read number of rows
+            std::getline(in_file, line, '\4');                                     // read control-D
+            in_file.read(reinterpret_cast<char *>(&numCols), sizeof(uint32_t));     // read number of columns
+            in_file.seekg(numRows * numCols * sizeof(float), in_file.cur);         // read data
+            i++;
+        }
+        if (!in_file.eof()) {
+            std::string line;
+            std::getline(in_file, ptrName, '\0');     // read variable length name followed by space and NUL
+            std::getline(in_file, line, '\4');       // read "BFM" followed by space and control-D
+            if (line.compare("BFM ") != 0) {
+                fprintf(stderr, "Cannot find array specifier in file %s in LoadKaldiArkArray()!\n", fileName);
+                exit(-1);
+            }
+            in_file.read(reinterpret_cast<char *>(ptrNumRows), sizeof(uint32_t));        // read number of rows
+            std::getline(in_file, line, '\4');                                            // read control-D
+            in_file.read(reinterpret_cast<char *>(ptrNumColumns), sizeof(uint32_t));    // read number of columns
+            size_t willWrite = *ptrNumRows * *ptrNumColumns * sizeof(float);
+            in_file.read(reinterpret_cast<char *>(&memory.front()),
+                         *ptrNumRows * *ptrNumColumns * sizeof(float));  // read array data
+        }
+        in_file.close();
+    } else {
+        fprintf(stderr, "Failed to open %s for reading in GetKaldiArkInfo()!\n", fileName);
+        exit(-1);
+    }
+
+    *ptrNumBytesPerElement = sizeof(float);
+}
+
+void SaveKaldiArkArray(const char *fileName,
+                       bool shouldAppend,
+                       std::string name,
+                       void *ptrMemory,
+                       uint32_t numRows,
+                       uint32_t numColumns) {
+    std::ios_base::openmode mode = std::ios::binary;
+    if (shouldAppend) {
+        mode |= std::ios::app;
+    }
+    std::ofstream out_file(fileName, mode);
+    if (out_file.good()) {
+        out_file.write(name.c_str(), name.length());  // write name
+        out_file.write("\0", 1);
+        out_file.write("BFM ", 4);
+        out_file.write("\4", 1);
+        out_file.write(reinterpret_cast<char *>(&numRows), sizeof(uint32_t));
+        out_file.write("\4", 1);
+        out_file.write(reinterpret_cast<char *>(&numColumns), sizeof(uint32_t));
+        out_file.write(reinterpret_cast<char *>(ptrMemory), numRows * numColumns * sizeof(float));
+        out_file.close();
+    } else {
+        throw std::runtime_error(std::string("Failed to open %s for writing in SaveKaldiArkArray()!\n") + fileName);
+    }
+}
+
+float ScaleFactorForQuantization(void *ptrFloatMemory, float targetMax, uint32_t numElements) {
+    float *ptrFloatFeat = reinterpret_cast<float *>(ptrFloatMemory);
+    float max = 0.0;
+    float scaleFactor;
+
+    for (uint32_t i = 0; i < numElements; i++) {
+        if (fabs(ptrFloatFeat[i]) > max) {
+            max = fabs(ptrFloatFeat[i]);
+        }
+    }
+
+    if (max == 0) {
+        scaleFactor = 1.0;
+    } else {
+        scaleFactor = targetMax / max;
+    }
+
+    return (scaleFactor);
+}
+
+void ClearScoreError(score_error_t *error) {
+    error->numScores = 0;
+    error->numErrors = 0;
+    error->maxError = 0.0;
+    error->rmsError = 0.0;
+    error->sumError = 0.0;
+    error->sumRmsError = 0.0;
+    error->sumSquaredError = 0.0;
+    error->maxRelError = 0.0;
+    error->sumRelError = 0.0;
+    error->sumSquaredRelError = 0.0;
+}
+
+void UpdateScoreError(score_error_t *error, score_error_t *totalError) {
+    totalError->numErrors += error->numErrors;
+    totalError->numScores += error->numScores;
+    totalError->sumRmsError += error->rmsError;
+    totalError->sumError += error->sumError;
+    totalError->sumSquaredError += error->sumSquaredError;
+    if (error->maxError > totalError->maxError) {
+        totalError->maxError = error->maxError;
+    }
+    totalError->sumRelError += error->sumRelError;
+    totalError->sumSquaredRelError += error->sumSquaredRelError;
+    if (error->maxRelError > totalError->maxRelError) {
+        totalError->maxRelError = error->maxRelError;
+    }
+}
+
+uint32_t CompareScores(float *ptrScoreArray,
+                       void *ptrRefScoreArray,
+                       score_error_t *scoreError,
+                       uint32_t numRows,
+                       uint32_t numColumns) {
+    uint32_t numErrors = 0;
+
+    ClearScoreError(scoreError);
+
+    float *A = ptrScoreArray;
+    float *B = reinterpret_cast<float *>(ptrRefScoreArray);
+    for (uint32_t i = 0; i < numRows; i++) {
+        for (uint32_t j = 0; j < numColumns; j++) {
+            float score = A[i * numColumns + j];
+            float refscore = B[i * numColumns + j];
+            float error = fabs(refscore - score);
+            float rel_error = error / (static_cast<float>(fabs(refscore)) + 1e-20f);
+            float squared_error = error * error;
+            float squared_rel_error = rel_error * rel_error;
+            scoreError->numScores++;
+            scoreError->sumError += error;
+            scoreError->sumSquaredError += squared_error;
+            if (error > scoreError->maxError) {
+                scoreError->maxError = error;
+            }
+            scoreError->sumRelError += rel_error;
+            scoreError->sumSquaredRelError += squared_rel_error;
+            if (rel_error > scoreError->maxRelError) {
+                scoreError->maxRelError = rel_error;
+            }
+            if (error > scoreError->threshold) {
+                numErrors++;
+            }
+        }
+    }
+    scoreError->rmsError = sqrt(scoreError->sumSquaredError / (numRows * numColumns));
+    scoreError->sumRmsError += scoreError->rmsError;
+    scoreError->numErrors = numErrors;
+
+    return (numErrors);
+}
+
+float StdDevError(score_error_t error) {
+    return (sqrt(error.sumSquaredError / error.numScores
+                 - (error.sumError / error.numScores) * (error.sumError / error.numScores)));
+}
+
+float StdDevRelError(score_error_t error) {
+    return (sqrt(error.sumSquaredRelError / error.numScores
+                 - (error.sumRelError / error.numScores) * (error.sumRelError / error.numScores)));
+}
+
+#if !defined(__arm__) && !defined(_M_ARM)
+#if defined(_WIN32) || defined(WIN32)
+#include <intrin.h>
+#include <windows.h>
+#else
+
+#include <cpuid.h>
+
+#endif
+
+inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+                         unsigned int *ecx, unsigned int *edx) {
+    size_t level = *eax;
+#if defined(_WIN32) || defined(WIN32)
+    int regs[4] = {static_cast<int>(*eax), static_cast<int>(*ebx), static_cast<int>(*ecx), static_cast<int>(*edx)};
+    __cpuid(regs, level);
+    *eax = static_cast<uint32_t>(regs[0]);
+    *ebx = static_cast<uint32_t>(regs[1]);
+    *ecx = static_cast<uint32_t>(regs[2]);
+    *edx = static_cast<uint32_t>(regs[3]);
+#else
+    __get_cpuid(level, eax, ebx, ecx, edx);
+#endif
+}
+
+// return GNA module frequency in MHz
+float getGnaFrequencyMHz() {
+    uint32_t level = 0;
+    uint32_t eax = 1;
+    uint32_t ebx = 0;
+    uint32_t ecx = 0;
+    uint32_t edx = 0;
+    uint32_t family = 0;
+    uint32_t model = 0;
+    const uint8_t sixth_family = 6;
+    const uint8_t cannon_lake_model = 102;
+    const uint8_t gemini_lake_model = 122;
+
+    native_cpuid(&eax, &ebx, &ecx, &edx);
+    family = (eax >> 8) & 0xF;
+
+    // model is the concatenation of two fields
+    // | extended model | model |
+    // copy extended model data
+    model = (eax >> 16) & 0xF;
+    // shift
+    model <<= 4;
+    // copy model data
+    model += (eax >> 4) & 0xF;
+
+    if (family == sixth_family && model == cannon_lake_model) {
+        return 400;
+    } else if (family == sixth_family &&
+               model == gemini_lake_model) {
+        return 200;
+    } else {
+        // counters not supported and we retrns just default value
+        return 1;
+    }
+}
+
+#endif  // !defined(__arm__) && !defined(_M_ARM)
+
+void printReferenceCompareResults(score_error_t const &totalError,
+                                  size_t framesNum,
+                                  std::ostream &stream) {
+    stream << "         max error: " <<
+           totalError.maxError << std::endl;
+    stream << "         avg error: " <<
+           totalError.sumError / totalError.numScores << std::endl;
+    stream << "     avg rms error: " <<
+           totalError.sumRmsError / framesNum << std::endl;
+    stream << "       stdev error: " <<
+           StdDevError(totalError) << std::endl << std::endl;
+    stream << std::endl;
+}
+
+void printPerformanceCounters(std::map<std::string,
+        InferenceEngine::InferenceEngineProfileInfo> const &utterancePerfMap,
+                              size_t callsNum,
+                              std::ostream &stream) {
+#if !defined(__arm__) && !defined(_M_ARM)
+    stream << std::endl << "Performance counts:" << std::endl;
+    stream << std::setw(10) << std::right << "" << "Counter descriptions";
+    stream << std::setw(22) << "Utt scoring time";
+    stream << std::setw(18) << "Avg infer time";
+    stream << std::endl;
+
+    stream << std::setw(46) << "(ms)";
+    stream << std::setw(24) << "(us per call)";
+    stream << std::endl;
+
+    for (const auto &it : utterancePerfMap) {
+        std::string const &counter_name = it.first;
+        float current_units = it.second.realTime_uSec;
+        float call_units = current_units / callsNum;
+        float freq = 1.0;
+        // if GNA HW counters
+        // get frequency of GNA module
+        freq = getGnaFrequencyMHz();
+        current_units /= freq * 1000;
+        call_units /= freq;
+        stream << std::setw(30) << std::left << counter_name.substr(4, counter_name.size() - 1);
+        stream << std::setw(16) << std::right << current_units;
+        stream << std::setw(21) << std::right << call_units;
+        stream << std::endl;
+    }
+    stream << std::endl;
+#endif
+}
+
+void getPerformanceCounters(InferenceEngine::InferRequest &request,
+                            std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfCounters) {
+    auto retPerfCounters = request.GetPerformanceCounts();
+
+    for (const auto &pair : retPerfCounters) {
+        perfCounters[pair.first] = pair.second;
+    }
+}
+
+void sumPerformanceCounters(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> const &perfCounters,
+                            std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &totalPerfCounters) {
+    for (const auto &pair : perfCounters) {
+        totalPerfCounters[pair.first].realTime_uSec += pair.second.realTime_uSec;
+    }
+}
+
+bool ParseAndCheckCommandLine(int argc, char *argv[]) {
+    // ---------------------------Parsing and validation of input args--------------------------------------
+    slog::info << "Parsing input parameters" << slog::endl;
+
+    gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true);
+    if (FLAGS_h) {
+        showUsage();
+        return false;
+    }
+    bool isDumpMode = !FLAGS_wg.empty() || !FLAGS_we.empty();
+
+    // input not required only in dump mode and if external scale factor provided
+    if (FLAGS_i.empty() && (!isDumpMode || FLAGS_q.compare("user") != 0)) {
+        if (isDumpMode) {
+            throw std::logic_error("In model dump mode either static quantization is used (-i) or user scale"
+                                   " factor need to be provided. See -q user option");
+        }
+        throw std::logic_error("Input file not set. Please use -i.");
+    }
+
+    if (FLAGS_m.empty() && FLAGS_rg.empty()) {
+        throw std::logic_error("Either IR file (-m) or GNAModel file (-rg) need to be set.");
+    }
+
+    if ((!FLAGS_m.empty() && !FLAGS_rg.empty())) {
+        throw std::logic_error("Only one of -m and -rg is allowed.");
+    }
+
+    if ((FLAGS_d.compare("GPU") != 0) && (FLAGS_d.compare("CPU") != 0) && (FLAGS_d.compare("GNA_AUTO") != 0) &&
+        (FLAGS_d.compare("GNA_HW") != 0)
+        && (FLAGS_d.compare("GNA_SW") != 0) && (FLAGS_d.compare("GNA_SW_EXACT") != 0)) {
+        throw std::logic_error("Specified device is not supported.");
+    }
+
+    float scaleFactorInput = static_cast<float>(FLAGS_sf);
+    if (scaleFactorInput <= 0.0f) {
+        throw std::logic_error("Scale factor out of range (must be non-negative).");
+    }
+
+    uint32_t batchSize = (uint32_t) FLAGS_bs;
+    if ((batchSize < 1) || (batchSize > 8)) {
+        throw std::logic_error("Batch size out of range (1..8).");
+    }
+
+    /** default is a static quantisation **/
+    if ((FLAGS_q.compare("static") != 0) && (FLAGS_q.compare("dynamic") != 0) && (FLAGS_q.compare("user") != 0)) {
+        throw std::logic_error("Quantization mode not supported (static, dynamic, user).");
+    }
+
+    if (FLAGS_q.compare("dynamic") == 0) {
+        throw std::logic_error("Dynamic quantization not yet supported.");
+    }
+
+    if (FLAGS_qb != 16 && FLAGS_qb != 8) {
+        throw std::logic_error("Only 8 or 16 bits supported.");
+    }
+
+    if (FLAGS_nthreads <= 0) {
+        throw std::logic_error("Not valid value for 'nthreads' argument. It should be > 0 ");
+    }
+
+    return true;
+}
+
+/**
+ * @brief The entry point for inference engine automatic speech recognition sample
+ * @file speech_sample/main.cpp
+ * @example speech_sample/main.cpp
+ */
+int main(int argc, char *argv[]) {
+    try {
+        slog::info << "InferenceEngine: " << GetInferenceEngineVersion() << slog::endl;
+
+        // ------------------------------ Parsing and validation of input args ---------------------------------
+        if (!ParseAndCheckCommandLine(argc, argv)) {
+            return 0;
+        }
+
+        if (FLAGS_l.empty()) {
+            slog::info << "No extensions provided" << slog::endl;
+        }
+
+        bool useGna = (FLAGS_d.find("GNA") != std::string::npos);
+        auto deviceStr = FLAGS_d.substr(0, (FLAGS_d.find("_")));
+        float scaleFactorInput = static_cast<float>(FLAGS_sf);
+        uint32_t batchSize = (uint32_t) FLAGS_bs;
+        /** Extract input ark file name **/
+        std::string inputArkName = fileNameNoExt(FLAGS_i) + ".ark";
+
+        uint32_t numUtterances(0), numBytesThisUtterance(0);
+        if (!FLAGS_i.empty()) {
+            GetKaldiArkInfo(inputArkName.c_str(), 0, &numUtterances, &numBytesThisUtterance);
+        }
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 1. Load Plugin for inference engine -------------------------------------
+        slog::info << "Loading plugin" << slog::endl;
+        /** Loading plugin for device **/
+        InferencePlugin plugin = PluginDispatcher({FLAGS_pp, "../../../lib/intel64", ""}).getPluginByDevice(deviceStr);
+
+        /** Printing plugin version **/
+        std::cout << plugin.GetVersion() << std::endl << std::endl;
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 2. Read IR Generated by ModelOptimizer (.xml and .bin files) ------------
+        slog::info << "Loading network files" << slog::endl;
+
+        CNNNetReader netBuilder;
+        if (!FLAGS_m.empty()) {
+            /** Read network model **/
+            netBuilder.ReadNetwork(FLAGS_m);
+
+            /** Extract model name and load weights **/
+            std::string binFileName = fileNameNoExt(FLAGS_m) + ".bin";
+            netBuilder.ReadWeights(binFileName);
+
+            // -------------------------------------------------------------------------------------------------
+
+            // --------------------------- 3. Set batch size ---------------------------------------------------
+            /** Set batch size.  Unlike in imaging, batching in time (rather than space) is done for speech recognition. **/
+            netBuilder.getNetwork().setBatchSize(batchSize);
+            slog::info << "Batch size is " << std::to_string(netBuilder.getNetwork().getBatchSize())
+                       << slog::endl;
+        }
+
+        /** Setting plugin parameter for per layer metrics **/
+        std::map<std::string, std::string> gnaPluginConfig;
+        std::map<std::string, std::string> genericPluginConfig;
+        if (FLAGS_d.compare("CPU") != 0) {
+            gnaPluginConfig[GNAConfigParams::KEY_GNA_DEVICE_MODE] = FLAGS_d;
+        }
+        if (FLAGS_pc) {
+            genericPluginConfig[PluginConfigParams::KEY_PERF_COUNT] = PluginConfigParams::YES;
+        }
+
+        if (FLAGS_q.compare("user") == 0) {
+            std::cout << "[ INFO ] Using scale factor of " << FLAGS_sf << std::endl;
+            gnaPluginConfig[GNA_CONFIG_KEY(SCALE_FACTOR)] = std::to_string(FLAGS_sf);
+        } else {  // "static" quantization with calculated scale factor
+            std::string name;
+            std::vector<uint8_t> ptrFeatures;
+            uint32_t numArrays(0), numBytes(0), numFrames(0), numFrameElements(0), numBytesPerElement(0);
+            GetKaldiArkInfo(inputArkName.c_str(), 0, &numArrays, &numBytes);
+            ptrFeatures.resize(numBytes);
+            LoadKaldiArkArray(inputArkName.c_str(),
+                              0,
+                              name,
+                              ptrFeatures,
+                              &numFrames,
+                              &numFrameElements,
+                              &numBytesPerElement);
+            scaleFactorInput =
+                    ScaleFactorForQuantization(ptrFeatures.data(), MAX_VAL_2B_FEAT, numFrames * numFrameElements);
+            slog::info << "Using scale factor of " << scaleFactorInput << " calculated from first utterance."
+                       << slog::endl;
+            gnaPluginConfig[GNA_CONFIG_KEY(SCALE_FACTOR)] = std::to_string(scaleFactorInput);
+        }
+
+        if (FLAGS_qb == 8) {
+            gnaPluginConfig[GNAConfigParams::KEY_GNA_PRECISION] = "I8";
+        } else {
+            gnaPluginConfig[GNAConfigParams::KEY_GNA_PRECISION] = "I16";
+        }
+
+        gnaPluginConfig[GNAConfigParams::KEY_GNA_LIB_N_THREADS] = std::to_string(FLAGS_nthreads);
+        gnaPluginConfig[GNA_CONFIG_KEY(COMPACT_MODE)] = CONFIG_VALUE(NO);
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 4. Write model to file --------------------------------------------------
+        // Embedded GNA model dumping (for Intel(R) Speech Enabling Developer Kit)
+        if (!FLAGS_we.empty()) {
+            gnaPluginConfig[GNAConfigParams::KEY_GNA_FIRMWARE_MODEL_IMAGE] = FLAGS_we;
+        }
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 5. Loading model to the plugin ------------------------------------------
+
+        if (useGna) {
+            genericPluginConfig.insert(std::begin(gnaPluginConfig), std::end(gnaPluginConfig));
+        }
+        auto t0 = Time::now();
+        ExecutableNetwork executableNet;
+        if (!FLAGS_m.empty()) {
+            slog::info << "Loading model to the plugin" << slog::endl;
+            executableNet = plugin.LoadNetwork(netBuilder.getNetwork(), genericPluginConfig);
+        } else {
+            slog::info << "Importing model to the plugin" << slog::endl;
+            executableNet = plugin.ImportNetwork(FLAGS_rg.c_str(), genericPluginConfig);
+        }
+
+
+        ms loadTime = std::chrono::duration_cast<ms>(Time::now() - t0);
+        slog::info << "Model loading time " << loadTime.count() << " ms" << slog::endl;
+
+        // --------------------------- 6. Exporting gna model using InferenceEngine AOT API---------------------
+        if (!FLAGS_wg.empty()) {
+            slog::info << "Writing GNA Model to file " << FLAGS_wg << slog::endl;
+            t0 = Time::now();
+            executableNet.Export(FLAGS_wg);
+            ms exportTime = std::chrono::duration_cast<ms>(Time::now() - t0);
+            slog::info << "Exporting time " << exportTime.count() << " ms" << slog::endl;
+            return 0;
+        }
+
+        if (!FLAGS_we.empty()) {
+            slog::info << "Exported GNA embedded model to file " << FLAGS_we << slog::endl;
+            return 0;
+        }
+
+        std::vector<std::pair<InferRequest, size_t>> inferRequests(FLAGS_nthreads);
+        for (auto& inferRequest : inferRequests) {
+            inferRequest = {executableNet.CreateInferRequest(), -1};
+        }
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 7. Prepare input blobs --------------------------------------------------
+        /** Taking information about all topology inputs **/
+        ConstInputsDataMap cInputInfo = executableNet.GetInputsInfo();
+        InputsDataMap inputInfo;
+        if (!FLAGS_m.empty()) {
+            inputInfo = netBuilder.getNetwork().getInputsInfo();
+        }
+
+        /** Stores all input blobs data **/
+        if (cInputInfo.size() != 1) {
+            throw std::logic_error("Sample supports only topologies with  1 input");
+        }
+
+        Blob::Ptr ptrInputBlob = inferRequests[0].first.GetBlob(cInputInfo.begin()->first);
+
+        /** configure input precision if model loaded from IR **/
+        for (auto &item : inputInfo) {
+            Precision inputPrecision = Precision::FP32;  // specify Precision::I16 to provide quantized inputs
+            item.second->setPrecision(inputPrecision);
+            item.second->getInputData()->layout = NC;  // row major layout
+        }
+
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 8. Prepare output blobs -------------------------------------------------
+        ConstOutputsDataMap cOutputInfo(executableNet.GetOutputsInfo());
+        OutputsDataMap outputInfo;
+        if (!FLAGS_m.empty()) {
+            outputInfo = netBuilder.getNetwork().getOutputsInfo();
+        }
+
+        Blob::Ptr ptrOutputBlob = inferRequests[0].first.GetBlob(cOutputInfo.begin()->first);
+
+        for (auto &item : outputInfo) {
+            DataPtr outData = item.second;
+            if (!outData) {
+                throw std::logic_error("output data pointer is not valid");
+            }
+
+            Precision outputPrecision = Precision::FP32;  // specify Precision::I32 to retrieve quantized outputs
+            outData->setPrecision(outputPrecision);
+            outData->layout = NC;  // row major layout
+        }
+        // -----------------------------------------------------------------------------------------------------
+
+        // --------------------------- 9. Do inference ---------------------------------------------------------
+        std::vector<uint8_t> ptrUtterance;
+        std::vector<uint8_t> ptrScores;
+        std::vector<uint8_t> ptrReferenceScores;
+        score_error_t frameError, totalError;
+
+        for (uint32_t utteranceIndex = 0; utteranceIndex < numUtterances; ++utteranceIndex) {
+            std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> utterancePerfMap;
+            std::string uttName;
+            uint32_t numFrames(0), numFrameElementsInput(0), numBytesPerElementInput(0), n(0);
+            uint32_t numFramesReference(0), numFrameElementsReference(0), numBytesPerElementReference(0),
+                    numBytesReferenceScoreThisUtterance(0);
+            const uint32_t numScoresPerFrame = ptrOutputBlob->size() / batchSize;
+            GetKaldiArkInfo(inputArkName.c_str(), utteranceIndex, &n, &numBytesThisUtterance);
+            ptrUtterance.resize(numBytesThisUtterance);
+            LoadKaldiArkArray(inputArkName.c_str(),
+                              utteranceIndex,
+                              uttName,
+                              ptrUtterance,
+                              &numFrames,
+                              &numFrameElementsInput,
+                              &numBytesPerElementInput);
+
+            uint32_t numFrameElementsInputPadded = numFrameElementsInput;
+
+            if (ptrInputBlob->size() != numFrameElementsInputPadded * batchSize) {
+                throw std::logic_error("network input size(" + std::to_string(ptrInputBlob->size()) +
+                                       ") mismatch to ark file size (" +
+                                       std::to_string(numFrameElementsInputPadded * batchSize) + ")");
+            }
+            ptrScores.resize(numFrames * numScoresPerFrame * sizeof(float));
+            if (!FLAGS_r.empty()) {
+                std::string refUtteranceName;
+                GetKaldiArkInfo(FLAGS_r.c_str(), utteranceIndex, &n, &numBytesReferenceScoreThisUtterance);
+                ptrReferenceScores.resize(numBytesReferenceScoreThisUtterance);
+                LoadKaldiArkArray(FLAGS_r.c_str(),
+                                  utteranceIndex,
+                                  refUtteranceName,
+                                  ptrReferenceScores,
+                                  &numFramesReference,
+                                  &numFrameElementsReference,
+                                  &numBytesPerElementReference);
+            }
+
+            double totalTime = 0.0;
+
+            std::cout << "Utterance " << utteranceIndex << ": " << std::endl;
+
+            ClearScoreError(&totalError);
+            totalError.threshold = frameError.threshold = MAX_SCORE_DIFFERENCE;
+            auto inputFrame = &ptrUtterance.front();
+            auto outputFrame = &ptrScores.front();
+
+            size_t frameIndex{0};
+            uint32_t numFramesThisBatch{batchSize};
+
+            auto t0 = Time::now();
+            auto t1 = t0;
+
+            // Doing inference
+            while (frameIndex <= numFrames) {
+                if (frameIndex == numFrames) {
+                    bool hasRequests = false;
+                    for (auto &inferRequest : inferRequests) {
+                        if (inferRequest.second != -1) {
+                            hasRequests = true;
+                        }
+                    }
+                    if (!hasRequests) {
+                        break;
+                    }
+                }
+
+                bool inferRequestFetched = false;
+                for (auto &inferRequest : inferRequests) {
+                    if (frameIndex == numFrames) {
+                        numFramesThisBatch = 1;
+                    } else {
+                        numFramesThisBatch = (numFrames - frameIndex < batchSize) ? (numFrames - frameIndex) : batchSize;
+                    }
+
+                    if (inferRequest.second != -1) {
+                        StatusCode code = inferRequest.first.Wait(
+                                InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
+
+                        if (code != StatusCode::OK) {
+                            continue;
+                        }
+
+                        if (!FLAGS_o.empty()) {
+                            Blob::Ptr outputBlob = inferRequest.first.GetBlob(cOutputInfo.begin()->first);
+                            std::memcpy(outputFrame,
+                                        outputBlob->buffer(),
+                                        outputBlob->byteSize());
+                            outputFrame += numScoresPerFrame * sizeof(float);
+                        }
+
+                        if (!FLAGS_r.empty()) {
+                            Blob::Ptr outputBlob = inferRequest.first.GetBlob(cOutputInfo.begin()->first);
+                            CompareScores(outputBlob->buffer().as<float *>(),
+                                          &ptrReferenceScores[inferRequest.second *
+                                                              numFrameElementsReference *
+                                                              numBytesPerElementReference],
+                                          &frameError,
+                                          numFramesThisBatch,
+                                          numFrameElementsReference);
+                            UpdateScoreError(&frameError, &totalError);
+                        }
+                    }
+
+                    inferRequest.second = -1;
+
+                    if (frameIndex == numFrames) {
+                        continue;
+                    }
+
+                    Blob::Ptr inputBlob = inferRequest.first.GetBlob(cInputInfo.begin()->first);
+                    std::memcpy(inputBlob->buffer(),
+                                inputFrame,
+                                inputBlob->byteSize());
+
+                    inferRequest.first.StartAsync();
+
+                    inferRequest.second = frameIndex;
+                    frameIndex += numFramesThisBatch;
+                    inputFrame += sizeof(float) * numFrameElementsInput * numFramesThisBatch;
+                    inferRequestFetched |= true;
+                }
+
+                if (!inferRequestFetched) {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+                    continue;
+                }
+
+                if (FLAGS_pc) {
+                    std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> callPerfMap;
+                    // retrive new counters
+                    for (auto inferRequest : inferRequests) {
+                        getPerformanceCounters(inferRequest.first, callPerfMap);
+                        // summarize retrived counters with all previous
+                        sumPerformanceCounters(callPerfMap, utterancePerfMap);
+                    }
+                }
+            }
+            t1 = Time::now();
+
+            fsec fs = t1 - t0;
+            ms d = std::chrono::duration_cast<ms>(fs);
+            totalTime += d.count();
+
+            // resetting state between utterances
+            for (auto &&state : executableNet.QueryState()) {
+                state.Reset();
+            }
+
+            if (!FLAGS_o.empty()) {
+                bool shouldAppend = (utteranceIndex == 0) ? false : true;
+                SaveKaldiArkArray(FLAGS_o.c_str(), shouldAppend, uttName, &ptrScores.front(),
+                                  numFrames, numScoresPerFrame);
+            }
+
+            /** Show performance results **/
+            std::cout << "Total time in Infer (HW and SW):\t" << totalTime << " ms"
+                      << std::endl;
+            std::cout << "Frames in utterance:\t\t\t" << numFrames << " frames"
+                      << std::endl;
+            std::cout << "Average Infer time per frame:\t\t" << totalTime / static_cast<double>(numFrames) << " ms"
+                      << std::endl;
+            if (FLAGS_pc) {
+                // print
+                printPerformanceCounters(utterancePerfMap, frameIndex, std::cout);
+            }
+            if (!FLAGS_r.empty()) {
+                printReferenceCompareResults(totalError, numFrames, std::cout);
+            }
+            std::cout << "End of Utterance " << utteranceIndex << std::endl << std::endl;
+        }
+        // -----------------------------------------------------------------------------------------------------
+    }
+    catch (const std::exception &error) {
+        slog::err << error.what() << slog::endl;
+        return 1;
+    }
+    catch (...) {
+        slog::err << "Unknown/internal exception happened" << slog::endl;
+        return 1;
+    }
+
+    slog::info << "Execution successful" << slog::endl;
+    return 0;
+}
diff --git a/inference-engine/samples/speech_sample/speech_sample.hpp b/inference-engine/samples/speech_sample/speech_sample.hpp
new file mode 100644 (file)
index 0000000..37cb88f
--- /dev/null
@@ -0,0 +1,163 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <gflags/gflags.h>
+#include <iostream>
+
+#ifdef _WIN32
+#include <os/windows/w_dirent.h>
+#else
+#include <dirent.h>
+#endif
+
+#define DEFAULT_PATH_P "./lib"
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message.";
+
+/// @brief message for images argument
+static const char input_message[] = "Required. Path to an .ark file.";
+
+/// @brief message for plugin_path argument
+static const char plugin_path_message[] = "Path to a plugin folder.";
+
+/// @brief message for model argument
+static const char model_message[] = "Required. Path to an .xml file with a trained model (required if -rg is missing).";
+
+/// @brief message for plugin argument
+static const char plugin_message[] = "Plugin name. For example MKLDNNPlugin. If this parameter is pointed, " \
+                                     "the sample will look for this plugin only";
+
+/// @brief message for assigning cnn calculation to device
+static const char target_device_message[] = "Specify the target device to infer on; CPU, GPU, GNA_AUTO, GNA_HW, GNA_SW, GNA_SW_EXACT is acceptable. " \
+                                            "Sample will look for a suitable plugin for device specified";
+/// @brief message for performance counters
+static const char performance_counter_message[] = "Enables per-layer performance report";
+
+/// @brief message for user library argument
+static const char custom_cpu_library_message[] = "Required for MKLDNN (CPU)-targeted custom layers." \
+"Absolute path to a shared library with the kernels impl.";
+
+/// @brief message for score output argument
+static const char output_message[] = "Output file name (default name is scores.ark).";
+
+/// @brief message for reference score file argument
+static const char reference_score_message[] = "Read reference score .ark file and compare scores.";
+
+/// @brief message for read GNA model argument
+static const char read_gna_model_message[] = "Read GNA model from file using path/filename provided (required if -m is missing).";
+
+/// @brief message for write GNA model argument
+static const char write_gna_model_message[] = "Write GNA model to file using path/filename provided.";
+
+/// @brief message for write GNA embedded model argument
+static const char write_embedded_model_message[] = "Write GNA embedded model to file using path/filename provided.";
+
+/// @brief message for quantization argument
+static const char quantization_message[] = "Input quantization mode:  static (default), dynamic, or user (use with -sf).";
+
+/// @brief message for quantization bits argument
+static const char quantization_bits_message[] = "Weight bits for quantization:  8 or 16 (default)";
+
+/// @brief message for scale factor argument
+static const char scale_factor_message[] = "Optional user-specified input scale factor for quantization (use with -q user).";
+
+/// @brief message for batch size argument
+static const char batch_size_message[] = "Batch size 1-8 (default 1)";
+
+/// @brief message for #threads for CPU inference
+static const char infer_num_threads_message[] = "Optional. Number of threads to use for concurrent async" \
+" inference requests on the GNA.";
+
+/// \brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// \brief Define parameter for set image file <br>
+/// It is a required parameter
+DEFINE_string(i, "", input_message);
+
+/// \brief Define parameter for set model file <br>
+/// It is a required parameter
+DEFINE_string(m, "", model_message);
+
+/// \brief Define parameter for set plugin name <br>
+/// It is a required parameter
+DEFINE_string(p, "", plugin_message);
+
+/// \brief Define parameter for set path to plugins <br>
+/// Default is ./lib
+DEFINE_string(pp, DEFAULT_PATH_P, plugin_path_message);
+
+/// \brief device the target device to infer on <br>
+DEFINE_string(d, "GNA_AUTO", target_device_message);
+
+/// \brief Enable per-layer performance report
+DEFINE_bool(pc, false, performance_counter_message);
+
+/// @brief Absolute path to CPU library with user layers <br>
+/// It is a optional parameter
+DEFINE_string(l, "", custom_cpu_library_message);
+
+/// @brief Write model to file (model.bin)
+DEFINE_string(o, "", output_message);
+
+/// @brief Read reference score file
+DEFINE_string(r, "", reference_score_message);
+
+/// @brief Read GNA model from file (model.bin)
+DEFINE_string(rg, "", read_gna_model_message);
+
+/// @brief Write GNA model to file (model.bin)
+DEFINE_string(wg, "", write_gna_model_message);
+
+/// @brief Write GNA embedded model to file (model.bin)
+DEFINE_string(we, "", write_embedded_model_message);
+
+/// @brief Input quantization mode (default static)
+DEFINE_string(q, "static", quantization_message);
+
+/// @brief Input quantization bits (default 16)
+DEFINE_int32(qb, 16, quantization_bits_message);
+
+/// @brief Scale factor for quantization (default 1.0)
+DEFINE_double(sf, 1.0, scale_factor_message);
+
+/// @brief Batch size (default 1)
+DEFINE_int32(bs, 1, batch_size_message);
+
+/// @brief Number of threads to use for inference on the CPU (also affects Hetero cases)
+DEFINE_int32(nthreads, 1, infer_num_threads_message);
+
+/**
+ * \brief This function show a help message
+ */
+static void showUsage() {
+    std::cout << std::endl;
+    std::cout << "speech_sample [OPTION]" << std::endl;
+    std::cout << "Options:" << std::endl;
+    std::cout << std::endl;
+    std::cout << "    -h                        " << help_message << std::endl;
+    std::cout << "    -i \"<path>\"             " << input_message << std::endl;
+    std::cout << "    -m \"<path>\"             " << model_message << std::endl;
+    std::cout << "    -o \"<path>\"             " << output_message << std::endl;
+    std::cout << "    -l \"<absolute_path>\"    " << custom_cpu_library_message << std::endl;
+    std::cout << "    -d \"<device>\"           " << target_device_message << std::endl;
+    std::cout << "    -p                        " << plugin_message << std::endl;
+    std::cout << "    -pp                       " << plugin_path_message << std::endl;
+    std::cout << "    -pc                       " << performance_counter_message << std::endl;
+    std::cout << "    -q \"<mode>\"             " << quantization_message << std::endl;
+    std::cout << "    -qb \"<integer>\"         " << quantization_bits_message << std::endl;
+    std::cout << "    -sf \"<double>\"          " << scale_factor_message << std::endl;
+    std::cout << "    -bs \"<integer>\"         " << batch_size_message << std::endl;
+    std::cout << "    -r \"<path>\"             " << reference_score_message << std::endl;
+    std::cout << "    -rg \"<path>\"            " << read_gna_model_message << std::endl;
+    std::cout << "    -wg \"<path>\"            " << write_gna_model_message << std::endl;
+    std::cout << "    -we \"<path>\"            " << write_embedded_model_message << std::endl;
+    std::cout << "    -nthreads \"<integer>\"   " << infer_num_threads_message << std::endl;
+}
+
index ee93d8f..bbc971e 100644 (file)
@@ -1,25 +1,11 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#      http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 cmake_minimum_required(VERSION 2.8)
 
 set (TARGET_NAME "style_transfer_sample")
 
-if( BUILD_SAMPLE_NAME AND NOT ${BUILD_SAMPLE_NAME} STREQUAL ${TARGET_NAME} )
-    message(STATUS "DEMO ${TARGET_NAME} SKIPPED")
-    return()
-endif()
-
 file (GLOB MAIN_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
         )
index 57a087c..89bd837 100644 (file)
@@ -35,11 +35,13 @@ You can do inference on an image using a trained model of NST network on Intel&r
 ./style_transfer_sample -i <path_to_image>/cat.bmp -m <path_to_model>/1_decoder_FP32.xml
 ```
 
+> **NOTE**: Before running the sample with a trained model, make sure the model is converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
 ### Outputs
 
 The application outputs an styled image(s) (<code>out(1).bmp</code>) which were redrawn in style of model which used for infer.
 Style of output images depend on models which use for sample.
 
 ## See Also 
-* [Using Inference Engine Samples](./docs/Inference_Engine_Developer_Guide/Samples_Overview.md)
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
 
index a0f50e3..898256e 100644 (file)
@@ -1,16 +1,7 @@
-# Copyright (c) 2018 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#      http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 cmake_minimum_required(VERSION 2.8)
 
 set (TARGET_NAME "validation_app")
@@ -30,22 +21,18 @@ file (GLOB MAIN_HEADERS
 source_group("src" FILES ${MAIN_SRC})
 source_group("include" FILES ${MAIN_HEADERS})
 
-# opencv include folders
-find_package(OpenCV QUIET COMPONENTS core imgproc highgui imgcodecs)
+# Find OpenCV components if exist
+find_package(OpenCV COMPONENTS imgcodecs QUIET)
 if(NOT(OpenCV_FOUND))
-    find_package(OpenCV QUIET COMPONENTS world)
-    if(NOT(OpenCV_FOUND))
-        message(WARNING "No suitable OpenCV version detected, " ${TARGET_NAME} " skipped")
-        return()
-    endif()
+    message(WARNING "OPENCV is disabled or not found, " ${TARGET_NAME} " skipped")
+    return()
 endif()
 
 # Properties->C/C++->General->Additional Include Directories
 include_directories (${CMAKE_CURRENT_SOURCE_DIR}/../classification_sample/core
         ${CMAKE_CURRENT_SOURCE_DIR}/../common
         ${CMAKE_CURRENT_SOURCE_DIR}/../common/os/windows
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-        ${OpenCV_INCLUDE_DIRS})
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
 
 link_directories(${LIB_FOLDER})
 
index 78e6adc..9c52c1e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -31,7 +30,7 @@ ClassificationProcessor::ClassificationProcessor(const std::string& flags_m, con
             PreprocessingOptions(false, ResizeCropPolicy::ResizeThenCrop, 256, 256), zeroBackground) {
 }
 
-std::shared_ptr<Processor::InferenceMetrics> ClassificationProcessor::Process() {
+std::shared_ptr<Processor::InferenceMetrics> ClassificationProcessor::Process(bool stream_output) {
      slog::info << "Collecting labels" << slog::endl;
      ClassificationSetGenerator generator;
      // try {
@@ -49,7 +48,7 @@ std::shared_ptr<Processor::InferenceMetrics> ClassificationProcessor::Process()
      std::vector<int> expected(batch);
      std::vector<std::string> files(batch);
 
-     ConsoleProgress progress(validationMap.size());
+     ConsoleProgress progress(validationMap.size(), stream_output);
 
      ClassificationInferenceMetrics im;
 
index c620e55..1813ac3 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -35,7 +34,7 @@ public:
     ClassificationProcessor(const std::string& flags_m, const std::string& flags_d, const std::string& flags_i, int flags_b,
             InferenceEngine::InferencePlugin plugin, CsvDumper& dumper, const std::string& flags_l, bool zeroBackground);
 
-    std::shared_ptr<InferenceMetrics> Process();
+    std::shared_ptr<InferenceMetrics> Process(bool stream_output);
     virtual void Report(const InferenceMetrics& im);
     virtual ~ClassificationProcessor() { }
 };
index 6a195cb..6109a96 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -47,7 +46,7 @@ ObjectDetectionProcessor::ObjectDetectionProcessor(const std::string& flags_m, c
     }
 }
 
-shared_ptr<Processor::InferenceMetrics> ObjectDetectionProcessor::Process() {
+shared_ptr<Processor::InferenceMetrics> ObjectDetectionProcessor::Process(bool stream_output) {
     // Parsing PASCAL VOC2012 format
     VOCAnnotationParser vocAnnParser;
     slog::info << "Collecting VOC annotations from " << annotationsPath << slog::endl;
@@ -92,7 +91,7 @@ shared_ptr<Processor::InferenceMetrics> ObjectDetectionProcessor::Process() {
 
     std::vector<VOCAnnotation> expected(batch);
 
-    ConsoleProgress progress(annCollector.annotations().size());
+    ConsoleProgress progress(annCollector.annotations().size(), stream_output);
 
     ObjectDetectionInferenceMetrics im(threshold);
 
index 68816eb..0bb2231 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -44,7 +43,7 @@ public:
             InferenceEngine::InferencePlugin plugin, CsvDumper& dumper,
             const std::string& flags_a, const std::string& classes_list_file, PreprocessingOptions preprocessingOptions, bool scaleSizeToInputSize);
 
-    shared_ptr<InferenceMetrics> Process();
+    shared_ptr<InferenceMetrics> Process(bool stream_output);
     virtual void Report(const InferenceMetrics& im);
     virtual ~ObjectDetectionProcessor() {}
 };
index 7342711..d352331 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -50,7 +49,14 @@ Processor::Processor(const std::string& flags_m, const std::string& flags_d, con
         batch = networkReader.getNetwork().getBatchSize();
     } else {
         // Not zero means "use the specified value"
-        networkReader.getNetwork().setBatchSize(batch);
+        auto network = networkReader.getNetwork();
+        auto input_shapes = network.getInputShapes();
+        std::string input_name;
+        SizeVector input_shape;
+        std::tie(input_name, input_shape) = *input_shapes.begin();
+        input_shape[0] = batch;
+        input_shapes[input_name] = input_shape;
+        network.reshape(input_shapes);
     }
 
     if (inputInfo.size() != 1) {
index 51033ab..49d5263 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -58,7 +57,7 @@ public:
     Processor(const std::string& flags_m, const std::string& flags_d, const std::string& flags_i, int flags_b,
             InferenceEngine::InferencePlugin plugin, CsvDumper& dumper, const std::string& approach, PreprocessingOptions preprocessingOptions);
 
-    virtual shared_ptr<InferenceMetrics> Process() = 0;
+    virtual shared_ptr<InferenceMetrics> Process(bool stream_output = false) = 0;
     virtual void Report(const InferenceMetrics& im) {
         double averageTime = im.totalTime / im.nRuns;
 
index f429a1e..4c8af47 100644 (file)
@@ -5,6 +5,8 @@ standard inputs and outputs configuration and to collect simple
 validation metrics for topologies. It supports **top-1** and **top-5** metric for Classification networks and
 11-points **mAP** metric for Object Detection networks.
 
+> **NOTE**: Before running the application with trained models, make sure the models are converted to the Inference Engine format (\*.xml + \*.bin) using the [Model Optimizer tool](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md).
+
 Possible use cases of the tool:
 * Check if the Inference Engine infers the public topologies well (the engineering team uses the Validation Application for
   regular testing)
@@ -27,6 +29,7 @@ Available options:
       -t "OD" for object detection
     -i <path>                 Required. Folder with validation images. Path to a directory with validation images. For Classification models, the directory must contain folders named as labels with images inside or a .txt file with a list of images. For Object Detection models, the dataset must be in VOC format.
     -m <path>                 Required. Path to an .xml file with a trained model
+    -lbl <path>               Labels file path. The labels file contains names of the dataset classes
     -l <absolute_path>        Required for CPU custom layers. Absolute path to a shared library with the kernel implementations
     -c <absolute_path>        Required for GPU custom kernels.Absolute path to an .xml file with the kernel descriptions.
     -d <device>               Target device to infer on: CPU (default), GPU, FPGA, or MYRIAD. The application looks for a suitable plugin for the specified device.
@@ -66,7 +69,7 @@ When executed, the Validation Application perform the following steps:
       For more information about the format, refer to the <a href="#preparing">Preparing the Dataset</a> section below.
 
 3. Reads the batch size value specified with the `-b` option and loads this number of images to the plugin
-   **Note**: Images loading time is not a part of inference time reported by the application.
+   > **NOTE**: Images loading time is not a part of inference time reported by the application.
 
 4. The plugin infers the model, and the Validation Application collects the statistics.
 
@@ -115,7 +118,7 @@ In this case, a dataset has the following structure:
 This structure means that each folder in dataset directory must have the name of one of the classes and contain all images of this class. In the given example, there are two images that represent the class `apron`, while three other classes have only one image
 each.
 
-**NOTE:** A dataset can contain images of both `.bmp` and `.jpg` formats.
+**NOTE:** A dataset can contain images of both `.bmp` and `.jpg` formats.
 
 The correct way to use such dataset is to specify the path as `-i <path>/dataset`.
 
@@ -144,7 +147,7 @@ a_big_dog.jpg 231
 Each line of the file must contain the name of the image and the ID of the class
 that it represents in the format `<image_name> tabulation <class_id>`. For example, `apron1.bmp` represents the class with ID `411`.
 
-**NOTE:** A dataset can contain images of both `.bmp` and `.jpg` formats.
+**NOTE:** A dataset can contain images of both `.bmp` and `.jpg` formats.
 
 The correct way to use such dataset is to specify the path as `-i <path>/dataset/labels.txt`.
 
@@ -161,7 +164,7 @@ To prepare the VOC dataset, follow the steps below :
   $wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
   tar -xvf VOCtest_06-Nov-2007.tar
   ```
-3. Convert the model with the [Model Optimizer](docs/Model_Optimizer_Developer_Guide/prepare_trained_model/convert_model/Convert_Model_From_Caffe.md).
+3. Convert the model with the [Model Optimizer](docs/MO_DG/prepare_model/convert_model/Convert_Model_From_Caffe.md).
 
 4. Create a proper `.txt` class file from the original `labelmap_voc.prototxt`. The new file must be in
 the following format:
@@ -200,8 +203,8 @@ run the following command to infer a classification model on the selected datase
 
 ## Validate Object Detection Models
 
-**Note**: Validation Application was validated with SSD CNN. Any network that can be inferred by the Inference Engine
-and has the same input and output format as one of these should be supported as well.
+> **NOTE**: Validation Application was validated with SSD CNN. Any network that can be inferred by the Inference Engine
+and has the same input and output format as one of these should be supported as well.
 
 Once you have prepared the dataset (refer to the <a href="#preparing">Preparing the Dataset</a> section above),
 run the following command to infer an Object Detection model on the selected dataset:
@@ -281,4 +284,4 @@ dataset. This value repeats the result stated in the
 
 ## See Also
 
-* [Using Inference Engine Samples](./docs/Inference_Engine_Developer_Guide/Samples_Overview.md)
+* [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
index f943e59..94693db 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -25,7 +24,7 @@ int VOCAnnotationParser::parseInt(const pugi::xml_node& node, const int def) {
     std::string val = parseString(node);
     try {
         return std::stoi(val);
-    } catch (const std::invalid_argument& e) {
+    } catch (const std::invalid_argument&) {
         THROW_USER_EXCEPTION(1) << "Can't convert node <" << node.name()
             << "> value \"" << val << "\" to integer";
     }
index 2ce28d1..2ff731d 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -132,7 +131,7 @@ std::vector<std::pair<int, std::string>> ClassificationSetGenerator::validationM
 
         int id = val->second;
         for (auto& image : getDirContents(getFullName(label, dir))) {
-            validationMap.push_back({ id + 1, image });        // [CVS-8200] line in .labels file is counted from 0, but classes are counted from 1
+            validationMap.push_back({ id, image });
         }
     }
     return validationMap;
index 0635bcc..35047a4 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -17,6 +16,7 @@ class ConsoleProgress {
 
     size_t total;
     size_t current = 0;
+    bool stream_output;
     size_t detalization;
 
 public:
@@ -25,8 +25,9 @@ public:
     * @param _total - maximum value that is correspondent to 100%
     * @param _detalization - number of symbols(.) to use to represent progress
     */
-    explicit ConsoleProgress(size_t _total, size_t _detalization = DEFAULT_DETALIZATION) :
+    explicit ConsoleProgress(size_t _total, bool _stream_output = false, size_t _detalization = DEFAULT_DETALIZATION) :
             total(_total), detalization(_detalization) {
+        stream_output = _stream_output;
         if (total == 0) {
             total = 1;
         }
@@ -45,8 +46,12 @@ public:
         for (; i < detalization; i++) {
             std::cout << " ";
         }
-        std::cout << "] " << std::fixed << std::setprecision(2) << 100 * static_cast<float>(current) / total << "% done    ";
-        std::flush(std::cout);
+        std::cout << "] " << std::fixed << std::setprecision(2) << 100 * static_cast<float>(current) / total << "% done";
+        if (stream_output) {
+            std::cout << std::endl;
+        } else {
+            std::flush(std::cout);
+        }
     }
 
     /**
index e69b348..a2c9446 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -89,11 +88,17 @@ static const char custom_cldnn_message[] = "Required for GPU custom kernels."
 static const char custom_cpu_library_message[] = "Required for CPU custom layers. "
                                                  "Absolute path to a shared library with the kernel implementations";
 
+/// @brief Message for labels file
+static const char labels_file_message[] = "Labels file path. The labels file contains names of the dataset classes";
+
 static const char zero_background_message[] = "\"Zero is a background\" flag. Some networks are trained with a modified"
                                               " dataset where the class IDs "
                                               " are enumerated from 1, but 0 is an undefined \"background\" class"
                                               " (which is never detected)";
 
+static const char plain_output_message[] = "Flag for plain output";
+
+
 /// @brief Network type options and their descriptions
 static const char* types_descriptions[][2] = {
     { "C", "classification" },
@@ -156,6 +161,11 @@ DEFINE_string(c, "", custom_cldnn_message);
 /// It is an optional parameter
 DEFINE_string(l, "", custom_cpu_library_message);
 
+/// @brief Flag for printing plain text
+DEFINE_bool(plain, false, plain_output_message);
+
+DEFINE_string(lbl, "", labels_file_message);
+
 /**
  * @brief This function shows a help message
  */
@@ -171,6 +181,7 @@ static void showUsage() {
     }
     std::cout << "    -i <path>                 " << image_message << std::endl;
     std::cout << "    -m <path>                 " << model_message << std::endl;
+    std::cout << "    -lbl <path>               " << labels_file_message << std::endl;
     std::cout << "    -l <absolute_path>        " << custom_cpu_library_message << std::endl;
     std::cout << "    -c <absolute_path>        " << custom_cldnn_message << std::endl;
     std::cout << "    -d <device>               " << target_device_message << std::endl;
@@ -248,7 +259,6 @@ int main(int argc, char *argv[]) {
             // Checking required OD-specific options
             if (FLAGS_ODa.empty()) ee << UserException(11, "Annotations folder is not specified for object detection (missing -a option)");
             if (FLAGS_ODc.empty()) ee << UserException(12, "Classes file is not specified (missing -c option)");
-            if (FLAGS_b > 0) ee << UserException(13, "Batch option other than 0 is not supported for Object Detection networks");
         }
 
         if (!ee.empty()) throw ee;
@@ -311,7 +321,7 @@ int main(int argc, char *argv[]) {
         if (netType == Classification) {
             processor = std::shared_ptr<Processor>(
                     new ClassificationProcessor(FLAGS_m, FLAGS_d, FLAGS_i, FLAGS_b,
-                                                plugin, dumper, FLAGS_l, preprocessingOptions, FLAGS_Czb));
+                                                plugin, dumper, FLAGS_lbl, preprocessingOptions, FLAGS_Czb));
         } else if (netType == ObjDetection) {
             if (FLAGS_ODkind == "SSD") {
                 processor = std::shared_ptr<Processor>(
@@ -329,7 +339,7 @@ int main(int argc, char *argv[]) {
             THROW_USER_EXCEPTION(2) <<  "Processor pointer is invalid" << FLAGS_ppType;
         }
         slog::info << (FLAGS_d.empty() ? "Plugin: " + FLAGS_p : "Device: " + FLAGS_d) << slog::endl;
-        shared_ptr<Processor::InferenceMetrics> pIM = processor->Process();
+        shared_ptr<Processor::InferenceMetrics> pIM = processor->Process(FLAGS_plain);
         processor->Report(*pIM.get());
 
         if (dumper.dumpEnabled()) {
index 29f793b..cabd78b 100644 (file)
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 ####################################
 ## All next project will use C++11
 set (CMAKE_CXX_STANDARD 11)
@@ -17,6 +18,10 @@ if(ENABLE_CLDNN)
     add_subdirectory(cldnn_engine)
 endif()
 
+if (ENABLE_GNA)
+    add_subdirectory(gna_plugin)
+endif()
+
 add_subdirectory(hetero_plugin)
 
 set(InferenceEngine_LIBRARIES inference_engine)
index 75ac39a..372bae8 100644 (file)
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 set (TARGET_NAME "clDNNPlugin")
 
 file (GLOB MAIN_SRC
@@ -14,9 +15,7 @@ file (GLOB LIBRARY_HEADERS
 
 addVersionDefines(cldnn_engine.cpp CI_BUILD_NUMBER CLDNN_VERSION)
 
-if(WIN32)
-    add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
-endif()
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
 
 # Create named folders for the sources within the .vcproj
 # Empty name lists them directly under the .vcproj
index 4720eda..4b79fe6 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -20,6 +19,7 @@
 #include "ie_plugin.hpp"
 #include "ie_plugin_config.hpp"
 #include "details/caseless.hpp"
+#include <details/ie_cnn_network_tools.h>
 
 #undef min
 #undef max
@@ -116,7 +116,7 @@ ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(InferenceEngine::
 INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin *&plugin, ResponseDesc *resp) noexcept {
     try {
         plugin = make_ie_compatible_plugin(
-                {1, 4,
+                {1, 5,
 #ifdef CLDNN_VERSION
                  CLDNN_VERSION,
 #else
@@ -139,27 +139,26 @@ void clDNNEngine::QueryNetwork(const ICNNNetwork& network, QueryNetworkResult& r
 }
 
 void clDNNEngine::QueryNetwork(const ICNNNetwork& network, const std::map<std::string, std::string>& config, QueryNetworkResult& res) const {
-    details::CNNNetworkIterator i(const_cast<ICNNNetwork *>(&network));
-
     std::vector <CNNLayer::Ptr> concats;
-    std::vector <CNNLayer::Ptr> constantBlobs;
-
-    while (i != details::CNNNetworkIterator()) {
-        CNNLayer::Ptr layer = *i;
+    std::vector <CNNLayer::Ptr> nextLayerDependent;
 
+    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
+    for (auto layer : sortedLayers) {
         if (CaselessEq<std::string>()(layer->type, "DetectionOutput")) {
         } else if (CaselessEq<std::string>()(layer->type, "PriorBox")) {
         } else if (CaselessEq<std::string>()(layer->type, "Proposal")) {
         } else if (CaselessEq<std::string>()(layer->type, "SimplerNMS")) {
         } else if (CaselessEq<std::string>()(layer->type, "Concat")) {
             concats.push_back(layer);
+        } else if (CaselessEq<std::string>()(layer->type, "reshape")) {
+            nextLayerDependent.push_back(layer);
+        } else if (CaselessEq<std::string>()(layer->type, "permute")) {
+            nextLayerDependent.push_back(layer);
         } else if (CaselessEq<std::string>()(layer->type, "Const")) {
-            constantBlobs.push_back(layer);
+            nextLayerDependent.push_back(layer);
         } else if (CLDNNGraph::IsLayerSupported(layer->type)) {
-            res.supportedLayers.insert((*i)->name);
+            res.supportedLayers.insert(layer->name);
         }
-
-        i++;
     }
 
     // evaluation of concats - if all parent layers are supported, only in this case we
@@ -169,7 +168,10 @@ void clDNNEngine::QueryNetwork(const ICNNNetwork& network, const std::map<std::s
         bool supported = true;
         for (DataWeakPtr insData : concat->insData) {
             CNNLayerPtr prev = insData.lock()->getCreatorLayer().lock();
-            if (res.supportedLayers.find(prev->name) == res.supportedLayers.end()) {
+            // verify if previous layer is not supported or if it in the list of not defined layers yet
+            // not defined layers are treated as layers which will be assigned to GPU if next layer is assigned to GPU
+            if (res.supportedLayers.find(prev->name) == res.supportedLayers.end()
+                && std::find(nextLayerDependent.begin(), nextLayerDependent.end(), prev) == nextLayerDependent.end()) {
                 supported = false;
             }
         }
@@ -179,16 +181,21 @@ void clDNNEngine::QueryNetwork(const ICNNNetwork& network, const std::map<std::s
 
     // evaluation of constant blobs - if all consumers are on GPU,
     // then leave it on GPU, else - move to other device
-    for (const auto &cblob : constantBlobs) {
+    for (auto cnl = nextLayerDependent.rbegin();
+        cnl != nextLayerDependent.rend();
+        cnl++) {
         bool supported = true;
-        for (DataPtr out : cblob->outData) {
-            CNNLayerPtr prev = out->getCreatorLayer().lock();
-            if (res.supportedLayers.find(prev->name) == res.supportedLayers.end()) {
-                supported = false;
+        for (DataPtr out : (*cnl)->outData) {
+            for (auto ol : out->inputTo) {
+                if (res.supportedLayers.find(ol.second->name) == res.supportedLayers.end()) {
+                    supported = false;
+                }
             }
         }
+        std::cout << (*cnl)->name << " is " << (supported ? "GPU" : "CPU") << std::endl;
+
         if (supported)
-            res.supportedLayers.insert(cblob->name);
+            res.supportedLayers.insert((*cnl)->name);
     }
 }
 
index 01bd1d1..6de94cf 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 6fbd246..fe61da1 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -41,6 +40,7 @@
 #include <CPP/arg_max_min.hpp>
 #include <CPP/mvn.hpp>
 #include <CPP/tile.hpp>
+#include <CPP/border.hpp>
 #include <CPP/lstm.hpp>
 #include <chrono>
 #include <cmath>
@@ -50,6 +50,9 @@
 #include <description_buffer.hpp>
 #include <cldnn/cldnn_config.hpp>
 #include <graph_tools.hpp>
+#include <ie_layers_internal.hpp>
+#include <net_pass.h>
+#include <ie_layers_prv.h>
 #include "cldnn_infer_request.h"
 #include <cpp_interfaces/ie_executor_manager.hpp>
 #include "details/caseless.hpp"
@@ -306,6 +309,11 @@ CLDNNGraph::CLDNNGraph(InferenceEngine::ICNNNetwork& network, const Config& conf
         _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eGPU));
     }
 
+    bool res = !NetPass::CombineLSTMSeq(network) ? NetPass::UnrollTI(network) : true;
+    if (!res)
+        THROW_CLDNN_EXCEPTION("Plugin doesn't support Tensor Iterator in pure form. "
+                              "No one TI optimization pattern was not applied successfully");
+
     if (max_batch > 1) {
         // check topology for applicability
         if (!CanProcessDynBatch(network)) {
@@ -563,6 +571,8 @@ CLDNNGraph::LayerType CLDNNGraph::LayerTypeFromStr(const std::string &str) {
         { "MVN" , MVN },
         { "Unpooling" , Unpooling },
         { "Tile" , Tile },
+        { "Pad" , Pad },
+        { "LSTMCell" , LSTMCell },
         { "RNN" , RNN },
     };
     auto it = LayerNameToType.find(str);
@@ -604,7 +614,6 @@ cldnn::eltwise_mode CLDNNGraph::EltwiseModeFromIEEltwise(InferenceEngine::Eltwis
 cldnn::concatenation::concatenation_axis CLDNNGraph::ConcatAxisFromIEAxis(unsigned axis) {
     switch (axis) {
     case 0:
-        THROW_CLDNN_EXCEPTION("Unsupported concatenation axis: " << axis);  // Currently unsupported (although existing in the API)
         return cldnn::concatenation::concatenation_axis::along_b;
     case 1:
         return cldnn::concatenation::concatenation_axis::along_f;
@@ -946,6 +955,8 @@ void CLDNNGraph::CreateSingleLayerPrimitive(InferenceEngine::CNNLayerPtr &layer)
             break;
         case MVN: CreateMVNPrimitive(layer);
             break;
+        case LSTMCell: CreateLSTMCellPrimitive(layer);
+            break;
         case RNN: CreateRNNPrimitive(layer);
             break;
         case RegionYolo: CreateYOLO2RegionPrimitive(layer);
@@ -954,6 +965,8 @@ void CLDNNGraph::CreateSingleLayerPrimitive(InferenceEngine::CNNLayerPtr &layer)
             break;
         case Tile: CreateTilePrimitive(layer);
             break;
+        case Pad: CreatePadPrimitive(layer);
+            break;
         default: THROW_CLDNN_EXCEPTION("Unknown Layer Type: " << layer->type);
     }
 }
@@ -1076,20 +1089,6 @@ void CLDNNGraph::CreatePReLUPrimitive(InferenceEngine::CNNLayerPtr &layer) {
         THROW_CLDNN_EXCEPTION("Data inserted into PreLu " << preluLayer->name << " is nullptr");
     }
     auto inputDims = inDataPtr->dims;
-    if (inputDims.size() == 2) {
-        // WA for FC output as BF instead of BX
-        // todo: remove this once FC output is changed in clDNN
-        cldnn::primitive_id reshapeID = preluLayer->name + m_workaroundTag;
-        m_topology->add(cldnn::reshape(
-            reshapeID,
-            inputPrimitives[0],
-            cldnn::tensor(TensorValue(inputDims[1]), TensorValue(inputDims[0]), 1, 1)));
-        m_env.primitiveIDs[inputPrimitives[0]] = reshapeID;
-        inputPrimitives[0] = reshapeID;
-        m_env.primitiveIDs[reshapeID] = reshapeID;
-        m_env.profilingIDs.insert(reshapeID);
-    }
-
     static const std::string blobName("weights");
     ValidateGenericLayerBlobs(preluLayer, { blobName });
 
@@ -1400,10 +1399,11 @@ void CLDNNGraph::CreateDeconvolutionPrimitive(InferenceEngine::CNNLayerPtr &laye
     std::vector<cldnn::primitive_id> weightPrimID;
     std::vector<cldnn::primitive_id> biasPrimID;
     CreateWeightAndBiasPrimitives(layer, weightPrimID, biasPrimID);
+    auto allPads = getPaddings(*deconvLayer);
     cldnn::tensor stride = cldnn::tensor(cldnn::batch(1), cldnn::feature(1),
                                          cldnn::spatial(deconvLayer->_stride[X_AXIS], deconvLayer->_stride[Y_AXIS]));
     cldnn::tensor padding = cldnn::tensor(cldnn::batch(0), cldnn::feature(0),
-                                         cldnn::spatial(-deconvLayer->_padding[X_AXIS], -deconvLayer->_padding[Y_AXIS]));
+                                         cldnn::spatial(-allPads.begin[X_AXIS], -allPads.begin[Y_AXIS]));
 
     auto deconvPrim = cldnn::deconvolution(deconvLayer->name,
         inputPrimitives[0],
@@ -1907,8 +1907,9 @@ void CLDNNGraph::CreateFusedSplitConvMergePrimitive(InferenceEngine::CNNLayerPtr
 
     cldnn::tensor stride = cldnn::tensor(cldnn::batch(1), cldnn::feature(1),
                                          cldnn::spatial(convLayer1->_stride[X_AXIS], convLayer1->_stride[Y_AXIS]));
+    auto allPad = getPaddings(*convLayer1);
     cldnn::tensor padding = cldnn::tensor(cldnn::batch(0), cldnn::feature(0),
-                                          cldnn::spatial(-convLayer1->_padding[X_AXIS], -convLayer1->_padding[Y_AXIS]));
+                                          cldnn::spatial(-allPad.begin[X_AXIS], -allPad.begin[Y_AXIS]));
     cldnn::tensor dilation = cldnn::tensor(cldnn::batch(1), cldnn::feature(1),
                                            cldnn::spatial(convLayer1->_dilation[X_AXIS], convLayer1->_dilation[Y_AXIS]));
 
@@ -2066,6 +2067,7 @@ void CLDNNGraph::CreatePoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
     auto poolLayer = dynamic_cast<InferenceEngine::PoolingLayer *> (layer.get());
 
+    auto allPads = getPaddings(*poolLayer);
     if (poolLayer->outData.size() > 1) {
         // max pooling with argmax
         SizeVector argmaxDims;
@@ -2124,7 +2126,7 @@ void CLDNNGraph::CreatePoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) {
             cldnn::spatial(TensorValue(poolLayer->_kernel[X_AXIS]), TensorValue(poolLayer->_kernel[Y_AXIS])),  // size
             cldnn::spatial(TensorValue(poolLayer->_stride[X_AXIS]), TensorValue(poolLayer->_stride[Y_AXIS])),  // stride
                                                                                                    // input offset (padding) - explicit tensor for 0 bf
-            { 0, 0, -TensorValue(poolLayer->_padding[X_AXIS]), -TensorValue(poolLayer->_padding[Y_AXIS]) },
+            { 0, 0, -TensorValue(allPads.begin[X_AXIS]), -TensorValue(allPads.begin[Y_AXIS]) },
             CldnnTensorFromIEDims(poolLayer->outData[0]->dims));
         m_topology->add(poolPrim);
         m_env.primitiveIDs[realOutputID] = poolLayer->name;
@@ -2136,7 +2138,7 @@ void CLDNNGraph::CreatePoolingPrimitive(InferenceEngine::CNNLayerPtr &layer) {
             cldnn::spatial(TensorValue(poolLayer->_kernel[X_AXIS]), TensorValue(poolLayer->_kernel[Y_AXIS])),  // size
             cldnn::spatial(TensorValue(poolLayer->_stride[X_AXIS]), TensorValue(poolLayer->_stride[Y_AXIS])),  // stride
                                                                                                    // input offset (padding) - explicit tensor for 0 bf
-            { 0, 0, -TensorValue(poolLayer->_padding[X_AXIS]), -TensorValue(poolLayer->_padding[Y_AXIS]) },
+            { 0, 0, -TensorValue(allPads.begin[X_AXIS]), -TensorValue(allPads.begin[Y_AXIS]) },
             CldnnTensorFromIEDims(poolLayer->outData[0]->dims));
     m_topology->add(poolPrim);
         m_env.primitiveIDs[poolLayer->name] = poolLayer->name;
@@ -2488,19 +2490,237 @@ void CLDNNGraph::CreateTilePrimitive(InferenceEngine::CNNLayerPtr &layer) {
     m_env.profilingIDs.insert(tileLayer->name);
 }
 
+void CLDNNGraph::CreatePadPrimitive(InferenceEngine::CNNLayerPtr &layer) {
+    ValidateLayer(layer, 1);
+    auto inputPrimitives = GetPrevLayersPrimitives(layer);
+    auto padLayer = dynamic_cast<InferenceEngine::GenericLayer*> (layer.get());
+
+    auto PadTensorFromArgs = [](const std::string &s) -> cldnn::tensor {
+        std::stringstream ss(s);
+        std::string item;
+        std::vector<cldnn::tensor::value_type> elems;
+        while (std::getline(ss, item, ',')) {
+            elems.push_back(static_cast<cldnn::tensor::value_type>(std::atoll(item.c_str())));
+        }
+
+        while (elems.size() < 4) {
+            elems.push_back(0);
+        }
+
+        // Swap x and y
+        auto tmp = elems[2];
+        elems[2] = elems[3];
+        elems[3] = tmp;
+
+        return cldnn::tensor(elems, 0);
+    };
+
+    auto pads_begin = PadTensorFromArgs(padLayer->GetParamAsString("pads_begin"));
+    auto pads_end = PadTensorFromArgs(padLayer->GetParamAsString("pads_end"));
+    std::string mode = padLayer->GetParamAsString("pad_mode");
+    float pad_value = padLayer->GetParamAsFloat("pad_value", 0.0f);
+
+    cldnn::border_type border_mode;
+    if (mode == "constant")
+        border_mode = cldnn::border_type::constant;
+    else if (mode == "edge")
+        border_mode = cldnn::border_type::edge;
+    else if (mode == "symmetric")
+        border_mode = cldnn::border_type::mirror;
+    else if (mode == "reflect")
+        border_mode = cldnn::border_type::mirror_101;
+    else
+        THROW_CLDNN_EXCEPTION("Invalid border mode " << mode << " in layer " << padLayer->name);
+
+    auto tilePrim = cldnn::border(
+            padLayer->name,
+            inputPrimitives[0],
+            pads_begin,
+            pads_end,
+            border_mode,
+            pad_value);
+
+    m_env.primitiveIDs[padLayer->name] = padLayer->name;
+    m_topology->add(tilePrim);
+    m_env.profilingIDs.insert(padLayer->name);
+}
+
 std::string get_string_id(size_t i) {
     std::stringstream ss;
     ss << std::setw(5) << std::setfill('0') << i;
     return ss.str();
 }
 
-void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
+void CLDNNGraph::CreateLSTMCellPrimitive(InferenceEngine::CNNLayerPtr &layer) {
     int lstm_batch_size, lstm_sequence_len, lstm_input_size, lstm_hidden_size;
     SizeVector in_dims1, in_dims2;
-    bool hasInitialHidden = false, hasInitialCell = false, hasBias = false;
-    bool swap_state = layer->params["swap_state"] == "YES";
+    bool hasBias = false;
     auto inputPrimitives = GetPrevLayersPrimitives(layer);
 
+    auto elementSize = cldnn::data_type_traits::size_of(m_networkPrecision);
+    cldnn::primitive_id weightID = layer->name + m_weightsTag;
+    cldnn::primitive_id recurrentID = layer->name + "_recurrent" + m_weightsTag;
+    cldnn::primitive_id biasID = layer->name + m_biasesTag;
+    auto cellLayer = dynamic_cast<InferenceEngine::LSTMCell*> (layer.get());
+
+    /* check incoming CNN layer and setup required variables */
+    {
+        auto in_data0 = layer->insData[0].lock();
+        if (!in_data0)
+            THROW_IE_EXCEPTION << "Missing first input for LSTMCell layer " << layer->name;
+
+        auto in_dims0 = in_data0->dims;
+        auto out_dims0 = layer->outData[0]->dims;
+
+        lstm_input_size = in_dims0[0];
+        lstm_batch_size = in_dims0[1];
+        lstm_hidden_size = out_dims0[0];
+
+        /* do we have initial hidden and cell?
+        if blobs are not null, direct the data from them
+        into corresponding LSTM inputs */
+
+        auto in_data1 = layer->insData[1].lock();
+        if (!in_data1)
+            THROW_IE_EXCEPTION << "Missing second input for LSTMCell layer " << layer->name;
+        in_dims1 = in_data1->dims;
+
+
+        auto in_data2 = layer->insData[2].lock();
+        if (!in_data2)
+            THROW_IE_EXCEPTION << "Missing third input for LSTMCell layer " << layer->name;
+        in_dims2 = in_data2->dims;
+
+
+        if (in_dims0.size() != 2 || in_dims1.size() != 2 || in_dims2.size() != 2)
+            THROW_IE_EXCEPTION << "Wrong input shapes for LSTMCell Layer " << layer->name;
+    }
+
+    /*
+     * Prepare weight/bias memory primitives:
+     *   - split weight blob into W and R
+     *   - rearrange gate order from FICO layout in IR to IOFC expected by clDNN
+     */
+    {
+        cldnn::tensor wTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(lstm_input_size, 4 * lstm_hidden_size));
+        cldnn::tensor rTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(lstm_hidden_size, 4 * lstm_hidden_size));
+        cldnn::layout WLayout = cldnn::layout(m_networkPrecision, m_defaultFormat, wTensor);
+        cldnn::layout RLayout = cldnn::layout(m_networkPrecision, m_defaultFormat, rTensor);
+
+        auto wmem = cldnn::memory::allocate(*(m_env.engine), WLayout);
+        auto wtmpPointer = wmem.pointer<char>();  // implicitly maps buffer - unmap in destructor
+
+        auto rmem = cldnn::memory::allocate(*(m_env.engine), RLayout);
+        auto rtmpPointer = rmem.pointer<char>();
+
+        // FICO -> IOFC
+        const std::vector<size_t> gate_offs{2, 0, 3, 1};
+
+        auto wLayer = dynamic_cast<InferenceEngine::WeightableLayer *> (layer.get());
+        auto pWeightsBlob = wLayer->_weights;
+        auto blobBytes = static_cast<const char *>(pWeightsBlob->buffer());
+        const size_t WchunkSz = lstm_input_size * elementSize;
+        const size_t RchunkSz = lstm_hidden_size * elementSize;
+
+        for (int g = 0; g < 4; g++) {
+            auto wBytes = wtmpPointer.data() + gate_offs[g] * lstm_hidden_size * WchunkSz;
+            auto rBytes = rtmpPointer.data() + gate_offs[g] * lstm_hidden_size * RchunkSz;
+            for (int h = 0; h < lstm_hidden_size; h++) {
+                // copy "input size" elements to W
+                for (size_t b = 0; b < WchunkSz; b++) {
+                    wBytes[b] = blobBytes[b];
+                }
+                blobBytes += WchunkSz;
+                wBytes += WchunkSz;
+
+                // copy "lstm_hidden_size" elements to R
+                for (size_t b = 0; b < RchunkSz; b++) {
+                    rBytes[b] = blobBytes[b];
+                }
+                blobBytes += RchunkSz;
+                rBytes += RchunkSz;
+            }
+        }
+
+        m_topology->add(cldnn::data(weightID, wmem));
+        m_topology->add(cldnn::data(recurrentID, rmem));
+
+        /* create bias memory primitive */
+        auto pBiasBlob = wLayer->_biases;
+        if (pBiasBlob != nullptr) {
+            cldnn::tensor bTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(4 * lstm_hidden_size, 1));
+            cldnn::layout BLayout = cldnn::layout(m_networkPrecision, m_defaultFormat, rTensor);
+
+            auto bmem = cldnn::memory::allocate(*(m_env.engine), BLayout);
+            auto btmpPointer = bmem.pointer<char>();
+
+            auto blobBytes = static_cast<const char *>(pBiasBlob->buffer());
+            const size_t BchunkSz = lstm_hidden_size * elementSize;
+
+            for (int g = 0; g < 4; g++) {
+                auto bBytes = btmpPointer.data() + gate_offs[g] * BchunkSz;
+                // copy "lstm_hidden_size" elements to B
+                for (size_t b = 0; b < BchunkSz; b++) {
+                    bBytes[b] = blobBytes[b];
+                }
+                blobBytes += BchunkSz;
+            }
+
+            m_topology->add(cldnn::data(biasID, bmem));
+            hasBias = true;
+        }
+    }
+
+    cldnn::primitive_id inReshapeID = layer->name + "_inReshape";
+    cldnn::primitive_id permuteID = layer->name + "_inputReorder";
+    cldnn::primitive_id inHiddenReshapeID = layer->name + "_inHiddenReshape";
+
+    cldnn::tensor inputShape = { lstm_batch_size, 1, lstm_input_size, 1 };
+    cldnn::tensor hiddenStateShape = { lstm_batch_size, 1, lstm_hidden_size, 1 };
+    cldnn::layout inputLayout = cldnn::layout(m_networkPrecision, cldnn::format::bfyx, inputShape);
+    m_topology->add(cldnn::reshape(inReshapeID, inputPrimitives[0], inputShape));
+    m_topology->add(cldnn::reorder(permuteID, inReshapeID, inputLayout));
+
+    m_topology->add(cldnn::reshape(inHiddenReshapeID+"_1", inputPrimitives[1], hiddenStateShape));
+    m_topology->add(cldnn::reshape(inHiddenReshapeID+"_2", inputPrimitives[2], hiddenStateShape));
+
+    cldnn::tensor hiddenSz = cldnn::tensor{ 1, lstm_batch_size, lstm_hidden_size, 1 };
+    cldnn::tensor cellCropSz = cldnn::tensor{0, 1, 0, 0};
+    std::string hiddenInStr = inHiddenReshapeID+"_1";
+    std::string cellInStr = inHiddenReshapeID+"_2";
+
+
+    std::string lstm_gemm_id = layer->name + "_lstm_gemm";
+    std::string lstm_elt_id = layer->name + "_lstm_elt";
+    std::string crop_id = layer->name + "_crop";
+
+    m_topology->add(cldnn::lstm_gemm(lstm_gemm_id, permuteID,
+                                     weightID, recurrentID,
+                                     hasBias ? biasID : "",
+                                     hiddenInStr));
+    m_topology->add(cldnn::lstm_elt(lstm_elt_id, lstm_gemm_id,
+                                    cellInStr));
+
+
+
+
+    cldnn::primitive_id outputHiddenID = layer->name;
+    m_topology->add(cldnn::crop(outputHiddenID, lstm_elt_id, hiddenSz, cldnn::tensor{0, 0, 0, 0}));
+    m_env.primitiveIDs[outputHiddenID] = outputHiddenID;
+    m_env.primitiveIDs[layer->outData[0]->name] = outputHiddenID;
+
+    cldnn::primitive_id outputCellID = layer->outData[1]->name;
+    m_topology->add(cldnn::crop(outputCellID, lstm_elt_id, hiddenSz, cellCropSz));
+    m_env.primitiveIDs[outputCellID] = outputCellID;
+
+    m_env.profilingIDs.insert(layer->name);
+}
+
+void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
+    int lstm_batch_size, lstm_sequence_len, lstm_input_size, lstm_hidden_size;
+    SizeVector in_dims1, in_dims2;
+    bool hasInitialHidden = false, hasInitialCell = false, hasBias = false, isForward = true;
+    auto inputPrimitives = GetPrevLayersPrimitives(layer);
 
     auto elementSize = cldnn::data_type_traits::size_of(m_networkPrecision);
     cldnn::primitive_id weightID = layer->name + m_weightsTag;
@@ -2510,7 +2730,7 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
 
     /* check incoming CNN layer and setup required variables */
     {
-        if (rnnLayer->cellType != LSTM)
+        if (rnnLayer->cellType != "LSTM")
          THROW_IE_EXCEPTION << "RNN layer supports only LSTM like cell";
 
         auto in_data0 = layer->insData[0].lock();
@@ -2520,7 +2740,7 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
         auto in_dims0 = in_data0->dims;
         auto out_dims0 = layer->outData[0]->dims;
 
-        if (1 == rnnLayer->_axis) {
+        if (1 == rnnLayer->axis) {
             lstm_batch_size = in_dims0[2];
             lstm_sequence_len = in_dims0[1];
         } else {
@@ -2535,18 +2755,22 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
         if blobs are not null, direct the data from them
         into corresponding LSTM inputs */
 
-        auto in_data1 = layer->insData[swap_state ? 2 : 1].lock();
+        auto in_data1 = layer->insData[1].lock();
         if (in_data1) {
             in_dims1 = in_data1->dims;
             hasInitialHidden = true;
         }
 
-        auto in_data2 = layer->insData[swap_state ? 1 : 2].lock();
+        auto in_data2 = layer->insData[2].lock();
         if (in_data2) {
             in_dims2 = in_data2->dims;
             hasInitialCell = true;
         }
 
+        if (rnnLayer->direction != RNNLayer::RNN_FWD && rnnLayer->direction != RNNLayer::RNN_BWD)
+            THROW_IE_EXCEPTION << "Support only forward and backward direction for RNN Layer " << layer->name;
+        isForward = rnnLayer->direction == RNNLayer::RNN_FWD;
+
         if (in_dims0.size() != 3 || in_dims1.size() != 2 || in_dims2.size() != 2)
             THROW_IE_EXCEPTION << "Wrong input shapes for RNN Layer " << layer->name;
     }
@@ -2650,15 +2874,16 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
 
     cldnn::tensor hiddenSz = cldnn::tensor{ 1, lstm_batch_size, lstm_hidden_size, 1 };
     cldnn::tensor cellCropSz = cldnn::tensor{0, 1, 0, 0};
-    std::string hiddenStr = hasInitialHidden ? (swap_state ? inHiddenReshapeID+"_2" : inHiddenReshapeID+"_1") : "";
-    std::string cellStr = hasInitialCell ? (swap_state ? inHiddenReshapeID+"_1" : inHiddenReshapeID+"_2") : "";
+    std::string hiddenStr = hasInitialHidden ? inHiddenReshapeID+"_1" : "";
+    std::string cellStr = hasInitialCell ? inHiddenReshapeID+"_2" : "";
 
     for (int i = 0; i < lstm_sequence_len; ++i) {
         std::string lstm_gemm_id = layer->name + "_lstm_gemm" + get_string_id(i);
         std::string lstm_elt_id = layer->name + "_lstm_elt" + get_string_id(i);
         std::string crop_id = layer->name + "_crop" + get_string_id(i);
 
-        m_topology->add(cldnn::lstm_gemm(lstm_gemm_id, inputSplitID + ":" + get_string_id(i),
+        int seqIdx = isForward ? i : lstm_sequence_len - 1 - i;
+        m_topology->add(cldnn::lstm_gemm(lstm_gemm_id, inputSplitID + ":" + get_string_id(seqIdx),
                                             weightID, recurrentID,
                                             hasBias ? biasID : "",
                                             hiddenStr));
@@ -2675,14 +2900,14 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
         } else {
             // last hidden state crop (output 2)
             if (layer->outData.size() > 1) {
-                cldnn::primitive_id outputHiddenID = layer->outData[swap_state ? 2 : 1]->name;
+                cldnn::primitive_id outputHiddenID = layer->outData[1]->name;
                 m_env.primitiveIDs[hiddenStr] = hiddenStr;
                 m_env.primitiveIDs[outputHiddenID] = hiddenStr;
             }
 
             // last cell state crop (output 3)
             if (layer->outData.size() > 2) {
-                cldnn::primitive_id outputCellID = layer->outData[swap_state ? 1 : 2]->name;
+                cldnn::primitive_id outputCellID = layer->outData[2]->name;
                 auto cropPrim = cldnn::crop(outputCellID, lstm_elt_id, hiddenSz, cellCropSz);
                 m_topology->add(cropPrim);
                 m_env.primitiveIDs[outputCellID] = outputCellID;
@@ -2690,13 +2915,15 @@ void CLDNNGraph::CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer) {
         }
     }
 
+    if (!isForward) std::reverse(output_ids_offsets.begin(), output_ids_offsets.end());
+
     // main output (concatenated hidden)
     cldnn::primitive_id concatID = layer->name + "_outputConcat";
     m_topology->add(cldnn::concatenation(concatID, output_ids_offsets, cldnn::concatenation::along_f));
 
     // permute output to [1, batch, sequence, hidden_size]
     cldnn::tensor outputTensor;
-    if (1 == rnnLayer->_axis) {
+    if (1 == rnnLayer->axis) {
         outputTensor = cldnn::tensor(cldnn::batch(1),   cldnn::feature(lstm_batch_size), cldnn::spatial(lstm_hidden_size, lstm_sequence_len));
     } else {
         outputTensor = cldnn::tensor(cldnn::batch(1), cldnn::feature(lstm_sequence_len), cldnn::spatial(lstm_hidden_size, lstm_batch_size));
@@ -2765,8 +2992,9 @@ void CLDNNGraph::CreateConvolutionPrimitive(InferenceEngine::CNNLayerPtr &layer)
 
     cldnn::tensor stride = cldnn::tensor(cldnn::batch(1), cldnn::feature(1),
                                          cldnn::spatial(convLayer->_stride[X_AXIS], convLayer->_stride[Y_AXIS]));
+    auto allPad = getPaddings(*convLayer);
     cldnn::tensor padding = cldnn::tensor(cldnn::batch(0), cldnn::feature(0),
-                                          cldnn::spatial(-convLayer->_padding[X_AXIS], -convLayer->_padding[Y_AXIS]));
+                                          cldnn::spatial(-allPad.begin[X_AXIS], -allPad.begin[Y_AXIS]));
     cldnn::tensor dilation = cldnn::tensor(cldnn::batch(1), cldnn::feature(1),
                                            cldnn::spatial(convLayer->_dilation[X_AXIS], convLayer->_dilation[Y_AXIS]));
 
@@ -2799,12 +3027,16 @@ bool CLDNNGraph::IsValidSplitConvMerge(const InferenceEngine::SplitLayer *splitL
         dynamic_cast<InferenceEngine::ConvolutionLayer *> (GetNextSingleLayer(splitLayer->outData[0]).get());
     auto convLayer2 =
         dynamic_cast<InferenceEngine::ConvolutionLayer *> (GetNextSingleLayer(splitLayer->outData[1]).get());
-    if (!convLayer1 || !convLayer2  // outputs aren't convolutions
-        || convLayer1->precision != convLayer2->precision                       // wrong precision
+    if (!convLayer1 || !convLayer2) {   // outputs aren't convolutions
+        return false;
+    }
+    auto allPad1 = getPaddings(*convLayer1);
+    auto allPad2 = getPaddings(*convLayer2);
+    if (convLayer1->precision != convLayer2->precision                       // wrong precision
         || convLayer1->_fusedWith || convLayer2->_fusedWith                     // convolutions are fused
         || convLayer1->outData.size() != 1 || convLayer2->outData.size() != 1   // more than 1 output for convolutions
-        || convLayer1->_padding[X_AXIS] != convLayer2->_padding[X_AXIS]                     // different padding
-        || convLayer1->_padding[Y_AXIS] != convLayer2->_padding[Y_AXIS]                     // different padding
+        || allPad1.begin[X_AXIS] != allPad2.begin[X_AXIS]                     // different padding
+        || allPad1.begin[Y_AXIS] != allPad2.begin[Y_AXIS]                     // different padding
         || convLayer1->_stride[X_AXIS] != convLayer2->_stride[X_AXIS]                       // different strides
         || convLayer1->_stride[Y_AXIS] != convLayer2->_stride[Y_AXIS]                       // different strides
         || convLayer1->_dilation[X_AXIS] != convLayer2->_dilation[X_AXIS]                   // different dilation
index 52ec068..c26b60a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -143,6 +142,8 @@ protected:
         MVN,
         Unpooling,
         Tile,
+        Pad,
+        LSTMCell,
         RNN,
         NO_TYPE
     };
@@ -244,7 +245,9 @@ protected:
     void CreateMaxUnpoolingPrimitive(InferenceEngine::CNNLayerPtr &layer);
     void CreateMVNPrimitive(InferenceEngine::CNNLayerPtr &layer);
     void CreateTilePrimitive(InferenceEngine::CNNLayerPtr &layer);
+    void CreatePadPrimitive(InferenceEngine::CNNLayerPtr &layer);
     void CreateRNNPrimitive(InferenceEngine::CNNLayerPtr &layer);
+    void CreateLSTMCellPrimitive(InferenceEngine::CNNLayerPtr &layer);
     void AddConstantBlobInput(InferenceEngine::CNNLayerPtr &layer);
     void CreateCustomLayerPrimitive(InferenceEngine::CNNLayerPtr &layer, CLDNNCustomLayerPtr customLayer);
 };
index a186b77..e36578c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -521,7 +520,7 @@ void CLDNNInferRequest::InferImpl() {
     IE_PROFILING_AUTO_SCOPE(CLDNN_INFER)
 
     // execute input pre-processing.
-    execDataPreprocessing(_inputs);
+    execDataPreprocessing(_inputs, true);  // "true" stands for serial preprocessing in case of OpenMP
 
     for (auto &item : _inputs) {
         if (m_env.m_max_batch > 1) {
index 688bc0a..5a6de15 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index cd782b2..3001b29 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 5035094..31257da 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index f2f50f1..20b09fb 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 1cc1bfe..445b62a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 87f78c7..ca9cc27 100644 (file)
@@ -17,17 +17,7 @@ endif()
 file(GLOB_RECURSE SRC *.cpp)
 file(GLOB_RECURSE HDR *.hpp)
 
-if(WIN32)
-    add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_API)
-endif()
-
-if (THREADING STREQUAL "TBB")
-    add_definitions(-DIE_THREAD=IE_THREAD_TBB)
-elseif (THREADING STREQUAL "OMP")
-    add_definitions(-DIE_THREAD=IE_THREAD_OMP)
-else()
-    add_definitions(-DIE_THREAD=IE_THREAD_SEQ)
-endif()
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_API)
 
 include_directories (PRIVATE
         ${CMAKE_CURRENT_SOURCE_DIR}/common
@@ -35,18 +25,12 @@ include_directories (PRIVATE
 )
 
 add_library(${TARGET_NAME} SHARED ${SRC} ${HDR})
+set_ie_threading_interface_for(${TARGET_NAME})
 
 set_target_properties(${TARGET_NAME} PROPERTIES OUTPUT_NAME "cpu_extension")
 
-if (THREADING STREQUAL "TBB")
-    target_include_directories(${TARGET_NAME} PUBLIC ${TBB_INCLUDE_DIRS})
-    target_link_libraries(${TARGET_NAME} debug ${TBB_LIBRARIES_RELEASE} optimized ${TBB_LIBRARIES_RELEASE})
-elseif (THREADING STREQUAL "OMP")
-    enable_omp()
-    target_link_libraries(${TARGET_NAME} ${intel_omp_lib})
-endif()
+target_link_libraries(${TARGET_NAME} PRIVATE ${InferenceEngine_LIBRARIES})
 
-target_link_libraries(${TARGET_NAME} ${InferenceEngine_LIBRARIES})
 target_include_directories(${TARGET_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
 
index 2fb0243..94aece3 100644 (file)
@@ -3,7 +3,7 @@ CPU Extensions
 
 ## Introducing CPU Extensions
 
-The CPU extensions library contains code of important layers that do not come with the [CPU plugin](./docs/Inference_Engine_Developer_Guide/supported_plugins/CPU.md).
+The CPU extensions library contains code of important layers that do not come with the [CPU plugin](./docs/IE_DG/supported_plugins/CPU.md).
 You should compile this library and use the <code>AddExtension</code> method in your application to load the extensions when for models featuring layers from this library.
 Refer to other samples for <code>AddExtension</code> code examples.
 
@@ -34,8 +34,8 @@ when cross-compiling this library for another platform.
  * SimplerNMS
  * SpatialTransformer
 
-In order to add a new layer, you can use [the extensibility mechanism](./docs/Inference_Engine_Developer_Guide/Integrate_your_kernels_into_IE.md).
+In order to add a new layer, you can use [the extensibility mechanism](./docs/IE_DG/Integrate_your_kernels_into_IE.md).
 
 ## See Also
-* [CPU](./docs/Inference_Engine_Developer_Guide/supported_plugins/CPU.md)
-* [Supported Devices](./docs/Inference_Engine_Developer_Guide/supported_plugins/Supported_Devices.md)
+* [CPU](./docs/IE_DG/supported_plugins/CPU.md)
+* [Supported Devices](./docs/IE_DG/supported_plugins/Supported_Devices.md)
index 38a31d6..ce78847 100644 (file)
@@ -1,7 +1,3 @@
-# Copyright (C) 2018 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-
 #
 # service functions:
 #   set_target_cpu_flags
index d578f98..4c07c2d 100644 (file)
@@ -1,45 +1,18 @@
-# Copyright (C) 2018 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2018 Intel Corporation
 #
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 cmake_minimum_required (VERSION 2.8)
 
-macro(ext_message TRACE_LEVEL)
-    if (${TRACE_LEVEL} STREQUAL FATAL_ERROR)
-        if(InferenceEngine_FIND_REQUIRED)
-            message(FATAL_ERROR "${ARGN}")
-        elseif(NOT InferenceEngine_FIND_QUIETLY)
-            message(WARNING "${ARGN}")
-        endif()
-        return()
-    elseif(NOT InferenceEngine_FIND_QUIETLY)
-        message(${TRACE_LEVEL} "${ARGN}")
-    endif ()
-endmacro()
-
 include(CPUID)
 include(OptimizationFlags)
-
-macro(enable_omp)
-    if(UNIX) # Linux
-        add_definitions(-fopenmp)
-        find_library(intel_omp_lib iomp5
-            PATHS ${InferenceEngine_INCLUDE_DIRS}/../external/omp/lib
-        )
-    elseif(WIN32) # Windows
-        if(${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
-            set(OPENMP_FLAGS "/Qopenmp /openmp")
-            set(CMAKE_SHARED_LINKER_FLAGS " ${CMAKE_SHARED_LINKER_FLAGS} /nodefaultlib:vcomp")
-        elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL Intel)
-            set(OPENMP_FLAGS "/Qopenmp /openmp")
-        else()
-            ext_message(WARNING "Unknown compiler ID. OpenMP support is disabled.")
-        endif()
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
-        find_library(intel_omp_lib
-            libiomp5md
-            PATHS "${InferenceEngine_INCLUDE_DIRS}/../lib/intel64/${CMAKE_BUILD_TYPE}"
-        )
-    endif()
-endmacro(enable_omp)
index 76252dd..9bf0400 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 83d09c3..4fcd25c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 6d5cb10..9070dda 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 5ac185d..7fb57a9 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 5284a2f..6aaf634 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 12b5057..c6efa6c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index cab24e7..cb00fda 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -82,9 +81,9 @@ void ExtLayerBase::addConfig(const CNNLayer* layer, std::vector<DataConfigurator
         for (size_t i = 0; i < order.size(); i++) order[i] = i;
 
         if (conf.layout == ConfLayout::BLK8 || conf.layout == ConfLayout::BLK16) {
-            if (data_dims.size() != 4)
+            if (data_dims.size() < 4 && data_dims.size() > 5)
                 THROW_IE_EXCEPTION << "Inapplicable blocking layout."
-                                   << "Tensor should be 4D.";
+                                   << "Tensor should be 4D or 5D.";
 
             int blk_size = conf.layout == ConfLayout::BLK8 ? 8 : 16;
 
index b45c0eb..3fa756a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index f751652..71c9d71 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index e55bad9..acf58fb 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -71,24 +70,24 @@ public:
                                                     static_cast<size_t>(_num_classes),
                                                     static_cast<size_t>(_num_priors),
                                                     4};
-            _decoded_bboxes = InferenceEngine::make_shared_blob<float>({Precision::UNSPECIFIED, bboxes_size, NCHW});
+            _decoded_bboxes = InferenceEngine::make_shared_blob<float>({Precision::FP32, bboxes_size, NCHW});
             _decoded_bboxes->allocate();
 
             InferenceEngine::SizeVector buf_size{static_cast<size_t>(_num),
                                                  static_cast<size_t>(_num_classes),
                                                  static_cast<size_t>(_num_priors)};
-            _buffer = InferenceEngine::make_shared_blob<int>({Precision::UNSPECIFIED, buf_size, {buf_size, {0, 1, 2}}});
+            _buffer = InferenceEngine::make_shared_blob<int>({Precision::I32, buf_size, {buf_size, {0, 1, 2}}});
             _buffer->allocate();
 
             InferenceEngine::SizeVector indices_size{static_cast<size_t>(_num),
                                                      static_cast<size_t>(_num_classes),
                                                      static_cast<size_t>(_num_priors)};
             _indices = InferenceEngine::make_shared_blob<int>(
-                    {Precision::UNSPECIFIED, indices_size, {indices_size, {0, 1, 2}}});
+                    {Precision::I32, indices_size, {indices_size, {0, 1, 2}}});
             _indices->allocate();
 
             InferenceEngine::SizeVector detections_size{static_cast<size_t>(_num * _num_classes)};
-            _detections_count = InferenceEngine::make_shared_blob<int>({Precision::UNSPECIFIED, detections_size, C});
+            _detections_count = InferenceEngine::make_shared_blob<int>({Precision::I32, detections_size, C});
             _detections_count->allocate();
 
             InferenceEngine::SizeVector conf_size = layer->insData[idx_confidence].lock()->dims;
@@ -103,7 +102,7 @@ public:
             _bbox_sizes->allocate();
 
             InferenceEngine::SizeVector num_priors_actual_size{static_cast<size_t>(_num)};
-            _num_priors_actual = InferenceEngine::make_shared_blob<int>({Precision::UNSPECIFIED, num_priors_actual_size, C});
+            _num_priors_actual = InferenceEngine::make_shared_blob<int>({Precision::I32, num_priors_actual_size, C});
             _num_priors_actual->allocate();
 
             addConfig(layer, {DataConfigurator(ConfLayout::PLN),
diff --git a/inference-engine/src/extension/ext_gather.cpp b/inference-engine/src/extension/ext_gather.cpp
new file mode 100644 (file)
index 0000000..27ae077
--- /dev/null
@@ -0,0 +1,305 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ext_list.hpp"
+#include "ext_base.hpp"
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <cassert>
+#include <algorithm>
+#include <limits>
+#include "ie_parallel.hpp"
+#include "simple_copy.h"
+
+namespace InferenceEngine {
+namespace Extensions {
+namespace Cpu {
+
+inline void clipping(int *idx, const int min, const int max) {
+    (*idx) = ((*idx) > min) ? (*idx) : min;
+    (*idx) = ((*idx) < max) ? (*idx) : (max - 1);
+    return;
+}
+
+class GatherImpl: public ILayerExecImpl {
+public:
+    StatusCode init(LayerConfig& config, ResponseDesc *resp) noexcept override {
+        for (auto& input : config.inConfs) {
+            for (auto& offset : input.desc.getBlockingDesc().getOffsetPaddingToData()) {
+                if (offset) {
+                    return GENERAL_ERROR;
+                }
+            }
+        }
+        for (auto& output : config.outConfs) {
+            for (auto& offset : output.desc.getBlockingDesc().getOffsetPaddingToData()) {
+                if (offset) {
+                    return GENERAL_ERROR;
+                }
+            }
+        }
+
+        //  Check for holes in tensors
+        SizeVector dictionary_dims = config.inConfs[GATHER_DICTIONARY].desc.getDims();
+        SizeVector indexes_dims = config.inConfs[GATHER_INDEXES].desc.getDims();
+        SizeVector out_dims = config.outConfs[0].desc.getDims();
+        size_t idx_size = 1;
+        for (auto dims : indexes_dims)
+            idx_size *= dims;
+
+        size_t dct_size = 1;
+        for (auto dims : dictionary_dims)
+            dct_size *= dims;
+
+        size_t out_size = 1;
+        for (auto dims : out_dims)
+            out_size *= dims;
+
+        size_t dctSV = config.inConfs[GATHER_DICTIONARY].desc.getBlockingDesc().getStrides()[0];
+        size_t dctDV = config.inConfs[GATHER_DICTIONARY].desc.getBlockingDesc().getBlockDims()[0];
+        size_t idxSV = config.inConfs[GATHER_INDEXES].desc.getBlockingDesc().getStrides()[0];
+        size_t idxDV = config.inConfs[GATHER_INDEXES].desc.getBlockingDesc().getBlockDims()[0];
+        size_t outSV = config.outConfs[0].desc.getBlockingDesc().getStrides()[0];
+        size_t outDV = config.outConfs[0].desc.getBlockingDesc().getBlockDims()[0];
+        if (outSV * outDV == out_size && idxSV * idxDV == idx_size && dctSV * dctDV == dct_size)
+            withHoles = NONE;
+        else if (outSV * outDV != out_size && idxSV * idxDV == idx_size && dctSV * dctDV == dct_size)
+            withHoles = OUTPUT;
+
+        return OK;
+    };
+
+    StatusCode getSupportedConfigurations(std::vector<LayerConfig>& conf, ResponseDesc *resp) noexcept override {
+        if (!errorMsg.empty()) {
+            if (resp) {
+                errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
+            }
+            return GENERAL_ERROR;
+        }
+        conf = confs;
+        return OK;
+    };
+
+    explicit GatherImpl(const CNNLayer* layer) {
+        try {
+            if (layer->insData.size() != 2 || layer->outData.empty())
+                THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
+
+            Precision inIdxPrecision = layer->insData[GATHER_INDEXES].lock()->getTensorDesc().getPrecision();
+            if (inIdxPrecision != Precision::FP32 &&
+                inIdxPrecision != Precision::I32 &&
+                inIdxPrecision != Precision::U16 &&
+                inIdxPrecision != Precision::I16 &&
+                inIdxPrecision != Precision::U8 &&
+                inIdxPrecision != Precision::I8)
+                THROW_IE_EXCEPTION << "Incorrect input precision. Only FP32|I32|U16|I16|U8|I8 are supported!";
+
+            //  Remove redundant dimensions
+            const SizeVector& dictionary_dims = layer->insData[GATHER_DICTIONARY].lock()->getTensorDesc().getDims();
+            size_t actualAxis = 0;
+            SizeVector dims_actual;
+            for (size_t i = 0; i < dictionary_dims.size(); i++) {
+                if (dictionary_dims[i] > 1) {
+                    for (size_t j = i; j < dictionary_dims.size(); j++)
+                        dims_actual.push_back(dictionary_dims[j]);
+                    break;
+                }
+            }
+
+            if (dims_actual.size() == 0)
+                THROW_IE_EXCEPTION << "Incorrect input parameters dimension!";
+
+            axis = static_cast<int>(layer->GetParamAsInt("axis"));
+            // Dictionary must be at least rank axis + 1
+            if (axis > 0 && (dims_actual.size() - axis) < 1)
+                THROW_IE_EXCEPTION << "Incorrect input parameters dimensions and axis number!";
+            else if (axis < 0 && (static_cast<int>(dims_actual.size()) + axis) < 0)
+                THROW_IE_EXCEPTION << "Incorrect input parameters dimensions and axis number!";
+
+            if (axis < 0)
+                axis += dims_actual.size();
+
+            //  Find number of dictionaries, index range and data length
+            for (size_t i = 0; i < axis; i++)
+                numDictionaries *= dims_actual[i];
+            indexRange = dims_actual[axis];
+            for (size_t i = axis + 1; i < dims_actual.size(); i++)
+                dataLength *= dims_actual[i];
+
+            if (dataLength == 0)
+                THROW_IE_EXCEPTION << "Incorrect input parameters dimension!";
+
+            LayerConfig config;
+            DataConfig dataConfigIdx, dataConfigDct;
+            const SizeVector& indexes_dims = layer->insData[GATHER_INDEXES].lock()->getTensorDesc().getDims();
+            dataConfigDct.desc = TensorDesc(InferenceEngine::Precision(InferenceEngine::Precision::FP32), dictionary_dims, InferenceEngine::Layout::ANY);
+            dataConfigIdx.desc = TensorDesc(inIdxPrecision, indexes_dims, InferenceEngine::Layout::ANY);
+            if (GATHER_DICTIONARY == 0) {
+                config.inConfs.push_back(dataConfigDct);
+                config.inConfs.push_back(dataConfigIdx);
+            } else {
+                config.inConfs.push_back(dataConfigIdx);
+                config.inConfs.push_back(dataConfigDct);
+            }
+
+            DataConfig dataConfigOut;
+            const SizeVector& out_dims = layer->outData[0]->getTensorDesc().getDims();
+            SizeVector blocks = out_dims;
+            SizeVector order(blocks.size());
+            SizeVector dimOffsets(blocks.size());
+            SizeVector strides(blocks.size());
+            size_t offset(std::numeric_limits<size_t>::max());
+            for (size_t i = 0; i < order.size(); i++) {
+                strides[i] = std::numeric_limits<size_t>::max();
+                dimOffsets[i] = 0;
+                order[i] = i;
+            }
+            dataConfigOut.desc = TensorDesc(InferenceEngine::Precision(InferenceEngine::Precision::FP32), out_dims,
+                                                                      { blocks, order, offset, dimOffsets, strides });
+            config.outConfs.push_back(dataConfigOut);
+            config.dynBatchSupport = false;
+            confs.push_back(config);
+        } catch (InferenceEngine::details::InferenceEngineException &ex) {
+            errorMsg = ex.what();
+        }
+    }
+
+    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
+                       ResponseDesc *resp) noexcept override {
+        switch (inputs[GATHER_INDEXES]->precision()) {
+            case Precision::FP32:
+                gather(inputs[GATHER_INDEXES]->cbuffer().as<const float *>(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles);
+                break;
+            case Precision::I32:
+                gather(inputs[GATHER_INDEXES]->cbuffer().as<const int32_t *>(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles);
+                break;
+            case Precision::U16:
+                gather(inputs[GATHER_INDEXES]->cbuffer().as<const uint16_t *>(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles);
+                break;
+            case Precision::I16:
+                gather(inputs[GATHER_INDEXES]->cbuffer().as<const int16_t *>(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles);
+                break;
+            case Precision::U8:
+                gather(inputs[GATHER_INDEXES]->cbuffer().as<const uint8_t *>(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles);
+                break;
+            case Precision::I8:
+                gather(inputs[GATHER_INDEXES]->cbuffer().as<const int8_t *>(), inputs[GATHER_INDEXES], inputs[GATHER_DICTIONARY], outputs[0], withHoles);
+                break;
+            default:
+                return GENERAL_ERROR;
+        }
+
+        return OK;
+    }
+
+protected:
+    enum class ConfLayout { ANY, PLN, BLK8, BLK16 };
+    std::string errorMsg;
+    std::vector<LayerConfig> confs;
+
+private:
+    enum HolesMode {
+        NONE = 0,
+        OUTPUT = 1,
+        ALL = 2
+    };
+
+    template <typename data_t>
+    void gather(data_t *src_dataIdx, Blob::Ptr indexes, Blob::Ptr dictionary, Blob::Ptr output, bool withHoles);
+
+    int axis = 0;
+    size_t numDictionaries = 1;
+    size_t indexRange = 0;
+    size_t dataLength = 1;
+    const size_t GATHER_DICTIONARY = 0;
+    const size_t GATHER_INDEXES = 1;
+    HolesMode withHoles = ALL;
+};
+
+template <typename data_t>
+void GatherImpl::gather(data_t *src_dataIdx, Blob::Ptr indexes, Blob::Ptr dictionary, Blob::Ptr output, bool withHoles) {
+    size_t src_dataIdxSize = indexes->size();
+    size_t dataSize = sizeof(float) * dataLength;
+
+    if (withHoles == GatherImpl::NONE) {  //  No holes in tensors
+        const float *src_dataDict = dictionary->cbuffer().as<const float *>() + dictionary->getTensorDesc().getBlockingDesc().getOffsetPadding();
+        float* dst_data = output->cbuffer().as<float *>() + output->getTensorDesc().getBlockingDesc().getOffsetPadding();
+        src_dataIdx += indexes->getTensorDesc().getBlockingDesc().getOffsetPadding();
+
+        if (axis == 0) {
+            parallel_for(src_dataIdxSize, [&](size_t i) {
+                int idx = static_cast<int>(src_dataIdx[i]);
+
+                //  Index clipping
+                clipping(&idx, 0, indexRange);
+
+                //  Copying data to destination from Dictionary
+                simple_copy(&dst_data[dataLength * i],
+                            output->byteSize() - (dataLength * i),
+                            &src_dataDict[dataLength * idx],
+                            dataSize);
+            });
+        } else {
+            parallel_for(src_dataIdxSize, [&](size_t i) {
+                int idx = static_cast<int>(src_dataIdx[i]);
+
+                //  Index clipping
+                clipping(&idx, 0, indexRange);
+
+                //  Copying data to destination from Dictionary
+                for (size_t j = 0; j < numDictionaries; j++) {
+                    simple_copy(&dst_data[dataLength * (i + j * src_dataIdxSize)],
+                                output->byteSize() - (dataLength * (i + j * src_dataIdxSize)),
+                                &src_dataDict[dataLength * (idx + j * indexRange)],
+                                dataSize);
+                }
+            });
+        }
+    } else if (withHoles == GatherImpl::OUTPUT) {  //  If only output tensor have holes
+        const float *src_dataDict = dictionary->cbuffer().as<const float *>() + dictionary->getTensorDesc().getBlockingDesc().getOffsetPadding();
+        float* dst_data = output->cbuffer().as<float *>();
+        src_dataIdx += indexes->getTensorDesc().getBlockingDesc().getOffsetPadding();
+
+        parallel_for(src_dataIdxSize, [&](size_t i) {
+            int idx = static_cast<int>(src_dataIdx[i]);
+
+            //  Index clipping
+            clipping(&idx, 0, indexRange);
+
+            //  Copying data to destination from Dictionary
+            for (size_t j = 0; j < numDictionaries; j++) {
+                for (size_t k = 0; k < dataLength; k++) {
+                    dst_data[output->getTensorDesc().offset(k + dataLength * (i + j * src_dataIdxSize))] =
+                        src_dataDict[k + dataLength * (idx + j * indexRange)];
+                }
+            }
+        });
+    } else {  //  If input and oupput tensors have holes
+        const float *src_dataDict = dictionary->cbuffer().as<const float *>();
+        float* dst_data = output->cbuffer().as<float *>();
+
+        parallel_for(src_dataIdxSize, [&](size_t i) {
+            int idx = static_cast<int>(src_dataIdx[indexes->getTensorDesc().offset(i)]);
+
+            //  Index clipping
+            clipping(&idx, 0, indexRange);
+
+            //  Copying data to destination from Dictionary
+            for (size_t j = 0; j < numDictionaries; j++) {
+                for (size_t k = 0; k < dataLength; k++) {
+                    dst_data[output->getTensorDesc().offset(k + dataLength * (i + j * src_dataIdxSize))] =
+                        src_dataDict[dictionary->getTensorDesc().offset(k + dataLength * (idx + j * indexRange))];
+                }
+            }
+        });
+    }
+}
+
+REG_FACTORY_FOR(ImplFactory<GatherImpl>, Gather);
+
+}  // namespace Cpu
+}  // namespace Extensions
+}  // namespace InferenceEngine
index 2524479..4810d9d 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index f37f859..64ff20d 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 4569437..6aa139d 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -82,11 +81,14 @@ void CpuExtensions::collectTypes(char**& types, unsigned int& size, const std::m
     size = count;
 }
 
+}  // namespace Cpu
+}  // namespace Extensions
+
 
 // Exported function
 INFERENCE_EXTENSION_API(StatusCode) CreateExtension(IExtension*& ext, ResponseDesc* resp) noexcept {
     try {
-        ext = new CpuExtensions();
+        ext = new Extensions::Cpu::CpuExtensions();
         return OK;
     } catch (std::exception& ex) {
         if (resp) {
@@ -97,7 +99,15 @@ INFERENCE_EXTENSION_API(StatusCode) CreateExtension(IExtension*& ext, ResponseDe
     }
 }
 
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
+// Exported function
+INFERENCE_EXTENSION_API(StatusCode) CreateShapeInferExtension(IShapeInferExtension*& ext, ResponseDesc* resp) noexcept {
+    IExtension * pExt = nullptr;
+    StatusCode  result = CreateExtension(pExt, resp);
+    if (result == OK) {
+        ext = pExt;
+    }
 
+    return result;
+}
+
+}  // namespace InferenceEngine
index a9f85a5..6e83e7e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -36,13 +35,13 @@ public:
 
     void GetVersion(const InferenceEngine::Version*& versionInfo) const noexcept override;
 
-    void SetLogCallback(InferenceEngine::IErrorListener& listener) noexcept override {};
+    void SetLogCallback(InferenceEngine::IErrorListener& /*listener*/) noexcept override {}
 
-    void Unload() noexcept override {};
+    void Unload() noexcept override {}
 
     void Release() noexcept override {
         delete this;
-    };
+    }
 
     static void AddExt(std::string name, ext_factory factory);
 
index 96fca92..27f8b9f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -53,67 +52,94 @@ public:
         float* src_data = inputs[0]->buffer();
         float* dst_data = outputs[0]->buffer();
 
-        SizeVector dims = inputs[0]->getTensorDesc().getDims();
-
-        int N = static_cast<int>((dims.size() > 0) ? dims[0] : 1);
-        int C = static_cast<int>((dims.size() > 1) ? dims[1] : 1);
-        int H = static_cast<int>((dims.size() > 2) ? dims[2] : 1);
-        int W = static_cast<int>((dims.size() > 3) ? dims[3] : 1);
-
-        if (inputs[0]->layout() == NCHW) {
-            mvn_pln(src_data, dst_data, N, C, H, W);
+        if (inputs[0]->layout() == NCHW || inputs[0]->layout() == NCDHW) {
+            mvn_pln(src_data, dst_data, inputs[0]->getTensorDesc().getDims());
         } else {
-            mvn_blk(src_data, dst_data, N, C, H, W);
+            mvn_blk(src_data, dst_data, inputs[0]->getTensorDesc().getDims());
         }
 
         return OK;
     }
 
 private:
-    void mvn_pln(const float* src_data, float* dst_data, int N, int C, int H, int W);
-    void mvn_blk(const float* src_data, float* dst_data, int N, int C, int H, int W);
+    void mvn_pln(const float* src_data, float* dst_data, const SizeVector& dims);
+    void mvn_blk(const float* src_data, float* dst_data, const SizeVector& dims);
 
     bool across_channels = false;
     bool normalize_variance = true;
     float eps = 1e-9f;
 };
 
-void MVNImpl::mvn_pln(const float* src_data, float* dst_data, int N, int C, int H, int W) {
-    for (int b = 0; b < N; b++) {
+void MVNImpl::mvn_pln(const float* src_data, float* dst_data, const SizeVector& dims) {
+    size_t dims_size = dims.size();
+    size_t N = (dims_size > 0) ? dims[0] : 1lu;
+    size_t C = (dims_size > 1) ? dims[1] : 1lu;
+    size_t D = (dims_size > 4) ? dims[dims_size - 3] : 1lu;
+    size_t H = (dims_size > 3) ? dims[dims_size - 2] : 1lu;
+    size_t W = (dims_size > 2) ? dims[dims_size - 1] : 1lu;
+
+    size_t C1 = H * W;
+    size_t C2 = C1 * D;
+    size_t C3 = C2 * C;
+
+    for (size_t b = 0lu; b < N; b++) {
         // Calculate mean value
+        size_t cb = b * C3;
         if (across_channels) {
             double mean = 0.0;
             mean = parallel_sum(C, mean, [&](int c)->double {
                 double mean_internal = 0.0;
-                for (int h = 0; h < H; h++) {
-                    for (int w = 0; w < W; w++) {
-                        mean_internal += src_data[b*C*H*W + c*H*W + h*W + w];
+                size_t cc = cb + c * C2;
+                for (size_t d = 0lu; d < D; d++) {
+                    size_t cd = cc + d * C1;
+                    for (size_t h = 0lu; h < H; h++) {
+                        size_t ch = cd + h * W;
+                        for (size_t w = 0lu; w < W; w++) {
+                            mean_internal += src_data[ch + w];
+                        }
                     }
                 }
                 return mean_internal;
             });
 
-            mean /= C*H*W;
+            mean /= C3;
             parallel_for(C, [&](int c) {
-                for (int h = 0; h < H; h++) {
-                    for (int w = 0; w < W; w++) {
-                        dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] - mean;
+                size_t cc = cb + c * C2;
+                for (size_t d = 0lu; d < D; d++) {
+                    size_t cd = cc + d * C1;
+                    for (size_t h = 0lu; h < H; h++) {
+                        size_t ch = cd + h * W;
+                        for (size_t w = 0lu; w < W; w++) {
+                            size_t cw = ch + w;
+                            dst_data[cw] = src_data[cw] - mean;
+                        }
                     }
                 }
             });
         } else {
-            parallel_for(C, [&](int c) {
-                double mean = 0;
-                for (int h = 0; h < H; h++) {
-                    for (int w = 0; w < W; w++) {
-                        mean += src_data[b*C*H*W + c*H*W + h*W + w];
+            parallel_for(C, [&](size_t c) {
+                double mean = 0.f;
+                size_t cc = cb + c * C2;
+                for (size_t d = 0lu; d < D; d++) {
+                    size_t cd = cc + d * C1;
+                    for (size_t h = 0lu; h < H; h++) {
+                        size_t ch = cd + h * W;
+                        for (size_t w = 0lu; w < W; w++) {
+                            mean += src_data[ch + w];
+                        }
                     }
                 }
-                mean /= H*W;
 
-                for (int h = 0; h < H; h++) {
-                    for (int w = 0; w < W; w++) {
-                        dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] - mean;
+                mean /= static_cast<double>(C2);
+
+                for (size_t d = 0lu; d < D; d++) {
+                    size_t cd = cc + d * C1;
+                    for (size_t h = 0lu; h < H; h++) {
+                        size_t ch = cd + h * W;
+                        for (size_t w = 0lu; w < W; w++) {
+                            size_t cw = ch + w;
+                            dst_data[cw] = src_data[cw] - mean;
+                        }
                     }
                 }
             });
@@ -121,44 +147,65 @@ void MVNImpl::mvn_pln(const float* src_data, float* dst_data, int N, int C, int
     }
 
     if (normalize_variance) {
-        for (int b = 0; b < N; b++) {
+        for (size_t b = 0lu; b < N; b++) {
             // Calculate variances value
+            size_t cb = b * C3;
             if (across_channels) {
                 double variance = 0.0;
                 variance = parallel_sum(C, variance, [&](int c)->double {
                     double variance_internal = 0.0;
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            variance_internal += std::pow(dst_data[b*C*H*W + c*H*W + h*W + w], 2);
+                    size_t cc = cb + c * C2;
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = cc + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * W;
+                            for (size_t w = 0lu; w < W; w++) {
+                                variance_internal += std::pow(dst_data[ch + w], 2);
+                            }
                         }
                     }
                     return variance_internal;
                 });
 
-                variance /= C*H*W;
-                variance = std::pow(variance, 0.5f);
+                variance /= C3;
                 variance += eps;
+                variance = std::pow(variance, 0.5f);
                 parallel_for(C, [&](int c) {
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            dst_data[b*C*H*W + c*H*W + h*W + w] /= variance;
+                    size_t cc = cb + c * C2;
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = cc + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * W;
+                            for (size_t w = 0lu; w < W; w++) {
+                                dst_data[ch + w] /= variance;
+                            }
                         }
                     }
                 });
             } else {
-                parallel_for(C, [&](int c) {
-                    double variance = 0;
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            variance += std::pow(dst_data[b*C*H*W + c*H*W + h*W + w], 2);
+                parallel_for(C, [&](size_t c) {
+                    double variance = 0.0;
+                    size_t cc = cb + c * C2;
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = cc + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * W;
+                            for (size_t w = 0lu; w < W; w++) {
+                                variance += std::pow(dst_data[ch + w], 2);
+                            }
                         }
                     }
-                    variance /= H*W;
-                    variance = std::pow(variance, 0.5f);
+
+                    variance /= static_cast<double>(C2);
                     variance += eps;
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            dst_data[b*C*H*W + c*H*W + h*W + w] /= variance;
+                    variance = std::pow(variance, 0.5f);
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = cc + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * W;
+                            for (size_t w = 0lu; w < W; w++) {
+                                dst_data[ch + w] /= variance;
+                            }
                         }
                     }
                 });
@@ -167,11 +214,11 @@ void MVNImpl::mvn_pln(const float* src_data, float* dst_data, int N, int C, int
     }
 }
 
-void MVNImpl::mvn_blk(const float* src_data, float* dst_data, int N, int C, int H, int W) {
+void MVNImpl::mvn_blk(const float* src_data, float* dst_data, const SizeVector& dims) {
 #if defined(HAVE_AVX512F)
     size_t blk_size = 16;
 #else
-    size_t blk_size = 8;
+    size_t blk_size = 8lu;
 #endif
 
 #if defined(HAVE_AVX512F)
@@ -179,116 +226,164 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, int N, int C, int
 #elif defined(HAVE_AVX2)
     typedef __m256 vec_type;
 #endif
+    size_t dims_size = dims.size();
+    size_t N = (dims_size > 0) ? dims[0] : 1lu;
+    size_t C = (dims_size > 1) ? dims[1] : 1lu;
+    size_t D = (dims_size > 4) ? dims[dims_size - 3] : 1lu;
+    size_t H = (dims_size > 3) ? dims[dims_size - 2] : 1lu;
+    size_t W = (dims_size > 2) ? dims[dims_size - 1] : 1lu;
 
     int CB = div_up(C, static_cast<int>(blk_size));
 
+    size_t C0 = W * blk_size;
+    size_t C1 = C0 * H;
+    size_t C2 = C1 * D;
+    size_t C3 = C2 * CB;
+    size_t C4 = D * H * W;
+    size_t C5 = C * D * H * W;
+
     if (normalize_variance) {
-        for (int b = 0; b < N; b++) {
+        for (size_t b = 0lu; b < N; b++) {
+            size_t ccb = b * C3;
             if (across_channels) {
-                float mean = 0.0f;
-                mean = parallel_sum2d(CB, H, mean, [&](int cb, int h)->float {
-                    float mean_internal = 0.0;
-                    for (int w = 0; w < W; w++) {
-                        for (int c = 0; c < std::min(blk_size, C - cb * blk_size); c++) {
-                            size_t src_offset = b*CB*H*W*blk_size + cb*H*W*blk_size + h*W*blk_size + w*blk_size + c;
-
-                            mean_internal += src_data[src_offset];
+                double mean = 0.0;
+                mean = parallel_sum3d(CB, D, H, mean, [&](size_t cb, size_t d, size_t h)->double {
+                    size_t ccbd = ccb + cb * C2 + d * C1 + h * C0;
+                    size_t min_cb = std::min(blk_size, C - cb * blk_size);
+                    double mean_internal = 0.0;
+                    for (size_t w = 0lu; w < W; w++) {
+                        size_t cw = ccbd + w * blk_size;
+                        for (size_t c = 0lu; c < min_cb; c++) {
+                            mean_internal += src_data[cw + c];
                         }
                     }
                     return mean_internal;
                 });
 
-                mean /= C * H * W;
-
-                float variance = 0.0f;
-                variance = parallel_sum2d(CB, H, variance, [&](int cb, int h)->float {
-                    float variance_internal = 0.0;
-                    for (int w = 0; w < W; w++) {
-                        for (int c = 0; c < std::min(blk_size, C - cb * blk_size); c++) {
-                            size_t src_offset = b*CB*H*W*blk_size + cb*H*W*blk_size + h*W*blk_size + w*blk_size + c;
+                mean /= static_cast<double>(C5);
 
-                            variance_internal += std::pow(src_data[src_offset] - mean, 2);
+                double variance = 0.0;
+                variance = parallel_sum3d(CB, D, H, variance, [&](size_t cb, size_t d, size_t h)->double {
+                    size_t ccbd = ccb + cb * C2 + d * C1 + h * C0;
+                    size_t min_cb = std::min(blk_size, C - cb * blk_size);
+                    double variance_internal = 0.0;
+                    for (size_t w = 0lu; w < W; w++) {
+                        size_t cw = ccbd + w * blk_size;
+                        for (size_t c = 0lu; c < min_cb; c++) {
+                            variance_internal += std::pow(static_cast<double>(src_data[cw + c]) - mean, 2);
                         }
                     }
                     return variance_internal;
                 });
 
-                variance /= C*H*W;
-                variance = std::pow(variance, 0.5f);
+                variance /= static_cast<double>(C5);
                 variance += eps;
+                variance = std::pow(variance, 0.5f);
 
-                parallel_for2d(CB, H, [&](int cb, int h) {
-                    for (int w = 0; w < W; w++) {
-                        for (int c = 0; c < std::min(blk_size, C - cb * blk_size); c++) {
-                            size_t src_offset = b*CB*H*W*blk_size + cb*H*W*blk_size + h*W*blk_size + w*blk_size + c;
+                parallel_for3d(CB, D, H, [&](size_t cb, size_t d, size_t h) {
+                    size_t ccbd = ccb + cb * C2 + d * C1 + h * C0;
+                    size_t min_cb = std::min(blk_size, C - cb * blk_size);
+                    for (size_t w = 0lu; w < W; w++) {
+                        size_t cw = ccbd + w * blk_size;
+                        for (size_t c = 0lu; c < min_cb; c++) {
+                            size_t src_offset = cw + c;
 
-                            dst_data[src_offset] = (src_data[src_offset] - mean) / variance;
+                            dst_data[src_offset] = (static_cast<double>(src_data[src_offset]) - mean) / variance;
                         }
                     }
                 });
             } else {
-                parallel_for(CB, [&](int cb) {
-                    size_t src_off = b*CB*H*W*blk_size + cb*H*W*blk_size;
+                parallel_for(CB, [&](size_t cb) {
+                    size_t min_cb = std::min(blk_size, C - cb * blk_size);
+                    size_t src_off = ccb + cb * C2;
 #if defined(HAVE_AVX2) || defined(HAVE_AVX512F)
                     vec_type vmean = _mm_uni_setzero_ps();
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            vec_type vsrc = _mm_uni_loadu_ps(src_data + src_off + h*W*blk_size + w*blk_size);
-                            vmean = _mm_uni_add_ps(vmean, vsrc);
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = src_off + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * C0;
+                            for (size_t w = 0lu; w < W; w++) {
+                                vec_type vsrc = _mm_uni_loadu_ps(src_data + ch + w * blk_size);
+                                vmean = _mm_uni_add_ps(vmean, vsrc);
+                            }
                         }
                     }
 
-                    vec_type vsize = _mm_uni_set1_ps(static_cast<float>(H * W));
+                    vec_type vsize = _mm_uni_set1_ps(static_cast<float>(D * H * W));
                     vmean = _mm_uni_div_ps(vmean, vsize);
 
                     vec_type vvariance = _mm_uni_setzero_ps();
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            vec_type vsrc = _mm_uni_loadu_ps(src_data + src_off + h*W*blk_size + w*blk_size);
-                            vsrc = _mm_uni_sub_ps(vsrc, vmean);
-                            vvariance = _mm_uni_add_ps(vvariance, _mm_uni_mul_ps(vsrc, vsrc));
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = src_off + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * C0;
+                            for (size_t w = 0lu; w < W; w++) {
+                                vec_type vsrc = _mm_uni_loadu_ps(src_data + ch + w * blk_size);
+                                vsrc = _mm_uni_sub_ps(vsrc, vmean);
+                                vvariance = _mm_uni_add_ps(vvariance, _mm_uni_mul_ps(vsrc, vsrc));
+                            }
                         }
                     }
-
                     vvariance = _mm_uni_div_ps(vvariance, vsize);
-                    vvariance = _mm_uni_sqrt_ps(vvariance);
 
                     vec_type veps = _mm_uni_set1_ps(eps);
                     vvariance = _mm_uni_add_ps(vvariance, veps);
 
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            vec_type vsrc = _mm_uni_loadu_ps(src_data + src_off + h*W*blk_size + w*blk_size);
-                            vsrc = _mm_uni_sub_ps(vsrc, vmean);
-                            _mm_uni_storeu_ps(dst_data + src_off + h*W*blk_size + w*blk_size, _mm_uni_div_ps(vsrc, vvariance));
+                    vvariance = _mm_uni_sqrt_ps(vvariance);
+
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = src_off + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * C0;
+                            for (size_t w = 0lu; w < W; w++) {
+                                size_t offset = ch + w * blk_size;
+                                vec_type vsrc = _mm_uni_loadu_ps(src_data + offset);
+                                vsrc = _mm_uni_sub_ps(vsrc, vmean);
+                                _mm_uni_storeu_ps(dst_data + offset, _mm_uni_div_ps(vsrc, vvariance));
+                            }
                         }
                     }
 #else
-                    for (int c = 0; c < std::min(blk_size, C - cb * blk_size); c++) {
-                        float mean = 0;
-                        for (int h = 0; h < H; h++) {
-                            for (int w = 0; w < W; w++) {
-                                mean += src_data[src_off + h*W*blk_size + w*blk_size + c];
+                    for (size_t c = 0; c < min_cb; c++) {
+                        size_t cc = src_off + c;
+
+                        double mean = 0.0;
+                        for (size_t d = 0; d < D; d++) {
+                            size_t cd = cc + d * C1;
+                            for (size_t h = 0; h < H; h++) {
+                                size_t ch = cd + h * C0;
+                                for (size_t w = 0; w < W; w++) {
+                                    mean += src_data[ch + w * blk_size];
+                                }
                             }
                         }
 
-                        mean /= H * W;
-
-                        float variance = 0;
-                        for (int h = 0; h < H; h++) {
-                            for (int w = 0; w < W; w++) {
-                                float value = src_data[src_off + h*W*blk_size + w*blk_size + c] - mean;
-                                variance += std::pow(value, 2);
+                        mean /= static_cast<double>(C4);
+
+                        double variance = 0.0;
+                        for (size_t d = 0lu; d < D; d++) {
+                            size_t cd = cc + d * C1;
+                            for (size_t h = 0lu; h < H; h++) {
+                                size_t ch = cd + h * C0;
+                                for (size_t w = 0lu; w < W; w++) {
+                                    double value = static_cast<double>(src_data[ch + w * blk_size]) - mean;
+                                    variance += std::pow(value, 2);
+                                }
                             }
                         }
 
-                        variance /= H * W;
-                        variance = std::pow(variance, 0.5f);
+                        variance /= static_cast<double>(C4);
                         variance += eps;
+                        variance = std::pow(variance, 0.5f);
 
-                        for (int h = 0; h < H; h++) {
-                            for (int w = 0; w < W; w++) {
-                                dst_data[src_off + h*W*blk_size + w*blk_size + c] = (src_data[src_off + h*W*blk_size + w*blk_size + c] - mean) / variance;
+                        for (size_t d = 0lu; d < D; d++) {
+                            size_t cd = cc + d * C1;
+                            for (size_t h = 0lu; h < H; h++) {
+                                size_t ch = cd + h * C0;
+                                for (size_t w = 0lu; w < W; w++) {
+                                    size_t index = ch + w * blk_size;
+                                    dst_data[index] = (src_data[index] - mean) / variance;
+                                }
                             }
                         }
                     }
@@ -297,67 +392,92 @@ void MVNImpl::mvn_blk(const float* src_data, float* dst_data, int N, int C, int
             }
         }
     } else {
-        for (int b = 0; b < N; b++) {
+        for (size_t b = 0; b < N; b++) {
+            size_t ccb = b * C3;
             if (across_channels) {
-                float mean = 0.0f;
-                mean = parallel_sum2d(CB, H, mean, [&](int cb, int h)->float {
-                    float mean_internal = 0;
-                    for (int w = 0; w < W; w++) {
-                        for (int c = 0; c < std::min(blk_size, C - cb * blk_size); c++) {
-                            size_t src_offset = b*CB*H*W*blk_size + cb*H*W*blk_size + h*W*blk_size + w*blk_size + c;
-
-                            mean_internal += src_data[src_offset];
+                double mean = 0.0;
+                mean = parallel_sum3d(CB, D, H, mean, [&](size_t cb, size_t d, size_t h)->double {
+                    size_t ccbd = ccb + cb * C2 + d * C1 + h * C0;
+                    size_t min_cb = std::min(blk_size, C - cb * blk_size);
+                    double mean_internal = 0.f;
+                    for (size_t w = 0lu; w < W; w++) {
+                        size_t cw = ccbd + w * blk_size;
+                        for (size_t c = 0lu; c < min_cb; c++) {
+                            mean_internal += src_data[cw + c];
                         }
                     }
                     return mean_internal;
                 });
 
-                mean /= C * H * W;
+                mean /= static_cast<double>(C5);
 
-                parallel_for2d(CB, H, [&](int cb, int h) {
-                    for (int w = 0; w < W; w++) {
-                        for (int c = 0; c < std::min(blk_size, C - cb * blk_size); c++) {
-                            size_t src_offset = b*CB*H*W*blk_size + cb*H*W*blk_size + h*W*blk_size + w*blk_size + c;
+                parallel_for3d(CB, D, H, [&](size_t cb, size_t d, size_t h) {
+                    size_t ccbd = ccb + cb * C2 + d * C1 + h * C0;
+                    size_t min_cb = std::min(blk_size, C - cb * blk_size);
+                    for (size_t w = 0lu; w < W; w++) {
+                        size_t cw = ccbd + w * blk_size;
+                        for (size_t c = 0lu; c < min_cb; c++) {
+                            size_t src_offset = cw + c;
 
                             dst_data[src_offset] = src_data[src_offset] - mean;
                         }
                     }
                 });
             } else {
-                parallel_for(CB, [&](int cb) {
-                    size_t src_off = b*CB*H*W*blk_size + cb*H*W*blk_size;
+                parallel_for(CB, [&](size_t cb) {
+                    size_t min_cb = std::min(blk_size, C - cb * blk_size);
+                    size_t src_off = ccb + cb * C2;
 #if defined(HAVE_AVX2) || defined(HAVE_AVX512F)
                     vec_type vmean = _mm_uni_setzero_ps();
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            vec_type vsrc = _mm_uni_loadu_ps(src_data + src_off + h*W*blk_size + w*blk_size);
-                            vmean = _mm_uni_add_ps(vmean, vsrc);
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = src_off + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * C0;
+                            for (size_t w = 0lu; w < W; w++) {
+                                vec_type vsrc = _mm_uni_loadu_ps(src_data + ch + w * blk_size);
+                                vmean = _mm_uni_add_ps(vmean, vsrc);
+                            }
                         }
                     }
 
-                    vec_type vsize = _mm_uni_set1_ps(static_cast<float>(H * W));
+                    vec_type vsize = _mm_uni_set1_ps(static_cast<float>(D * H * W));
                     vmean = _mm_uni_div_ps(vmean, vsize);
 
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            vec_type vsrc = _mm_uni_loadu_ps(src_data + src_off + h*W*blk_size + w*blk_size);
-                            _mm_uni_storeu_ps(dst_data + src_off + h*W*blk_size + w*blk_size, _mm_uni_sub_ps(vsrc, vmean));
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = src_off + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * C0;
+                            for (size_t w = 0lu; w < W; w++) {
+                                size_t offset = ch + w * blk_size;
+                                vec_type vsrc = _mm_uni_loadu_ps(src_data + offset);
+                                _mm_uni_storeu_ps(dst_data + offset, _mm_uni_sub_ps(vsrc, vmean));
+                            }
                         }
                     }
 #else
-                    for (int c = 0; c < std::min(blk_size, C - cb * blk_size); c++) {
-                        float mean = 0;
-                        for (int h = 0; h < H; h++) {
-                            for (int w = 0; w < W; w++) {
-                                mean += src_data[src_off + h*W*blk_size + w*blk_size + c];
+                    for (size_t c = 0lu; c < min_cb; c++) {
+                        size_t cc = src_off + c;
+                        double mean = 0.0;
+                        for (size_t d = 0lu; d < D; d++) {
+                            size_t cd = cc + d * C1;
+                            for (size_t h = 0lu; h < H; h++) {
+                                size_t ch = cd + h * C0;
+                                for (size_t w = 0lu; w < W; w++) {
+                                    mean += src_data[ch + w * blk_size];
+                                }
                             }
                         }
 
-                        mean /= H * W;
+                        mean /= static_cast<double>(C4);
 
-                        for (int h = 0; h < H; h++) {
-                            for (int w = 0; w < W; w++) {
-                                dst_data[src_off + h*W*blk_size + w*blk_size + c] = src_data[src_off + h*W*blk_size + w*blk_size + c] - mean;
+                        for (size_t d = 0lu; d < D; d++) {
+                            size_t cd = cc + d * C1;
+                            for (size_t h = 0lu; h < H; h++) {
+                                size_t ch = cd + h * C0;
+                                for (size_t w = 0lu; w < W; w++) {
+                                    size_t index = ch + w * blk_size;
+                                    dst_data[index] = src_data[index] - mean;
+                                }
                             }
                         }
                     }
index 9849630..0c77e3e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -26,6 +25,9 @@ public:
             if (layer->insData.size() != 1 || layer->outData.size() != 1)
                 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
 
+            if (layer->insData[0].lock()->dims.size() < 2 || layer->insData[0].lock()->dims.size() > 4)
+                THROW_IE_EXCEPTION << "Normalize supports from 2D to 4D blobs!";
+
             weights = std::dynamic_pointer_cast<TBlob<float>>(layer->blobs.at("weights"));
             if (!weights)
                 THROW_IE_EXCEPTION << layer->name << " weights is empty!";
diff --git a/inference-engine/src/extension/ext_pad.cpp b/inference-engine/src/extension/ext_pad.cpp
new file mode 100644 (file)
index 0000000..102db13
--- /dev/null
@@ -0,0 +1,258 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ext_list.hpp"
+#include "ext_base.hpp"
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <cassert>
+#include "ie_parallel.hpp"
+
+namespace InferenceEngine {
+namespace Extensions {
+namespace Cpu {
+
+class PadImpl: public ExtLayerBase {
+public:
+    explicit PadImpl(const CNNLayer* layer) {
+        try {
+            if (layer->insData.empty() || layer->outData.empty())
+                THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
+
+            pads_begin = layer->GetParamAsUInts("pads_begin");
+            std::vector<unsigned int> pads_end = layer->GetParamAsUInts("pads_end");
+
+            src_dims = layer->insData[0].lock()->getTensorDesc().getDims();
+            dst_dims = layer->outData[0]->getTensorDesc().getDims();
+            if (src_dims.size() != dst_dims.size() || pads_begin.size() != src_dims.size())
+                THROW_IE_EXCEPTION << "Incorrect number of input/output dimensions!";
+
+            std::string pad_mode = layer->GetParamAsString("pad_mode");
+            if (pad_mode == "constant") {
+                padMode = CONSTANT;
+            } else if (pad_mode == "edge") {
+                padMode = EDGE;
+            } else if (pad_mode == "reflect") {
+                padMode = REFLECT;
+                for (size_t i = 0; i < src_dims.size(); i++) {
+                    if ((src_dims[i] - 1) < pads_begin[i] || (src_dims[i] - 1) < pads_end[i])
+                        THROW_IE_EXCEPTION << layer->name << " Incorrect pads_begin or pads_end for 'reflect' pad mode";
+                }
+            } else if (pad_mode == "symmetric") {
+                padMode = SYMMETRIC;
+                for (size_t i = 0; i < src_dims.size(); i++) {
+                    if (src_dims[i] < pads_begin[i] || src_dims[i] < pads_end[i])
+                        THROW_IE_EXCEPTION << layer->name << " Incorrect pads_begin or pads_end for 'symmetric' pad mode";
+                }
+            } else {
+                THROW_IE_EXCEPTION << layer->name
+                                   << " Incorrect pad_mode. Only constants|edge|reflect|symmetric modes are supported!";
+            }
+
+            if (padMode == CONSTANT)
+                pad_value = layer->GetParamAsFloat("pad_value", 0.f);
+
+            srcStrides = layer->insData[0].lock()->getTensorDesc().getBlockingDesc().getStrides();
+            dstStrides = layer->outData[0]->getTensorDesc().getBlockingDesc().getStrides();
+            work_amount = dst_dims[0] * dstStrides[0];
+            for (size_t i = 0; i < src_dims.size(); i++)
+                src_o_dms.push_back(src_dims[i] + pads_begin[i]);
+
+            addConfig(layer, { DataConfigurator(ConfLayout::PLN) }, { DataConfigurator(ConfLayout::PLN) });
+        } catch (InferenceEngine::details::InferenceEngineException &ex) {
+            errorMsg = ex.what();
+        }
+    }
+
+    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
+        const float *src_data = inputs[0]->cbuffer().as<const float *>() +
+            inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
+        float* dst_data = outputs[0]->cbuffer().as<float *>() +
+            outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
+
+        switch (padMode) {
+            case CONSTANT:
+                pad_constant(src_data, dst_data);
+                break;
+            case EDGE:
+                pad_edge(src_data, dst_data);
+                break;
+            case REFLECT:
+                pad_reflect(src_data, dst_data);
+                break;
+            case SYMMETRIC:
+                pad_symmetric(src_data, dst_data);
+                break;
+            default:
+                return GENERAL_ERROR;
+        }
+        return OK;
+    }
+
+private:
+    enum PadMode {
+        CONSTANT = 0,
+        EDGE = 1,
+        REFLECT = 2,
+        SYMMETRIC = 3
+    };
+
+    void pad_constant(const float *src_data, float* dst_data);
+    void pad_edge(const float *src_data, float* dst_data);
+    void pad_reflect(const float *src_data, float* dst_data);
+    void pad_symmetric(const float *src_data, float* dst_data);
+
+    PadMode padMode = CONSTANT;
+    float pad_value;
+    SizeVector src_dims;
+    SizeVector dst_dims;
+    std::vector<unsigned int> pads_begin;
+    SizeVector src_o_dms;
+    SizeVector srcStrides;
+    SizeVector dstStrides;
+    size_t work_amount;
+};
+
+
+inline size_t parallel_init(size_t start, size_t size, std::vector<size_t> &counters, std::vector<size_t> &dims) {
+    for (int j = size - 1; j >= 0; j--) {
+        counters[j] = start % dims[j];
+        start = start / dims[j];
+    }
+    return start;
+}
+
+inline void parallel_step(size_t size, std::vector<size_t> &counters, std::vector<size_t> &dims) {
+    for (int j = size - 1; j >= 0; j--) {
+        counters[j] = (counters[j] + 1) % dims[j];
+        if (counters[j] != 0)
+            return;
+    }
+}
+
+void PadImpl::pad_constant(const float *src_data, float* dst_data) {
+    int offset = 0;
+    for (size_t i = 0; i < srcStrides.size(); ++i)
+        offset += pads_begin[i] * srcStrides[i];
+
+    parallel_nt(0, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        SizeVector counters(dst_dims.size(), 0);
+        splitter(work_amount, nthr, ithr, start, end);
+
+        parallel_init(start, dst_dims.size(), counters, dst_dims);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            int srcIdx = 1;
+            int dstIdx = 0;
+            for (size_t i = 0; i < dstStrides.size(); ++i)
+                dstIdx += counters[i] * dstStrides[i];
+
+            for (size_t i = 0; i < counters.size(); ++i) {
+                if (counters[i] < pads_begin[i] || counters[i] >= src_o_dms[i]) {
+                    dst_data[dstIdx] = pad_value;
+                    srcIdx = 0;
+                    break;
+                }
+            }
+            if (srcIdx) {
+                int srcIdx = 0;
+                for (size_t i = 0; i < srcStrides.size(); ++i)
+                    srcIdx += counters[i] * srcStrides[i];
+                dst_data[dstIdx] = src_data[srcIdx - offset];
+            }
+            parallel_step(dst_dims.size(), counters, dst_dims);
+        }
+    });
+}
+
+void PadImpl::pad_edge(const float *src_data, float* dst_data) {
+    parallel_nt(0, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        SizeVector counters(dst_dims.size(), 0);
+        splitter(work_amount, nthr, ithr, start, end);
+
+        parallel_init(start, dst_dims.size(), counters, dst_dims);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            int srcIdx = 0;
+            int dstIdx = 0;
+            for (size_t i = 0; i < dstStrides.size(); ++i)
+                dstIdx += counters[i] * dstStrides[i];
+
+            for (size_t i = 0; i < srcStrides.size(); ++i) {
+                int idx = (counters[i] < pads_begin[i]) ? 0 :
+                    ((counters[i] >= src_o_dms[i]) ? (src_dims[i] - 1) : (counters[i] - pads_begin[i]));
+                srcIdx += idx * srcStrides[i];
+            }
+
+            dst_data[dstIdx] = src_data[srcIdx];
+            parallel_step(dst_dims.size(), counters, dst_dims);
+        }
+    });
+}
+
+void PadImpl::pad_reflect(const float *src_data, float* dst_data) {
+    SizeVector src_2;
+    for (size_t i = 0; i < src_dims.size(); i++)
+        src_2.push_back(src_dims[i] + src_o_dms[i] - 2);
+
+    parallel_nt(0, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        SizeVector counters(dst_dims.size(), 0);
+        splitter(work_amount, nthr, ithr, start, end);
+
+        parallel_init(start, dst_dims.size(), counters, dst_dims);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            int srcIdx = 0;
+            int dstIdx = 0;
+            for (size_t i = 0; i < dstStrides.size(); ++i)
+                dstIdx += counters[i] * dstStrides[i];
+
+            for (size_t i = 0; i < srcStrides.size(); ++i) {
+                int idx = (counters[i] < pads_begin[i]) ? (pads_begin[i] - counters[i]) :
+                    ((counters[i] >= src_o_dms[i]) ? (src_2[i] - counters[i]) : (counters[i] - pads_begin[i]));
+                srcIdx += idx * srcStrides[i];
+            }
+
+            dst_data[dstIdx] = src_data[srcIdx];
+            parallel_step(dst_dims.size(), counters, dst_dims);
+        }
+    });
+}
+
+void PadImpl::pad_symmetric(const float *src_data, float* dst_data) {
+    SizeVector src_2;
+    for (size_t i = 0; i < src_dims.size(); i++)
+        src_2.push_back(src_dims[i] + src_o_dms[i] - 1);
+
+    parallel_nt(0, [&](const int ithr, const int nthr) {
+        size_t start = 0, end = 0;
+        SizeVector counters(dst_dims.size(), 0);
+        splitter(work_amount, nthr, ithr, start, end);
+
+        parallel_init(start, dst_dims.size(), counters, dst_dims);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            int srcIdx = 0;
+            int dstIdx = 0;
+            for (size_t i = 0; i < dstStrides.size(); ++i)
+                dstIdx += counters[i] * dstStrides[i];
+
+            for (size_t i = 0; i < srcStrides.size(); ++i) {
+                int idx = (counters[i] < pads_begin[i]) ? (pads_begin[i] - 1 - counters[i]) :
+                    ((counters[i] >= src_o_dms[i]) ? (src_2[i] - counters[i]) : (counters[i] - pads_begin[i]));
+                srcIdx += idx * srcStrides[i];
+            }
+
+            dst_data[dstIdx] = src_data[srcIdx];
+            parallel_step(dst_dims.size(), counters, dst_dims);
+        }
+    });
+}
+
+REG_FACTORY_FOR(ImplFactory<PadImpl>, Pad);
+
+}  // namespace Cpu
+}  // namespace Extensions
+}  // namespace InferenceEngine
index 8f5d5a5..f3666b2 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 0008256..8b948ef 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -20,6 +19,11 @@ public:
         try {
             if (layer->insData.size() != 2 || layer->outData.empty())
                 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
+
+            if (layer->insData[0].lock()->dims.size() != 4 ||
+                    layer->insData[1].lock()->dims.size() != 4)
+                THROW_IE_EXCEPTION << "PriorBox supports only 4D blobs!";
+
             _offset = layer->GetParamAsFloat("offset");
             _step = layer->GetParamAsFloat("step", 0);
             _min_sizes = layer->GetParamAsFloats("min_size", {});
index f04643a..69807a9 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -19,6 +18,10 @@ public:
             if (layer->insData.size() != 2 || layer->outData.empty())
                 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
 
+            if (layer->insData[0].lock()->dims.size() != 4 ||
+                    layer->insData[1].lock()->dims.size() != 4)
+                THROW_IE_EXCEPTION << "PriorBoxClustered supports only 4D blobs!";
+
             widths_ = layer->GetParamAsFloats("width", {});
             heights_ = layer->GetParamAsFloats("height", {});
             clip_ = layer->GetParamAsInt("clip");
index 9b05b46..2f93b05 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -328,6 +327,10 @@ public:
         try {
             if (layer->insData.size() != 3 || layer->outData.size() != 1)
                 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
+
+            if (layer->insData[0].lock()->dims.size() != 4)
+                THROW_IE_EXCEPTION << "Proposal supports only 4D blobs!";
+
             feat_stride_ = static_cast<size_t>(layer->GetParamAsInt("feat_stride"));
             base_size_ = static_cast<size_t>(layer->GetParamAsInt("base_size"));
             min_size_ = static_cast<size_t>(layer->GetParamAsInt("min_size"));
index efdaacc..355a3e6 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 470f9ab..1cda662 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -8,6 +7,7 @@
 #include "defs.h"
 #include "softmax.h"
 #include <vector>
+#include "simple_copy.h"
 
 namespace InferenceEngine {
 namespace Extensions {
@@ -44,7 +44,7 @@ public:
         int IC = (inputs[0]->getTensorDesc().getDims().size() > 1) ? inputs[0]->getTensorDesc().getDims()[1] : 1;
         int B = (inputs[0]->getTensorDesc().getDims().size() > 0) ? inputs[0]->getTensorDesc().getDims()[0] : 1;
 
-        memcpy(dst_data, src_data, B * IC * IH * IW * sizeof(float));
+        simple_copy(dst_data, outputs[0]->byteSize(), src_data, (size_t)B * IC * IH * IW * sizeof(float));
 
         int end_index = 0;
         int num_ = 0;
index 8326a93..ebeecb7 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 0860ff6..531158f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -14,6 +13,7 @@
 #include <cmath>
 #include <cassert>
 #include "ie_parallel.hpp"
+#include "simple_copy.h"
 
 namespace InferenceEngine {
 namespace Extensions {
@@ -31,6 +31,9 @@ public:
             if (layer->insData.size() != 1 || layer->outData.empty())
                 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
 
+            if (layer->insData[0].lock()->dims.size() != 4)
+                THROW_IE_EXCEPTION << "Resample supports only 4D blobs!";
+
             type = layer->GetParamAsString("type");
             antialias = static_cast<bool>(layer->GetParamAsInt("antialias"));
 
@@ -65,7 +68,7 @@ public:
         size_t OW = outputs[0]->getTensorDesc().getDims()[3];
 
         if (IW == OW && IH == OH && type == "caffe.ResampleParameter.LINEAR") {
-            memcpy(dst_data, src_data, IN * IC * IH * IW * sizeof(float));
+            simple_copy(dst_data, outputs[0]->byteSize(), src_data, IN * IC * IH * IW * sizeof(float));
             return OK;
         }
 
index 8947946..72b004a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -203,6 +202,9 @@ public:
             if (layer->insData.size() != 3 || layer->outData.size() != 1)
                 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
 
+            if (layer->insData[0].lock()->dims.size() != 4)
+                THROW_IE_EXCEPTION << "SimplerNMS supports only 4D blobs!";
+
             min_box_size_ = layer->GetParamAsInt("min_bbox_size");
             feat_stride_ = layer->GetParamAsInt("feat_stride");
             pre_nms_topn_ = layer->GetParamAsInt("pre_nms_topn");
index 6c009b6..a63fb69 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -25,6 +24,9 @@ public:
             if (layer->insData.size() != 2 || layer->outData.empty())
                 THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
 
+            if (layer->insData[0].lock()->dims.size() != 4)
+                THROW_IE_EXCEPTION << "SpatialTransformer supports only 4D blobs!";
+
             addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)});
         } catch (InferenceEngine::details::InferenceEngineException &ex) {
             errorMsg = ex.what();
diff --git a/inference-engine/src/extension/simple_copy.cpp b/inference-engine/src/extension/simple_copy.cpp
new file mode 100644 (file)
index 0000000..22d6be0
--- /dev/null
@@ -0,0 +1,21 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <stdint.h>
+#include <string.h>
+#include "simple_copy.h"
+
+int simple_copy(void* dest, size_t destsz, void const* src, size_t count) {
+    size_t i;
+    if (!src || count > destsz ||
+        count > (dest > src ? ((uintptr_t)dest - (uintptr_t)src)
+                            : ((uintptr_t)src - (uintptr_t)dest))) {
+        // zero out dest if error detected
+        memset(dest, 0, destsz);
+        return -1;
+    }
+
+    for (i = 0; i < count; ++i) (reinterpret_cast<uint8_t*>(dest))[i] = (reinterpret_cast<const uint8_t*>(src))[i];
+    return 0;
+}
diff --git a/inference-engine/src/extension/simple_copy.h b/inference-engine/src/extension/simple_copy.h
new file mode 100644 (file)
index 0000000..aaf7521
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <stdlib.h>
+#include "ie_api.h"
+
+/**
+ * @brief Copies bytes between buffers with security enhancements
+ * Copies count bytes from src to dest. If the source and destination
+ * overlap, the behavior is undefined.
+ * @param dest
+ * pointer to the object to copy to
+ * @param destsz
+ * max number of bytes to modify in the destination (typically the size
+ * of the destination object)
+ * @param src
+ pointer to the object to copy from
+ * @param count
+ number of bytes to copy
+ @return zero on success and non-zero value on error.
+ */
+
+INFERENCE_ENGINE_API_CPP(int) simple_copy(void* dest, size_t destsz, void const* src, size_t count);
diff --git a/inference-engine/src/gna_plugin/CMakeLists.txt b/inference-engine/src/gna_plugin/CMakeLists.txt
new file mode 100644 (file)
index 0000000..f6a25b6
--- /dev/null
@@ -0,0 +1,60 @@
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set(TARGET_NAME "GNAPlugin")
+
+file(GLOB_RECURSE SOURCES
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+        )
+
+file(GLOB_RECURSE HEADERS
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
+        )
+
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
+
+find_package(libGNA)
+include_directories(${libGNA_INCLUDE_DIRS})
+
+include_directories(
+        ${CMAKE_SOURCE_DIR}/include
+        ${CMAKE_SOURCE_DIR}/src/inference_engine
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${libGNA_INCLUDE_DIRS}
+)
+
+add_definitions(-D_NO_MKL_)
+add_library(${TARGET_NAME} SHARED ${SOURCES} ${HEADERS})
+
+if (LINUX)
+    find_package(Threads)
+endif ()
+
+set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
+
+#saving rpath to GNA shared library be used by CI
+log_rpath_remove_top(GNA FALSE "/gna${libGNA_LIBRARY}" TRUE)
+
+target_link_libraries(${TARGET_NAME} inference_engine ${INTEL_ITT_LIBS} ${libGNA_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+
+
+set(TEST_SOURCES
+        "${CMAKE_CURRENT_SOURCE_DIR}/gna_plugin.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/gna_plugin_passes.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/quantization/quantization.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/dnn.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/gna_device.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/pwl_design.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/floatmath.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/dnn_memory.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/util.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/gna_model_serial.cpp")
+
+add_library(${TARGET_NAME}_test_static STATIC ${TEST_SOURCES} ${HEADERS})
+target_compile_definitions(${TARGET_NAME}_test_static
+        PUBLIC -DINTEGER_LOW_P
+               -DUSE_STATIC_IE)
+
+set_target_properties(${TARGET_NAME}_test_static PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}_test_static)
diff --git a/inference-engine/src/gna_plugin/dnn.cpp b/inference-engine/src/gna_plugin/dnn.cpp
new file mode 100644 (file)
index 0000000..8c94f72
--- /dev/null
@@ -0,0 +1,2528 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+extern bool global_debug;
+
+#include <cstdlib>
+#include <cstdio>
+#include <cmath>
+#include <set>
+#include <details/ie_exception.hpp>
+#include <algorithm>
+#include <gna-api-types-xnn.h>
+
+#ifndef _NO_MKL_
+#include <mkl_dnn.h>
+#endif
+#include "dnn.h"
+#ifdef INTEGER_REF
+#include "convnet.h"
+#include "igemv16.h"
+#include "igemv8.h"
+#include "sgemm.h"
+#else
+#include "floatmath.h"
+#endif
+#include "pwl.h"
+#include "util.h"
+#include "gna_plugin_log.hpp"
+
+#ifdef WIN32
+# define rand_r(X) rand()
+#endif
+
+/**
+ * whether to dump weights and biases
+ */
+#define DUMP_WB
+/**
+ * in light mode only layer names are dumped
+ * @param filename
+ * @param number_type
+ * @return
+ */
+#define LIGHT_DUMP
+
+static int & getDumpFolderId() {
+    static int N = 0;
+    return N;
+}
+
+static std::string getDumpFolderNameGNA() {
+    return std::string("./gna_layers/")+std::to_string(getDumpFolderId() - 1)+"/";
+}
+
+static std::string getDumpFolderName() {
+    return std::string("./layers/")+std::to_string(getDumpFolderId() - 1)+"/";
+}
+
+static std::string getRefFolderName() {
+    return std::string("./ref_layers/")+std::to_string(getDumpFolderId() - 1)+"/";
+}
+
+void AmIntelDnn::BeginNewWrite() {
+    getDumpFolderId()++;
+}
+
+
+void AmIntelDnn::Init(void *ptr_memory,
+                      uint32_t num_memory_bytes,
+                      intel_dnn_number_type_t number_type,
+                      float scale_factor) {
+    ptr_dnn_memory_ = ptr_memory;
+    num_bytes_dnn_memory_ = num_memory_bytes;
+    number_type_ = number_type;
+    input_scale_factor_ = scale_factor;
+
+    ptr_active_outputs_ = nullptr;
+    num_active_outputs_ = 0;
+    num_left_context = 0;
+    num_right_context = 0;
+    do_rotate_input = false;
+    softmax_type = kSoftmaxNone;
+    ptr_sumgroup_sizes = nullptr;
+    num_sumgroup_sizes = 0;
+    ptr_priors = nullptr;
+
+
+    //  component.clear();
+}
+
+void AmIntelDnn::InitActiveList(uint32_t *ptr_active_list) {
+    ptr_active_outputs_ = ptr_active_list;
+    if (ptr_active_list == nullptr) {
+        if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) {
+            num_active_outputs_ = component[component.size() - 1].num_rows_out;
+        } else {
+            num_active_outputs_ = component[component.size() - 1].num_columns_out;
+        }
+    } else {
+        num_active_outputs_ = 0;
+    }
+}
+
+void AmIntelDnn::AddComponents(uint32_t num_components_to_add) {
+    component.resize(component.size() + num_components_to_add);
+    for (uint32_t i = 0; i < num_components_to_add; i++) {
+        ClearComponent(component.size() - i - 1);
+    }
+}
+
+void AmIntelDnn::ClearComponent(uint32_t component_index) {
+    if (component_index > component.size() - 1) {
+        fprintf(stderr, "Error:  attempt to clear non-existent component!\n");
+        throw -1;
+    }
+    component[component_index].num_rows_in = 0;
+    component[component_index].num_columns_in = 0;
+    component[component_index].num_rows_out = 0;
+    component[component_index].num_columns_out = 0;
+    component[component_index].num_bytes_per_input = 0;
+    component[component_index].num_bytes_per_output = 0;
+    component[component_index].operation = kDnnNullOp;
+    component[component_index].macro_operation = kDnnMacroOpNone;
+    component[component_index].orientation_in = kDnnUnknownOrientation;
+    component[component_index].orientation_out = kDnnUnknownOrientation;
+    component[component_index].ptr_inputs = nullptr;
+    component[component_index].ptr_outputs = nullptr;
+    memset(&component[component_index].op, 0, sizeof(component[component_index].op));
+}
+
+void AmIntelDnn::ClearState() {
+    // To support recurrent networks, provide mechanism to clear persistent state
+    // (e.g., between utterances for speech recognition).  For recurrent component,
+    // this means clearing the feedback buffer.  For other components, just clear the
+    // output buffer since any feedback will come from some component's output.
+    for (uint32_t i = 0; i < component.size(); i++) {
+        if (component[i].operation == kDnnRecurrentOp) {
+            memset(component[i].op.recurrent.ptr_feedbacks,
+                   0,
+                   component[i].op.recurrent.num_vector_delay * component[i].num_columns_out
+                       * component[i].num_bytes_per_input);
+        } else {
+            memset(component[i].ptr_outputs,
+                   0,
+                   component[i].num_bytes_per_output * component[i].num_rows_out * component[i].num_columns_out);
+        }
+    }
+}
+
+void AmIntelDnn::InitAffineComponentPrivate(intel_dnn_component_t &comp,
+                                            uint32_t num_rows_in,
+                                            uint32_t num_columns,
+                                            uint32_t num_rows_out,
+                                            uint32_t num_bytes_per_input,
+                                            uint32_t num_bytes_per_output,
+                                            uint32_t num_bytes_per_weight,
+                                            uint32_t num_bytes_per_bias,
+                                            float weight_scale_factor,
+                                            float output_scale_factor,
+                                            void *&ptr_inputs,
+                                            void *&ptr_outputs,
+                                            void *&ptr_weights,
+                                            void *&ptr_biases,
+                                            bool isDiag,
+                                            bool postInitMem) {
+    comp.num_rows_in = num_rows_in;
+    comp.num_columns_in = num_columns;
+    comp.num_rows_out = num_rows_out;
+    comp.num_columns_out = num_columns;
+    comp.num_bytes_per_input = num_bytes_per_input;
+    comp.num_bytes_per_output = num_bytes_per_output;
+    comp.operation = isDiag ? kDnnDiagonalOp : kDnnAffineOp;
+    comp.macro_operation = kDnnMacroOpNone;
+    comp.orientation_in = kDnnInterleavedOrientation;
+    comp.orientation_out = kDnnInterleavedOrientation;
+    comp.op.affine.num_bytes_per_weight = num_bytes_per_weight;
+    comp.op.affine.num_bytes_per_bias = num_bytes_per_bias;
+    comp.op.affine.weight_scale_factor = weight_scale_factor;
+    comp.output_scale_factor = output_scale_factor;
+    if (!postInitMem) {
+        comp.op.affine.ptr_weights = ptr_weights;
+        comp.op.affine.ptr_biases = ptr_biases;
+        comp.ptr_inputs = ptr_inputs;
+        comp.ptr_outputs = ptr_outputs;
+    } else {
+        ptr_weights = &comp.op.affine.ptr_weights;
+        ptr_biases = &comp.op.affine.ptr_biases;
+        ptr_inputs = &comp.ptr_inputs;
+        ptr_outputs = &comp.ptr_outputs;
+    }
+}
+
+void AmIntelDnn::InitDiagonalComponent(uint32_t component_index,
+                                       uint32_t num_rows_in,
+                                       uint32_t num_columns,
+                                       uint32_t num_rows_out,
+                                       uint32_t num_bytes_per_input,
+                                       uint32_t num_bytes_per_output,
+                                       uint32_t num_bytes_per_weight,
+                                       uint32_t num_bytes_per_bias,
+                                       float weight_scale_factor,
+                                       float output_scale_factor,
+                                       void *ptr_inputs,
+                                       void *ptr_outputs,
+                                       void *ptr_weights,
+                                       void *ptr_biases) {
+    component[component_index].num_rows_in = num_rows_in;
+    component[component_index].num_columns_in = num_columns;
+    component[component_index].num_rows_out = num_rows_out;
+    component[component_index].num_columns_out = num_columns;
+    component[component_index].num_bytes_per_input = num_bytes_per_input;
+    component[component_index].num_bytes_per_output = num_bytes_per_output;
+    component[component_index].operation = kDnnDiagonalOp;
+    component[component_index].macro_operation = kDnnMacroOpNone;
+    component[component_index].orientation_in = kDnnInterleavedOrientation;
+    component[component_index].orientation_out = kDnnInterleavedOrientation;
+    component[component_index].ptr_inputs = ptr_inputs;
+    component[component_index].ptr_outputs = ptr_outputs;
+    component[component_index].op.affine.num_bytes_per_weight = num_bytes_per_weight;
+    component[component_index].op.affine.num_bytes_per_bias = num_bytes_per_bias;
+    component[component_index].op.affine.weight_scale_factor = weight_scale_factor;
+    component[component_index].output_scale_factor = output_scale_factor;
+    component[component_index].op.affine.ptr_weights = ptr_weights;
+    component[component_index].op.affine.ptr_biases = ptr_biases;
+}
+
+void AmIntelDnn::InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp,
+                                              uint32_t num_rows_in,
+                                              uint32_t num_columns_in,
+                                              uint32_t num_rows_out,
+                                              uint32_t num_columns_out,
+                                              uint32_t num_bytes_per_input,
+                                              uint32_t num_bytes_per_output,
+                                              uint32_t num_bytes_per_weight,
+                                              uint32_t num_bytes_per_bias,
+                                              uint32_t num_filters,
+                                              uint32_t num_filter_rows,
+                                              uint32_t num_filter_coefficients,
+                                              uint32_t num_feature_maps,
+                                              uint32_t num_feature_map_rows,
+                                              uint32_t num_feature_map_columns,
+                                              float weight_scale_factor,
+                                              float output_scale_factor,
+                                              void *&ptr_inputs,
+                                              void *&ptr_outputs,
+                                              void *&ptr_filters,
+                                              void *&ptr_biases,
+                                              bool postInitMem) {
+    comp.num_rows_in = num_rows_in;
+    comp.num_columns_in = num_columns_in;
+    comp.num_rows_out = num_rows_out;
+    comp.num_columns_out = num_columns_out;
+    comp.num_bytes_per_input = num_bytes_per_input;
+    comp.num_bytes_per_output = num_bytes_per_output;
+    comp.operation = kDnnConvolutional1dOp;
+    comp.macro_operation = kDnnMacroOpNone;
+    comp.orientation_in = kDnnNonInterleavedOrientation;
+    comp.orientation_out = kDnnNonInterleavedOrientation;
+    comp.ptr_inputs = ptr_inputs;
+    comp.ptr_outputs = ptr_outputs;
+    comp.op.conv1D.num_bytes_per_weight = num_bytes_per_weight;
+    comp.op.conv1D.num_bytes_per_bias = num_bytes_per_bias;
+    comp.op.conv1D.num_filters = num_filters;
+    comp.op.conv1D.num_filter_rows = num_filter_rows;
+    comp.op.conv1D.num_filter_coefficients = num_filter_coefficients;
+    comp.op.conv1D.num_feature_maps = num_feature_maps;
+    comp.op.conv1D.num_feature_map_rows = num_feature_map_rows;
+    comp.op.conv1D.num_feature_map_columns = num_feature_map_columns;
+    comp.op.conv1D.weight_scale_factor = weight_scale_factor;
+    comp.output_scale_factor = output_scale_factor;
+
+    if (!postInitMem) {
+        comp.op.conv1D.ptr_filters = ptr_filters;
+        comp.op.conv1D.ptr_biases  = ptr_biases;
+        comp.ptr_inputs = ptr_inputs;
+        comp.ptr_outputs = ptr_outputs;
+    } else {
+        ptr_filters = &comp.op.conv1D.ptr_filters;
+        ptr_biases  = &comp.op.conv1D.ptr_biases;
+        ptr_inputs  = &comp.ptr_inputs;
+        ptr_outputs = &comp.ptr_outputs;
+    }
+}
+
+void AmIntelDnn::InitMaxpoolComponentPrivate(intel_dnn_component_t &comp,
+                                      uint32_t num_rows_in,
+                                      uint32_t num_columns_in,
+                                      uint32_t num_rows_out,
+                                      uint32_t num_columns_out,
+                                      uint32_t num_bytes_per_input,
+                                      uint32_t num_bytes_per_output,
+                                      uint32_t num_pool_size,
+                                      uint32_t num_pool_step,
+                                      uint32_t num_pool_stride,
+                                      bool do_sum_not_max,
+                                      float output_scale_factor,
+                                      void *&ptr_inputs,
+                                      void *&ptr_outputs,
+                                      bool postInitMem) {
+    comp.num_rows_in = num_rows_in;
+    comp.num_columns_in = num_columns_in;
+    comp.num_rows_out = num_rows_out;
+    comp.num_columns_out = num_columns_out;
+    comp.num_bytes_per_input = num_bytes_per_input;
+    comp.num_bytes_per_output = num_bytes_per_output;
+    comp.operation = kDnnMaxPoolOp;
+    comp.macro_operation = kDnnMacroOpNone;
+    comp.orientation_in = kDnnNonInterleavedOrientation;
+    comp.orientation_out = kDnnNonInterleavedOrientation;
+    comp.op.maxpool.num_inputs = num_pool_size;
+    comp.op.maxpool.num_inputs_step = num_pool_step;
+    comp.op.maxpool.num_inputs_stride = num_pool_stride;
+    comp.op.maxpool.do_sum_not_max = do_sum_not_max;
+    comp.output_scale_factor = output_scale_factor;
+
+    if (!postInitMem) {
+        comp.ptr_inputs = ptr_inputs;
+        comp.ptr_outputs = ptr_outputs;
+    } else {
+        ptr_inputs  = &comp.ptr_inputs;
+        ptr_outputs = &comp.ptr_outputs;
+    }
+}
+
+void AmIntelDnn::InitCopyComponentPrivate(intel_dnn_component_t &comp,
+                                          intel_dnn_orientation_t orientation,
+                                          uint32_t num_rows_in,
+                                          uint32_t num_columns_in,
+                                          uint32_t num_rows_out,
+                                          uint32_t num_columns_out,
+                                          uint32_t num_bytes_per_input,
+                                          uint32_t num_bytes_per_output,
+                                          float output_scale_factor,
+                                          uint32_t num_copy_rows,
+                                          uint32_t num_copy_columns,
+                                          void *&ptr_inputs,
+                                          void *&ptr_outputs,
+                                          bool postInitMem) {
+    comp.num_rows_in = num_rows_in;
+    comp.num_columns_in = num_columns_in;
+    comp.num_rows_out = num_rows_out;
+    comp.num_columns_out = num_columns_out;
+    comp.num_bytes_per_input = num_bytes_per_input;
+    comp.num_bytes_per_output = num_bytes_per_output;
+    comp.operation = kDnnCopyOp;
+    comp.macro_operation = kDnnMacroOpNone;
+    comp.orientation_in = orientation;
+    comp.orientation_out = orientation;
+    comp.ptr_inputs = ptr_inputs;
+    comp.ptr_outputs = ptr_outputs;
+    comp.output_scale_factor = output_scale_factor;
+    comp.op.copy.num_copy_rows = num_copy_rows;
+    comp.op.copy.num_copy_columns = num_copy_columns;
+
+    if (!postInitMem) {
+        comp.ptr_inputs = ptr_inputs;
+        comp.ptr_outputs = ptr_outputs;
+    } else {
+        ptr_inputs  = &comp.ptr_inputs;
+        ptr_outputs = &comp.ptr_outputs;
+    }
+}
+
+void AmIntelDnn::InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &comp,
+                                                     DnnActivation function_id,
+                                                     intel_dnn_orientation_t orientation,
+                                                     uint32_t num_rows,
+                                                     uint32_t num_columns,
+                                                     uint32_t num_bytes_per_input,
+                                                     uint32_t num_bytes_per_output,
+                                                     uint32_t num_segments,
+                                                     float output_scale_factor,
+                                                     void *&ptr_inputs,
+                                                     void *&ptr_outputs,
+                                                     intel_pwl_segment_t *ptr_segments,
+                                                     bool postInitMem) {
+    comp.num_rows_in = num_rows;
+    comp.num_columns_in = num_columns;
+    comp.num_rows_out = num_rows;
+    comp.num_columns_out = num_columns;
+    comp.num_bytes_per_input = num_bytes_per_input;
+    comp.num_bytes_per_output = num_bytes_per_output;
+    comp.operation = kDnnPiecewiselinearOp;
+    comp.macro_operation = kDnnMacroOpNone;
+    comp.orientation_in = orientation;
+    comp.orientation_out = orientation;
+    comp.op.pwl.func_id = function_id;
+    comp.op.pwl.num_segments = num_segments;
+    comp.output_scale_factor = output_scale_factor;
+
+    if (!postInitMem) {
+        comp.ptr_inputs = ptr_inputs;
+        comp.ptr_outputs = ptr_outputs;
+        comp.op.pwl.ptr_segments = ptr_segments;
+    } else {
+        ptr_inputs = &comp.ptr_inputs;
+        ptr_outputs = &comp.ptr_outputs;
+        if (ptr_segments != nullptr) {
+            *reinterpret_cast<intel_pwl_segment_t **>(ptr_segments) =
+                reinterpret_cast<intel_pwl_segment_t *>(& comp.op.pwl.ptr_segments);
+        }
+    }
+}
+
+void AmIntelDnn::InitRecurrentComponent(uint32_t component_index,
+                                        uint32_t num_rows,
+                                        uint32_t num_columns_in,
+                                        uint32_t num_columns_out,
+                                        uint32_t num_bytes_per_input,
+                                        uint32_t num_bytes_per_output,
+                                        uint32_t num_vector_delay,
+                                        uint32_t num_bytes_per_weight,
+                                        uint32_t num_bytes_per_bias,
+                                        float weight_scale_factor,
+                                        float output_scale_factor,
+                                        void *ptr_inputs,
+                                        void *ptr_feedbacks,
+                                        void *ptr_outputs,
+                                        void *ptr_weights,
+                                        void *ptr_biases) {
+    component[component_index].num_rows_in = num_rows;
+    component[component_index].num_columns_in = num_columns_in;
+    component[component_index].num_rows_out = num_rows;
+    component[component_index].num_columns_out = num_columns_out;
+    component[component_index].num_bytes_per_input = num_bytes_per_input;
+    component[component_index].num_bytes_per_output = num_bytes_per_output;
+    component[component_index].operation = kDnnRecurrentOp;
+    component[component_index].macro_operation = kDnnMacroOpNone;
+    component[component_index].orientation_in = kDnnNonInterleavedOrientation;
+    component[component_index].orientation_out = kDnnNonInterleavedOrientation;
+    component[component_index].ptr_inputs = ptr_inputs;
+    component[component_index].ptr_outputs = ptr_outputs;
+    component[component_index].op.recurrent.num_vector_delay = num_vector_delay;
+    component[component_index].op.recurrent.num_bytes_per_weight = num_bytes_per_weight;
+    component[component_index].op.recurrent.num_bytes_per_bias = num_bytes_per_bias;
+    component[component_index].op.recurrent.weight_scale_factor = weight_scale_factor;
+    component[component_index].output_scale_factor = output_scale_factor;
+    component[component_index].op.recurrent.ptr_feedbacks = ptr_feedbacks;
+    component[component_index].op.recurrent.ptr_weights = ptr_weights;
+    component[component_index].op.recurrent.ptr_biases = ptr_biases;
+}
+
+void AmIntelDnn::InitInterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns,
+                                         uint32_t num_bytes_per_input, uint32_t num_bytes_per_output,
+                                         float output_scale_factor, void *ptr_inputs, void *ptr_outputs) {
+    component[component_index].num_rows_in = num_rows;
+    component[component_index].num_columns_in = num_columns;
+    component[component_index].num_rows_out = num_columns;
+    component[component_index].num_columns_out = num_rows;
+    component[component_index].num_bytes_per_input = num_bytes_per_input;
+    component[component_index].num_bytes_per_output = num_bytes_per_output;
+    component[component_index].operation = kDnnInterleaveOp;
+    component[component_index].macro_operation = kDnnMacroOpNone;
+    component[component_index].orientation_in = kDnnNonInterleavedOrientation;
+    component[component_index].orientation_out = kDnnInterleavedOrientation;
+    component[component_index].ptr_inputs = ptr_inputs;
+    component[component_index].ptr_outputs = ptr_outputs;
+    component[component_index].output_scale_factor = output_scale_factor;
+}
+
+void AmIntelDnn::InitDeinterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns,
+                                           uint32_t num_bytes_per_input, uint32_t num_bytes_per_output,
+                                           float output_scale_factor, void *ptr_inputs, void *ptr_outputs) {
+    component[component_index].num_rows_in = num_rows;
+    component[component_index].num_columns_in = num_columns;
+    component[component_index].num_rows_out = num_columns;
+    component[component_index].num_columns_out = num_rows;
+    component[component_index].num_bytes_per_input = num_bytes_per_input;
+    component[component_index].num_bytes_per_output = num_bytes_per_output;
+    component[component_index].operation = kDnnDeinterleaveOp;
+    component[component_index].macro_operation = kDnnMacroOpNone;
+    component[component_index].orientation_in = kDnnInterleavedOrientation;
+    component[component_index].orientation_out = kDnnNonInterleavedOrientation;
+    component[component_index].ptr_inputs = ptr_inputs;
+    component[component_index].ptr_outputs = ptr_outputs;
+    component[component_index].output_scale_factor = output_scale_factor;
+}
+
+__inline void ApplyAffineTransform(intel_dnn_component_t *component, uint32_t *list, uint32_t listsize) {
+    auto transform = &component->op.affine;
+    int m = component->num_rows_out;
+    int n = component->num_columns_in;
+    int k = component->num_rows_in;
+    int lda = component->num_rows_in;
+    int ldb = component->num_columns_in;
+    int ldc = component->num_columns_out;
+
+    switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+        case 2:
+            if (component->op.affine.num_bytes_per_weight == 1) {
+                int8_t *A = reinterpret_cast<int8_t*>(transform->ptr_weights);
+                int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
+                intel_compound_bias_t *bias = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
+                if (list == nullptr) {
+                    //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
+                    igemm8_gna(m, n, k, A, lda, B, ldb, bias, C, ldc);
+                } else {
+                    //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
+                    igemm8_gna_subset(m, n, k, A, lda, B, ldb, bias, C, ldc, list, listsize);
+                }
+                //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+            } else if (component->op.affine.num_bytes_per_weight == 2) {
+                int16_t *A = reinterpret_cast<int16_t*>(transform->ptr_weights);
+                int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
+                int32_t *bias = reinterpret_cast<int32_t*>(transform->ptr_biases);
+                if (list == nullptr) {
+                    for (uint32_t i = 0; i < m; i++) {
+                        for (uint32_t j = 0; j < n; j++) {
+                            C[i*ldc+j] = bias[i];
+                        }
+                    }
+                    //  PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+                    cblas_igemm16(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc);
+                } else {
+                    for (int l = 0; l < listsize; l++) {
+                        int i = list[l];
+                        for (uint32_t j = 0; j < n; j++) {
+                            C[l*ldc+j] = bias[i];
+                        }
+                    }
+                    //  PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.scale_factor);
+                    //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.scale_factor);
+                    //  PrintMatrixInt32("C int32", C, m, n, ldc, component->op.affine.scale_factor * component->op.affine.scale_factor);
+                    cblas_igemm16_subset(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc, list, listsize);
+                }
+                //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+            } else {
+                fprintf(stderr, "Bad weight width in ApplyAffineTransform!\n");
+                throw -1;
+            }
+            break;
+#endif  // #ifdef INTEGER_REF
+        case 4: {
+            auto A = reinterpret_cast<float *>(transform->ptr_weights);
+            auto B = reinterpret_cast<float *>(component->ptr_inputs);
+            auto C = reinterpret_cast<float *>(component->ptr_outputs);
+            auto bias = reinterpret_cast<float *>(transform->ptr_biases);
+            if (list == nullptr) {
+                for (uint32_t i = 0; i < m; i++) {
+                    for (uint32_t j = 0; j < n; j++) {
+                        C[i * ldc + j] = bias[i];
+                    }
+                }
+                //  if (global_debug) PrintMatrixFloat32("A float", A, m, k, lda);
+                //  if (global_debug) PrintMatrixFloat32("B float", B, k, n, ldb);
+                //  if (global_debug) PrintMatrixFloat32("C float before", C, m, n, ldc);
+                cblas_sgemm1(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc);
+                //  if (global_debug) PrintMatrixFloat32("C float after", C, m, n, ldc);
+            } else {
+                for (int l = 0; l < listsize; l++) {
+                    int i = list[l];
+                    for (uint32_t j = 0; j < n; j++) {
+                        C[l * ldc + j] = bias[i];
+                    }
+                }
+                //  PrintMatrixFloat32("A float", A, k, m, lda);
+                //  PrintMatrixFloat32("trans(B) float", B, k, n, ldb);
+                //  PrintMatrixFloat32("C float before", C, listsize, n, ldc);
+                cblas_sgemm_subset(CblasRowMajor,
+                                   CblasNoTrans,
+                                   CblasNoTrans,
+                                   m,
+                                   n,
+                                   k,
+                                   1.0,
+                                   A,
+                                   lda,
+                                   B,
+                                   ldb,
+                                   1.0,
+                                   C,
+                                   ldc,
+                                   list,
+                                   listsize);
+                //  PrintMatrixFloat32("C float after", C, listsize, n, ldc);
+            }
+        }
+            break;
+        default:fprintf(stderr, "Bad data width in ApplyAffineTransform!\n");
+            throw -1;
+    }
+}
+
+__inline void ApplyDiagonalTransform(intel_dnn_component_t *component) {
+    auto transform = &component->op.affine;
+    int m = component->num_rows_out;
+    int n = component->num_columns_in;
+    int ldb = component->num_columns_in;
+    int ldc = component->num_columns_out;
+
+    switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+        case 2:
+            if (component->op.affine.num_bytes_per_weight == 1) {
+                int8_t *A = reinterpret_cast<int8_t*>(transform->ptr_weights);
+                int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
+                intel_compound_bias_t *bias = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
+                //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
+                //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
+                //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
+                isbmm8_gna(m, n, A, lda, B, ldb, bias, C, ldc);
+                //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+            } else if (component->op.affine.num_bytes_per_weight == 2) {
+                int16_t *A = reinterpret_cast<int16_t*>(transform->ptr_weights);
+                int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
+                int32_t *bias = reinterpret_cast<int32_t*>(transform->ptr_biases);
+                for (uint32_t i = 0; i < m; i++) {
+                    for (uint32_t j = 0; j < n; j++) {
+                        C[i*ldc+j] = bias[i];
+                    }
+                }
+                //  PrintMatrixInt16("A int16", A, 1, m, lda, component->op.affine.weight_scale_factor);
+                //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor);
+                //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+                cblas_isbmm16(m, n, A, lda, B, ldb, C, ldc);
+                //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+            } else {
+                fprintf(stderr, "Bad weight width in ApplyDiagonalTransform!\n");
+                throw -1;
+            }
+            break;
+#endif  // #ifdef INTEGER_REF
+        case 4: {
+            auto A = reinterpret_cast<float *>(transform->ptr_weights);
+            auto B = reinterpret_cast<float *>(component->ptr_inputs);
+            auto C = reinterpret_cast<float *>(component->ptr_outputs);
+            auto bias = reinterpret_cast<float *>(transform->ptr_biases);
+            for (uint32_t i = 0; i < m; i++) {
+                for (uint32_t j = 0; j < n; j++) {
+                    C[i * ldc + j] = bias[i];
+                }
+            }
+            //  PrintMatrixFloat32("A float", A, 1, m, lda);
+            //  PrintMatrixFloat32("B float", B, k, n, ldb);
+            //  PrintMatrixFloat32("C float before", C, m, n, ldc);
+            for (uint32_t j = 0; j < n; j++) {
+                float *Bcol = B + j * ldb;
+                float *Ccol = C + j * ldc;
+                cblas_ssbmv1(CblasRowMajor, CblasLower, m, 0, 1.0, A, 1, Bcol, 1, 1.0, Ccol, 1);
+            }
+            //  PrintMatrixFloat32("C float after", C, m, n, ldc);
+        }
+            break;
+        default:fprintf(stderr, "Bad data width in ApplyDiagonalTransform!\n");
+            throw -1;
+    }
+}
+
+__inline void ApplyRecurrentTransform(intel_dnn_component_t *component, uint32_t row, void *ptr_feedbacks) {
+    intel_recurrent_t *transform = &component->op.recurrent;
+    int k1 = component->num_columns_in;
+    int k2 = component->num_columns_out;
+    int n = k2;
+
+    if (component->op.recurrent.ptr_feedbacks == nullptr) {
+        fprintf(stderr, "nullptr feedback pointer in ApplyRecurrentTransform()!\n");
+        throw -1;
+    }
+
+    switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+        case 2:
+            if (component->op.recurrent.num_bytes_per_weight == 1) {
+                int16_t *A1 = reinterpret_cast<int16_t*>(component->ptr_inputs) + row * component->num_columns_in;
+                int16_t *A2 = reinterpret_cast<int16_t*>(ptr_feedbacks);
+                int8_t *X = reinterpret_cast<int8_t*>(transform->ptr_weights);
+                intel_compound_bias_t *B = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs) + row * component->num_columns_out;
+                //  PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor);
+                //  PrintMatrixInt16("A2 int", A2, 1, k2, k2);
+                //  PrintMatrixInt8("X int", X, k, n, n, component->op.recurrent.weight_scale_factor);
+                //  PrintMatrixInt32("B int", B, 1, 2*n, 2*n, component->output_scale_factor);
+                igemv8_gna_split(n, k1, k2, A1, A2, X, B, C);
+                //  PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor);
+            } else if (component->op.recurrent.num_bytes_per_weight == 2) {
+                int16_t *A1 = reinterpret_cast<int16_t*>(component->ptr_inputs) + row * component->num_columns_in;
+                int16_t *A2 = reinterpret_cast<int16_t*>(ptr_feedbacks);
+                int16_t *X = reinterpret_cast<int16_t*>(transform->ptr_weights);
+                int32_t *B = reinterpret_cast<int32_t*>(transform->ptr_biases);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs) + row * component->num_columns_out;
+                //  PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor);
+                //  PrintMatrixInt16("A2 int", A2, 1, k2, k2, component->op.recurrent.weight_scale_factor);
+                //  PrintMatrixInt16("X int", X, k, n, n, component->op.recurrent.weight_scale_factor);
+                //  PrintMatrixInt32("B int", B, 1, n, n, component->output_scale_factor);
+                igemv16_split(n, k1, k2, A1, A2, X, B, C);
+                //  PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor);
+            } else {
+                fprintf(stderr, "Weight width not supported in ApplyRecurrentTransform!\n");
+                throw -1;
+            }
+            break;
+#endif  // #ifdef INTEGER_REF
+        case 4: {
+            auto A1 = reinterpret_cast<float *>(component->ptr_inputs) + row * component->num_columns_in;
+            auto A2 = reinterpret_cast<float *>(ptr_feedbacks);
+            auto X = reinterpret_cast<float *>(transform->ptr_weights);
+            auto B = reinterpret_cast<float *>(transform->ptr_biases);
+            auto C = reinterpret_cast<float *>(component->ptr_outputs) + row * component->num_columns_out;
+            //  PrintMatrixFloat32("A1 float", A1, 1, k1, k1);
+            //  PrintMatrixFloat32("A2 float", A2, 1, k2, k2);
+            //  PrintMatrixFloat32("X float", X, k, n, n);
+            //  PrintMatrixFloat32("B float", B, 1, n, n);
+            sgemv_split(n, k1, k2, A1, A2, X, B, C);
+            //  PrintMatrixFloat32("C float", C, 1, n, n);
+        }
+            break;
+        default:fprintf(stderr, "Bad data width in ApplyRecurrentTransform!\n");
+            throw -1;
+    }
+}
+
+__inline void ApplyConvolutional1DTransform(intel_dnn_component_t *component) {
+    switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+        case 2:
+            CNNFilter16(component);
+            break;
+#endif  // #ifdef INTEGER_REF
+        case 4:
+            //  PrintMatrixFloat32("Input float", reinterpret_cast<float*>(component->ptr_inputs),
+            //  component->num_rows_in, component->num_columns_in, component->num_columns_in);
+            //  PrintMatrixFloat32("Filt float", reinterpret_cast<float*>(component->op.conv1D.ptr_filters),
+            //  component->op.conv1D.num_filters,
+            //  component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps,
+            //  component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps);
+            //  PrintMatrixFloat32("Bias float", reinterpret_cast<float*>(component->op.conv1D.ptr_biases), 1,
+            // component->op.conv1D.num_filters, component->op.conv1D.num_filters);
+            CNNFilter32(component);
+            //  PrintMatrixFloat32("Output float", reinterpret_cast<float*>(component->ptr_outputs, component->num_rows_out,
+            // component->num_columns_out, component->num_columns_out);
+            break;
+        default:fprintf(stderr, "Bad data width in ApplyConvolutionalTransform!\n");
+            throw -1;
+    }
+}
+
+__inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component,
+                                            intel_dnn_number_type_t number_type,
+                                            uint32_t listsize) {
+    if (number_type == kDnnFloat) {
+        // PrintMatrixFloat32("PWL Input float", reinterpret_cast<float*>(component->ptr_inputs), component->num_rows_in,
+        // component->num_columns_in, component->num_columns_in);
+        PwlApply32(component, listsize);
+        // PrintMatrixFloat32("PWL Output float", reinterpret_cast<float*>(component->ptr_outputs), component->num_rows_out,
+        // component->num_columns_out, component->num_columns_out);
+#ifdef INTEGER_REF
+        } else if (component->num_bytes_per_output == 2) {
+            PwlApply16(component, listsize);
+#endif  // #ifdef INTEGER_REF
+    } else {
+        fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n");
+        throw -1;
+    }
+}
+
+__inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component,
+                                            intel_dnn_number_type_t number_type,
+                                            uint32_t listsize,
+                                            uint32_t num_row) {
+    if (number_type == kDnnFloat) {
+        PwlApply32(component, num_row, num_row, 0, listsize - 1);
+#ifdef INTEGER_REF
+        } else if (component->num_bytes_per_output == 2) {
+            PwlApply16(component, num_row, num_row, 0, listsize-1);
+#endif  // #ifdef INTEGER_REF
+    } else {
+        fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n");
+        throw -1;
+    }
+}
+
+__inline void ApplyMaxPoolTransform(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
+    if (component->num_bytes_per_input == 4) {
+        // PrintMatrixFloat32("Input float", reinterpret_cast<float*>(component->ptr_inputs), component->num_rows_in,
+        // component->num_columns_in, component->num_columns_in);
+        CNNMaxPool(component, number_type);
+        // PrintMatrixFloat32("Output float", reinterpret_cast<float*>(component->ptr_outputs), component->num_rows_out,
+        // component->num_columns_out, component->num_columns_out);
+    } else {
+        fprintf(stderr, "Bad data width in ApplyMaxPoolTransform!\n");
+        throw -1;
+    }
+}
+
+__inline void ApplyTranspose(intel_dnn_component_t *component) {
+    int m = component->num_rows_in;
+    int n = component->num_columns_in;
+    int lda = component->num_columns_in;
+    int ldb = component->num_columns_out;
+    // B = Transpose(A) where A is mxn and B is nxm
+    switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+        case 1:
+            {
+                int8_t *A = reinterpret_cast<int8_t*>(component->ptr_inputs);
+                int8_t *B = reinterpret_cast<int8_t*>(component->ptr_outputs);
+                for (uint32_t row = 0; row < m; row++) {
+                    for (uint32_t col = 0; col < n; col++) {
+                        B[col*ldb+row] = A[row*lda+col];
+                    }
+                }
+            }
+            break;
+        case 2:
+            {
+                int16_t *A = reinterpret_cast<int16_t*>(component->ptr_inputs);
+                int16_t *B = reinterpret_cast<int16_t*>(component->ptr_outputs);
+                for (uint32_t row = 0; row < m; row++) {
+                    for (uint32_t col = 0; col < n; col++) {
+                        B[col*ldb+row] = A[row*lda+col];
+                    }
+                }
+            }
+            break;
+#endif  // #ifdef INTEGER_REF
+        case 4: {
+            auto A = reinterpret_cast<float *>(component->ptr_inputs);
+            auto B = reinterpret_cast<float *>(component->ptr_outputs);
+            for (uint32_t row = 0; row < m; row++) {
+                for (uint32_t col = 0; col < n; col++) {
+                    B[col * ldb + row] = A[row * lda + col];
+                }
+            }
+        }
+            break;
+        default:fprintf(stderr, "Bad data width in ApplyInterleave!\n");
+            throw -1;
+    }
+}
+
+__inline void ApplyCopy(intel_dnn_component_t *component) {
+    auto src = reinterpret_cast<uint8_t *>(component->ptr_inputs);
+    auto dst = reinterpret_cast<uint8_t *>(component->ptr_outputs);
+    int32_t m = component->op.copy.num_copy_rows;
+    int32_t n = component->op.copy.num_copy_columns;
+    int32_t lda = component->num_columns_in;
+    int32_t ldb = component->num_columns_out;
+    if (m > component->num_rows_in) {
+        fprintf(stderr, "Error:  attempt to copy more columns than matrix has!\n");
+        throw -1;
+    } else {
+        switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+            case 2:
+                {
+                    int16_t *A = reinterpret_cast<int16_t*>(src);
+                    int16_t *B = reinterpret_cast<int16_t*>(dst);
+                    for (uint32_t row = 0; row < m; row++) {
+                        for (uint32_t col = 0; col < n; col++) {
+                            B[row*ldb + col] = A[row*lda + col];
+                        }
+                    }
+                }
+                break;
+#endif  // #ifdef INTEGER_REF
+            case 4: {
+                auto A = reinterpret_cast<float *>(src);
+                auto B = reinterpret_cast<float *>(dst);
+                for (uint32_t row = 0; row < m; row++) {
+                    for (uint32_t col = 0; col < n; col++) {
+                        B[row * ldb + col] = A[row * lda + col];
+                    }
+                }
+            }
+                break;
+            default:fprintf(stderr, "Bad data width in ApplyCopy!\n");
+                throw -1;
+        }
+    }
+}
+
+uint32_t AmIntelDnn::CopyActiveList(std::vector<std::vector<uint32_t> > &active_list, uint32_t list_index) {
+    if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) {
+        num_active_outputs_ = component[component.size() - 1].num_rows_out;
+    } else {
+        num_active_outputs_ = component[component.size() - 1].num_columns_out;
+    }
+
+    if (!active_list.empty()) {
+        if (list_index >= active_list.size()) {
+            fprintf(stderr, "Index %d beyond end of active list in CopyActiveList()\n", list_index);
+            throw -1;
+        }
+        if (active_list[list_index].size() > component[component.size() - 1].num_rows_out) {
+            fprintf(stderr, "Active list too large in CopyActiveList()\n");
+            throw -1;
+        }
+
+        if (ptr_active_outputs_ != nullptr) {
+            num_active_outputs_ = active_list[list_index].size();
+            memcpy(ptr_active_outputs_, active_list[list_index].data(), num_active_outputs_ * sizeof(uint32_t));
+        }
+    }
+
+    return (num_active_outputs_);
+}
+
+void AmIntelDnn::Propagate() {
+    for (uint32_t i = 0; i < component.size(); i++) {
+        intel_dnn_component_t *comp = &component[i];
+        uint32_t *ptr_active_outputs = nullptr;
+        uint32_t num_active_outputs = (comp->orientation_out == kDnnInterleavedOrientation)
+                                      ? comp->num_rows_out : comp->num_columns_out;
+
+        if (i == component.size() - 1) {  // active list applies to last component
+            ptr_active_outputs = ptr_active_outputs_;
+            num_active_outputs = num_active_outputs_;
+        } else if (i == component.size() - 2) {  // also applies to last two components when last is PWL
+            if ((component[i].operation == kDnnAffineOp) && (component[i + 1].operation == kDnnPiecewiselinearOp)) {
+                ptr_active_outputs = ptr_active_outputs_;
+                num_active_outputs = num_active_outputs_;
+            }
+        }
+
+        switch (comp->operation) {
+            case kDnnAffineOp :ApplyAffineTransform(comp, ptr_active_outputs, num_active_outputs);
+                break;
+            case kDnnDiagonalOp:ApplyDiagonalTransform(comp);
+                break;
+            case kDnnRecurrentOp:
+                if ((i < component.size() - 1) && (component[i + 1].operation == kDnnPiecewiselinearOp)) {
+                    intel_dnn_component_t *comp_pwl = &component[i + 1];
+                    for (uint32_t j = 0; j < comp->num_rows_in; j++) {
+                        void *ptr_feedbacks =
+                            reinterpret_cast<void *>(reinterpret_cast<int32_t *>(comp->op.recurrent.ptr_feedbacks) + j * comp_pwl->num_columns_out);
+                        ApplyRecurrentTransform(comp, j, ptr_feedbacks);
+                        //  PrintOutputs(i);
+                        ApplyPiecewiseLinearTransform(comp_pwl, number_type_, num_active_outputs, j);
+                    }
+                    i++;  // skip next component
+                } else {
+                    fprintf(stderr, "Missing PiecewiseLinear component after Recurrent component in Propagate!\n");
+                    throw -1;
+                }
+                break;
+            case kDnnConvolutional1dOp:ApplyConvolutional1DTransform(comp);
+                break;
+            case kDnnPiecewiselinearOp:ApplyPiecewiseLinearTransform(comp, number_type_, num_active_outputs);
+                break;
+            case kDnnMaxPoolOp:ApplyMaxPoolTransform(comp, number_type_);
+                break;
+            case kDnnInterleaveOp:ApplyTranspose(comp);
+                break;
+            case kDnnDeinterleaveOp:ApplyTranspose(comp);
+                break;
+            case kDnnCopyOp:ApplyCopy(comp);
+                break;
+            default:fprintf(stderr, "Bad operation in Propagate!\n");
+                throw -1;
+                break;
+        }
+        //  PrintOutputs(i); fflush(stdout);
+    }
+}
+
+intel_dnn_macro_operation_t AmIntelDnn::MacroOperation(uint32_t component_index) {
+    return (component[component_index].macro_operation);
+}
+
+void AmIntelDnn::SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation) {
+    component[component_index].macro_operation = macro_operation;
+}
+
+float AmIntelDnn::InputScaleFactor(uint32_t component_index) {
+    float scale_factor = 1.0;
+
+    if (component_index == 0) {
+        scale_factor = input_scale_factor_;
+    } else {
+        if (component[component_index - 1].operation == kDnnAffineOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnDiagonalOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnConvolutional1dOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnRecurrentOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnInterleaveOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnDeinterleaveOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnCopyOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        }
+    }
+
+    return (scale_factor);
+}
+
+float AmIntelDnn::WeightScaleFactor(uint32_t component_index) {
+    float scale_factor = 1.0;
+
+    if (component[component_index].operation == kDnnAffineOp) {
+        scale_factor = component[component_index].op.affine.weight_scale_factor;
+    } else if (component[component_index].operation == kDnnDiagonalOp) {
+        scale_factor = component[component_index].op.affine.weight_scale_factor;
+    } else if (component[component_index].operation == kDnnConvolutional1dOp) {
+        scale_factor = component[component_index].op.conv1D.weight_scale_factor;
+    } else if (component[component_index].operation == kDnnRecurrentOp) {
+        scale_factor = component[component_index].op.recurrent.weight_scale_factor;
+    }
+
+    return (scale_factor);
+}
+
+float AmIntelDnn::OutputScaleFactor(intel_dnn_component_t &comp) {
+    return comp.output_scale_factor;
+}
+
+void AmIntelDnn::SetOutputScaleFactor(uint32_t component_index, float scale_factor) {
+    component[component_index].output_scale_factor = scale_factor;
+}
+
+void AmIntelDnn::PrintOutputs(uint32_t component_index) {
+    float scale_factor = OutputScaleFactor(component_index);
+    uint32_t num_rows = component[component_index].num_rows_out;
+    uint32_t num_columns = component[component_index].num_columns_out;
+
+    printf("component %d : %s\n", component_index, intel_dnn_operation_name[component[component_index].operation]);
+    if (number_type_ == kDnnFloat) {
+        auto ptr_output = reinterpret_cast<float *>(component[component_index].ptr_outputs);
+        for (int i = 0; i < num_rows; i++) {
+            for (int j = 0; j < num_columns; j++) {
+                printf("%d %d : %e\n", i, j, ptr_output[i * num_columns + j] / scale_factor);
+            }
+        }
+    } else {
+        switch (component[component_index].num_bytes_per_output) {
+            case 1: {
+                auto ptr_output = reinterpret_cast<int8_t *>(component[component_index].ptr_outputs);
+                for (int i = 0; i < num_rows; i++) {
+                    for (int j = 0; j < num_columns; j++) {
+                        printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
+                    }
+                }
+            }
+                break;
+            case 2: {
+                auto ptr_output = reinterpret_cast<int16_t *>(component[component_index].ptr_outputs);
+                for (int i = 0; i < num_rows; i++) {
+                    for (int j = 0; j < num_columns; j++) {
+                        printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
+                    }
+                }
+            }
+                break;
+            case 4: {
+                auto ptr_output = reinterpret_cast<int32_t *>(component[component_index].ptr_outputs);
+                for (int i = 0; i < num_rows; i++) {
+                    for (int j = 0; j < num_columns; j++) {
+                        printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
+                    }
+                }
+            }
+                break;
+            default:
+                fprintf(stderr,
+                        "Bad num_bytes_per_output in component %d in AmIntelDnn::PrintOutputs()\n",
+                        component_index);
+                throw -1;
+        }
+    }
+}
+
+uint32_t AmIntelDnn::CompareScores(void *ptr_refscorearray, intel_score_error_t *score_error, uint32_t num_frames) {
+    intel_dnn_component_t *ptr_component = &component[component.size() - 1];
+    intel_dnn_orientation_t orientation = ptr_component->orientation_out;
+    float scale_factor = OutputScaleFactor(component.size() - 1);
+    uint32_t num_errors = 0;
+    uint32_t num_rows = (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : num_frames;
+    uint32_t num_columns = (orientation == kDnnInterleavedOrientation) ? num_frames : ptr_component->num_columns_out;
+    uint32_t num_row_step_ref =
+        (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : ptr_component->num_columns_out;
+    uint32_t num_row_step = ptr_component->num_columns_out;
+
+    if (ptr_component->operation == kDnnAffineOp) {
+        num_rows = num_active_outputs_;
+    }
+
+    ClearScoreError(score_error);
+
+    if (number_type_ == kDnnFloat) {
+        auto A = reinterpret_cast<float *>(ptr_component->ptr_outputs);
+        auto B = reinterpret_cast<float *>(ptr_refscorearray);
+        for (int i = 0; i < num_rows; i++) {
+            for (int j = 0; j < num_columns; j++) {
+                float score = A[i * num_row_step + j];
+                float refscore =
+                    (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref
+                        + j];
+                float scaled_score = score / scale_factor;
+                float error = fabs(refscore - scaled_score);
+                float rel_error = error / (fabs(refscore) + 1e-20);
+                float squared_error = error * error;
+                float squared_rel_error = rel_error * rel_error;
+                score_error->num_scores++;
+                score_error->sum_error += error;
+                score_error->sum_squared_error += squared_error;
+                if (error > score_error->max_error) {
+                    score_error->max_error = error;
+                }
+                score_error->sum_rel_error += rel_error;
+                score_error->sum_squared_rel_error += squared_rel_error;
+                if (rel_error > score_error->max_rel_error) {
+                    score_error->max_rel_error = rel_error;
+                }
+                if (error > score_error->threshold) {
+                    num_errors++;
+                }
+            }
+        }
+    } else if (number_type_ == kDnnInt) {
+        auto B = reinterpret_cast<float *>(ptr_refscorearray);
+        for (int i = 0; i < num_rows; i++) {
+            for (int j = 0; j < num_columns; j++) {
+                float score;
+                if (ptr_component->num_bytes_per_output == 4) {
+                    auto A = reinterpret_cast<int32_t *>(ptr_component->ptr_outputs);
+                    score = static_cast<float>(A[i * num_row_step + j]);
+                } else if (ptr_component->num_bytes_per_output == 2) {
+                    auto A = reinterpret_cast<int16_t *>(ptr_component->ptr_outputs);
+                    score = static_cast<float>(A[i * num_row_step + j]);
+                } else {
+                    fprintf(stderr,
+                            "Unsupported output width (%d) in AmIntelDnn::CompareScores()!\n",
+                            ptr_component->num_bytes_per_output);
+                    throw -1;
+                }
+                float refscore =
+                    (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref
+                        + j];
+                float scaled_score = score / scale_factor;
+                float error = fabs(refscore - scaled_score);
+                float rel_error = error / (fabs(refscore) + 1e-20);
+                float squared_error = error * error;
+                float squared_rel_error = rel_error * rel_error;
+                score_error->num_scores++;
+                score_error->sum_error += error;
+                score_error->sum_squared_error += squared_error;
+                if (error > score_error->max_error) {
+                    score_error->max_error = error;
+                }
+                score_error->sum_rel_error += rel_error;
+                score_error->sum_squared_rel_error += squared_rel_error;
+                if (rel_error > score_error->max_rel_error) {
+                    score_error->max_rel_error = rel_error;
+                }
+                if (error > score_error->threshold) {
+                    num_errors++;
+                }
+            }
+        }
+    } else {
+        fprintf(stderr, "Unknown number type in AmIntelDnn::CompareScores()!\n");
+        throw -1;
+    }
+
+    score_error->num_errors = num_errors;
+
+    return (num_errors);
+}
+
+void AmIntelDnn::WriteGraphWizModel(const char *filename) {
+    auto & components = component;
+
+#define IS_AFFINE(k)\
+    (components[k].operation == kDnnAffineOp ||\
+     components[k].operation == kDnnDiagonalOp)
+
+#define IS_CONV(k)\
+    (components[k].operation == kDnnConvolutional1dOp)
+
+#define IS_RELU(k)\
+    (components[k].operation == kDnnPiecewiselinearOp &&\
+     components[k].op.pwl.func_id == kActRelu)
+
+
+#define IS_DIAG(k)\
+    (components[k].operation == kDnnDiagonalOp)
+
+#define OUTPUTS(idx)\
+    components[idx].ptr_outputs, components[idx].num_rows_out*components[idx].num_columns_out * components[idx].num_bytes_per_output
+
+#define INPUTS(idx)\
+    components[idx].ptr_inputs, components[idx].num_rows_in*components[idx].num_columns_in * components[idx].num_bytes_per_input
+
+#define BIASES(idx)\
+    components[idx].op.affine.ptr_biases,  components[idx].num_rows_in*components[idx].num_columns_in * components[idx].op.affine.num_bytes_per_bias
+
+#define WEIGHTS(idx)\
+    components[idx].op.affine.ptr_weights, components[idx].op.affine.num_bytes_per_weight * components[idx].num_rows_in*components[idx].num_columns_in * \
+            (IS_DIAG(idx) ? 1 : components[idx].num_rows_out*components[idx].num_columns_out)
+
+    auto intersected = [](void * ptra, size_t asize, void * ptrb, size_t bsize) {
+        return !(((reinterpret_cast<char*>(ptra) + asize) <= ptrb) || ((reinterpret_cast<char*>(ptrb) + bsize) <= ptra));
+    };
+
+    auto equals = [](void * ptra, size_t asize, void * ptrb, size_t bsize) {
+        // return !((((char*)ptra + asize) < ptrb) || (((char*)ptrb + bsize) < ptra));
+        return ptra >= ptrb  && ptra < reinterpret_cast<char*>(ptrb) + bsize;
+    };
+
+    std::fstream graph("graph.dot", std::ios::out);
+    graph << "strict digraph {";
+    std::set<void*> weights;
+    std::set<void*> biases;
+    std::set<void*> outputs;
+    std::set<std::string> layersNames;
+
+    auto generate_layer_name = [&](int k) {
+        std::string l;
+        if (components[k].operation == kDnnPiecewiselinearOp) {
+            l += intel_dnn_activation_name[components[k].op.pwl.func_id];
+        } else {
+            l += intel_dnn_operation_name[components[k].operation];
+        }
+        l += "_" + std::to_string(k);
+        if (components[k].operation == kDnnPiecewiselinearOp) {
+            graph << l << " [shape=box, style=filled, fillcolor=yellow";
+        } else {
+            graph << l << " [shape=box";
+        }
+
+        graph << ", label=<<TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">\n"
+            "  <TR><TD  colspan=\"2\">" <<  l << "</TD></TR>\n"
+            "  <TR><TD  colspan=\"2\">" <<  components[k].num_rows_in << "x" <<  components[k].num_rows_out<< "</TD></TR>\n";
+        if (IS_AFFINE(k)) {
+            graph << "  <TR><TD> wscale</TD><TD>" <<  components[k].op.affine.weight_scale_factor<< "</TD></TR>\n";
+            graph << "  <TR><TD> wbit</TD><TD>" <<  components[k].op.affine.num_bytes_per_weight<< "</TD></TR>\n";
+            graph << "  <TR><TD> bbit</TD><TD>" <<  components[k].op.affine.num_bytes_per_bias<< "</TD></TR>\n";
+        }
+        if (IS_RELU(k)) {
+            graph << "  <TR><TD> negative_slope</TD><TD>" <<  components[k].op.pwl.func_id.negative_slope<< "</TD></TR>\n";
+        }
+        if (IS_CONV(k)) {
+            auto &conv = components[k].op.conv1D;
+            graph << "  <TR><TD> num_filters</TD><TD>" <<  conv.num_filters<< "</TD></TR>\n";
+            graph << "  <TR><TD> num_filter_rows</TD><TD>" <<  conv.num_filter_rows<< "</TD></TR>\n";
+            graph << "  <TR><TD> num_filter_coefficients</TD><TD>" <<  conv.num_filter_coefficients<< "</TD></TR>\n";
+            graph << "  <TR><TD> num_feature_maps</TD><TD>" <<  conv.num_feature_maps<< "</TD></TR>\n";
+            graph << "  <TR><TD> num_feature_map_rows</TD><TD>" <<  conv.num_feature_map_rows<< "</TD></TR>\n";
+            graph << "  <TR><TD> num_feature_map_columns</TD><TD>" <<  conv.num_feature_map_columns<< "</TD></TR>\n";
+            graph << "  <TR><TD> wscale</TD><TD>" <<  conv.weight_scale_factor<< "</TD></TR>\n";
+            graph << "  <TR><TD> wbit</TD><TD>" <<  conv.num_bytes_per_weight<< "</TD></TR>\n";
+            graph << "  <TR><TD> bbit</TD><TD>" <<  conv.num_bytes_per_bias<< "</TD></TR>\n";
+        }
+        graph<<   "  <TR><TD> num_rows_in</TD><TD>" <<  components[k].num_rows_in<< "</TD></TR>\n"
+                  "  <TR><TD> num_columns_in</TD><TD>" <<  components[k].num_columns_in<< "</TD></TR>\n"
+                  "  <TR><TD> num_rows_out</TD><TD>" <<  components[k].num_rows_out<< "</TD></TR>\n"
+                  "  <TR><TD> num_columns_out</TD><TD>" <<  components[k].num_columns_out<< "</TD></TR>\n"
+                  "  <TR><TD> oscale</TD><TD>" <<  components[k].output_scale_factor<< "</TD></TR>\n"
+                  "  <TR><TD> ibit</TD><TD>" <<  components[k].num_bytes_per_input<< "</TD></TR>\n"
+                  "  <TR><TD> obit</TD><TD>" <<  components[k].num_bytes_per_output<< "</TD></TR>\n"
+            "</TABLE>>];\n";
+
+        return l;
+    };
+
+
+    for (int k = 0; k < components.size(); ++k) {
+        std::string l = generate_layer_name(k);
+        layersNames.insert(l);
+        int lidx = std::distance(layersNames.begin(), layersNames.find(l));
+        int widx = 0;
+        int bidx = 0;
+
+        if (IS_AFFINE(k)) {
+            weights.insert(components[k].op.affine.ptr_weights);
+            biases.insert(components[k].op.affine.ptr_biases);
+
+            widx = std::distance(weights.begin(), weights.find(components[k].op.affine.ptr_weights));
+            bidx = std::distance(biases.begin(), biases.find(components[k].op.affine.ptr_biases));
+        }
+
+
+        auto lw =  "weights_" +  std::to_string(lidx) + "_" + std::to_string(widx);;
+        auto lb =  "biases_" +  std::to_string(lidx) + "_" + std::to_string(bidx);
+
+        if (IS_AFFINE(k)) {
+            graph << lw << " -> " << l << "[style=bold];";
+            graph << lb << " -> " << l << "[style=bold];";
+        }
+
+        graph << "\n";
+
+        bool inputConnected = false;
+
+        for (int k2 = 0; k2 < components.size(); ++k2) {
+            if (k2 == k) continue;
+
+
+            std::string r = generate_layer_name(k2);
+
+            int w2idx = 0;
+            int b2idx = 0;
+
+            if (IS_AFFINE(k2)) {
+                weights.insert(components[k2].op.affine.ptr_weights);
+                biases.insert(components[k2].op.affine.ptr_biases);
+
+                w2idx = std::distance(weights.begin(), weights.find(components[k2].op.affine.ptr_weights));
+                b2idx = std::distance(biases.begin(), biases.find(components[k2].op.affine.ptr_biases));
+            }
+
+            auto rw =  "weights_" + std::to_string(w2idx);
+            auto rb =  "biases_" + std::to_string(b2idx);
+
+            // ----------------------------------------------------------
+            // output to input connections
+            if (intersected(OUTPUTS(k2), INPUTS(k))) {
+                graph << r <<" -> "<< l << ";";
+                inputConnected = true;
+            }
+
+            // ----------------------------------------------------------
+            // output to biases connections
+            if (IS_AFFINE(k) && intersected(OUTPUTS(k2), BIASES(k))) {
+                graph << r << " -> " << lb << " [label=\"OB\", fontcolor=blue, color=blue, style=dashed];";
+            }
+
+            // ----------------------------------------------------------
+            // output to weights connections
+            if (IS_AFFINE(k) && equals(OUTPUTS(k2), WEIGHTS(k))) {
+                graph << r << " -> " << lw << " [label=\"OW\", fontcolor=magenta, color=magenta, style=dashed];";
+            }
+
+            // ----------------------------------------------------------
+            // weights to input connections
+            if (IS_AFFINE(k2) && equals(WEIGHTS(k2), INPUTS(k))) {
+                graph << rw << " -> " << l << " [label=\"WI\", fontcolor=red, color=red, style=dashed];";
+                inputConnected = true;
+            }
+
+            // ----------------------------------------------------------
+            // weights to bias connections
+            if (IS_AFFINE(k2) && IS_AFFINE(k) && equals(WEIGHTS(k2), BIASES(k))) {
+                graph << rw << " -> " << lb << " [label=\"WB\", fontcolor=darkgreen,color=darkgreen, style=dashed];";
+            }
+        }
+        if (!inputConnected) {
+            // drawing tmp connection
+            outputs.insert(components[k].ptr_inputs);
+            auto tidx = std::distance(outputs.begin(), outputs.find(components[k].ptr_inputs));
+            graph << tidx << " -> " << l
+                  << " [label=\"FROM_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
+        }
+    }
+
+    for (int k = 0; k < components.size(); ++k) {
+        std::string l = generate_layer_name(k);
+
+        int tidx = 0;
+        for (auto tmpOutPtrs : outputs) {
+            if (components[k].ptr_outputs == tmpOutPtrs) {
+                graph << l << " -> " << tidx << " [label=\"TO_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
+            }
+            tidx++;
+        }
+    }
+
+    graph << "}";
+}
+
+void AmIntelDnn::WriteDnnText(const char *filename, intel_dnn_number_type_t number_type) {
+    if ((number_type_ == kDnnFloat) && (number_type == kDnnInt)) {
+        fprintf(stderr, "Error trying to write floating point DNN as integer in AmIntelDnn::WriteDnnText().\n");
+        fprintf(stderr, "  Please convert to integer first.\n");
+        throw -1;
+    }
+#ifndef LIGHT_DUMP
+    std::ofstream out_file1(filename, std::ios::out);
+    std::ofstream &out_file = out_file1;
+#else
+    std::ofstream out_file((std::string(filename) + ".light").c_str(), std::ios::out);
+#endif
+    if (out_file.good()) {
+        uint32_t num_inputs = component[0].num_rows_in;
+        uint32_t num_outputs =
+            (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) ? component[component.size()
+                - 1].num_rows_out : component[component.size() - 1].num_columns_out;
+        uint32_t num_layers = num_gna_layers();
+        uint32_t num_group = this->num_group_in();
+        uint32_t layer = 0;
+
+        out_file << "<intel_dnn_file>\n";
+        out_file << "<number_type> " << intel_dnn_number_type_name[number_type] << "\n";
+        out_file << "<softmax_type> " << intel_dnn_softmax_name[softmax_type] << "\n";
+        out_file << "<num_memory_bytes> " << std::dec << num_bytes_dnn_memory_ << "\n";
+        out_file << "<num_group> " << std::dec << num_group << "\n";
+        out_file << "<number_inputs> " << std::dec << num_inputs << "\n";
+        out_file << "<num_outputs> " << std::dec << num_outputs << "\n";
+        out_file << "<num_layers> " << std::dec << num_layers << "\n";
+        for (uint32_t i = 0; i < component.size(); i++) {
+#ifdef LIGHT_DUMP
+            std::stringstream out_file_name;
+            out_file_name << getDumpFolderName() << std::setfill('0') << std::setw(2) << i << "_"
+                          << intel_dnn_operation_name[component[i].operation]
+                          << "-" << component[i].num_rows_in
+                          << "-" << component[i].num_rows_out;
+            if (component[i].operation == kDnnPiecewiselinearOp) {
+                out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id.type];
+            }
+            std::ofstream out_file((out_file_name.str() + ".txt").c_str(), std::ios::out);
+#endif
+
+            uint32_t num_rows_in = component[i].num_rows_in;
+            uint32_t num_columns_in = component[i].num_columns_in;
+            uint32_t num_rows_out = component[i].num_rows_out;
+            uint32_t num_columns_out = component[i].num_columns_out;
+            uint32_t num_bytes_per_input = component[i].num_bytes_per_input;
+            uint32_t num_bytes_per_output = component[i].num_bytes_per_output;
+            if ((component[i].operation == kDnnAffineOp)
+                || (component[i].operation == kDnnDiagonalOp)
+                || (component[i].operation == kDnnRecurrentOp)
+                || (component[i].operation == kDnnConvolutional1dOp)
+                || (component[i].operation == kDnnInterleaveOp)
+                || (component[i].operation == kDnnDeinterleaveOp)
+                || (component[i].operation == kDnnCopyOp)) {
+                out_file << "<layer_index> " << std::dec << layer << "\n";
+                layer++;
+            }
+            out_file << "<component_operation> " << intel_dnn_operation_name[component[i].operation] << "\n";
+            out_file << "<macro_operation> " << intel_dnn_macro_operation_name[component[i].macro_operation] << "\n";
+            out_file << "<num_rows_in> " << std::dec << num_rows_in << "\n";
+            out_file << "<num_columns_in> " << std::dec << num_columns_in << "\n";
+            out_file << "<num_rows_out> " << std::dec << num_rows_out << "\n";
+            out_file << "<num_columns_out> " << std::dec << num_columns_out << "\n";
+            out_file << "<orientation_in> " << std::dec << (component[i].orientation_in == kDnnInterleavedOrientation ?
+            "interleaved" : "deinterleaved") << "\n";
+            out_file << "<orientation_out> " << std::dec << (component[i].orientation_out == kDnnInterleavedOrientation ?
+                                                            "interleaved" : "deinterleaved") << "\n";
+
+            if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                out_file << "<num_bytes_per_input> " << std::dec << sizeof(float) << "\n";
+                out_file << "<num_bytes_per_output> " << std::dec << sizeof(float) << "\n";
+            } else {
+                out_file << "<num_bytes_per_input> " << std::dec << num_bytes_per_input << "\n";
+                out_file << "<num_bytes_per_output> " << std::dec << num_bytes_per_output << "\n";
+            }
+            out_file << "<input_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                     << MemoryOffset(component[i].ptr_inputs, ptr_dnn_memory_) << "\n";
+            out_file << "<output_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                     << MemoryOffset(component[i].ptr_outputs, ptr_dnn_memory_) << "\n";
+            switch (component[i].operation) {
+                case kDnnAffineOp:
+                case kDnnDiagonalOp: {
+                    uint32_t num_bytes_per_weight = component[i].op.affine.num_bytes_per_weight;
+                    uint32_t num_bytes_per_bias = component[i].op.affine.num_bytes_per_bias;
+                    float weight_scale_factor = component[i].op.affine.weight_scale_factor;
+                    float output_scale_factor = component[i].output_scale_factor;
+                    uint32_t num_weight_rows = (component[i].operation == kDnnDiagonalOp) ? 1 : num_rows_out;
+                    uint32_t num_weight_columns = num_rows_in;
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
+                    } else {
+                        out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
+                    }
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
+                    } else {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
+                                 << weight_scale_factor << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                                 << output_scale_factor << "\n";
+                    }
+                    out_file << "<weight_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.affine.ptr_weights, ptr_dnn_memory_) << "\n";
+                    out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.affine.ptr_biases, ptr_dnn_memory_) << "\n";
+
+                    std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out);
+                    std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
+
+                    if (num_bytes_per_weight == 1) {
+                        int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.affine.ptr_weights);
+                        intel_compound_bias_t *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.affine.ptr_biases);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                if (number_type == kDnnFloat) {
+                                    float val =
+                                        static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier
+                                            / weight_scale_factor;
+                                    out_wfile << std::setprecision(4) << val << " ";
+                                } else {
+                                    out_wfile <<  int((int8_t) ptr_weight[row * num_weight_columns + col]) << " ";
+                                }
+                                out_wfile << "\n";
+                            }
+                        }
+#endif
+                    } else if (num_bytes_per_weight == 2) {
+                        int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.affine.ptr_weights);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                if (number_type == kDnnFloat) {
+                                    out_wfile << std::setprecision(12)
+                                              << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " ";
+                                } else {
+                                    out_wfile << ptr_weight[row * num_weight_columns + col] << " ";
+                                }
+                                out_wfile << "\n";
+                            }
+                        }
+#endif
+                    } else if (number_type_ == kDnnFloat) {
+                        float *ptr_weight = reinterpret_cast<float *>(component[i].op.affine.ptr_weights);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                out_wfile << std::setprecision(5)
+                                          << ptr_weight[row * num_weight_columns + col] << " ";
+                                out_wfile << "\n";
+                            }
+                        }
+#endif
+                    } else {
+                        fprintf(stderr, "Unsupported weight type in WriteDnnText!\n");
+                        throw -1;
+                    }
+                    if (number_type_ == kDnnInt) {
+                        if (num_bytes_per_weight == 1) {
+                            intel_compound_bias_t
+                                *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.affine.ptr_biases);
+#ifdef DUMP_WB
+                            for (uint32_t row = 0; row < num_rows_out; row++) {
+                                out_bfile << std::setw(8) << ptr_biases[row].bias << ", ";
+                                out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n";
+                            }
+#endif
+                        } else {
+                            int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.affine.ptr_biases);
+#ifdef DUMP_WB
+                            for (uint32_t row = 0; row < num_rows_out; row++) {
+                                if (number_type == kDnnInt) {
+                                    out_bfile << std::setw(8) << ptr_biases[row] << "\n";
+                                } else {
+                                    out_bfile << std::setw(8) << ptr_biases[row] / output_scale_factor << "\n";
+                                }
+                            }
+#endif
+                        }
+
+                    } else {
+                        float *ptr_biases = reinterpret_cast<float *>(component[i].op.affine.ptr_biases);
+#ifdef DUMP_WB
+
+                        for (uint32_t row = 0; row < num_rows_out; row++) {
+                            out_bfile << std::setprecision(5) << ptr_biases[row] << "\n";
+                        }
+#endif
+                    }
+                }
+                break;
+                case kDnnConvolutional1dOp: {
+                    uint32_t num_filters = component[i].op.conv1D.num_filters;
+                    uint32_t num_filter_rows = component[i].op.conv1D.num_filter_rows;
+                    uint32_t num_filter_coefficients = component[i].op.conv1D.num_filter_coefficients;
+                    uint32_t num_feature_maps = component[i].op.conv1D.num_feature_maps;
+                    uint32_t num_feature_map_rows = component[i].op.conv1D.num_feature_map_rows;
+                    uint32_t num_feature_map_columns = component[i].op.conv1D.num_feature_map_columns;
+                    uint32_t num_filter_outputs =
+                        component[i].op.conv1D.num_feature_map_rows - component[i].op.conv1D.num_filter_rows + 1;
+                    uint32_t num_bytes_per_weight = component[i].op.conv1D.num_bytes_per_weight;
+                    uint32_t num_bytes_per_bias = component[i].op.conv1D.num_bytes_per_bias;
+                    float weight_scale_factor = component[i].op.conv1D.weight_scale_factor;
+                    float output_scale_factor = component[i].output_scale_factor;
+                    out_file << "<num_filters> " << std::dec << num_filters << "\n";
+                    out_file << "<num_filter_coefficients> " << std::dec << num_filter_coefficients << "\n";
+                    out_file << "<num_filter_rows> " << std::dec << num_filter_rows << "\n";
+                    out_file << "<num_feature_maps> " << std::dec << num_feature_maps << "\n";
+                    out_file << "<num_feature_map_rows> " << std::dec << num_feature_map_rows << "\n";
+                    out_file << "<num_feature_map_columns> " << std::dec << num_feature_map_columns << "\n";
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
+                    } else {
+                        out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
+                    }
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
+                    } else {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
+                                 << weight_scale_factor << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                                 << output_scale_factor << "\n";
+                    }
+                    out_file << "<filter_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.conv1D.ptr_filters, ptr_dnn_memory_) << "\n";
+                    out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.conv1D.ptr_biases, ptr_dnn_memory_) << "\n";
+
+
+                    std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out);
+                    std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
+
+
+                    if (num_bytes_per_weight == 1) {
+                        int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.conv1D.ptr_filters);
+                        intel_compound_bias_t *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.conv1D.ptr_biases);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_filters; row++) {
+                            for (uint32_t col = 0; col < num_filter_coefficients; col++) {
+                                if (number_type == kDnnFloat) {
+                                    float val = static_cast<float>(ptr_weight[row * num_filter_coefficients + col])
+                                        * ptr_bias[row].multiplier / weight_scale_factor;
+                                    out_wfile << std::setprecision(12) <<val << "\n";
+                                } else {
+                                    out_wfile << "0x" << std::setfill('0') << std::setw(2) << std::hex
+                                             << int((uint8_t) ptr_weight[row * num_filter_coefficients + col]) << "\n";
+                                }
+                            }
+                        }
+#endif
+                    } else if (num_bytes_per_weight == 2) {
+                        int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.conv1D.ptr_filters);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_filters; row++) {
+                            for (uint32_t col = 0; col < num_filter_coefficients; col++) {
+                                if (number_type == kDnnFloat) {
+                                    out_wfile << std::setprecision(12)
+                                             << ptr_weight[row * num_filter_coefficients + col] / weight_scale_factor
+                                             << "\n";
+                                } else {
+                                    out_wfile << "0x" << std::setfill('0') << std::setw(4) << std::hex
+                                             << ptr_weight[row * num_filter_coefficients + col] << "\n";
+                                }
+                            }
+                        }
+#endif
+                    } else if (number_type_ == kDnnFloat) {
+                        float *ptr_weight = reinterpret_cast<float *>(component[i].op.conv1D.ptr_filters);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_filters; row++) {
+                            for (uint32_t col = 0; col < num_filter_coefficients; col++) {
+                                out_wfile << std::setprecision(12)
+                                         << ptr_weight[row * num_filter_coefficients + col] << "\n";
+                            }
+                            out_wfile << "\n";
+                        }
+#endif
+                    } else {
+                        fprintf(stderr, "Unsupported filter weight type in WriteDnnText!\n");
+                        throw -1;
+                    }
+
+                    if (number_type_ == kDnnInt) {
+                        if (number_type == kDnnInt) {
+                            if (num_bytes_per_weight == 1) {
+                                intel_compound_bias_t
+                                    *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.conv1D.ptr_biases);
+#ifdef DUMP_WB
+                                for (uint32_t row = 0; row < num_filters; row++) {
+                                    out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                             << ptr_biases[row].bias << " ";
+                                    out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                             << int(ptr_biases[row].multiplier) << "\n";
+                                }
+#endif
+                            } else {
+                                int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.conv1D.ptr_biases);
+#ifdef DUMP_WB
+                                for (uint32_t row = 0; row < num_filters; row++) {
+                                    out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[row]
+                                             << "\n";
+                                }
+#endif
+                            }
+                        } else {
+                            int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.conv1D.ptr_biases);
+#ifdef DUMP_WB
+                            for (uint32_t row = 0; row < num_filters; row++) {
+                                out_bfile << std::setprecision(12)
+                                         << ptr_biases[row] / output_scale_factor << "\n";
+                            }
+#endif
+                        }
+                    } else {
+                        float *ptr_biases = reinterpret_cast<float *>(component[i].op.conv1D.ptr_biases);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_filters; row++) {
+                            out_bfile << std::setprecision(12) << ptr_biases[row] << "\n";
+                        }
+#endif
+                    }
+                    out_file << "\n";
+                }
+                    break;
+                case kDnnRecurrentOp: {
+                    float weight_scale_factor = component[i].op.recurrent.weight_scale_factor;
+                    float output_scale_factor = component[i].output_scale_factor;
+                    uint32_t num_vector_delay = component[i].op.recurrent.num_vector_delay;
+                    uint32_t num_bytes_per_weight = component[i].op.recurrent.num_bytes_per_weight;
+                    uint32_t num_bytes_per_bias = component[i].op.recurrent.num_bytes_per_bias;
+                    uint32_t num_weight_rows = num_columns_out;
+                    uint32_t num_weight_columns = num_columns_in + num_columns_out;
+                    out_file << "<num_vector_delay> " << std::dec << num_vector_delay << "\n";
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
+                    } else {
+                        out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
+                    }
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
+                    } else {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
+                                 << weight_scale_factor << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                                 << output_scale_factor << "\n";
+                    }
+                    out_file << "<weight_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.recurrent.ptr_weights, ptr_dnn_memory_) << "\n";
+                    out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.recurrent.ptr_biases, ptr_dnn_memory_) << "\n";
+                    out_file << "<feedback_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.recurrent.ptr_feedbacks, ptr_dnn_memory_) << "\n";
+                    if (num_bytes_per_weight == 1) {
+                        int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.recurrent.ptr_weights);
+                        intel_compound_bias_t
+                            *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.recurrent.ptr_biases);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            out_file << "<weight_row> ";
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                if (number_type == kDnnFloat) {
+                                    float val =
+                                        static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[col].multiplier
+                                            / weight_scale_factor;
+                                    out_file << std::setprecision(12) << std::scientific << val << " ";
+                                } else {
+                                    out_file << "0x" << std::setfill('0') << std::setw(2) << std::hex
+                                             << int((uint8_t) ptr_weight[row * num_weight_columns + col]) << " ";
+                                }
+                            }
+                            out_file << "\n";
+                        }
+#endif
+                    } else if (num_bytes_per_weight == 2) {
+                        int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.recurrent.ptr_weights);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            out_file << "<weight_row> ";
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                if (number_type == kDnnFloat) {
+                                    out_file << std::setprecision(12) << std::scientific
+                                             << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " ";
+                                } else {
+                                    out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
+                                             << ptr_weight[row * num_weight_columns + col] << " ";
+                                }
+                            }
+                            out_file << "\n";
+                        }
+#endif
+                    } else if (number_type_ == kDnnFloat) {
+                        float *ptr_weight = reinterpret_cast<float *>(component[i].op.recurrent.ptr_weights);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            out_file << "<weight_row> ";
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                out_file << std::setprecision(12) << std::scientific
+                                         << ptr_weight[row * num_weight_columns + col] << " ";
+                            }
+                            out_file << "\n";
+                        }
+#endif
+                    } else {
+                        fprintf(stderr, "Unsupported weight type in WriteDnnText!\n");
+                        throw -1;
+                    }
+                    if (number_type_ == kDnnInt) {
+                        if (number_type == kDnnInt) {
+                            if (num_bytes_per_weight == 1) {
+                                intel_compound_bias_t
+                                    *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.recurrent.ptr_biases);
+                                out_file << "<compound_bias>" << " ";
+#ifdef DUMP_WB
+                                for (uint32_t col = 0; col < num_columns_out; col++) {
+                                    out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                             << ptr_biases[col].bias << " ";
+                                    out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                             << ptr_biases[col].multiplier << " ";
+                                }
+#endif
+                            } else {
+                                int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.recurrent.ptr_biases);
+                                out_file << "<bias>" << " ";
+#ifdef DUMP_WB
+                                for (uint32_t col = 0; col < num_columns_out; col++) {
+                                    out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[col]
+                                             << " ";
+                                }
+#endif
+                            }
+                        } else {
+                            int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.recurrent.ptr_biases);
+                            out_file << "<bias>" << " ";
+#ifdef DUMP_WB
+                            for (uint32_t col = 0; col < num_columns_out; col++) {
+                                out_file << std::setprecision(12) << std::scientific
+                                         << ptr_biases[col] / output_scale_factor << " ";
+                            }
+#endif
+                        }
+                    } else {
+                        float *ptr_biases = reinterpret_cast<float *>(component[i].op.recurrent.ptr_biases);
+                        out_file << "<bias>" << " ";
+#ifdef DUMP_WB
+                        for (uint32_t col = 0; col < num_columns_out; col++) {
+                            out_file << std::setprecision(12) << std::scientific << ptr_biases[col] << " ";
+                        }
+#endif
+                    }
+                    out_file << "\n";
+                }
+                    break;
+                case kDnnMaxPoolOp: {
+                    uint32_t num_pool_type = (component[i].op.maxpool.do_sum_not_max) ? 2 : 1;
+                    out_file << "<pool_type> " << std::dec << num_pool_type << "\n";
+                    out_file << "<pool_size> " << std::dec << component[i].op.maxpool.num_inputs << "\n";
+                    out_file << "<pool_step> " << std::dec << component[i].op.maxpool.num_inputs_step << "\n";
+                    out_file << "<pool_num_rows> " << std::dec << component[i].op.maxpool.num_inputs_stride << "\n";
+                    out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                             << component[i].output_scale_factor << "\n";
+                }
+                    break;
+                case kDnnPiecewiselinearOp: {
+                    intel_pwl_segment_t *ptr_segment = component[i].op.pwl.ptr_segments;
+                    DnnActivationType func_id = component[i].op.pwl.func_id.type;
+                    uint32_t num_segments = component[i].op.pwl.num_segments;
+                    float output_scale_factor = component[i].output_scale_factor;
+                    out_file << "<func_id> " << intel_dnn_activation_name[func_id] << "\n";
+                    out_file << "<num_bytes_per_slope> " << std::dec << sizeof(int16_t) << "\n";
+                    out_file << "<num_bytes_per_intercept> " << std::dec << sizeof(int16_t) << "\n";
+                    out_file << "<num_bytes_per_offset> " << std::dec << sizeof(int32_t) << "\n";
+                    if (number_type == kDnnFloat) {
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
+                        out_file << "<num_segments> " << std::dec << 0 << "\n";
+                        out_file << "<segment_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                 << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n";
+                    } else {
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                                 << output_scale_factor << "\n";
+                        out_file << "<num_segments> " << std::dec << num_segments << "\n";
+                        out_file << "<segment_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                 << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n";
+                        if (number_type_ == kDnnInt) {
+                            out_file << "<slope> ";
+                            for (int segment = 0; segment < num_segments; segment++) {
+                                out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
+                                         << ptr_segment[segment].slope << " ";
+                            }
+                            out_file << "\n";
+                            out_file << "<intercept> ";
+                            for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) {
+                                out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
+                                         << ptr_segment[segment].yBase << " ";
+                            }
+                            out_file << "\n";
+                            out_file << "<offset> ";
+                            for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) {
+                                out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                         << ptr_segment[segment].xBase << " ";
+                            }
+                            out_file << "\n";
+                        } else if (num_segments > 0) {
+                            fprintf(stderr,
+                                    "Number of segments must be zero in floating point model in WriteDnnText!\n");
+                            throw -1;
+                        }
+                    }
+                }
+                    break;
+                case kDnnInterleaveOp:
+                    out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                             << component[i].output_scale_factor << "\n";
+                    break;
+                case kDnnDeinterleaveOp:
+                    out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                             << component[i].output_scale_factor << "\n";
+                    break;
+                case kDnnCopyOp:
+                    out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                             << component[i].output_scale_factor << "\n";
+                    out_file << "<num_copy_rows> " << std::dec << component[i].op.copy.num_copy_rows << "\n";
+                    out_file << "<num_copy_columns> " << std::dec << component[i].op.copy.num_copy_columns << "\n";
+                    break;
+                default:
+                    out_file << "<Error!!!> Unsupported Component :  "
+                             << intel_dnn_operation_name[component[i].operation] << "\n";
+                    //  fprintf(stderr, "Component type %s not yet supported in AmIntelDnn::WriteDnnText()!\n",
+                    //    intel_dnn_operation_name[component[i].operation]);
+                    //  throw -1;
+                    break;
+            }
+        }
+        if (ptr_active_outputs() != nullptr) {
+            out_file << "<activelist_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                     << MemoryOffset(ptr_active_outputs(), ptr_dnn_memory_) << "\n";
+        }
+        out_file << "<end_of_file>\n";
+        out_file.close();
+    } else {
+        fprintf(stderr, "Failed to open %s for writing!\n", filename);
+        throw -1;
+    }
+}
+
+void AmIntelDnn::InitGNAStruct(intel_nnet_type_t *ptr_nnet) {
+    intel_nnet_layer_t *pLayer;
+
+    if (ptr_nnet == nullptr)
+        THROW_GNA_EXCEPTION << "Invalid input parameter";
+    if (component.empty())
+        THROW_GNA_EXCEPTION << "empty model in AmIntelDnn::FillGNAStruct()";
+
+    ptr_nnet->nLayers = 0;
+    for (auto && c : component) {
+        if (c.operation == kDnnAffineOp
+            || (c.operation == kDnnDiagonalOp)
+            || (c.operation == kDnnConvolutional1dOp)
+            || (c.operation == kDnnDeinterleaveOp)
+            || (c.operation == kDnnInterleaveOp)
+            || (c.operation == kDnnRecurrentOp)
+            || (c.operation == kDnnCopyOp)
+            ) {
+            ptr_nnet->nLayers++;
+        }
+    }
+    ptr_nnet->nGroup = num_group_in();
+    ptr_nnet->pLayers = reinterpret_cast<intel_nnet_layer_t *>(_mm_malloc(ptr_nnet->nLayers * sizeof(intel_nnet_layer_t), 64));
+    if (ptr_nnet->pLayers == nullptr)
+        THROW_GNA_EXCEPTION << "out of memory in AmIntelDnn::FillGNAStruct()";
+    pLayer = ptr_nnet->pLayers;
+
+    for (int i = 0; i < component.size(); i++) {
+        // std::cout << "Component + " << i <<"=GNA_" << std::distance(ptr_nnet->pLayers, pLayer) << "\n";
+        switch (component[i].operation) {
+            case kDnnAffineOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = component[i].ptr_outputs;
+                pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
+                pLayer->nLayerKind = INTEL_AFFINE;
+                {
+                    pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE layer structure.";
+                    }
+                    auto pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
+                    pAffineLayer->pwl.pSegments = nullptr;
+                    pAffineLayer->pwl.nSegments = 0;
+
+                    pAffineLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias;
+                    pAffineLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight;
+                    pAffineLayer->affine.pBiases = component[i].op.affine.ptr_biases;
+                    pAffineLayer->affine.pWeights = component[i].op.affine.ptr_weights;
+                }
+                if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
+                    pLayer++;
+                }
+                break;
+            case kDnnDiagonalOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = component[i].ptr_outputs;
+                pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
+                pLayer->nLayerKind = INTEL_AFFINE_DIAGONAL;
+                {
+                    pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE_DIAGONAL layer structure.";
+                    }
+                    auto pDiagonalLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
+                    pDiagonalLayer->pwl.pSegments = nullptr;
+                    pDiagonalLayer->pwl.nSegments = 0;
+
+                    pDiagonalLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias;
+                    pDiagonalLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight;
+                    pDiagonalLayer->affine.pBiases = component[i].op.affine.ptr_biases;
+                    pDiagonalLayer->affine.pWeights = component[i].op.affine.ptr_weights;
+                }
+                if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
+                    pLayer++;
+                }
+                break;
+            case kDnnRecurrentOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = component[i].ptr_outputs;
+                pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
+                pLayer->nLayerKind = INTEL_RECURRENT;
+                {
+                    pLayer->pLayerStruct = _mm_malloc(sizeof(intel_recurrent_layer_t), 64);
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_RECURRENT layer structure.";
+                    }
+                    auto pRecurrentLayer = reinterpret_cast<intel_recurrent_layer_t *>(pLayer->pLayerStruct);
+                    pRecurrentLayer->pFeedbackBuffer = component[i].op.recurrent.ptr_feedbacks;
+                    pRecurrentLayer->pwl.pSegments = nullptr;
+                    pRecurrentLayer->pwl.nSegments = 0;
+
+                    pRecurrentLayer->affine.nBytesPerBias = component[i].op.recurrent.num_bytes_per_bias;
+                    pRecurrentLayer->affine.nBytesPerWeight = component[i].op.recurrent.num_bytes_per_weight;
+                    pRecurrentLayer->affine.pBiases = component[i].op.recurrent.ptr_biases;
+                    pRecurrentLayer->affine.pWeights = component[i].op.recurrent.ptr_weights;
+                }
+                if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
+                    pLayer++;
+                }
+                break;
+            case kDnnConvolutional1dOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = component[i].ptr_outputs;
+                pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten
+                pLayer->nLayerKind = INTEL_CONVOLUTIONAL;
+                {
+                    pLayer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64);
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_CONVOLUTIONAL layer structure.";
+                    }
+                    auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
+                    pConvolutionalLayer->nBytesBias = component[i].op.conv1D.num_bytes_per_bias;
+                    pConvolutionalLayer->nBytesFilterCoefficient = component[i].op.conv1D.num_bytes_per_weight;
+                    pConvolutionalLayer->nFilters = component[i].op.conv1D.num_filters;
+                    pConvolutionalLayer->nFilterRows = component[i].op.conv1D.num_filter_rows;
+                    pConvolutionalLayer->nFilterCoefficients = component[i].op.conv1D.num_filter_coefficients;
+                    pConvolutionalLayer->nFeatureMaps = component[i].op.conv1D.num_feature_maps;
+                    pConvolutionalLayer->nFeatureMapRows = component[i].op.conv1D.num_feature_map_rows;
+                    pConvolutionalLayer->nFeatureMapColumns = component[i].op.conv1D.num_feature_map_columns;
+                    pConvolutionalLayer->poolType = INTEL_NO_POOLING;  //  will be overwritten
+                    pConvolutionalLayer->nPoolSize = 0;  //  will be overwritten
+                    pConvolutionalLayer->nPoolStride = 0;  //  will be overwritten
+                    pConvolutionalLayer->pwl.nSegments = 0;  //  will be overwritten
+                    pConvolutionalLayer->pwl.pSegments = nullptr;  //  will be overwritten
+                    pConvolutionalLayer->pBiases = component[i].op.conv1D.ptr_biases;
+                    pConvolutionalLayer->pFilters = component[i].op.conv1D.ptr_filters;
+                }
+                if (i == component.size() - 1 || ((component[i + 1].operation != kDnnMaxPoolOp)
+                        && (component[i + 1].operation != kDnnPiecewiselinearOp))) {
+                    pLayer++;
+                }
+                break;
+            case kDnnMaxPoolOp:
+                if (i == 0) {
+                    THROW_GNA_EXCEPTION << "Pooling component with no preceeding component";
+                } else if (pLayer->nLayerKind == INTEL_CONVOLUTIONAL) {
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION "INTEL_CONVOLUTIONAL layer structure was not initialized.";
+                    }
+                    auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
+                    // it is possible to have activation preceding to maxpool
+                    if (pConvolutionalLayer->pwl.nSegments != 0) {
+                        THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at." << i;
+                    } else {
+                        pConvolutionalLayer->poolType =
+                            (component[i].op.maxpool.do_sum_not_max) ? INTEL_SUM_POOLING : INTEL_MAX_POOLING;
+                        pConvolutionalLayer->nPoolSize = component[i].op.maxpool.num_inputs;
+                        pConvolutionalLayer->nPoolStride = component[i].op.maxpool.num_inputs_step;
+
+
+                        // number of output columns correction - based on GNA-library expectations
+                        auto nFltSize = pConvolutionalLayer->nFilterCoefficients;
+                        auto fltStrideSz = pConvolutionalLayer->nFeatureMaps * pConvolutionalLayer->nFeatureMapColumns;  // always move 1 "row"
+                        auto maxNCOE = (pLayer->nInputColumns - nFltSize) / fltStrideSz + 1;
+                        // FLAT input matrix, pooled outputs per filter
+                        pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((maxNCOE - 1) / pConvolutionalLayer->nPoolStride + 1);
+
+                        // old code
+                        // pLayer->nOutputColumns /= pConvolutionalLayer->nPoolStride;
+                    }
+                } else {
+                    THROW_GNA_EXCEPTION << "Pooling component applied to non-convolutional layer";
+                }
+                break;
+            case kDnnPiecewiselinearOp:
+                pLayer->pOutputs = component[i].ptr_outputs;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
+                if (pLayer->pLayerStruct == nullptr) {
+                    THROW_GNA_EXCEPTION << pLayer->nLayerKind << " layer structure was not initialized.";
+                }
+                if (i == 0) {
+                    THROW_GNA_EXCEPTION << "PWL component with no preceding component.";
+                } else if ((component[i - 1].operation == kDnnAffineOp)
+                    || (component[i - 1].operation == kDnnDiagonalOp)) {
+                    auto pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
+                    pAffineLayer->pwl.nSegments = component[i].op.pwl.num_segments;
+                    pAffineLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
+                } else if (component[i - 1].operation == kDnnRecurrentOp) {
+                    auto pRecurrentLayer = reinterpret_cast<intel_recurrent_layer_t *>(pLayer->pLayerStruct);
+                    pRecurrentLayer->pwl.nSegments = component[i].op.pwl.num_segments;
+                    pRecurrentLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
+                } else if ((component[i - 1].operation == kDnnConvolutional1dOp)
+                    || ((component[i - 1].operation == kDnnMaxPoolOp)
+                        && (component[i - 2].operation == kDnnConvolutional1dOp))) {
+                    auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
+                    pConvolutionalLayer->pwl.nSegments = component[i].op.pwl.num_segments;
+                    pConvolutionalLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
+                    if (component[i - 1].operation != kDnnMaxPoolOp) {
+                        pLayer->nOutputColumns = component[i].num_columns_out;
+                    }
+                }
+                pLayer++;
+
+                break;
+            case kDnnInterleaveOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = nullptr;
+                pLayer->pOutputs = component[i].ptr_outputs;
+                pLayer->nLayerKind = INTEL_INTERLEAVE;
+                pLayer->pLayerStruct = nullptr;
+                pLayer++;
+                break;
+            case kDnnDeinterleaveOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = nullptr;
+                pLayer->pOutputs = component[i].ptr_outputs;
+                pLayer->nLayerKind = INTEL_DEINTERLEAVE;
+                pLayer->pLayerStruct = nullptr;
+                pLayer++;
+                break;
+            case kDnnCopyOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = nullptr;
+                pLayer->pOutputs = component[i].ptr_outputs;
+                pLayer->nLayerKind = INTEL_COPY;
+                pLayer->pLayerStruct = nullptr;
+                {
+                    pLayer->pLayerStruct = _mm_malloc(sizeof(intel_copy_layer_t), 64);
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION << pLayer->nLayerKind << " could not allocate memory for INTEL_COPY layer structure.";
+                    }
+                    auto *pCopyLayer = reinterpret_cast<intel_copy_layer_t *>(pLayer->pLayerStruct);
+                    pCopyLayer->nCopyRows = component[i].op.copy.num_copy_rows;
+                    pCopyLayer->nCopyCols = component[i].op.copy.num_copy_columns;
+                }
+                pLayer++;
+                break;
+            default: {
+                THROW_GNA_EXCEPTION << "GNA does yet not support " << intel_dnn_operation_name[component[i].operation];
+            }
+        }
+    }
+    // enable debugging of partial array of components
+    ptr_nnet->nLayers = std::distance(ptr_nnet->pLayers, pLayer);
+}
+
+void AmIntelDnn::DestroyGNAStruct(intel_nnet_type_t *ptr_nnet) {
+    ptr_nnet->nGroup = 0;
+    if (ptr_nnet->pLayers != nullptr) {
+        for (int i = 0; i < ptr_nnet->nLayers; i++) {
+            switch (ptr_nnet->pLayers[i].nLayerKind) {
+                case INTEL_AFFINE:break;
+                case INTEL_AFFINE_DIAGONAL:break;
+                case INTEL_RECURRENT:break;
+                case INTEL_CONVOLUTIONAL:break;
+                case INTEL_INTERLEAVE:break;
+                case INTEL_DEINTERLEAVE:break;
+                case INTEL_COPY:break;
+                default:break;
+            }
+            if (ptr_nnet->pLayers[i].pLayerStruct != nullptr) {
+                _mm_free(ptr_nnet->pLayers[i].pLayerStruct);
+            }
+        }
+        if (ptr_nnet->pLayers != nullptr) {
+            _mm_free(ptr_nnet->pLayers);
+        }
+    }
+    ptr_nnet->nLayers = 0;
+}
+
+void AmIntelDnn::GetScaledOutput(float *ptr_output, uint32_t component_index) {
+    if (component_index > num_components()) {
+        fprintf(stderr, "Illegal component index %d in GetScaledOutput\n", component_index);
+        throw -1;
+    }
+    if (ptr_output != nullptr) {
+        float scale_factor = OutputScaleFactor(component_index);
+        uint32_t num_elements = component[component_index].num_rows_out * component[component_index].num_columns_out;
+        if (number_type_ == kDnnFloat) {
+            float *ptr_input = reinterpret_cast<float *>(component[component_index].ptr_outputs);
+            for (uint32_t i = 0; i < num_elements; i++) {
+                ptr_output[i] = ptr_input[i] / scale_factor;
+            }
+        } else if (component[component_index].num_bytes_per_output == 2) {
+            int16_t *ptr_input = reinterpret_cast<int16_t *>(component[component_index].ptr_outputs);
+            for (uint32_t i = 0; i < num_elements; i++) {
+                ptr_output[i] = static_cast<float>(ptr_input[i]) / scale_factor;
+            }
+        } else {
+            int32_t *ptr_input = reinterpret_cast<int32_t *>(component[component_index].ptr_outputs);
+            for (uint32_t i = 0; i < num_elements; i++) {
+                ptr_output[i] = static_cast<float>(ptr_input[i]) / scale_factor;
+            }
+        }
+    } else {
+        fprintf(stderr, "Output pointer is nullptr in GetScaledOutput\n");
+        throw -1;
+    }
+}
+
+void AmIntelDnn::WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet) {
+#ifdef LIGHT_DUMP
+    if (nnet) {
+        for (int i = 0; i < nnet->nLayers; i++) {
+            auto component = nnet->pLayers;
+            std::stringstream out_file_name;
+            auto getLayerType = [](intel_layer_kind_t kind){
+                switch (kind){
+                    case INTEL_AFFINE : return "affine";
+                    case INTEL_AFFINE_DIAGONAL : return "diag";
+                    case INTEL_RECURRENT : return "recurrent";
+                    case INTEL_CONVOLUTIONAL : return "convolution";
+                    case INTEL_INTERLEAVE : return "interleave";
+                    case INTEL_DEINTERLEAVE : return "deinterleave";
+                    case INTEL_COPY : return "copy";
+                    default: return "unknown";
+                }
+            };
+            out_file_name << std::setfill('0') << std::setw(2) << i << "_"
+                          << getLayerType(component[i].nLayerKind)
+                          << "-" << nnet->pLayers[i].nInputRows
+                          << "-" << nnet->pLayers[i].nOutputRows;
+
+            auto inputfileName = getDumpFolderNameGNA() + out_file_name.str() + "_input.txt";
+            auto outFileName = getDumpFolderNameGNA() + out_file_name.str() + "_output.txt";
+            auto pwlFileName = getDumpFolderNameGNA() + out_file_name.str() + "_pwl.txt";
+            auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt";
+
+            std::ofstream out_file(outFileName.c_str(), std::ios::out);
+            std::ofstream pwl_file(pwlFileName.c_str(), std::ios::out);
+            std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in);
+            std::ofstream in_file(inputfileName.c_str(), std::ios::out);
+
+            float  summOfDiff = 0.f;
+            float  summOfSqDiff = 0.f;
+            float  maxD = 0.0f;
+            int    numItems = 0;
+
+            auto write_pwl = [&pwl_file](intel_pwl_func_t & pwl) {
+                for (int k =0; k < pwl.nSegments; k++) {
+                    pwl_file << pwl.pSegments[k].slope << ", " << pwl.pSegments[k].xBase << ", " << pwl.pSegments[k].yBase << "\n";
+                }
+            };
+            if (nnet->pLayers[i].nLayerKind == INTEL_AFFINE || nnet->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL) {
+                auto affine = reinterpret_cast<intel_affine_layer_t*>(nnet->pLayers[i].pLayerStruct);
+                write_pwl(affine->pwl);
+            }
+            if (nnet->pLayers[i].nLayerKind == INTEL_CONVOLUTIONAL) {
+                auto conv = reinterpret_cast<intel_convolutional_layer_t*>(nnet->pLayers[i].pLayerStruct);
+                write_pwl(conv->pwl);
+            }
+
+            for (int k = 0; k < component[i].nOutputRows; k++) {
+                for (int j = 0; j < component[i].nOutputColumns; j++) {
+                    float floatValue = 0.f;
+                    if (component[i].nBytesPerOutput == 4) {
+                        auto value = (reinterpret_cast<int32_t *>(component[i].pOutputs)[k * component[i].nOutputColumns + j]);
+                        floatValue = (static_cast<float>(value) / 1.0);
+                    } else {
+                        auto value = reinterpret_cast<int16_t *>(component[i].pOutputs)[k * component[i].nOutputColumns + j];
+                        floatValue = (static_cast<float>(value) / 1.0);
+                    }
+                    out_file << std::setw(8) << floatValue << "\n";
+                    if (ref_out_file) {
+                        float ref_value = 0.f;
+                        ref_out_file >> ref_value;
+                        float diff = (ref_value - floatValue);
+                        diff = diff  < 0 ? -diff : diff;
+                        summOfDiff += diff;
+                        summOfSqDiff += diff * diff;
+                        maxD = std::max(maxD, diff);
+                        numItems++;
+                    }
+                }
+            }
+            if (numItems) {
+                auto rmse = sqrt(summOfSqDiff / numItems);
+                auto avg = summOfDiff / numItems;
+                std :: cout << std::left << std::setw(55) << out_file_name.str()
+                            << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse
+                            << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg
+                            << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl;
+            }
+
+
+            for (int k = 0; k < component[i].nInputRows; k++) {
+                for (int j = 0; j < component[i].nInputColumns; j++) {
+                    if (component[i].nBytesPerInput == 4) {
+                        in_file << std::setw(8)
+                                << (reinterpret_cast<int32_t *>(component[i].pInputs)[k * component[i].nInputColumns + j]);
+                    } else {
+                        in_file << std::setw(8)
+                                << (reinterpret_cast<int16_t *>(component[i].pInputs)[k * component[i].nInputColumns + j]);
+                    }
+                    in_file << "\n";
+                }
+            }
+        }
+    }
+#endif
+}
+
+void AmIntelDnn::WriteInputAndOutputText() {
+#ifdef LIGHT_DUMP
+    for (int i = 0; i < num_components(); i++) {
+        std::stringstream out_file_name;
+        out_file_name << std::setfill('0') << std::setw(2) << i << "_"
+                      << intel_dnn_operation_name[component[i].operation]
+                      << "-" << component[i].num_rows_in
+                      << "-" << component[i].num_rows_out;
+        if (component[i].operation == kDnnPiecewiselinearOp) {
+            out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id];
+        }
+        auto inputfileName = getDumpFolderName() + out_file_name.str() + "_input.txt";
+        auto outFileName = getDumpFolderName() + out_file_name.str() + "_output.txt";
+        auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt";
+
+        std::ofstream out_file(outFileName.c_str(), std::ios::out);
+        std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in);
+        std::ofstream in_file(inputfileName.c_str(), std::ios::out);
+
+        float  summOfDiff = 0.f;
+        float  summOfSqDiff = 0.f;
+        float  maxD = 0.0f;
+        int    numItems = 0;
+
+        for (int k = 0; k < component[i].num_rows_out; k++) {
+            for (int j = 0; j < component[i].num_columns_out; j++) {
+                float floatValue = 0.f;
+                if (component[i].num_bytes_per_output == 4) {
+                    if (number_type_ == kDnnInt) {
+                        auto value = (reinterpret_cast<int32_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j]);
+                    //    out_file << std::setw(8) << value << "\n";
+                        floatValue = (static_cast<float>(value) / component[i].output_scale_factor);
+
+                    } else {
+                        floatValue = (reinterpret_cast<float*>(component[i].ptr_outputs)[
+                            k * component[i].num_columns_out+ j]) / component[i].output_scale_factor;
+                    }
+                } else {
+                    auto value = reinterpret_cast<int16_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
+                 //   out_file << std::setw(8) << value << "\n";
+                    floatValue = (static_cast<float>(value) / component[i].output_scale_factor);
+                }
+                out_file << std::setw(8) << floatValue << "\n";
+                if (ref_out_file) {
+                    float ref_value = 0.f;
+                    ref_out_file >> ref_value;
+                    float diff = (ref_value - floatValue);
+                    diff = diff < 0.f ? -diff : diff;
+                    summOfDiff += diff;
+                    summOfSqDiff += diff * diff;
+                    maxD = std::max(maxD, diff);
+                    numItems++;
+                }
+            }
+        }
+        if (numItems) {
+            auto rmse = sqrt(summOfSqDiff / numItems);
+            auto avg = summOfDiff / numItems;
+            std :: cout << std::left << std::setw(55) << out_file_name.str()
+                        << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse
+                        << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg
+                        << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl;
+        }
+
+
+        for (int k = 0; k < component[i].num_rows_in; k++) {
+            for (int j = 0; j < component[i].num_columns_in; j++) {
+                if (component[i].num_bytes_per_input == 4) {
+                    if (number_type_ == kDnnInt) {
+                        in_file << std::setw(8)
+                                << (reinterpret_cast<int32_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in
+                                    + j]);
+                    } else {
+                        in_file << std::setw(8)
+                                << (reinterpret_cast<float *>(component[i].ptr_inputs)[k * component[i].num_columns_in
+                                    + j]);
+                    }
+                } else {
+                    in_file << std::setw(8)
+                            << (reinterpret_cast<int16_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in
+                                + j]);
+                }
+                in_file << "\n";
+            }
+        }
+#endif
+    }
+}
+
+bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2) {
+    bool isCompatible = true;
+
+    // compare basic structures to see if they are compatible
+    if (dnn1.num_components() != dnn2.num_components()) isCompatible = false;
+    for (int i = 0; i < dnn1.num_components(); i++) {
+        if (dnn1.component[i].num_rows_in != dnn2.component[i].num_rows_in) isCompatible = false;
+        if (dnn1.component[i].num_columns_in != dnn2.component[i].num_columns_in) isCompatible = false;
+        if (dnn1.component[i].num_rows_out != dnn2.component[i].num_rows_out) isCompatible = false;
+        if (dnn1.component[i].num_columns_out != dnn2.component[i].num_columns_out) isCompatible = false;
+        if (dnn1.component[i].operation != dnn2.component[i].operation) isCompatible = false;
+    }
+
+    return (isCompatible);
+}
+
+void ClearScoreError(intel_score_error_t *error) {
+    error->num_scores = 0;
+    error->num_errors = 0;
+    error->max_error = 0.0;
+    error->sum_error = 0.0;
+    error->sum_squared_error = 0.0;
+    error->max_rel_error = 0.0;
+    error->sum_rel_error = 0.0;
+    error->sum_squared_rel_error = 0.0;
+}
+
+void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error) {
+    total_error->num_errors += error->num_errors;
+    total_error->num_scores += error->num_scores;
+    total_error->sum_error += error->sum_error;
+    total_error->sum_squared_error += error->sum_squared_error;
+    if (error->max_error > total_error->max_error) {
+        total_error->max_error = error->max_error;
+    }
+    total_error->sum_rel_error += error->sum_rel_error;
+    total_error->sum_squared_rel_error += error->sum_squared_rel_error;
+    if (error->max_rel_error > total_error->max_rel_error) {
+        total_error->max_rel_error = error->max_rel_error;
+    }
+}
+
+void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs) {
+    // Assumes input vector contains log likelihoods
+    // The computes x[i] = x[i] - log(sum_j exp(x[j]))
+    // This normalizes the likelihoods by the sum of likelihoods but stores them as log likelihoods
+
+    float max_score = ptr_input[0];
+    float sum = 0.0;
+    float diff;
+    // find max score for normalization to [0,1]
+    for (uint32_t i = 0; i < num_inputs; i++) {
+        if (ptr_input[i] > max_score) {
+            max_score = ptr_input[i];
+        }
+    }
+    for (uint32_t i = 0; i < num_inputs; i++) {
+        sum += exp(ptr_input[i] - max_score);
+    }
+    if (sum < 1.0e-20) {
+        fprintf(stderr, "Warning:  attempt to take log(0) in SoftmaxGoogle()!\n");
+        sum = 1.0e-20;
+    }
+    diff = max_score + log(sum);
+    for (uint32_t i = 0; i < num_outputs; i++) {
+        ptr_output[i] = ptr_input[i] - diff;
+    }
+}
diff --git a/inference-engine/src/gna_plugin/dnn.h b/inference-engine/src/gna_plugin/dnn.h
new file mode 100644 (file)
index 0000000..8a1506d
--- /dev/null
@@ -0,0 +1,823 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <malloc.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <iomanip>
+#include <type_traits>
+#include <vector>
+#include "gna-api.h"
+
+#define DNN_MAX_BATCH_SIZE 8
+#define DNN_MAX_INPUTS 3072
+#define DNN_MAX_OUTPUTS 8192
+#define DNN_MAX_ERROR 1.0e-4f
+#define DNN_NUM_BYTES_INT_BIAS 4
+#define DNN_NUM_BYTES_INT_AFFINE_OUT 4
+#define DNN_RAND_INT8_AMPLITUDE 127.0f
+#define DNN_RAND_INT16_AMPLITUDE 16384.0f
+#define DNN_RAND_INT32_AMPLITUDE 1048576.0f
+#define DNN_RAND_FLOAT32_AMPLITUDE 8.0f
+
+enum DnnActivationType {
+    kActNone,
+    kActSigmoid,
+    kActTanh,
+    kActRelu,
+    kActLeakyRelu,
+    kActIdentity,
+    kActKaldiLstmClipping,
+    kActCustom,
+    kActNumType
+};
+struct DnnActivation {
+    // for prelu
+    DnnActivationType type;
+    float negative_slope;
+    operator DnnActivationType () const noexcept {
+        return type;
+    }
+    static DnnActivation fromType(DnnActivationType type) {
+        DnnActivation activation;
+        activation.type = type;
+        activation.negative_slope = 0.0f;
+        return activation;
+    }
+};
+
+static_assert(std::is_trivial<DnnActivation>::value, "DnnActivation is not trival type");
+
+static const char *intel_dnn_activation_name[kActNumType] = {
+    "kActNone",
+    "kActSigmoid",
+    "kActTanh",
+    "kActRelu",
+    "kActLeakyRelu",
+    "kActIdentity",
+    "kActKaldiLstmClipping",
+    "kActCustom"
+};
+
+typedef enum DnnSoftmaxType {
+    kSoftmaxNone,
+    kSoftmaxKaldiSumgroup,
+    kSoftmaxEesen,
+    kSoftmaxGoogle,
+    kSoftmaxNumType
+} intel_dnn_softmax_type_t;
+
+static const char *intel_dnn_softmax_name[kSoftmaxNumType] = {
+    "kSoftmaxNone",
+    "kSoftmaxKaldiSumGroup",
+    "kSoftmaxKaldiApplyLog",
+    "kSoftmaxGoogle"
+};
+
+typedef enum {
+    kDnnUnknownOrientation,
+    kDnnInterleavedOrientation,
+    kDnnNonInterleavedOrientation,
+    kDnnNumOrientation
+} intel_dnn_orientation_t;
+
+typedef enum {
+    kDnnNullOp,
+    kDnnAffineOp,
+    kDnnDiagonalOp,
+    kDnnConvolutional1dOp,
+    kDnnPiecewiselinearOp,
+    kDnnMaxPoolOp,
+    kDnnRecurrentOp,
+    kDnnInterleaveOp,
+    kDnnDeinterleaveOp,
+    kDnnCopyOp,
+    kDnnNumOp
+} intel_dnn_operation_t;
+
+static const char *intel_dnn_operation_name[kDnnNumOp] = {
+    "kDnnNullOp",
+    "kDnnAffineOp",
+    "kDnnDiagonalOp",
+    "kDnnConvolutional1dOp",
+    "kDnnPiecewiselinearOp",
+    "kDnnMaxPoolOp",
+    "kDnnRecurrentOp",
+    "kDnnInterleaveOp",
+    "kDnnDeinterleaveOp",
+    "kDnnCopyOp"
+};
+
+typedef enum {
+    kDnnMacroOpNone,
+    kDnnMacroOpLstm,
+    kDnnMacroOpBiLstm,
+    kDnnNumMacroOp
+} intel_dnn_macro_operation_t;
+
+static const char *intel_dnn_macro_operation_name[kDnnNumMacroOp] = {
+    "kDnnMacroOpNone",
+    "kDnnMacroOpLstm",
+    "kDnnMacroOpBiLstm"
+};
+
+typedef enum {
+    kDnnFloat,
+    kDnnInt,
+    kDnnNumNumberType
+} intel_dnn_number_type_t;
+
+static const char *intel_dnn_number_type_name[kDnnNumNumberType] = {
+    "kDnnFloat",
+    "kDnnInt"
+};
+
+typedef struct {
+    uint32_t num_bytes_per_weight;
+    uint32_t num_bytes_per_bias;
+    float weight_scale_factor;
+    void *ptr_weights;
+    void *ptr_biases;
+} intel_affine_t;
+
+typedef struct {
+    uint32_t num_bytes_per_weight;
+    uint32_t num_bytes_per_bias;
+    uint32_t num_filters;
+    uint32_t num_filter_rows;
+    uint32_t num_filter_coefficients;
+    uint32_t num_feature_maps;
+    uint32_t num_feature_map_rows;
+    uint32_t num_feature_map_columns;
+    float weight_scale_factor;
+    void *ptr_filters;     // filters stored one after the other
+    void *ptr_biases;
+} intel_convolutionalD_t;
+
+typedef struct {
+    uint32_t num_inputs;         // pool size
+    uint32_t num_inputs_step;     // pool step
+    uint32_t num_inputs_stride;  // pool stride (number of convolution filters)
+    bool do_sum_not_max;
+} intel_maxpool_t;
+
+typedef struct {
+    DnnActivation func_id;       // identifies function being approximated
+    uint32_t num_segments;
+    intel_pwl_segment_t *ptr_segments;
+} intel_piecewiselinear_t;
+
+typedef struct {
+    uint32_t num_vector_delay;
+    uint32_t num_bytes_per_weight;
+    uint32_t num_bytes_per_bias;
+    float weight_scale_factor;
+    void *ptr_feedbacks;
+    void *ptr_weights;
+    void *ptr_biases;
+} intel_recurrent_t;
+
+typedef struct {
+} intel_interleave_t;
+
+typedef struct {
+} intel_deinterleave_t;
+
+typedef struct {
+    uint32_t num_copy_columns;        // number of columns to copy
+    uint32_t num_copy_rows;            // number of rows to copy
+} intel_copy_t;
+
+typedef struct {
+    uint32_t num_rows_in;
+    uint32_t num_columns_in;
+    uint32_t num_rows_out;
+    uint32_t num_columns_out;
+    uint32_t num_bytes_per_input;
+    uint32_t num_bytes_per_output;
+    intel_dnn_operation_t operation;
+    intel_dnn_macro_operation_t macro_operation;
+    intel_dnn_orientation_t orientation_in;
+    intel_dnn_orientation_t orientation_out;
+    union operation_struct_t {
+        intel_affine_t affine;
+        intel_convolutionalD_t conv1D;
+        intel_maxpool_t maxpool;
+        intel_piecewiselinear_t pwl;
+        intel_recurrent_t recurrent;
+        intel_interleave_t interleave;
+        intel_deinterleave_t deinterleave;
+        intel_copy_t copy;
+    } op;
+    void *ptr_inputs;
+    void *ptr_outputs;
+    float output_scale_factor;
+} intel_dnn_component_t;
+
+typedef struct {
+    uint32_t num_scores;
+    uint32_t num_errors;
+    float threshold;
+    float max_error;
+    float rms_error;
+    float sum_error;
+    float sum_rms_error;
+    float sum_squared_error;
+    float max_rel_error;
+    float sum_rel_error;
+    float sum_squared_rel_error;
+} intel_score_error_t;
+
+class AmIntelDnn {
+ public:
+    AmIntelDnn()
+        : ptr_active_outputs_(NULL),
+          num_active_outputs_(0),
+          input_scale_factor_(1.0),
+          num_left_context(0),
+          num_right_context(0),
+          do_rotate_input(false),
+          num_rotate_rows(0),
+          num_rotate_columns(0),
+          softmax_type(kSoftmaxNone),
+          ptr_sumgroup_sizes(NULL),
+          num_sumgroup_sizes(0),
+          ptr_priors(NULL) {
+    }
+
+    ~AmIntelDnn() {
+        component.clear();
+        if (ptr_sumgroup_sizes != NULL) {
+            _mm_free(ptr_sumgroup_sizes);
+        }
+        if (ptr_priors != NULL) {
+            _mm_free(ptr_priors);
+        }
+    }
+
+    uint32_t num_components() { return (uint32_t) component.size(); }
+
+    void Init(void *ptr_memory, uint32_t num_memory_bytes, intel_dnn_number_type_t number_type, float scale_factor);
+    void InitActiveList(uint32_t *ptr_active_list);
+
+    template<class A, class B, class C, class D>
+    static void InitAffineComponent(intel_dnn_component_t &comp,
+                             uint32_t num_rows_in,
+                             uint32_t num_columns,
+                             uint32_t num_rows_out,
+                             uint32_t num_bytes_per_input,
+                             uint32_t num_bytes_per_output,
+                             uint32_t num_bytes_per_weight,
+                             uint32_t num_bytes_per_bias,
+                             float weight_scale_factor,
+                             float output_scale_factor,
+                             A *&ptr_inputs,
+                             B *&ptr_outputs,
+                             C *&ptr_weights,
+                             D *&ptr_biases,
+                             bool isDiag = false) {
+        InitAffineComponentPrivate(comp,
+                                   num_rows_in,
+                                   num_columns,
+                                   num_rows_out,
+                                   num_bytes_per_input,
+                                   num_bytes_per_output,
+                                   num_bytes_per_weight,
+                                   num_bytes_per_bias,
+                                   weight_scale_factor,
+                                   output_scale_factor,
+                                   (void *&) ptr_inputs,
+                                   (void *&) ptr_outputs,
+                                   (void *&) ptr_weights,
+                                   (void *&) ptr_biases,
+                                   isDiag,
+                                   true);
+    }
+
+    template<class A, class B, class C, class D>
+    void InitAffineComponent(uint32_t component_index,
+                             uint32_t num_rows_in,
+                             uint32_t num_columns,
+                             uint32_t num_rows_out,
+                             uint32_t num_bytes_per_input,
+                             uint32_t num_bytes_per_output,
+                             uint32_t num_bytes_per_weight,
+                             uint32_t num_bytes_per_bias,
+                             float weight_scale_factor,
+                             float output_scale_factor,
+                             A *&ptr_inputs,
+                             B *&ptr_outputs,
+                             C *&ptr_weights,
+                             D *&ptr_biases,
+                             bool isDiag = false) {
+        InitAffineComponentPrivate(component[component_index],
+                                   num_rows_in,
+                                   num_columns,
+                                   num_rows_out,
+                                   num_bytes_per_input,
+                                   num_bytes_per_output,
+                                   num_bytes_per_weight,
+                                   num_bytes_per_bias,
+                                   weight_scale_factor,
+                                   output_scale_factor,
+                                   (void *&) ptr_inputs,
+                                   (void *&) ptr_outputs,
+                                   (void *&) ptr_weights,
+                                   (void *&) ptr_biases,
+                                   isDiag,
+                                   false);
+    }
+
+    void InitDiagonalComponent(uint32_t component_index,
+                               uint32_t num_rows_in,
+                               uint32_t num_columns,
+                               uint32_t num_rows_out,
+                               uint32_t num_bytes_per_input,
+                               uint32_t num_bytes_per_output,
+                               uint32_t num_bytes_per_weight,
+                               uint32_t num_bytes_per_bias,
+                               float weight_scale_factor,
+                               float output_scale_factor,
+                               void *ptr_inputs,
+                               void *ptr_outputs,
+                               void *ptr_weights,
+                               void *ptr_biases);
+
+    template<class A, class B, class C, class D>
+    void InitConvolutional1DComponent(uint32_t component_index,
+                                      uint32_t num_rows_in,
+                                      uint32_t num_columns_in,
+                                      uint32_t num_rows_out,
+                                      uint32_t num_columns_out,
+                                      uint32_t num_bytes_per_input,
+                                      uint32_t num_bytes_per_output,
+                                      uint32_t num_bytes_per_weight,
+                                      uint32_t num_bytes_per_bias,
+                                      uint32_t num_filters,
+                                      uint32_t num_filter_rows,
+                                      uint32_t num_filter_coefficients,
+                                      uint32_t num_feature_maps,
+                                      uint32_t num_feature_map_rows,
+                                      uint32_t num_feature_map_columns,
+                                      float weight_scale_factor,
+                                      float output_scale_factor,
+                                      A *& ptr_inputs,
+                                      B *& ptr_outputs,
+                                      C *& ptr_filters,
+                                      D *& ptr_biases) {
+        InitConvolutional1DComponentPrivate(component[component_index],
+                                            num_rows_in,
+                                            num_columns_in,
+                                            num_rows_out,
+                                            num_columns_out,
+                                            num_bytes_per_input,
+                                            num_bytes_per_output,
+                                            num_bytes_per_weight,
+                                            num_bytes_per_bias,
+                                            num_filters,
+                                            num_filter_rows,
+                                            num_filter_coefficients,
+                                            num_feature_maps,
+                                            num_feature_map_rows,
+                                            num_feature_map_columns,
+                                            weight_scale_factor,
+                                            output_scale_factor,
+                                            (void *&) ptr_inputs,
+                                            (void *&) ptr_outputs,
+                                            (void *&) ptr_filters,
+                                            (void *&) ptr_biases,
+                                            false);
+    }
+
+    template<class A, class B, class C, class D>
+    static void InitConvolutional1DComponent(intel_dnn_component_t &comp,
+                                      uint32_t num_rows_in,
+                                      uint32_t num_columns_in,
+                                      uint32_t num_rows_out,
+                                      uint32_t num_columns_out,
+                                      uint32_t num_bytes_per_input,
+                                      uint32_t num_bytes_per_output,
+                                      uint32_t num_bytes_per_weight,
+                                      uint32_t num_bytes_per_bias,
+                                      uint32_t num_filters,
+                                      uint32_t num_filter_rows,
+                                      uint32_t num_filter_coefficients,
+                                      uint32_t num_feature_maps,
+                                      uint32_t num_feature_map_rows,
+                                      uint32_t num_feature_map_columns,
+                                      float weight_scale_factor,
+                                      float output_scale_factor,
+                                      A *& ptr_inputs,
+                                      B *& ptr_outputs,
+                                      C *& ptr_filters,
+                                      D *& ptr_biases) {
+        InitConvolutional1DComponentPrivate(comp,
+                                            num_rows_in,
+                                            num_columns_in,
+                                            num_rows_out,
+                                            num_columns_out,
+                                            num_bytes_per_input,
+                                            num_bytes_per_output,
+                                            num_bytes_per_weight,
+                                            num_bytes_per_bias,
+                                            num_filters,
+                                            num_filter_rows,
+                                            num_filter_coefficients,
+                                            num_feature_maps,
+                                            num_feature_map_rows,
+                                            num_feature_map_columns,
+                                            weight_scale_factor,
+                                            output_scale_factor,
+                                            (void *&) ptr_inputs,
+                                            (void *&) ptr_outputs,
+                                            (void *&) ptr_filters,
+                                            (void *&) ptr_biases,
+                                            true);
+    }
+
+
+
+    // TODO: this functions accepted component_index only used in legacy code
+    void InitMaxpoolComponent(uint32_t component_index,
+                              uint32_t num_rows_in,
+                              uint32_t num_columns_in,
+                              uint32_t num_rows_out,
+                              uint32_t num_columns_out,
+                              uint32_t num_bytes_per_input,
+                              uint32_t num_bytes_per_output,
+                              uint32_t num_pool_size,
+                              uint32_t num_pool_step,
+                              uint32_t num_pool_stride,
+                              bool do_sum_not_max,
+                              float output_scale_factor,
+                              void * ptr_inputs,
+                              void * ptr_outputs) {
+        InitMaxpoolComponentPrivate(component[component_index],
+            num_rows_in,
+            num_columns_in,
+            num_rows_out,
+            num_columns_out,
+            num_bytes_per_input,
+            num_bytes_per_output,
+            num_pool_size,
+            num_pool_step,
+            num_pool_stride,
+            do_sum_not_max,
+            output_scale_factor,
+            (void *&) ptr_inputs,
+            (void *&) ptr_outputs,
+            false);
+    }
+
+    template<class A, class B>
+    static void InitMaxpoolComponent(intel_dnn_component_t &cmp,
+                              uint32_t num_rows_in,
+                              uint32_t num_columns_in,
+                              uint32_t num_rows_out,
+                              uint32_t num_columns_out,
+                              uint32_t num_bytes_per_input,
+                              uint32_t num_bytes_per_output,
+                              uint32_t num_pool_size,
+                              uint32_t num_pool_step,
+                              uint32_t num_pool_stride,
+                              bool do_sum_not_max,
+                              float output_scale_factor,
+                              A *&ptr_inputs,
+                              B *&ptr_outputs) {
+        InitMaxpoolComponentPrivate(cmp,
+                                    num_rows_in,
+                                    num_columns_in,
+                                    num_rows_out,
+                                    num_columns_out,
+                                    num_bytes_per_input,
+                                    num_bytes_per_output,
+                                    num_pool_size,
+                                    num_pool_step,
+                                    num_pool_stride,
+                                    do_sum_not_max,
+                                    output_scale_factor,
+                                    (void *&) ptr_inputs,
+                                    (void *&) ptr_outputs,
+                                    true);
+    }
+
+
+
+
+    void InitPiecewiseLinearComponent(uint32_t component_index,
+                                      DnnActivation function_id,
+                                      intel_dnn_orientation_t orientation,
+                                      uint32_t num_rows,
+                                      uint32_t num_columns,
+                                      uint32_t num_bytes_per_input,
+                                      uint32_t num_bytes_per_output,
+                                      uint32_t num_segments,
+                                      float output_scale_factor,
+                                      void * ptr_inputs,
+                                      void * ptr_outputs,
+                                      intel_pwl_segment_t *ptr_segments) {
+        InitPiecewiseLinearComponentPrivate(component[component_index],
+                                            function_id,
+                                            orientation,
+                                            num_rows,
+                                            num_columns,
+                                            num_bytes_per_input,
+                                            num_bytes_per_output,
+                                            num_segments,
+                                            output_scale_factor,
+                                            ptr_inputs,
+                                            ptr_outputs,
+                                            ptr_segments,
+                                            false);
+    }
+    template<class A, class B>
+    static void InitPiecewiseLinearComponent(intel_dnn_component_t &cmp,
+                                      DnnActivation function_id,
+                                      intel_dnn_orientation_t orientation,
+                                      uint32_t num_rows,
+                                      uint32_t num_columns,
+                                      uint32_t num_bytes_per_input,
+                                      uint32_t num_bytes_per_output,
+                                      uint32_t num_segments,
+                                      float output_scale_factor,
+                                      A *&ptr_inputs,
+                                      B *&ptr_outputs,
+                                      intel_pwl_segment_t *ptr_segments) {
+        InitPiecewiseLinearComponentPrivate(cmp,
+                                            function_id,
+                                            orientation,
+                                            num_rows,
+                                            num_columns,
+                                            num_bytes_per_input,
+                                            num_bytes_per_output,
+                                            num_segments,
+                                            output_scale_factor,
+                                            (void *&) ptr_inputs,
+                                            (void *&) ptr_outputs,
+                                            ptr_segments,
+                                            true);
+    }
+
+
+    void InitRecurrentComponent(uint32_t component_index,
+                                uint32_t num_rows,
+                                uint32_t num_columns_in,
+                                uint32_t num_columns_out,
+                                uint32_t num_bytes_per_input,
+                                uint32_t num_bytes_per_output,
+                                uint32_t num_vector_delay,
+                                uint32_t num_bytes_per_weight,
+                                uint32_t num_bytes_per_bias,
+                                float weight_scale_factor,
+                                float output_scale_factor,
+                                void *ptr_inputs,
+                                void *ptr_feedbacks,
+                                void *ptr_outputs,
+                                void *ptr_weights,
+                                void *ptr_biases);
+    void InitInterleaveComponent(uint32_t component_index,
+                                 uint32_t num_rows,
+                                 uint32_t num_columns,
+                                 uint32_t num_bytes_per_input,
+                                 uint32_t num_bytes_per_output,
+                                 float output_scale_factor,
+                                 void *ptr_inputs,
+                                 void *ptr_outputs);
+    void InitDeinterleaveComponent(uint32_t component_index,
+                                   uint32_t num_rows,
+                                   uint32_t num_columns,
+                                   uint32_t num_bytes_per_input,
+                                   uint32_t num_bytes_per_output,
+                                   float output_scale_factor,
+                                   void *ptr_inputs,
+                                   void *ptr_outputs);
+    void InitCopyComponent(uint32_t component_index,
+                           intel_dnn_orientation_t orientation,
+                           uint32_t num_rows_in,
+                           uint32_t num_columns_in,
+                           uint32_t num_rows_out,
+                           uint32_t num_columns_out,
+                           uint32_t num_bytes_per_input,
+                           uint32_t num_bytes_per_output,
+                           float output_scale_factor,
+                           uint32_t num_copy_rows,
+                           uint32_t num_copy_columns,
+                           void *ptr_inputs,
+                           void *ptr_outputs) {
+        InitCopyComponentPrivate(component[component_index],
+                                 orientation,
+                                 num_rows_in,
+                                 num_columns_in,
+                                 num_rows_out,
+                                 num_columns_out,
+                                 num_bytes_per_input,
+                                 num_bytes_per_output,
+                                 output_scale_factor,
+                                 num_copy_rows,
+                                 num_copy_columns,
+                                 ptr_inputs,
+                                 ptr_outputs,
+                                 false);
+    }
+
+    template<class A, class B>
+    static  void InitCopyComponent(intel_dnn_component_t &cmp,
+                                   intel_dnn_orientation_t orientation,
+                                   uint32_t num_rows_in,
+                                   uint32_t num_columns_in,
+                                   uint32_t num_rows_out,
+                                   uint32_t num_columns_out,
+                                   uint32_t num_bytes_per_input,
+                                   uint32_t num_bytes_per_output,
+                                   float output_scale_factor,
+                                   uint32_t num_copy_rows,
+                                   uint32_t num_copy_columns,
+                                   A *&ptr_inputs,
+                                   B *&ptr_outputs) {
+        InitCopyComponentPrivate(cmp,
+                                 orientation,
+                                 num_rows_in,
+                                 num_columns_in,
+                                 num_rows_out,
+                                 num_columns_out,
+                                 num_bytes_per_input,
+                                 num_bytes_per_output,
+                                 output_scale_factor,
+                                 num_copy_rows,
+                                 num_copy_columns,
+                                 (void *&) ptr_inputs,
+                                 (void *&) ptr_outputs,
+                                 true);
+    }
+    void AddComponents(uint32_t num_components_to_add);
+    void ClearComponent(uint32_t component_index);
+    void ClearState();
+    uint32_t CopyActiveList(std::vector<std::vector<uint32_t> > &active_list, uint32_t list_index);
+    void Propagate();
+    intel_dnn_macro_operation_t MacroOperation(uint32_t component_index);
+    void SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation);
+    float InputScaleFactor(uint32_t component_index);
+    float WeightScaleFactor(uint32_t component_index);
+    float OutputScaleFactor(uint32_t component_index) {
+        return OutputScaleFactor(component[component_index]);
+    }
+    float OutputScaleFactor(intel_dnn_component_t &comp);
+    void SetInputScaleFactor(float scale_factor) { input_scale_factor_ = scale_factor; }
+    void SetOutputScaleFactor(uint32_t component_index, float scale_factor);
+    void PrintOutputs(uint32_t component_index);
+    uint32_t CompareScores(void *ptr_scores, intel_score_error_t *score_error, uint32_t num_frames);
+    void WriteGraphWizModel(const char *filename);
+    void WriteDnnText(const char *filename, intel_dnn_number_type_t number_type);
+    uint32_t MemoryRequiredToReadDnnText(const char *filename);
+    void ReadDnnText(const char *filename, void *ptr_memory, uint32_t num_memory_bytes, float *ptr_scale_in);
+
+    void InitGNAStruct(intel_nnet_type_t *ptr_nnet);
+    void DestroyGNAStruct(intel_nnet_type_t *ptr_nnet);
+    void GetScaledOutput(float *ptr_output, uint32_t component_index);
+    uint32_t *ptr_active_outputs() { return (ptr_active_outputs_); }
+    uint32_t num_active_outputs() { return (num_active_outputs_); }
+    uint32_t num_gna_layers() {
+        uint32_t num_layers = 0;
+        for (uint32_t i = 0; i < component.size(); i++) {
+            if ((component[i].operation == kDnnAffineOp) || (component[i].operation == kDnnDiagonalOp)
+                || (component[i].operation == kDnnConvolutional1dOp) || (component[i].operation == kDnnCopyOp)
+                || (component[i].operation == kDnnDeinterleaveOp) || (component[i].operation == kDnnInterleaveOp)
+                || (component[i].operation == kDnnRecurrentOp)) {
+                num_layers++;
+            }
+        }
+        return (num_layers);
+    }
+    uint32_t num_group_in() {
+        return ((component.size() > 0) ? ((component[0].orientation_in == kDnnInterleavedOrientation)
+                                          ? component[0].num_columns_in : component[0].num_rows_in) : 0);
+    }
+    uint32_t num_group_out() {
+        return ((component.size() > 0) ? ((component[component.size() - 1].orientation_out
+            == kDnnInterleavedOrientation) ? component[component.size() - 1].num_columns_out : component[
+                                              component.size() - 1].num_rows_out) : 0);
+    }
+
+    std::vector<intel_dnn_component_t> component;
+    uint32_t num_left_context;
+    uint32_t num_right_context;
+    bool do_rotate_input;
+    uint32_t num_rotate_rows = 0;
+    uint32_t num_rotate_columns = 0;
+    DnnSoftmaxType softmax_type;
+    uint32_t *ptr_sumgroup_sizes;
+    uint32_t num_sumgroup_sizes;
+    float *ptr_priors;
+
+    void WriteInputAndOutputText();
+    static void WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet);
+    void BeginNewWrite();
+
+ private:
+    void *ptr_dnn_memory_;
+    uint32_t num_bytes_dnn_memory_;
+    uint32_t *ptr_active_outputs_;
+    uint32_t num_active_outputs_;
+    intel_dnn_number_type_t number_type_;
+    float input_scale_factor_;
+
+    static void InitCopyComponentPrivate(intel_dnn_component_t &cmp,
+                                         intel_dnn_orientation_t orientation,
+                                         uint32_t num_rows_in,
+                                         uint32_t num_columns_in,
+                                         uint32_t num_rows_out,
+                                         uint32_t num_columns_out,
+                                         uint32_t num_bytes_per_input,
+                                         uint32_t num_bytes_per_output,
+                                         float output_scale_factor,
+                                         uint32_t num_copy_rows,
+                                         uint32_t num_copy_columns,
+                                         void *&ptr_inputs,
+                                         void *&ptr_outputs,
+                                         bool postInitMem);
+
+    static void InitMaxpoolComponentPrivate(intel_dnn_component_t &cmp,
+                                     uint32_t num_rows_in,
+                                     uint32_t num_columns_in,
+                                     uint32_t num_rows_out,
+                                     uint32_t num_columns_out,
+                                     uint32_t num_bytes_per_input,
+                                     uint32_t num_bytes_per_output,
+                                     uint32_t num_pool_size,
+                                     uint32_t num_pool_step,
+                                     uint32_t num_pool_stride,
+                                     bool do_sum_not_max,
+                                     float output_scale_factor,
+                                     void *&ptr_inputs,
+                                     void *&ptr_outputs,
+                                     bool   postInitMem);
+
+    static void InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &cmp,
+                                             DnnActivation function_id,
+                                             intel_dnn_orientation_t orientation,
+                                             uint32_t num_rows,
+                                             uint32_t num_columns,
+                                             uint32_t num_bytes_per_input,
+                                             uint32_t num_bytes_per_output,
+                                             uint32_t num_segments,
+                                             float   output_scale_factor,
+                                             void *& ptr_inputs,
+                                             void *& ptr_outputs,
+                                             intel_pwl_segment_t *ptr_segments,
+                                             bool    postInitMem);
+
+    static void InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp,
+                                             uint32_t num_rows_in,
+                                             uint32_t num_columns_in,
+                                             uint32_t num_rows_out,
+                                             uint32_t num_columns_out,
+                                             uint32_t num_bytes_per_input,
+                                             uint32_t num_bytes_per_output,
+                                             uint32_t num_bytes_per_weight,
+                                             uint32_t num_bytes_per_bias,
+                                             uint32_t num_filters,
+                                             uint32_t num_filter_rows,
+                                             uint32_t num_filter_coefficients,
+                                             uint32_t num_feature_maps,
+                                             uint32_t num_feature_map_rows,
+                                             uint32_t num_feature_map_columns,
+                                             float   weight_scale_factor,
+                                             float   output_scale_factor,
+                                             void *& ptr_inputs,
+                                             void *& ptr_outputs,
+                                             void *& ptr_filters,
+                                             void *& ptr_biases,
+                                             bool    postInitMem);
+
+    static void InitAffineComponentPrivate(intel_dnn_component_t &comp,
+                                           uint32_t num_rows_in,
+                                           uint32_t num_columns,
+                                           uint32_t num_rows_out,
+                                           uint32_t num_bytes_per_input,
+                                           uint32_t num_bytes_per_output,
+                                           uint32_t num_bytes_per_weight,
+                                           uint32_t num_bytes_per_bias,
+                                           float  weight_scale_factor,
+                                           float  output_scale_factor,
+                                           void *&ptr_inputs,
+                                           void *&ptr_outputs,
+                                           void *&ptr_weights,
+                                           void *&ptr_biases,
+                                           bool   isDiag,
+                                           bool   postInitMem);
+};
+
+void PlotFloatIntDnn(AmIntelDnn *dnn, AmIntelDnn *dnn_int);
+bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2);
+void ClearScoreError(intel_score_error_t *error);
+void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error);
+void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs);
diff --git a/inference-engine/src/gna_plugin/dnn_memory.cpp b/inference-engine/src/gna_plugin/dnn_memory.cpp
new file mode 100644 (file)
index 0000000..16496b5
--- /dev/null
@@ -0,0 +1,30 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cstdio>
+#include <cstdlib>
+#include "dnn_memory.hpp"
+#include "gna-api.h"
+
+void MemoryAssign(void **ptr_dest,
+                  void **ptr_memory,
+                  uint32_t num_bytes_needed,
+                  uint32_t *ptr_num_bytes_used,
+                  uint32_t num_memory_bytes,
+                  const char *name) {
+    if (*ptr_num_bytes_used + ALIGN(num_bytes_needed, 64) > num_memory_bytes) {
+        fprintf(stderr,
+                "Out of memory in %s (%d+ALIGN(%d)>%d)!\n",
+                name,
+                *ptr_num_bytes_used,
+                num_bytes_needed,
+                num_memory_bytes);
+        throw -1;
+    } else {
+        uint8_t *ptr_bytes = reinterpret_cast<uint8_t *>(*ptr_memory);
+        *ptr_dest = *ptr_memory;
+        *ptr_memory = ptr_bytes + ALIGN(num_bytes_needed, 64);
+        *ptr_num_bytes_used += ALIGN(num_bytes_needed, 64);
+    }
+}
diff --git a/inference-engine/src/gna_plugin/dnn_memory.hpp b/inference-engine/src/gna_plugin/dnn_memory.hpp
new file mode 100644 (file)
index 0000000..5ab2c96
--- /dev/null
@@ -0,0 +1,13 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstdint>
+extern void MemoryAssign(void **ptr_dest,
+                         void **ptr_memory,
+                         uint32_t num_bytes_needed,
+                         uint32_t *ptr_num_bytes_used,
+                         uint32_t num_memory_bytes,
+                         const char *name);
diff --git a/inference-engine/src/gna_plugin/dnn_traits.hpp b/inference-engine/src/gna_plugin/dnn_traits.hpp
new file mode 100644 (file)
index 0000000..0a92bb3
--- /dev/null
@@ -0,0 +1,90 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "dnn.h"
+
+template<intel_dnn_operation_t layer>
+struct DnnTrait {};
+
+template<>
+struct DnnTrait<kDnnDiagonalOp> {
+    using Type = intel_affine_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.affine;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnPiecewiselinearOp> {
+    using Type = intel_piecewiselinear_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.pwl;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnAffineOp> {
+    using Type = intel_affine_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.affine;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnConvolutional1dOp> {
+    using Type = intel_convolutionalD_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.conv1D;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnMaxPoolOp> {
+    using Type = intel_maxpool_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.maxpool;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnRecurrentOp> {
+    using Type = intel_recurrent_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.recurrent;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnInterleaveOp> {
+    using Type = intel_interleave_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.interleave;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnDeinterleaveOp> {
+    using Type = intel_deinterleave_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.deinterleave;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnCopyOp> {
+    using Type = intel_copy_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.copy;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnNullOp> {
+    using Type = void;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return nullptr;
+    }
+};
diff --git a/inference-engine/src/gna_plugin/floatmath.cpp b/inference-engine/src/gna_plugin/floatmath.cpp
new file mode 100644 (file)
index 0000000..3ea4112
--- /dev/null
@@ -0,0 +1,423 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "floatmath.h"
+#include "pwl.h"
+#include "gna_plugin_log.hpp"
+#include <cmath>
+
+
+void CNNFilter32(intel_dnn_component_t *component) {
+    float *ptr_filters = reinterpret_cast<float *>(component->op.conv1D.ptr_filters);
+    float *ptr_biases = reinterpret_cast<float *>(component->op.conv1D.ptr_biases);
+    float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
+    float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
+    uint32_t num_group = component->num_rows_in;
+    uint32_t num_filter_outputs = component->op.conv1D.num_feature_map_rows - component->op.conv1D.num_filter_rows + 1;
+    uint32_t
+        num_inputs_band_stride = component->op.conv1D.num_feature_maps * component->op.conv1D.num_feature_map_columns;
+    uint32_t num_filter_coefficients = component->op.conv1D.num_filter_coefficients;
+
+    if ((component->num_rows_in != 1) || (component->num_rows_out != 1)
+        || (component->num_columns_out != num_filter_outputs * component->op.conv1D.num_filters)) {
+        THROW_GNA_EXCEPTION << "Bad problem dimensions in CNNFilter32!";
+    }
+
+    for (uint32_t j = 0; j < num_filter_outputs; j++) {
+        float *ptr_in = ptr_inputs + j * num_inputs_band_stride;
+        for (uint32_t i = 0; i < component->op.conv1D.num_filters; i++) {
+            float *ptr_coef = ptr_filters + i * num_filter_coefficients;
+            float sum = ptr_biases[i];
+            for (uint32_t k = 0; k < num_filter_coefficients; k++) {
+                sum += ptr_in[k] * ptr_coef[k];
+            }
+            ptr_outputs[j * component->op.conv1D.num_filters + i] = sum;
+        }
+    }
+}
+
+void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
+    if (number_type == kDnnInt) {
+        int32_t *ptr_inputs = reinterpret_cast<int32_t *>(component->ptr_inputs);
+        int32_t *ptr_outputs = reinterpret_cast<int32_t *>(component->ptr_outputs);
+        uint32_t num_inputs = component->num_columns_in;
+        uint32_t num_columns = component->op.maxpool.num_inputs_stride;
+        uint32_t num_pool_size = component->op.maxpool.num_inputs;
+        uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
+        uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
+        uint32_t num_rows_out = num_rows_in / num_pool_step;
+
+        for (uint32_t i = 0; i < num_columns; i++) {
+            int32_t m = 0;
+            if (component->op.maxpool.do_sum_not_max) {
+                uint32_t num_saturate = 0;
+                for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
+                    int64_t sum = 0;
+                    uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
+                    for (uint32_t k = j; k < num_end; k++) {
+                        sum += ptr_inputs[k * num_columns + i];
+                    }
+                    if (sum > 2147483647.0) {
+                        ptr_outputs[m * num_columns + i] = 2147483647L;
+                        num_saturate++;
+                    } else if (sum < -2147483648.0) {
+                        ptr_outputs[m * num_columns + i] = -2147483648L;
+                        num_saturate++;
+                    } else {
+                        ptr_outputs[m * num_columns + i] = (int32_t) sum;
+                    }
+                    m++;
+                }
+                if (num_saturate > 0) {
+                    fprintf(stderr, "Warning:  %d saturations in CNNMaxPool()\n", num_saturate);
+                }
+            } else {
+                for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
+                    int32_t max = INT32_MIN;
+                    uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
+                    for (uint32_t k = j; k < num_end; k++) {
+                        if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
+                    }
+                    ptr_outputs[m * num_columns + i] = max;
+                    m++;
+                }
+            }
+        }
+    } else {
+        float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
+        float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
+        uint32_t num_inputs = component->num_columns_in;
+        uint32_t num_columns = component->op.maxpool.num_inputs_stride;
+        uint32_t num_pool_size = component->op.maxpool.num_inputs;
+        uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
+        uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
+        uint32_t num_rows_out = num_rows_in / num_pool_step;
+
+        for (uint32_t i = 0; i < num_columns; i++) {
+            int32_t m = 0;
+            if (component->op.maxpool.do_sum_not_max) {
+                for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
+                    float sum = 0.0;
+                    uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
+                    for (uint32_t k = j; k < num_end; k++) {
+                        sum += ptr_inputs[k * num_columns + i];
+                    }
+                    ptr_outputs[m * num_columns + i] = sum;
+                    m++;
+                }
+            } else {
+                for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
+                    float max = -1e20f;
+                    uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
+                    for (uint32_t k = j; k < num_end; k++) {
+                        if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
+                    }
+                    ptr_outputs[m * num_columns + i] = max;
+                    m++;
+                }
+            }
+        }
+    }
+}
+
+void PwlApply16(intel_dnn_component_t *component, uint32_t num_subset_size) {
+    if (component->orientation_in == kDnnInterleavedOrientation) {  // subsets only supported in interleaved orientation
+        PwlApply16(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
+    } else {
+        PwlApply16(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
+    }
+}
+
+void PwlApply16(intel_dnn_component_t *component,
+                uint32_t num_row_start,
+                uint32_t num_row_end,
+                uint32_t num_col_start,
+                uint32_t num_col_end) {
+    uint32_t num_saturate = 0;
+    uint32_t num_segments = component->op.pwl.num_segments;
+    if (num_segments > 0) {
+        intel_pwl_segment_t *ptr_segment = component->op.pwl.ptr_segments;
+        for (int i = num_row_start; i <= num_row_end; i++) {
+            int32_t *ptr_input = reinterpret_cast<int32_t *>(component->ptr_inputs) + i * component->num_columns_in;
+            int16_t *ptr_output = reinterpret_cast<int16_t *>(component->ptr_outputs) + i * component->num_columns_in;
+            for (int j = num_col_start; j <= num_col_end; j++) {
+                int32_t xbase = (int32_t) (ptr_segment[0].xBase & XBASEMASK);
+                int32_t input = ptr_input[j];
+                if (input <= xbase) {
+                    ptr_output[j] = ptr_segment[0].yBase;
+                } else {
+                    uint32_t slope_shift;
+                    int16_t slope, ybase;
+                    int64_t diff, prod, prod_shift, sum;
+                    uint32_t k = num_segments / 2;
+                    uint32_t k_upper = num_segments;
+                    uint32_t k_lower = 0;
+                    while (k_upper > k_lower + 1) {
+                        xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
+                        if (xbase > input) {
+                            k_upper = k;
+                            k = (k + k_lower) / 2;
+                        } else {
+                            k_lower = k;
+                            k = (k_upper + k) / 2;
+                        }
+                    }
+                    xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
+                    slope_shift = ((ptr_segment[k].xBase & ~XBASEMASK) + 1) * 8;
+                    slope = ptr_segment[k].slope;
+                    ybase = ptr_segment[k].yBase;
+                    diff = (int64_t) input - (int64_t) xbase;
+                    prod = diff * slope;
+                    prod_shift = prod >> slope_shift;
+                    sum = prod_shift + (int64_t) ybase;
+                    if (sum > 32767LL) {
+                        ptr_output[j] = 32767;
+                        num_saturate++;
+                    } else if (sum < -32768LL) {
+                        ptr_output[j] = -32768;
+                        num_saturate++;
+                    } else {
+                        ptr_output[j] = (int16_t) sum;
+                    }
+                }
+            }
+        }
+    }
+
+    if (num_saturate > 0) {
+        fprintf(stderr, "Warning:  %d saturations in PwlApply16!\n", num_saturate);
+    }
+}
+
+void PwlApply32(intel_dnn_component_t *component, uint32_t num_subset_size) {
+    if (component->orientation_in == kDnnInterleavedOrientation) {  // subsets only supported in interleaved orientation
+        PwlApply32(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
+    } else {
+        PwlApply32(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
+    }
+}
+
+void PwlApply32(intel_dnn_component_t *component,
+                uint32_t num_row_start,
+                uint32_t num_row_end,
+                uint32_t num_col_start,
+                uint32_t num_col_end) {
+    intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component->op.pwl);
+    float *ptr_in = reinterpret_cast<float *>(component->ptr_inputs);
+    float *ptr_out = reinterpret_cast<float *>(component->ptr_outputs);
+    uint32_t num_columns = component->num_columns_in;
+    switch (transform->func_id.type) {
+        case kActSigmoid:
+            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
+                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
+                    ptr_out[i * num_columns + j] = 0.5 * (1.0 + tanh(0.5 * ptr_in[i * num_columns + j]));
+                }
+            }
+            break;
+        case kActTanh:
+            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
+                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
+                    ptr_out[i * num_columns + j] = tanh(ptr_in[i * num_columns + j]);
+                }
+            }
+            break;
+        case kActRelu:
+            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
+                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
+                    ptr_out[i * num_columns + j] =
+                        (ptr_in[i * num_columns + j] < 0.0f) ? ptr_in[i * num_columns + j] * transform->func_id.negative_slope : ptr_in[i * num_columns + j];
+                }
+            }
+            break;
+        case kActIdentity:
+            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
+                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
+                    ptr_out[i * num_columns + j] = ptr_in[i * num_columns + j];
+                }
+            }
+            break;
+        case kActKaldiLstmClipping:
+            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
+                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
+                    float val = ptr_in[i * num_columns + j];
+                    if (val > KALDI_LSTM_CLIP_UPPER) {
+                        ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_UPPER;
+                    } else if (val < KALDI_LSTM_CLIP_LOWER) {
+                        ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_LOWER;
+                    } else {
+                        ptr_out[i * num_columns + j] = val;
+                    }
+                }
+            }
+            break;
+        case kActCustom:
+            // break;
+        default:fprintf(stderr, "Unknown piecewise linear function type!\n");
+            throw -1;
+    }
+}
+
+#ifdef __cplusplus
+extern "C" {  // API uses C linkage so that it can be used by C and C++ applications
+#endif
+
+#ifdef _NO_MKL_
+void cblas_sgemm1(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
+                  const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
+                  const MKL_INT K, const float alpha, const float *A,
+                  const MKL_INT lda, const float *B, const MKL_INT ldb,
+                  const float beta, float *C, const MKL_INT ldc) {
+    int i, j, k;
+
+    if (Layout != CblasRowMajor) {
+        fprintf(stderr, "Only row major is supported in cblas_sgemm!\n");
+        throw -1;
+    }
+
+    if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
+        for (i = 0; i < M; i++) {
+            for (j = 0; j < N; j++) {
+                float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
+                for (k = 0; k < K; k++) {
+                    sum += A[i * lda + k] * B[k * ldb + j];
+                }
+                C[i * ldc + j] = sum;
+            }
+        }
+    } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
+        for (i = 0; i < M; i++) {
+            for (j = 0; j < N; j++) {
+                float sum;
+                sum = beta * C[i * ldc + j];
+                for (k = 0; k < K; k++) {
+                    sum += alpha * A[i * lda + k] * B[j * ldb + k];
+                }
+                C[i * ldc + j] = sum;
+            }
+        }
+    } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
+        for (i = 0; i < M; i++) {
+            for (j = 0; j < N; j++) {
+                float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
+                for (k = 0; k < K; k++) {
+                    sum += A[k * lda + i] * B[k * ldb + j];
+                }
+                C[i * ldc + j] = sum;
+            }
+        }
+    } else {
+        fprintf(stderr, "Expected A not transposed in cblas_sgemm!\n");
+        throw -1;
+    }
+}
+void cblas_ssbmv1(const CBLAS_LAYOUT Layout, const CBLAS_UPLO Uplo,
+                  const MKL_INT N, const MKL_INT K, const float alpha, const float *A,
+                  const MKL_INT lda, const float *X, const MKL_INT incX,
+                  const float beta, float *Y, const MKL_INT incY) {
+    int i, j, k;
+
+    if (Layout != CblasRowMajor) {
+        fprintf(stderr, "Only row major is supported in cblas_ssbmv!\n");
+        throw -1;
+    }
+    if (Uplo != CblasLower) {
+        fprintf(stderr, "Only lower format is supported in cblas_ssbmv!\n");
+        throw -1;
+    }
+    if (K != 0) {
+        fprintf(stderr, "Only diagonal matrices supported in cblas_ssbmv at this time!\n");
+        throw -1;
+    }
+    if ((alpha == 1.0) && (beta == 1.0) && (incX == 1) && (incY == 1)) {
+        for (i = 0; i < N; i++) {
+            Y[i] += A[i] * X[i];
+        }
+    } else {
+        fprintf(stderr, "Only alpha=1, beta=1, incX=1, incY=1, LDA=1 supported in cblas_ssbmv at this time!\n");
+        throw -1;
+    }
+}
+#endif  // #ifdef _NO_MKL_
+
+void cblas_sgemm_subset(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
+                        const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
+                        const MKL_INT K, const float alpha, const float *A,
+                        const MKL_INT lda, const float *B, const MKL_INT ldb,
+                        const float beta, float *C, const MKL_INT ldc,
+                        const uint32_t *OutputList, const MKL_INT L) {
+    int i, j, k, l;
+
+    if (Layout != CblasRowMajor) {
+        fprintf(stderr, "Only row major is supported in cblas_sgemm_subset!\n");
+        throw -1;
+    }
+
+    if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
+        for (l = 0; l < L; l++) {
+            i = OutputList[l];
+            for (j = 0; j < N; j++) {
+                float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
+                for (k = 0; k < K; k++) {
+                    sum += A[i * lda + k] * B[k * ldb + j];
+                }
+                C[l * ldc + j] = sum;
+            }
+        }
+    } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
+        for (i = 0; i < M; i++) {
+            for (l = 0; l < L; l++) {
+                float sum;
+                j = OutputList[l];
+                sum = beta * C[i * ldc + l];
+                for (k = 0; k < K; k++) {
+                    sum += alpha * A[i * lda + k] * B[j * ldb + k];
+                }
+                C[i * ldc + l] = sum;
+            }
+        }
+    } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
+        for (l = 0; l < L; l++) {
+            i = OutputList[l];
+            for (j = 0; j < N; j++) {
+                float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
+                for (k = 0; k < K; k++) {
+                    sum += A[k * lda + i] * B[k * ldb + j];
+                }
+                C[l * ldc + j] = sum;
+            }
+        }
+    } else {
+        fprintf(stderr, "Expected A not transposed in cblas_sgemm_subset!\n");
+        throw -1;
+    }
+}
+
+// C = [ A1 A2 ] * X + B
+void sgemv_split(const uint32_t N,
+                 const uint32_t K1,
+                 const uint32_t K2,
+                 const float *A1,
+                 const float *A2,
+                 const float *X,
+                 const float *B,
+                 float *C) {
+    uint32_t num_columns = K1 + K2;
+    uint32_t num_rows = N;
+    uint32_t i, j;
+
+    for (i = 0; i < num_rows; i++) {
+        float sum = B[i];
+        for (j = 0; j < K1; j++) {
+            sum += A1[j] * X[i * num_columns + j];
+        }
+        for (j = K1; j < num_columns; j++) {
+            sum += A2[j - K1] * X[i * num_columns + j];
+        }
+        C[i] = sum;
+    }
+}
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif
diff --git a/inference-engine/src/gna_plugin/floatmath.h b/inference-engine/src/gna_plugin/floatmath.h
new file mode 100644 (file)
index 0000000..ff9bf99
--- /dev/null
@@ -0,0 +1,71 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma  once
+
+#include <stdlib.h>
+#include <stdio.h>
+#ifndef _NO_MKL_
+#include <mkl_dnn.h>
+#include <mkl_cblas.h>
+#endif
+// #include "types.h"
+#include "dnn.h"
+
+#ifndef CBLAS_LAYOUT
+#define CBLAS_LAYOUT CBLAS_ORDER
+#endif
+
+#define CNN_MAX_POOL_SIZE 6
+
+void CNNFilter32(intel_dnn_component_t *component);
+void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type);
+
+#ifdef _NO_MKL_
+#ifndef _MKL_H_
+#define _MKL_H_
+typedef enum { CblasRowMajor = 101, CblasColMajor = 102 } CBLAS_LAYOUT;
+typedef enum { CblasNoTrans = 111, CblasTrans = 112, CblasConjTrans = 113 } CBLAS_TRANSPOSE;
+typedef enum { CblasUpper = 121, CblasLower = 122 } CBLAS_UPLO;
+typedef enum { CblasNonUnit = 131, CblasUnit = 132 } CBLAS_DIAG;
+typedef enum { CblasLeft = 141, CblasRight = 142 } CBLAS_SIDE;
+typedef CBLAS_LAYOUT CBLAS_ORDER; /* this for backward compatibility with CBLAS_ORDER */
+#define MKL_INT int
+#endif  // #ifndef _MKL_H_
+#endif  // #ifdef _NO_MKL_
+
+#ifdef __cplusplus
+extern "C" {  // API uses C linkage so that it can be used by C and C++ applications
+#endif
+
+#ifdef _NO_MKL_
+void cblas_sgemm1(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
+                  const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
+                  const MKL_INT K, const float alpha, const float *A,
+                  const MKL_INT lda, const float *B, const MKL_INT ldb,
+                  const float beta, float *C, const MKL_INT ldc);
+void cblas_ssbmv1(const CBLAS_LAYOUT Layout, const CBLAS_UPLO Uplo,
+                  const MKL_INT N, const MKL_INT K, const float alpha, const float *A,
+                  const MKL_INT lda, const float *X, const MKL_INT incX,
+                  const float beta, float *Y, const MKL_INT incY);
+#endif  // #ifdef _NO_MKL_
+void cblas_sgemm_subset(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
+                        const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
+                        const MKL_INT K, const float alpha, const float *A,
+                        const MKL_INT lda, const float *B, const MKL_INT ldb,
+                        const float beta, float *C, const MKL_INT ldc,
+                        const uint32_t *OutputList, const MKL_INT L);
+void sgemv_split(const uint32_t N,
+                 const uint32_t K1,
+                 const uint32_t K2,
+                 const float *A1,
+                 const float *A2,
+                 const float *X,
+                 const float *B,
+                 float *C);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/inference-engine/src/gna_plugin/gna_allocator.hpp b/inference-engine/src/gna_plugin/gna_allocator.hpp
new file mode 100644 (file)
index 0000000..ae62b1f
--- /dev/null
@@ -0,0 +1,33 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <functional>
+#include "gna_device.hpp"
+#include "polymorh_allocator.hpp"
+
+/**
+ * wrap GNA interface into c++ allocator friendly one
+ */
+class GNAAllocator {
+    std::reference_wrapper<GNADeviceHelper> _device;
+
+ public:
+    typedef uint8_t value_type;
+
+    explicit GNAAllocator(GNADeviceHelper &device) : _device(device) {
+    }
+    uint8_t *allocate(std::size_t n) {
+        uint32_t granted = 0;
+        auto result = _device.get().alloc(n, &granted);
+        if (result == nullptr || granted == 0) {
+            throw std::bad_alloc();
+        }
+        return result;
+    }
+    void deallocate(uint8_t *p, std::size_t n) {
+        _device.get().free();
+    }
+};
diff --git a/inference-engine/src/gna_plugin/gna_api_wrapper.hpp b/inference-engine/src/gna_plugin/gna_api_wrapper.hpp
new file mode 100644 (file)
index 0000000..fb9d2cc
--- /dev/null
@@ -0,0 +1,63 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <gna-api-types-xnn.h>
+#include "gna_plugin_log.hpp"
+namespace GNAPluginNS {
+
+/**
+ * represent wrapper that capable to exception save pass c-objects
+ * @tparam T
+ */
+template <class T>
+class CPPWrapper {
+};
+
+template <>
+class CPPWrapper<intel_nnet_type_t> {
+ public:
+    intel_nnet_type_t obj;
+
+    CPPWrapper() {
+        obj.nLayers = 0;
+        obj.pLayers = nullptr;
+        obj.nGroup = 0;
+    }
+
+    /**
+     * creates nnet structure of n layers
+     * @param n - number  of layers
+     */
+    explicit CPPWrapper(size_t n) {
+        obj.pLayers = reinterpret_cast<intel_nnet_layer_t *>(_mm_malloc(n * sizeof(intel_nnet_layer_t), 64));
+        if (obj.pLayers == nullptr) {
+            THROW_GNA_EXCEPTION << "out of memory in while allocating "<< n << " GNA layers";
+        }
+        obj.nLayers = n;
+        for (int i = 0; i < obj.nLayers; i++) {
+            obj.pLayers[i].pLayerStruct = nullptr;
+        }
+    }
+    ~CPPWrapper() {
+        for (int i = 0; i < obj.nLayers; i++) {
+            if (obj.pLayers[i].pLayerStruct != nullptr) {
+                _mm_free(obj.pLayers[i].pLayerStruct);
+            }
+        }
+        _mm_free(obj.pLayers);
+    }
+    intel_nnet_type_t * operator ->() {
+        return &obj;
+    }
+    intel_nnet_type_t * operator *() {
+        return &obj;
+    }
+    operator  intel_nnet_type_t &() {
+        return *this;
+    }
+};
+
+}  // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/gna_device.cpp b/inference-engine/src/gna_plugin/gna_device.cpp
new file mode 100644 (file)
index 0000000..3936bc8
--- /dev/null
@@ -0,0 +1,125 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gna_device.hpp"
+
+#include <map>
+#include <string>
+#include <cstring>
+
+#include "gna-api-status.h"
+#include "gna-api.h"
+
+#include "details/ie_exception.hpp"
+#include "gna_plugin_log.hpp"
+#include "gna/gna_config.hpp"
+
+uint8_t* GNADeviceHelper::alloc(uint32_t size_requested, uint32_t *size_granted) {
+    return reinterpret_cast<uint8_t *>(GNAAlloc(nGNAHandle, size_requested, size_granted));
+}
+
+void GNADeviceHelper::propagateSync(const intel_nnet_type_t *pNeuralNetwork,
+                                    const uint32_t *pActiveIndices,
+                                    uint32_t nActiveIndices) {
+    wait(propagate(pNeuralNetwork, pActiveIndices, nActiveIndices));
+}
+
+uint32_t GNADeviceHelper::propagate(const intel_nnet_type_t *pNeuralNetwork,
+                   const uint32_t *pActiveIndices,
+                   uint32_t nActiveIndices) {
+    uint32_t reqId;
+    nGNAStatus = GNAPropagateForward(nGNAHandle, pNeuralNetwork,
+                                     pActiveIndices, nActiveIndices, &reqId, nGNAProcType);
+    checkStatus();
+    return reqId;
+}
+
+void GNADeviceHelper::wait(uint32_t reqId) {
+    if (isPerformanceMeasuring) {
+        nGNAStatus = GNAWaitPerfRes(nGNAHandle, GNA_TIMEOUT, reqId, &nGNAPerfResults);
+        updateGnaPerfCounters();
+    } else {
+        nGNAStatus = GNAWait(nGNAHandle, 1000000, reqId);
+    }
+    checkStatus();
+}
+
+GNADeviceHelper::DumpResult GNADeviceHelper::dumpXnn(const intel_nnet_type_t *pNeuralNetwork,
+                                    const uint32_t *pActiveIndices,
+                                    uint32_t nActiveIndices) {
+    DumpResult r;
+    intel_gna_status_t gna_status;
+
+    if (!pNeuralNetwork) {
+        THROW_GNA_EXCEPTION<< "GNADumpXnn got invalid NeuralNetwork parameter \n";
+    }
+    r.model.reset(GNADumpXnn(pNeuralNetwork,
+                             pActiveIndices,
+                             nActiveIndices,
+                             &r.header,
+                             &nGNAStatus,
+                             [](size_t count)-> void* {return ::operator new(count);}),
+                             [](void * ptr) {::operator delete(ptr);});
+
+    checkStatus();
+
+    if (r.model == nullptr) {
+        THROW_GNA_EXCEPTION << "GNADumpXnn returned nullptr";
+    }
+
+    return r;
+}
+
+void GNADeviceHelper::checkStatus() const {
+    if ((nGNAStatus != GNA_NOERROR) && (nGNAStatus != GNA_SSATURATE)) {
+        THROW_GNA_EXCEPTION << "Bad GNA status " << nGNAStatus << ", " << GNAStatusName[nGNAStatus];
+    }
+}
+
+void GNADeviceHelper::open(uint8_t n_threads) {
+    nGNAHandle = GNADeviceOpenSetThreads(&nGNAStatus, n_threads);
+
+    checkStatus();
+}
+
+void GNADeviceHelper::close() {
+    GNADeviceClose(nGNAHandle);
+    nGNAHandle = 0;
+}
+
+void GNADeviceHelper::setOMPThreads(uint8_t const n_threads) {
+    gmmSetThreads(n_threads);
+}
+
+void GNADeviceHelper::updateGnaPerfCounters() {
+    nGNAPerfResultsTotal.hw.stall = nGNAPerfResults.hw.stall;
+    nGNAPerfResultsTotal.hw.total = nGNAPerfResults.hw.total;
+
+    nGNAPerfResultsTotal.lib.submit = nGNAPerfResults.lib.submit;
+    nGNAPerfResultsTotal.lib.preprocess = nGNAPerfResults.lib.preprocess;
+    nGNAPerfResultsTotal.lib.process = nGNAPerfResults.lib.process;
+    nGNAPerfResultsTotal.lib.scoring = nGNAPerfResults.lib.scoring;
+    nGNAPerfResultsTotal.lib.total = nGNAPerfResults.lib.total;
+    nGNAPerfResultsTotal.lib.ioctlSubmit = nGNAPerfResults.lib.ioctlSubmit;
+    nGNAPerfResultsTotal.lib.ioctlWaitOn = nGNAPerfResults.lib.ioctlWaitOn;
+
+    nGNAPerfResultsTotal.total.start = nGNAPerfResults.total.start;
+    nGNAPerfResultsTotal.total.stop = nGNAPerfResults.total.stop;
+
+    nGNAPerfResultsTotal.drv.startHW = nGNAPerfResults.drv.startHW;
+    nGNAPerfResultsTotal.drv.scoreHW = nGNAPerfResults.drv.scoreHW;
+    nGNAPerfResultsTotal.drv.intProc = nGNAPerfResults.drv.intProc;
+}
+
+void GNADeviceHelper::getGnaPerfCounters(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>& retPerfCounters) {
+    InferenceEngine::InferenceEngineProfileInfo info;
+    info.status = InferenceEngine::InferenceEngineProfileInfo::EXECUTED;
+
+    // Hardware
+    info.realTime_uSec = nGNAPerfResultsTotal.hw.total;
+    retPerfCounters["1.1 Total scoring time in HW"] = info;
+
+    info.realTime_uSec = nGNAPerfResultsTotal.hw.stall;
+    retPerfCounters["1.2 Stall scoring time in HW"] = info;
+}
diff --git a/inference-engine/src/gna_plugin/gna_device.hpp b/inference-engine/src/gna_plugin/gna_device.hpp
new file mode 100644 (file)
index 0000000..7828211
--- /dev/null
@@ -0,0 +1,91 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "gna-api-dumper.h"
+#include "gna-api-instrumentation.h"
+#include "ie_common.h"
+#include <memory>
+#include <string>
+#include <map>
+#include <thread>
+
+/**
+ * holds gna - style handle in RAII way
+ */
+class GNADeviceHelper {
+    intel_gna_status_t nGNAStatus = GNA_NOERROR;
+    intel_gna_handle_t nGNAHandle = 0;
+    intel_gna_proc_t nGNAProcType = GNA_AUTO;
+    intel_gna_perf_t nGNAPerfResults;
+    intel_gna_perf_t nGNAPerfResultsTotal;
+    const uint32_t GNA_TIMEOUT = MAX_TIMEOUT;
+    bool isPerformanceMeasuring;
+
+ public:
+    explicit GNADeviceHelper(intel_gna_proc_t proc_type = GNA_AUTO,
+                            uint8_t lib_async_n_threads = 1,
+                            bool use_openmp = false,
+                            bool isPerformanceMeasuring = false) :
+                                    nGNAProcType(proc_type),
+                                    isPerformanceMeasuring(isPerformanceMeasuring) {
+        initGnaPerfCounters();
+        open(lib_async_n_threads);
+
+        if (use_openmp) {
+            uint8_t num_cores = std::thread::hardware_concurrency();
+            setOMPThreads((num_cores != 0) ? num_cores : 1);
+        }
+    }
+
+    ~GNADeviceHelper() {
+        close();
+    }
+
+    uint8_t *alloc(uint32_t size_requested, uint32_t *size_granted);
+
+    void propagateSync(const intel_nnet_type_t *pNeuralNetwork,
+                       const uint32_t *pActiveIndices,
+                       uint32_t nActiveIndices);
+
+    uint32_t propagate(const intel_nnet_type_t *pNeuralNetwork,
+                       const uint32_t *pActiveIndices,
+                       uint32_t nActiveIndices);
+
+    void wait(uint32_t id);
+
+
+    struct DumpResult {
+        intel_gna_model_header header;
+        std::shared_ptr<void> model;
+    };
+
+    DumpResult dumpXnn(const intel_nnet_type_t *pNeuralNetwork,
+                 const uint32_t *pActiveIndices,
+                 uint32_t nActiveIndices);
+
+
+    void free() {
+        GNAFree(nGNAHandle);
+    }
+    void updateGnaPerfCounters();
+    void getGnaPerfCounters(std::map<std::string,
+                        InferenceEngine::InferenceEngineProfileInfo>& retPerfCounters);
+
+ private:
+    void open(uint8_t const n_threads);
+
+    void close();
+
+    void checkStatus() const;
+
+    void setOMPThreads(uint8_t const n_threads);
+
+    void initGnaPerfCounters() {
+        nGNAPerfResults = {{0, 0, 0, 0, 0, 0, 0}, {0, 0}, {0, 0, 0}, {0, 0}};
+        nGNAPerfResultsTotal = {{0, 0, 0, 0, 0, 0, 0}, {0, 0}, {0, 0, 0}, {0, 0}};
+    }
+};
+
diff --git a/inference-engine/src/gna_plugin/gna_executable_network.hpp b/inference-engine/src/gna_plugin/gna_executable_network.hpp
new file mode 100644 (file)
index 0000000..1230624
--- /dev/null
@@ -0,0 +1,53 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <memory>
+#include <string>
+#include <map>
+#include <vector>
+
+#include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
+#include "gna_infer_request.hpp"
+#include "gna_plugin.hpp"
+#include <cpp_interfaces/ie_executor_manager.hpp>
+#include <cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp>
+
+namespace GNAPluginNS {
+
+class GNAExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafeAsyncOnly {
+    std::shared_ptr<GNAPlugin> plg;
+
+ public:
+    GNAExecutableNetwork(const std::string &aotFileName, const std::map<std::string, std::string> &config) :
+        plg(std::make_shared<GNAPlugin>(config)) {
+        plg->ImportNetwork(aotFileName);
+        _networkInputs  = plg->GetInputs();
+        _networkOutputs = plg->GetOutputs();
+    }
+
+    GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config)
+        : plg(std::make_shared<GNAPlugin>(config)) {
+        plg->LoadNetwork(network);
+    }
+
+    InferenceEngine::AsyncInferRequestInternal::Ptr
+        CreateAsyncInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
+                                    InferenceEngine::OutputsDataMap networkOutputs) override {
+        return std::make_shared<GNAInferRequest>(plg, networkInputs, networkOutputs);
+    }
+
+
+
+    std::vector<InferenceEngine::IMemoryStateInternal::Ptr>  QueryState() override {
+        auto pluginStates = plg->QueryState();
+        std::vector<InferenceEngine::IMemoryStateInternal::Ptr> state(pluginStates.begin(), pluginStates.end());
+        return plg->QueryState();
+    }
+
+    void Export(const std::string &modelFileName) override {
+        plg->Export(modelFileName);
+    }
+};
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_helper.cpp b/inference-engine/src/gna_plugin/gna_helper.cpp
new file mode 100644 (file)
index 0000000..604828c
--- /dev/null
@@ -0,0 +1,449 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "lstm.hpp"
+
+#define USING_GCC
+#define PROFILE
+
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <vector>
+#include <sstream>
+#include <string>
+#include "gna-api.h"
+
+#ifndef WIN32
+#include <profiler.h>
+
+void clearTimeB(timeb & tb) {
+    tb.time = 0;
+    tb.dstflag = 0;
+    tb.millitm = 0;
+    tb.timezone = 0;
+}
+//  dummy definitions to work around issue with Linux userspace library
+void profilerTscStart(intel_gna_profiler_tsc *p) {
+    if (nullptr == p) return;
+    p->stop = 0;
+    p->start = 0;
+}
+void profilerTscStop(intel_gna_profiler_tsc *p) {
+    if (nullptr == p) return;
+    p->stop = 0;
+    p->start = 0;
+}
+void profilerTscStartAccumulate(intel_gna_profiler_tsc *p) {
+    if (nullptr == p) return;
+    p->stop = 0;
+    p->start = 0;
+}
+void profilerTscStopAccumulate(intel_gna_profiler_tsc *p) {
+    if (nullptr == p) return;
+    p->stop = 0;
+}
+void profilerRtcClear(intel_gna_profiler_rtc *p) {
+    if (nullptr == p) return;
+    clearTimeB(p->passed);
+    clearTimeB(p->start);
+    clearTimeB(p->stop);
+}
+void profilerRtcStart(intel_gna_profiler_rtc *p) {
+    if (nullptr == p) return;
+    clearTimeB(p->passed);
+    clearTimeB(p->stop);
+    ftime(&p->start);
+}
+
+void profilerRtcStop(intel_gna_profiler_rtc *p) {
+    if (nullptr == p) return;
+    ftime(&p->stop);
+    /*if ((p->stop.tv_nsec - p->start.tv_nsec)<0) {
+        p->passed.tv_sec = p->stop.tv_sec - p->start.tv_sec - 1;
+        p->passed.tv_nsec = 1000000000 + p->stop.tv_nsec - p->start.tv_nsec;
+    }
+    else {
+        p->passed.tv_sec = p->stop.tv_sec - p->start.tv_sec;
+        p->passed.tv_nsec = p->stop.tv_nsec - p->start.tv_nsec;
+    }*/
+}
+void profilerRtcStartAccumulate(intel_gna_profiler_rtc *p) {
+    if (nullptr == p) return;
+    clearTimeB(p->stop);
+//    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &p->start);
+}
+void profilerRtcStopAccumulate(intel_gna_profiler_rtc *p) {
+    timespec diff;
+    if (nullptr == p) return;
+//    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &p->stop);
+//    if ((p->stop.tv_nsec - p->start.tv_nsec)<0) {
+//        diff.tv_sec = p->stop.tv_sec - p->start.tv_sec - 1;
+//        diff.tv_nsec = 1000000000 + p->stop.tv_nsec - p->start.tv_nsec;
+//    }
+//    else {
+//        diff.tv_sec = p->stop.tv_sec - p->start.tv_sec;
+//        diff.tv_nsec = p->stop.tv_nsec - p->start.tv_nsec;
+//    }
+//    p->passed.tv_sec += diff.tv_sec;
+//    p->passed.tv_nsec += diff.tv_nsec;
+//    if (p->passed.tv_nsec > 1000000000) {
+//        p->passed.tv_sec++;
+//        p->passed.tv_nsec -= 1000000000;
+//    }
+}
+
+#endif
+void PrintMatrixInt16(const char *ptr_name, int16_t *ptr_matrix, int num_rows, int num_cols, int lda, float scale) {
+    printf("%s:  %dx%d lda %d\n", ptr_name, num_rows, num_cols, lda);
+    for (int i = 0; i < num_rows; i++) {
+        for (int j = 0; j < num_cols; j++) {
+            printf("[%d,%d]: %e\n", i, j, *(ptr_matrix + i*lda + j) / scale);
+        }
+    }
+}
+
+void PrintMatrixInt32(char *ptr_name, int32_t *ptr_matrix, int num_rows, int num_cols, int lda, float scale) {
+    printf("%s:  %dx%d lda %d\n", ptr_name, num_rows, num_cols, lda);
+    for (int i = 0; i < num_rows; i++) {
+        for (int j = 0; j < num_cols; j++) {
+            printf("[%d,%d]: %e\n", i, j, *(ptr_matrix + i*lda + j) / scale);
+        }
+    }
+}
+
+void PrintMatrixFloat32(char *ptr_name, float *ptr_matrix, int num_rows, int num_cols, int lda) {
+#if (_WIN32 || _WIN64) && (_MSC_VER < 1900)
+    _set_output_format(_TWO_DIGIT_EXPONENT);
+#endif
+    printf("%s:  %dx%d lda %d\n", ptr_name, num_rows, num_cols, lda);
+    for (int i = 0; i < num_rows; i++) {
+        for (int j = 0; j < num_cols; j++) {
+            printf("[%d,%d]: %e\n", i, j, *(ptr_matrix + i*lda + j));
+        }
+    }
+}
+
+void PrintGnaNetwork(intel_nnet_type_t *ptr_nnet) {
+    PrintMatrixInt16("input", reinterpret_cast<int16_t*>(ptr_nnet->pLayers[0].pInputs),
+                     ptr_nnet->pLayers[0].nInputRows, ptr_nnet->pLayers[0].nInputColumns, ptr_nnet->pLayers[0].nInputColumns, 1.0);
+    for (uint32_t i = 0; i < ptr_nnet->nLayers; i++) {
+        char name[256];
+        snprintf(name, sizeof(name), "output %d", i);
+        if (ptr_nnet->pLayers[i].nBytesPerOutput == 2) {
+            PrintMatrixInt16(name, reinterpret_cast<int16_t*>(ptr_nnet->pLayers[i].pOutputs),
+                             ptr_nnet->pLayers[i].nOutputRows, ptr_nnet->pLayers[i].nOutputColumns, ptr_nnet->pLayers[i].nOutputColumns, 1.0);
+        } else {
+            PrintMatrixInt32(name, reinterpret_cast<int32_t*>(ptr_nnet->pLayers[i].pOutputs),
+                             ptr_nnet->pLayers[i].nOutputRows, ptr_nnet->pLayers[i].nOutputColumns, ptr_nnet->pLayers[i].nOutputColumns, 1.0);
+        }
+    }
+}
+
+typedef struct {
+    std::string sName;
+    std::string sType;  //  if wgt/bias/filt/pwl is writeable, then do not write it to file
+    void *pAddress;
+    uint32_t nBytes;
+} intel_memory_region_t;
+
+void AddBufferEntry(std::vector<intel_memory_region_t> &vBuffer,
+                    const std::string &sName,
+                    const std::string &sType,
+                    void *pBuffer,
+                    uint32_t nBytes) {
+    if (pBuffer != NULL) {
+        intel_memory_region_t region;
+        region.sName = sName;
+        region.sType = sType;
+        region.pAddress = pBuffer;
+        region.nBytes = nBytes;
+        vBuffer.push_back(region);
+    }
+}
+
+std::string BufferNameFromAddress(std::vector<intel_memory_region_t> &vBuffer, void *pBuffer) {
+    std::stringstream ss;
+    std::string sAddr, sName;
+    void *pParentBuffer = pBuffer;
+    bool found = false;
+    bool found_persistent = false;
+    bool found_output = false;
+    for (uint32_t i = 0; i < vBuffer.size(); i++) {
+        uint8_t *pBufferStart = reinterpret_cast<uint8_t *>(pBuffer);
+        uint8_t *pEntryBufferStart = reinterpret_cast<uint8_t *>(vBuffer.at(i).pAddress);
+        uint8_t *pEntryBufferEnd = reinterpret_cast<uint8_t *>(vBuffer.at(i).pAddress) + vBuffer.at(i).nBytes;
+        if ((pBufferStart >= pEntryBufferStart) && (pBufferStart < pEntryBufferEnd)) {
+            found = true;
+            if (pBufferStart > pEntryBufferStart) {
+                pParentBuffer = pEntryBufferStart;
+            }
+            if ((vBuffer.at(i).sType.compare("pOutputs") == 0)
+                || (vBuffer.at(i).sType.compare("pOutputsIntermediate") == 0)) {
+                found_output = true;
+            } else if (vBuffer.at(i).sType.compare("pWeights") == 0) {
+                sName = "wgt_";
+                found_persistent = true;
+            } else if (vBuffer.at(i).sType.compare("pBiases") == 0) {
+                sName = "bias_";
+                found_persistent = true;
+            } else if (vBuffer.at(i).sType.compare("pSegments") == 0) {
+                sName = "pwl_";
+                found_persistent = true;
+            }
+        }
+    }
+    if (found) {
+        if ((found_output) || (!found_persistent)) {
+            sName = "buf_";
+        }
+        ss << (int64_t) pParentBuffer;
+        sAddr = ss.str();
+        sName.append(sAddr);
+    } else {
+        fprintf(stderr, "Error:  buffer address does not exist in BufferNameFromAddress!\n");
+        exit(EXIT_FAILURE);
+    }
+    return (sName);
+}
+
+uint32_t BufferOffsetFromAddress(std::vector<intel_memory_region_t> &vBuffer, void *pBuffer) {
+    uint32_t nOffsetBytes = 0;
+    for (uint32_t i = 0; i < vBuffer.size(); i++) {
+        uint8_t *pBufferStart = reinterpret_cast<uint8_t *>(pBuffer);
+        uint8_t *pEntryBufferStart = reinterpret_cast<uint8_t *>(vBuffer.at(i).pAddress);
+        uint8_t *pEntryBufferEnd = reinterpret_cast<uint8_t *>(vBuffer.at(i).pAddress) + vBuffer.at(i).nBytes;
+        if ((pBufferStart >= pEntryBufferStart) && (pBufferStart < pEntryBufferEnd)) {
+            if (pBufferStart > pEntryBufferStart) {
+                nOffsetBytes = (uint32_t) (pBufferStart - pEntryBufferStart);
+            }
+        }
+    }
+    return (nOffsetBytes);
+}
+
+std::string LayerName(intel_nnet_layer_t *pLayer) {
+    intel_layer_kind_t nKind = pLayer->nLayerKind;
+    std::string sKind;
+    if (nKind == INTEL_AFFINE) {
+        sKind = "affine";
+    } else if (nKind == INTEL_AFFINE_DIAGONAL) {
+        sKind = "diagonal";
+    } else if (nKind == INTEL_INTERLEAVE) {
+        sKind = "interleave";
+    } else if (nKind == INTEL_DEINTERLEAVE) {
+        sKind = "deinterleave";
+    } else {
+        fprintf(stderr, "Error:  nLayerKind not supported in LayerName()!\n");
+        exit(EXIT_FAILURE);
+    }
+    return (sKind);
+}
+
+uint32_t NumInputs(intel_nnet_layer_t *pLayer) {
+    intel_layer_kind_t nKind = pLayer->nLayerKind;
+    uint32_t nInputs;
+    if ((nKind == INTEL_AFFINE) || (nKind == INTEL_AFFINE_DIAGONAL)) {
+        nInputs = pLayer->nInputRows;
+    } else if (nKind == INTEL_INTERLEAVE) {
+        nInputs = pLayer->nInputColumns;
+    } else if (nKind == INTEL_DEINTERLEAVE) {
+        nInputs = pLayer->nInputRows;
+    } else {
+        fprintf(stderr, "Error:  nLayerKind not supported in NumInputs()!\n");
+        exit(EXIT_FAILURE);
+    }
+    return (nInputs);
+}
+
+uint32_t NumOutputs(intel_nnet_layer_t *pLayer) {
+    intel_layer_kind_t nKind = pLayer->nLayerKind;
+    uint32_t nOutputs;
+    if ((nKind == INTEL_AFFINE) || (nKind == INTEL_AFFINE_DIAGONAL)) {
+        nOutputs = pLayer->nOutputRows;
+    } else if (nKind == INTEL_INTERLEAVE) {
+        nOutputs = pLayer->nOutputRows;
+    } else if (nKind == INTEL_DEINTERLEAVE) {
+        nOutputs = pLayer->nOutputColumns;
+    } else {
+        fprintf(stderr, "Error:  nLayerKind not supported in NumInputs()!\n");
+        exit(EXIT_FAILURE);
+    }
+    return (nOutputs);
+}
+
+uint32_t NumGroupSize(intel_nnet_layer_t *pLayer) {
+    intel_layer_kind_t nKind = pLayer->nLayerKind;
+    uint32_t nGroupSize;
+    if ((nKind == INTEL_AFFINE) || (nKind == INTEL_AFFINE_DIAGONAL)) {
+        nGroupSize = pLayer->nOutputColumns;
+    } else if (nKind == INTEL_INTERLEAVE) {
+        nGroupSize = pLayer->nOutputColumns;
+    } else if (nKind == INTEL_DEINTERLEAVE) {
+        nGroupSize = pLayer->nOutputRows;
+    } else {
+        fprintf(stderr, "Error:  nLayerKind not supported in NumGroupSize()!\n");
+        exit(EXIT_FAILURE);
+    }
+    return (nGroupSize);
+}
+
+void ExportGnaNetworkAndrzej(const char *ptr_name, intel_nnet_type_t *pNeuralNetwork) {
+    std::string sXmlFileName;
+    sXmlFileName.append(ptr_name);
+    sXmlFileName.append("/model.xml");
+    std::ofstream xml_file(sXmlFileName.c_str(), std::ios::out);
+    if (xml_file.good()) {
+        std::vector<intel_memory_region_t> vBuffer;
+        //  find all the memory regions in the network
+        for (uint32_t layer = 0; layer < pNeuralNetwork->nLayers; layer++) {
+            intel_nnet_layer_t *pLayer = &pNeuralNetwork->pLayers[layer];
+            intel_affine_layer_t *pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
+            uint32_t nPWLSegments = 0;
+            uint32_t nWeightWidth = 0;
+            AddBufferEntry(vBuffer,
+                           LayerName(pLayer),
+                           "pInputs",
+                           pLayer->pInputs,
+                           pLayer->nBytesPerInput * pLayer->nInputColumns * pLayer->nInputRows);
+            AddBufferEntry(vBuffer,
+                           LayerName(pLayer),
+                           "pOutputs",
+                           pLayer->pOutputs,
+                           pLayer->nBytesPerOutput * pLayer->nOutputColumns * pLayer->nOutputRows);
+            AddBufferEntry(vBuffer,
+                           LayerName(pLayer),
+                           "pOutputsIntermediate",
+                           pLayer->pOutputsIntermediate,
+                           pLayer->nBytesPerIntermediateOutput * pLayer->nOutputColumns * pLayer->nOutputRows);
+            if ((pLayer->nLayerKind == INTEL_AFFINE) || (pLayer->nLayerKind == INTEL_AFFINE_DIAGONAL)) {
+                uint32_t nBytesWeights =
+                    (pLayer->nLayerKind == INTEL_AFFINE) ? pAffineLayer->affine.nBytesPerWeight * pLayer->nInputRows
+                        * pLayer->nOutputRows : pAffineLayer->affine.nBytesPerWeight * pLayer->nOutputRows;
+                nPWLSegments = pAffineLayer->pwl.nSegments;
+                nWeightWidth = pAffineLayer->affine.nBytesPerWeight;
+                AddBufferEntry(vBuffer, LayerName(pLayer), "pWeights", pAffineLayer->affine.pWeights, nBytesWeights);
+                AddBufferEntry(vBuffer,
+                               LayerName(pLayer),
+                               "pBiases",
+                               pAffineLayer->affine.pBiases,
+                               pAffineLayer->affine.nBytesPerBias * pLayer->nOutputRows);
+                if (nPWLSegments > 0) {
+                    AddBufferEntry(vBuffer,
+                                   LayerName(pLayer),
+                                   "pSegments",
+                                   pAffineLayer->pwl.pSegments,
+                                   sizeof(intel_pwl_segment_t) * nPWLSegments);
+                }
+            } else if (pLayer->nLayerKind == INTEL_INTERLEAVE) {
+            } else if (pLayer->nLayerKind == INTEL_DEINTERLEAVE) {
+            } else {
+                fprintf(stderr, "Error:  layer kind not yet supported in ExportGnaNetworkAndrzej()!\n");
+                exit(EXIT_FAILURE);
+            }
+        }
+        //  write XML network description
+        xml_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+        xml_file << "<model>\n\n\n";
+        xml_file << "    <!--Neural network topology definition. -->\n";
+        xml_file << "    <network type=\"2b\" grouping=\"" << pNeuralNetwork->nGroup << "\">\n";
+        for (uint32_t layer = 0; layer < pNeuralNetwork->nLayers; layer++) {
+            intel_nnet_layer_t *pLayer = &pNeuralNetwork->pLayers[layer];
+            intel_affine_layer_t *pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
+            //  below is hard-coded for the Google LSTM model -- it is only for debugging
+            std::string sClass = (layer < pNeuralNetwork->nLayers - 1) ? "LSTM_" : "DNN_";
+            std::string sName;
+            uint32_t nGoogleLayer;
+            if (pNeuralNetwork->nGroup == 1) {
+                sName = (layer < pNeuralNetwork->nLayers - 1) ? intel_lstm_projected_layer_name[layer % NUM_LSTM_LAYERS]
+                                                              : "final affine layer";
+                nGoogleLayer = layer / NUM_LSTM_LAYERS;
+            } else if (pNeuralNetwork->nGroup == 4) {
+                sName = (layer < pNeuralNetwork->nLayers - 1) ? intel_lstm_projected_layer_g4_name[layer
+                    % NUM_LSTM_G4_LAYERS] : "final affine layer";
+                nGoogleLayer = layer / NUM_LSTM_G4_LAYERS;
+            } else {
+                  sName = "affine"; sName + std::to_string(layer);
+                  nGoogleLayer = layer;
+                  // fprintf(stderr, "Error:  unsupported grouping factor in ExportGnaNetworkAndrzej()!\n");
+                  // exit(EXIT_FAILURE);
+            }
+            xml_file << "        <layer type=\"" << LayerName(pLayer) << "\" class=\"" << sClass.c_str()
+                     << nGoogleLayer;
+            xml_file << "\" name=\"" << sName.c_str() << "\" grouping=\"" << NumGroupSize(pLayer) << "\">\n";
+            xml_file << "            <input>\n";
+            xml_file << "                <count>" << NumInputs(pLayer) << "</count>\n";
+            xml_file << "                <buffer offset=\"" << BufferOffsetFromAddress(vBuffer, pLayer->pInputs)
+                     << "\">";
+            xml_file << BufferNameFromAddress(vBuffer, pLayer->pInputs) << "</buffer>\n";
+            xml_file << "            </input>\n";
+            xml_file << "            <output>\n";
+            xml_file << "                <count>" << NumOutputs(pLayer) << "</count>\n";
+            xml_file << "                <buffer offset=\"" << BufferOffsetFromAddress(vBuffer, pLayer->pOutputs)
+                     << "\">";
+            xml_file << BufferNameFromAddress(vBuffer, pLayer->pOutputs) << "</buffer>\n";
+            xml_file << "            </output>\n";
+            if (pLayer->pOutputsIntermediate != NULL) {
+                xml_file << "            <temp>\n";
+                xml_file << "                <count>" << NumOutputs(pLayer) << "</count>\n";
+                xml_file << "                <buffer offset=\""
+                         << BufferOffsetFromAddress(vBuffer, pLayer->pOutputsIntermediate) << "\">";
+                xml_file << BufferNameFromAddress(vBuffer, pLayer->pOutputsIntermediate) << "</buffer>\n";
+                xml_file << "            </temp>\n";
+            }
+            if ((pLayer->nLayerKind == INTEL_AFFINE) || (pLayer->nLayerKind == INTEL_AFFINE_DIAGONAL)) {
+                xml_file << "            <weights>" << BufferNameFromAddress(vBuffer, pAffineLayer->affine.pWeights)
+                         << "</weights>\n";
+                xml_file << "            <biases offset=\""
+                         << BufferOffsetFromAddress(vBuffer, pAffineLayer->affine.pBiases) << "\">";
+                xml_file << BufferNameFromAddress(vBuffer, pAffineLayer->affine.pBiases) << "</biases>\n";
+                if (pAffineLayer->pwl.nSegments > 0) {
+                    xml_file << "            <pwl>" << BufferNameFromAddress(vBuffer, pAffineLayer->pwl.pSegments)
+                             << "</pwl>\n";
+                }
+            }
+            xml_file << "        </layer>\n\n";
+        }
+        xml_file << "    </network>\n\n";
+        xml_file.flush();
+
+        //  write buffer list to XML and create data files
+        xml_file << "    <buffers>\n";
+        for (uint32_t i = 0; i < vBuffer.size(); i++) {
+            std::string sName = ptr_name;
+            sName.append("/");
+            sName.append(BufferNameFromAddress(vBuffer, vBuffer.at(i).pAddress));
+            bool found = false;
+            for (uint32_t j = 0; j < i; j++) {
+                std::string sPrevName = BufferNameFromAddress(vBuffer, vBuffer.at(j).pAddress);
+                if (sPrevName.compare(sName) == 0) found = true;
+            }
+            if (!found) {
+                xml_file << "        <buffer>\n";
+                xml_file << "            <name>" << sName << "</name>\n";
+                if (sName.compare(0, 4, "buf_") == 0) {
+                    xml_file << "            <size>" << vBuffer.at(i).nBytes << "</size>\n";
+                } else {
+                    std::string sFileName;
+                    sFileName.append(sName);
+                    sFileName.append(".dat");
+                    xml_file << "            <file>" << sFileName << "</file>\n";
+                    std::ofstream data_file(sFileName.c_str(), std::ios::binary);
+                    data_file.write(reinterpret_cast<char *>(vBuffer.at(i).pAddress), vBuffer.at(i).nBytes);
+                    data_file.close();
+                }
+                xml_file << "        </buffer>\n";
+            }
+        }
+        xml_file << "    </buffers>\n";
+        xml_file << "<!--Temporary scratch buffer is required but not used in this model definition. -->\n";
+        xml_file << "    <scratchpad>\n";
+        xml_file << "        <size>65536</size>\n";
+        xml_file << "    </scratchpad>\n";
+        xml_file << "</model>\n";
+        xml_file.close();
+    } else {
+        fprintf(stderr, "Failed to open %s for writing!\n", ptr_name);
+    }
+}
diff --git a/inference-engine/src/gna_plugin/gna_infer_request.hpp b/inference-engine/src/gna_plugin/gna_infer_request.hpp
new file mode 100644 (file)
index 0000000..ba8e99f
--- /dev/null
@@ -0,0 +1,73 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <map>
+
+#include "cpp_interfaces/impl/ie_infer_request_internal.hpp"
+#include "gna_plugin.hpp"
+
+namespace GNAPluginNS {
+
+class GNAInferRequest : public InferenceEngine::AsyncInferRequestInternal {
+    std::shared_ptr<GNAPlugin> plg;
+    uint32_t inferRequestIdx = -1;
+
+ public:
+    GNAInferRequest(const std::shared_ptr<GNAPlugin>& plg,
+                    InferenceEngine::InputsDataMap networkInputs,
+                    InferenceEngine::OutputsDataMap networkOutputs)
+        : InferenceEngine::AsyncInferRequestInternal(networkInputs, networkOutputs), plg(plg) {
+        // TODO: internal connection API - better to generalize
+        if (networkOutputs.empty()) {
+            THROW_GNA_EXCEPTION << "GNAInferRequest :: network has zero outputs";
+        }
+        if (networkInputs.empty()) {
+            THROW_GNA_EXCEPTION << "GNAInferRequest :: network has zero inputs";
+        }
+
+        // copy inputs blobs since we need to have them in separate address space to allow simultaneous infer requests
+        _outputs[_networkOutputs.begin()->first] = plg->GetOutputBlob(networkOutputs.begin()->second->getPrecision());
+        _inputs[_networkInputs.begin()->first] = plg->GetInputBlob(networkInputs.begin()->second->getInputPrecision());
+    }
+    /**
+     * @brief Infers specified input(s) in synchronous mode
+     * @note blocks all method of IInferRequest while request is ongoing (running or waiting in queue)
+     */
+    void InferImpl() override {
+        // execute input pre-processing.
+        execDataPreprocessing(_inputs);
+        plg->Infer(_inputs, _outputs);
+    }
+
+    /**
+     * @brief Queries performance measures per layer to get feedback of what is the most time consuming layer.
+     *  Note: not all plugins may provide meaningful data
+     *  @param perfMap - a map of layer names to profiling information for that layer.
+     */
+    void GetPerformanceCounts(std::map<std::string,
+                                               InferenceEngine::InferenceEngineProfileInfo> &perfMap) const override {
+        plg->GetPerformanceCounts(perfMap);
+    }
+
+    /**
+        * @brief methods with _ThreadUnsafe prefix are to implement in plugins
+        * or in default wrapper (e.g. AsyncInferRequestThreadSafeDefault)
+        */
+    void StartAsyncImpl() override {
+        // execute input pre-processing.
+        execDataPreprocessing(_inputs);
+        inferRequestIdx = plg->QueueInference(_inputs, _outputs);
+    }
+
+    InferenceEngine::StatusCode Wait(int64_t millis_timeout) override {
+        if (inferRequestIdx == -1) return InferenceEngine::INFER_NOT_STARTED;
+        plg->Wait(inferRequestIdx);
+        return InferenceEngine::OK;
+    }
+};
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_layer_info.hpp b/inference-engine/src/gna_plugin/gna_layer_info.hpp
new file mode 100644 (file)
index 0000000..7e6da43
--- /dev/null
@@ -0,0 +1,206 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <memory>
+#include "inference_engine.hpp"
+#include "details/caseless.hpp"
+#include "ie_algorithm.hpp"
+
+
+namespace GNAPluginNS {
+
+/**
+ * @brief detecting of const pointer for dynamic cast operations
+ * @tparam T
+ */
+template <class T>
+struct is_const_pointer : public std::false_type{
+};
+
+template <class T>
+struct is_const_pointer<const T *> : public std::true_type{
+};
+
+
+/**
+ * similar to type traits determined in standard library this trait provides details per layer type, with some attributes specific for GNA
+ * we don't need to have compile time performance for this yet
+ */
+class LayerInfo {
+    InferenceEngine::CNNLayer * layer;
+
+#define IS_VALID() if (nullptr == layer) return false
+
+ public:
+    explicit LayerInfo(InferenceEngine::CNNLayer & layer)
+        : LayerInfo(&layer) {
+    }
+    explicit LayerInfo(const InferenceEngine::CNNLayerPtr & layer)
+        : LayerInfo(layer.get()) {
+    }
+    explicit LayerInfo(InferenceEngine::CNNLayer * layer)
+        : layer(layer) {
+    }
+    bool has16BOutput() const noexcept {
+        IS_VALID();
+        static InferenceEngine::details::caseless_set<std::string> layersWith16BOutputs = {"memory", "input", "split", "slice", "concat", "copy"};
+        return layersWith16BOutputs.find(layer->type) != layersWith16BOutputs.end() ||
+                                                                        isActivation() ||
+                                                            (isCrop() && !isCropAffined());
+    }
+    bool has32BOutput() const noexcept {
+        IS_VALID();
+        static  InferenceEngine::details::caseless_set<std::string> layersWith32BOutputs =
+                {"FullyConnected", "InnerProduct", "Eltwise", "ScaleShift", "Convolution", "Pooling"};
+        return (layersWith32BOutputs.find(layer->type) != layersWith32BOutputs.end()) ||
+                                                            (isCrop() && isCropAffined());
+    }
+    static bool isBatchSizeConstrained(const std::string name) {
+        static InferenceEngine::details::caseless_set<std::string> layersWithConstrains = {"memory", "convolution"};
+        return layersWithConstrains.find(name) != layersWithConstrains.end();
+    }
+    bool isActivation() const noexcept {
+        IS_VALID();
+        static InferenceEngine::details::caseless_set<std::string> activations = {"clamp", "sigmoid", "identity", "relu", "leakyrelu", "tanh", "prelu"};
+        return activations.find(layer->type) != activations.end();
+    }
+    bool isRelu() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "relu");
+    }
+    bool isConvolution() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "convolution");
+    }
+    bool isPower() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "power");
+    }
+    bool has32BInput() const noexcept {
+        IS_VALID();
+        return isActivation() || isPooling();
+    }
+    bool isInput() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "input");
+    }
+    bool isEltwise() const noexcept {
+        IS_VALID();
+        return nullptr != as<const InferenceEngine::EltwiseLayer*>();
+    }
+    bool isEltwiseSum() const noexcept {
+        IS_VALID();
+        if (!isEltwise()) return false;
+        return dynamic_cast<const InferenceEngine::EltwiseLayer*>(layer)->_operation ==
+            InferenceEngine::EltwiseLayer::Sum;
+    }
+    bool isEltwiseMul() const noexcept {
+        IS_VALID();
+        if (!isEltwise()) return false;
+        return dynamic_cast<const InferenceEngine::EltwiseLayer*>(layer)->_operation ==
+            InferenceEngine::EltwiseLayer::Prod;
+    }
+    bool isIdentity() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "identity");
+    }
+    bool isFullyConnected() const noexcept {
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "FullyConnected") ||
+                InferenceEngine::details::CaselessEq<std::string>()(layer->type, "InnerProduct");
+    }
+    bool isConvolutional() const noexcept {
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "Convolution");
+    }
+    bool isSplit() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "split");
+    }
+    bool isSlice() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "slice");
+    }
+    bool isConcat() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "concat");
+    }
+    bool isReshape() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "reshape");
+    }
+    bool isPermute() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "permute");
+    }
+    bool isPooling() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "Pooling");
+    }
+    bool isMaxPooling() const noexcept {
+        IS_VALID();
+        if (!isPooling()) return false;
+        return as<const InferenceEngine::PoolingLayer*>()->_type == InferenceEngine::PoolingLayer::MAX;
+    }
+    bool isMemory() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "memory");
+    }
+    bool isCrop() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "crop");
+    }
+    bool isCropAffined() const noexcept {
+        auto cropLayer = dynamic_cast<InferenceEngine::CropLayer *> (layer);
+        size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
+        return (ALIGN(cropOffset, 8) != cropOffset);
+    }
+    bool isCopy() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "copy");
+    }
+    size_t paddingSize() const noexcept {
+        static InferenceEngine::details::caseless_set<std::string> layersWithPossiblePadding = {"FullyConnected",
+                                                                        "InnerProduct",
+                                                                             "Pooling",
+                                                                         "Convolution"};
+        if (layersWithPossiblePadding.find(layer->type) != layersWithPossiblePadding.end()) {
+            size_t size_without_padding = 0;
+            auto inputs = layer->insData.begin()->lock();
+            if (inputs) {
+                size_without_padding = InferenceEngine::details::product(begin(inputs->dims),
+                                                                   end(inputs->dims));
+            }
+            return ALIGN(size_without_padding, 8) - size_without_padding;
+        }
+        return 0;
+    }
+    template <class T>
+    typename std::enable_if<!is_const_pointer<T>::value, T>::type as() noexcept {
+        return dynamic_cast<T>(layer);
+    }
+    template <class T>
+    typename std::enable_if<is_const_pointer<T>::value, T>::type as() const noexcept {
+        return dynamic_cast<T>(layer);
+    }
+    operator InferenceEngine::CNNLayer *() noexcept {
+        return layer;
+    }
+    operator const InferenceEngine::CNNLayer *() const noexcept {
+        return layer;
+    }
+    operator InferenceEngine::CNNLayerPtr () const noexcept {
+        return std::shared_ptr<InferenceEngine::CNNLayer>(layer, [] (InferenceEngine::CNNLayer * p) {});
+    }
+
+    #undef IS_VALID
+};
+
+inline std::ostream & operator <<(std::ostream &os, const LayerInfo & info) {
+    os << static_cast<const InferenceEngine::CNNLayer*>(info)->name;
+    return os;
+}
+
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_mem_requests.hpp b/inference-engine/src/gna_plugin/gna_mem_requests.hpp
new file mode 100644 (file)
index 0000000..24163dc
--- /dev/null
@@ -0,0 +1,175 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <list>
+#include <vector>
+#include <algorithm>
+#include <functional>
+
+namespace GNAPluginNS {
+
+enum rType {
+    REQUEST_STORE,
+    REQUEST_ALLOCATE,
+    REQUEST_BIND,
+    REQUEST_INITIALIZER,
+};
+/**
+ * @brief region of firmware data
+ */
+enum rRegion {
+    REGION_RO,
+    REGION_RW,
+    REGION_AUTO,
+};
+
+struct MemRequest {
+    rType _type;
+    rRegion  _region;
+    void *_ptr_out;
+    const void *_ptr_in = nullptr;
+    std::function<void(void * data, size_t size)> _initializer;
+    // holds arbitrary value
+    std::vector<uint8_t> _data;
+    uint8_t _element_size;
+    size_t _num_elements;
+    size_t _alignment;
+    size_t _offset;
+    // expansion in bytes due to large depended layers
+    size_t _padding = 0;
+    MemRequest(rRegion region,
+                rType req,
+                void *ptr_out,
+                const void *ptr_in,
+                uint8_t element_size = 0,
+                size_t num_elements = 0,
+                size_t alignment = 1,
+                size_t offset = 0) : _region(region),
+                                     _type(req),
+                                     _ptr_out(ptr_out),
+                                     _ptr_in(ptr_in),
+                                     _element_size(element_size),
+                                     _num_elements(num_elements),
+                                     _alignment(alignment),
+                                     _offset(offset) {}
+
+    /**
+     * Store value only request
+     * @tparam T
+     * @param req
+     * @param ptr_out
+     * @param element
+     * @param num_elements
+     * @param alignment
+     */
+    template<class T>
+    MemRequest(rRegion region,
+                void *ptr_out,
+                T element,
+                size_t num_elements,
+                size_t alignment = 1) : _region(region),
+                                        _type(REQUEST_STORE),
+                                        _ptr_out(ptr_out),
+                                        _element_size(sizeof(T)),
+                                        _num_elements(num_elements),
+                                        _alignment(alignment) {
+        _data.resize(sizeof(T));
+        std::copy(reinterpret_cast<uint8_t *>(&element), reinterpret_cast<uint8_t *>(&element) + sizeof(T), _data.begin());
+    }
+/**
+     * Store initializer request
+     * @param req
+     * @param ptr_out
+     * @param element
+     * @param num_elements
+     * @param alignment
+     */
+    MemRequest(rRegion region,
+               void   *ptr_out,
+               size_t  regionSize,
+               std::function<void(void * data, size_t size)> initializer,
+               size_t  alignment = 1) : _region(region),
+                                        _type(REQUEST_INITIALIZER),
+                                        _ptr_out(ptr_out),
+                                        _element_size(1),
+                                        _num_elements(regionSize),
+                                        _alignment(alignment),
+                                        _initializer(initializer) {
+    }
+};
+
+/**
+ * Adapter for requests submission and actual request queue
+ */
+class GNAMemRequestsQueue {
+ public:
+    virtual ~GNAMemRequestsQueue() {}
+
+    /**
+     * @brief register initialiser to access memory once it is actually allocated
+     * @param ptr_out
+     * @param ptr_in
+     * @param num_bytes
+     * @param alignment
+     */
+    void push_initializer(void *ptr_out, size_t num_bytes, std::function<void(void * data, size_t size)> initializer, size_t alignment = 1) {
+        futureHeap().push_back({regionType(), ptr_out, num_bytes, initializer, alignment});
+    }
+
+    void push_ptr(void *ptr_out, const void *ptr_in, size_t num_bytes, size_t alignment = 1) {
+        futureHeap().push_back({regionType(), REQUEST_STORE, ptr_out, ptr_in, 1, num_bytes, alignment});
+    }
+
+    /**
+     * copy input to intermediate buffer
+     * @param ptr_out
+     * @param ptr_in
+     * @param num_bytes
+     */
+    void push_local_ptr(void *ptr_out, const void *ptr_in, size_t num_bytes, size_t alignment = 1) {
+        localStorage().emplace_back(reinterpret_cast<const uint8_t *>(ptr_in),
+                                    reinterpret_cast<const uint8_t *>(ptr_in) + num_bytes);
+        futureHeap().push_back({regionType(), REQUEST_STORE, ptr_out, &localStorage().back().front(), 1, num_bytes, alignment});
+    }
+
+    /**
+     *
+     * @param ptr_out
+     * @param num_bytes
+     */
+    void reserve_ptr(void *ptr_out, size_t num_bytes)  {
+        futureHeap().push_back({regionType(), REQUEST_ALLOCATE, ptr_out, nullptr, 1, num_bytes});
+    }
+
+    /**
+     *
+     * @param source
+     * @param dest - source is binded to dest pointer after allocation
+     * @param offset - offset in bytes in sourse that will be set in dest
+     * @param num_bytes - bind can request for bigger buffer that originally allocated via reserve(),
+     *      if that happens - reserved request parameters will be updated bero commiting memory
+     */
+    void bind_ptr(void *source, const void *dest, size_t offset = 0, size_t num_bytes = 0)  {
+        futureHeap().push_back({regionType(), REQUEST_BIND, source, dest, 1, num_bytes, 1, offset});
+    }
+    /**
+     * @brief allocates buffer and set all its values to T value
+     */
+    template<class T>
+    void push_value(void *ptr_out, T value, size_t num_elements, size_t alignment = 1) {
+        futureHeap().push_back({regionType(), ptr_out, value, num_elements, alignment});
+    }
+
+    /**
+     * @brief interface for actual queue storage
+     */
+    virtual rRegion regionType() const = 0;
+    virtual std::vector<MemRequest> & futureHeap()  = 0;
+    virtual std::list<std::vector<char>> &localStorage() = 0;
+};
+
+
+
+}  // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/gna_memory.hpp b/inference-engine/src/gna_plugin/gna_memory.hpp
new file mode 100644 (file)
index 0000000..d1c9650
--- /dev/null
@@ -0,0 +1,227 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "gna_mem_requests.hpp"
+#include <memory>
+#include <vector>
+#include <list>
+#include <algorithm>
+#include <functional>
+
+/**
+ * Pads memory size to given number of Bytes
+ *
+ * Please always use this padding macro for consistency
+ *
+ * @memSize size (in bytes) of memory to be padded
+ * @align   number of bytes to pad
+ * @return  memory size (int bytes) padded to given value
+ */
+#ifndef ALIGN
+# define ALIGN(memSize, pad)   (static_cast<int>(((memSize) + pad -1) / pad) * pad)
+#endif
+
+namespace GNAPluginNS {
+
+
+
+/**
+ * @brief encapsulate various request to allocate GNA specific memory,
+ * in order to issue single allocation call and configure actual pointers in requests
+ * @tparam Allocator - a GNAAllocator in case of actual HW offloads
+ */
+template<class Allocator = std::allocator<uint8_t>>
+class GNAMemory : public GNAMemRequestsQueue {
+    std::vector<MemRequest> _future_heap;
+    std::list<std::vector<char>> _local_storage;
+    size_t _total = 0;
+    size_t _rw_section_size = 0;
+    size_t _ro_section_size = 0;
+    Allocator _allocator;
+    std::shared_ptr<uint8_t> heap;
+    size_t _page_alignment = 1;
+
+    class GNAMemRequestsReadOnlyQueue : public GNAMemRequestsQueue {
+        std::reference_wrapper<GNAMemRequestsQueue> _that;
+     public:
+        explicit GNAMemRequestsReadOnlyQueue(GNAMemory & that) : _that(that) {
+        }
+        rRegion regionType() const override {
+            return REGION_RO;
+        };
+        std::vector<MemRequest> & futureHeap()  override {
+            return _that.get().futureHeap();
+        }
+        std::list<std::vector<char>> &localStorage() override {
+            return _that.get().localStorage();
+        }
+    };
+
+    GNAMemRequestsReadOnlyQueue readOnlyFrontEnd;
+
+ public:
+    explicit GNAMemory(size_t pageAlignment = 1)
+        : readOnlyFrontEnd(*this), _page_alignment(pageAlignment) {}
+
+    explicit GNAMemory(const Allocator &a, size_t pageAlignment = 1)
+        : _allocator(a), readOnlyFrontEnd(*this), _page_alignment(pageAlignment) {}
+
+    GNAMemRequestsQueue & readonly() {
+        return readOnlyFrontEnd;
+    }
+
+    /**
+     * @brief calculates size required for all requests, allocates memory and updates pointers
+     */
+    void commit() {
+        // 1st stage -- looking for expandable bind requests:
+        for (auto &originated : _future_heap) {
+            if (originated._type == REQUEST_BIND) continue;
+            size_t offset = 0;
+            iterate_binded(originated, [&](MemRequest & reference, MemRequest & binded) {
+                if (&originated == &reference) {
+                    offset = 0;
+                }
+                offset += binded._offset;
+                auto current = offset + ALIGN(binded._num_elements * binded._element_size, binded._alignment);
+                auto original_no_pad = ALIGN(originated._num_elements * originated._element_size, originated._alignment);
+                auto original_with_pad = ALIGN(originated._num_elements * originated._element_size + originated._padding, originated._alignment);
+
+                originated._padding = ALIGN(std::max(original_with_pad, current), originated._alignment) - original_no_pad;
+            });
+        }
+
+        updateSectionsSizes();
+
+        _total = _rw_section_size + _ro_section_size;
+
+        // allocation with memory setting to 0 internally
+        heap = allocate(_total);
+        auto setupOffsets = [&](std::function<bool(MemRequest & request)> filter, size_t offset) {
+            for (auto &re : _future_heap) {
+                if (re._type == REQUEST_BIND) continue;
+                if (filter(re)) continue;
+
+                auto sz = re._element_size * re._num_elements;
+
+                if (re._ptr_out != nullptr) {
+                    auto cptr = heap.get() + offset;
+                    *reinterpret_cast<void **>(re._ptr_out) = cptr;
+                    // std::cout << "ALLOCATED=" << cptr << ", size=" << re._element_size * re._num_elements << "\n";
+                    iterate_binded(re, [](MemRequest & reference, MemRequest & binded) {
+                        *reinterpret_cast<void **>(binded._ptr_out) =
+                            binded._offset + reinterpret_cast<uint8_t *>(*reinterpret_cast<void **>(reference._ptr_out));
+                    });
+
+                    // std::cout << "size=" << ALIGN(sz, re._alignment) << "\n" << std::flush;
+
+                    switch (re._type) {
+                        case REQUEST_ALLOCATE :break;
+                        case REQUEST_STORE : {
+                            if (re._ptr_in != nullptr) {
+                                memcpy(cptr, re._ptr_in, sz);
+                            } else {
+                                size_t of = 0;
+                                for (int i = 0; i < re._num_elements; i++, of += re._element_size) {
+                                    std::copy(std::begin(re._data), std::end(re._data), cptr + of);
+                                }
+                            }
+                            break;
+                        }
+                        case REQUEST_INITIALIZER : {
+                            re._initializer(cptr, sz);
+                            break;
+                        }
+                    }
+                }
+
+                offset += ALIGN(sz + re._padding, re._alignment);
+            }
+        };
+
+        setupOffsets([](MemRequest & request) {
+            return request._region != REGION_RW;
+        }, 0);
+
+        setupOffsets([](MemRequest & request) {
+            return request._region != REGION_RO;
+        }, _rw_section_size);
+    }
+
+    void *getBasePtr() {
+        return heap.get();
+    }
+
+    size_t getRWBytes() {
+        updateSectionsSizes();
+        return _rw_section_size;
+    }
+
+    size_t getTotalBytes() {
+        updateSectionsSizes();
+        return _total;
+    }
+
+ protected:
+    rRegion regionType() const override {
+        return REGION_RW;
+    };
+    std::vector<MemRequest> & futureHeap()  override {
+        return _future_heap;
+    }
+    std::list<std::vector<char>> &localStorage() override {
+        return _local_storage;
+    }
+
+    template<class T>
+    void iterate_binded(MemRequest & reference, const T & visitor) {
+        for (auto &re : _future_heap) {
+            if (re._type == REQUEST_BIND && re._ptr_in == reference._ptr_out) {
+                // std::cout << "  [binded=" << re._ptr_out <<"]\n";
+                visitor(reference, re);
+                // TODO: no circular dependency checking, only tree-style dependency supported
+                iterate_binded(re, visitor);
+            }
+        }
+    }
+
+
+    std::shared_ptr<uint8_t> allocate(size_t bytes) {
+        std::shared_ptr<uint8_t> sp(_allocator.allocate(bytes), [=](uint8_t *p) {
+            _allocator.deallocate(p, bytes);
+        });
+        std::fill(sp.get(), sp.get() + bytes, 0);
+        return sp;
+    }
+
+ protected:
+    void updateSectionsSizes() {
+        // count total size and size of read/write regions
+        _rw_section_size = 0;
+        _ro_section_size = 0;
+        for (auto &re : _future_heap) {
+            auto current = ALIGN(re._num_elements * re._element_size + re._padding, re._alignment);
+#ifdef GNA_HEAP_PROFILER
+            std::cout << "chunk: " << " region: " << re._region << ", " <<
+                    "type: " << (re._type  == REQUEST_STORE ? "store " : re._type == REQUEST_BIND ? "bind  " : "alloc ") <<
+                    std::setw(10) << re._num_elements << ", " <<
+                    static_cast<int>(re._element_size) << ", " <<
+                    re._padding << ", " <<
+                    re._offset << ", " <<
+                    re._alignment << std::endl;
+#endif
+            if (re._type == REQUEST_BIND) continue;
+
+            if (re._region == REGION_RW) {
+                _rw_section_size += current;
+            } else {
+                _ro_section_size += current;
+            }
+        }
+        _rw_section_size = ALIGN(_rw_section_size, _page_alignment);
+        _ro_section_size = ALIGN(_ro_section_size, _page_alignment);
+    }
+};
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_memory_state.hpp b/inference-engine/src/gna_plugin/gna_memory_state.hpp
new file mode 100644 (file)
index 0000000..7edcb02
--- /dev/null
@@ -0,0 +1,25 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <cpp_interfaces/impl/ie_memory_state_internal.hpp>
+#include "gna_plugin.hpp"
+
+namespace  GNAPluginNS {
+
+class GNAMemoryState : public InferenceEngine::MemoryStateInternal {
+    std::shared_ptr<GNAPlugin> plg;
+ public:
+    using Ptr = InferenceEngine::MemoryStateInternal::Ptr;
+
+    explicit GNAMemoryState(std::shared_ptr<GNAPlugin> plg)
+        : InferenceEngine::MemoryStateInternal("GNAResetState"), plg(plg) {}
+    void Reset() override {
+        plg->Reset();
+    }
+};
+
+}  // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/gna_model_serial.cpp b/inference-engine/src/gna_plugin/gna_model_serial.cpp
new file mode 100644 (file)
index 0000000..3b14b8c
--- /dev/null
@@ -0,0 +1,320 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <array>
+#include <details/ie_exception.hpp>
+#include <ios>
+#include <iomanip>
+#ifndef _WIN32
+#include <mm_malloc.h>
+#endif
+#include <gna-api-types-xnn.h>
+#include "gna_model_serial.hpp"
+#include "gna_plugin_log.hpp"
+
+template <class T>
+inline void writeBits(const T & obj, std::ostream & os) {
+    os.write(reinterpret_cast<const char *>(&obj), sizeof(T));
+}
+
+template <class T>
+inline void readBits(T & obj, std::istream & is) {
+    is.read(reinterpret_cast<char *>(&obj), sizeof(T));
+}
+
+template <int nBits, class T>
+inline void readNBits(T & obj, std::istream & is) {
+    std::array<uint8_t, nBits / 8> tmp;
+    is.read(reinterpret_cast<char *>(&tmp), nBits / 8);
+
+    obj = * reinterpret_cast<T*>(&tmp.front());
+}
+
+template <class T>
+inline void readOffset(T & ptr, void *base,  std::istream & is) {
+    uint64_t offset = 0ull;
+    readBits(offset, is);
+    ptr = reinterpret_cast<T>(reinterpret_cast<uint8_t *>(base) + offset);
+}
+
+union {
+    uint16_t s;
+    uint8_t  c[2];
+} constexpr static  LECheck {1};
+
+bool is_little_endian() {
+    return LECheck.c[0] == 1;
+}
+
+const int gna_header_magic = is_little_endian() ?  0x4d414e47 : 0x474e414d;
+
+ModelHeader GNAModelSerial::ReadHeader(std::istream &is) {
+    is.exceptions(std::istream::failbit);
+
+    ModelHeader header;
+    readBits(header, is);
+    if (*reinterpret_cast<int*>(header.gnam) != gna_header_magic) {
+        THROW_GNA_EXCEPTION << "Imported file unsupported: magic number should be GNAM(0x474e414d), but was 0x"
+                           << std::setfill('0') <<
+                           std::hex << std::setw(2) << static_cast<short>(header.gnam[0]) <<
+                           std::hex << std::setw(2) << static_cast<short>(header.gnam[1]) <<
+                           std::hex << std::setw(2) << static_cast<short>(header.gnam[2]) <<
+                           std::hex << std::setw(2) << static_cast<short>(header.gnam[3]);
+    }
+    if (header.version.major < 1) {
+        THROW_GNA_EXCEPTION << "Imported file unsupported: major version sould be > 1";
+    }
+    if (header.headerSize < sizeof(header)) {
+        THROW_GNA_EXCEPTION << "Unsupported header size minimal value is : " << sizeof (header) << ", but read: " << header.headerSize;
+    }
+    /*
+     * extra data need to be added into new header and modify check as appropriate
+     */
+
+    //  forward compatible
+    if (header.headerSize > sizeof(header)) {
+        is.seekg(header.headerSize - sizeof(header), std::ios_base::cur);
+    }
+    return header;
+}
+
+void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize,  std::istream & is) {
+    is.exceptions(std::istream::failbit);
+
+    auto readPwl = [&is, basePointer] (intel_pwl_func_t & value) {
+        readBits(value.nSegments, is);
+        if (value.nSegments != 0) {
+            readOffset(value.pSegments, basePointer, is);
+        } else {
+            value.pSegments = nullptr;
+        }
+    };
+
+    for (auto layer = ptr_nnet->pLayers; layer != ptr_nnet->pLayers + ptr_nnet->nLayers; ++layer) {
+        readBits(layer->nInputColumns, is);
+        readBits(layer->nInputRows, is);
+        readBits(layer->nOutputColumns, is);
+        readBits(layer->nOutputRows, is);
+        readBits(layer->nBytesPerInput, is);
+        readBits(layer->nBytesPerOutput, is);
+        readBits(layer->nBytesPerIntermediateOutput, is);
+        readNBits<32>(layer->nLayerKind, is);
+
+        // reading layers structs
+        switch (layer->nLayerKind) {
+            case INTEL_AFFINE_DIAGONAL:
+            case INTEL_AFFINE: {
+                layer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
+                if (layer->pLayerStruct == nullptr) {
+                    THROW_GNA_EXCEPTION << "could not allocate memory for intel_affine_layer_t structure.";
+                }
+
+                auto &affine = *reinterpret_cast<intel_affine_layer_t *>(layer->pLayerStruct);
+                readBits(affine.affine.nBytesPerWeight, is);
+                readBits(affine.affine.nBytesPerBias, is);
+                readOffset(affine.affine.pWeights, basePointer, is);
+                readOffset(affine.affine.pBiases, basePointer, is);
+                readPwl(affine.pwl);
+                break;
+            }
+            case INTEL_CONVOLUTIONAL: {
+                layer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64);
+                if (layer->pLayerStruct == nullptr) {
+                    THROW_GNA_EXCEPTION <<"could not allocate memory for intel_convolutional_layer_t structure.";
+                }
+
+                auto &convolution = *reinterpret_cast<intel_convolutional_layer_t *>(layer->pLayerStruct);
+                readBits(convolution.nFilterCoefficients, is);
+                readBits(convolution.nBytesFilterCoefficient, is);
+                readBits(convolution.nBytesBias, is);
+                readBits(convolution.nFilters, is);
+                readBits(convolution.nFeatureMaps, is);
+                readBits(convolution.nFeatureMapRows, is);
+                readBits(convolution.nFeatureMapColumns, is);
+                readBits(convolution.nFilterRows, is);
+                readOffset(convolution.pFilters, basePointer, is);
+                readOffset(convolution.pBiases, basePointer, is);
+                readBits(convolution.nPoolSize, is);
+                readBits(convolution.nPoolStride, is);
+                readBits(convolution.poolType, is);
+                readPwl(convolution.pwl);
+                break;
+            }
+
+            case INTEL_RECURRENT:
+                THROW_GNA_EXCEPTION << "Importing of recurrent layer not supported";
+            case INTEL_INTERLEAVE:
+                THROW_GNA_EXCEPTION << "Importing of interleave layer not supported";
+            case INTEL_DEINTERLEAVE:
+                THROW_GNA_EXCEPTION << "Importing of deinterleave layer not supported";
+            case INTEL_COPY:
+                THROW_GNA_EXCEPTION << "Importing of copy layer not supported";
+            default:
+                THROW_GNA_EXCEPTION << "Importing of unknown GNA layer kind(" << layer->nLayerKind << ")  not supported";
+        }
+
+        // reading offsets of inputs/outputs
+        readOffset(layer->pInputs, basePointer, is);
+        readOffset(layer->pOutputsIntermediate, basePointer, is);
+        readOffset(layer->pOutputs, basePointer, is);
+    }
+
+    // writing memory information
+    uint32_t nStates = 0;
+    readBits(nStates, is);
+    if (pstates != nullptr) {
+        pstates->resize(nStates);
+    }
+
+    for (int i = 0; i != nStates; i++) {
+       void *pSegment;
+       readOffset(pSegment, basePointer, is);
+       uint32_t segmentSz;
+       readBits(segmentSz, is);
+       if (pstates) {
+           (*pstates)[i] = {pSegment, segmentSz};
+       }
+    }
+
+
+    // once structure has been read lets read whole gna graph
+    is.read(reinterpret_cast<char*>(basePointer), gnaGraphSize);
+}
+
+#define offsetFromBase(field)\
+getOffsetFromBase(field, #field)
+
+
+/**
+ *
+ * @param ptr_nnet
+ * @param gnaAllocSize - it can be calculated based on nnet, however it will overcomplicate export
+ * about base adress it is relatively easy to calculate
+ * @param os
+ */
+void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostream & os) const {
+    os.exceptions(std::ostream::failbit);
+
+    std::vector<intel_nnet_layer_t>
+        layers(ptr_nnet->pLayers, ptr_nnet->pLayers + ptr_nnet->nLayers);
+
+
+    // all offsets will be from this pointer
+    auto getOffsetFromBase = [basePointer, &gnaGraphSize](void * pointer, const char * name = nullptr) {
+        auto offset = static_cast<uint64_t >(std::distance(reinterpret_cast<uint8_t*>(basePointer), reinterpret_cast<uint8_t*>(pointer)));
+        if (offset > gnaGraphSize) {
+            THROW_GNA_EXCEPTION << "offset to " << (name == nullptr ? "" : name) << "(0x" << pointer
+                               << ") not in range segment retuned from GNAAlloc(0x" << basePointer << "-0x"
+                               << reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(basePointer) + gnaGraphSize) << ")";
+        }
+        return offset;
+    };
+
+    auto writePwl = [&os, getOffsetFromBase] (intel_pwl_func_t & value) {
+        writeBits(value.nSegments, os);
+        // export require certain offset, since offset from base to nullptr cannot be correct, we are not store it at all
+        if (value.nSegments != 0) {
+            writeBits(offsetFromBase(value.pSegments), os);
+        }
+    };
+
+    auto convert_to_serial = [getOffsetFromBase](const GNAModelSerial::RuntimeEndPoint& ep){
+        ModelHeader::EndPoint out;
+        out.elements_count = ep.elements_count;
+        out.element_size = ep.element_size;
+        out.descriptor_offset = offsetFromBase(ep.descriptor_ptr);
+        out.scaleFactor = ep.scaleFactor;
+        return out;
+    };
+    /**
+     * writing header
+     */
+    ModelHeader header;
+    header.gnam[0] = 'G';
+    header.gnam[1] = 'N';
+    header.gnam[2] = 'A';
+    header.gnam[3] = 'M';
+    header.version.major = HEADER_MAJOR;
+    header.version.minor = HEADER_MINOR;
+    header.gnaMemSize = gnaGraphSize;
+    header.layersCount = layers.size();
+    header.nGroup = ptr_nnet->nGroup;
+    header.input  = convert_to_serial(input);
+    header.output = convert_to_serial(output);
+    header.headerSize = sizeof(ModelHeader);
+    header.nRotateRows = nRotateRows;
+    header.nRotateColumns = nRotateColumns;
+
+
+    writeBits(header, os);
+
+    for (auto & layer : layers) {
+        writeBits(layer.nInputColumns, os);
+        writeBits(layer.nInputRows, os);
+        writeBits(layer.nOutputColumns, os);
+        writeBits(layer.nOutputRows, os);
+        writeBits(layer.nBytesPerInput, os);
+        writeBits(layer.nBytesPerOutput, os);
+        writeBits(layer.nBytesPerIntermediateOutput, os);
+        writeBits(static_cast<uint32_t>(layer.nLayerKind), os);
+
+        // writing layers structs
+        switch (layer.nLayerKind) {
+            case INTEL_AFFINE_DIAGONAL:
+            case INTEL_AFFINE: {
+                auto &affine = *reinterpret_cast<intel_affine_layer_t *>(layer.pLayerStruct);
+                writeBits(affine.affine.nBytesPerWeight, os);
+                writeBits(affine.affine.nBytesPerBias, os);
+                writeBits(offsetFromBase(affine.affine.pWeights), os);
+                writeBits(offsetFromBase(affine.affine.pBiases), os);
+                writePwl(affine.pwl);
+                break;
+            }
+            case INTEL_CONVOLUTIONAL: {
+                auto &convolution = *reinterpret_cast<intel_convolutional_layer_t *>(layer.pLayerStruct);
+                writeBits(convolution.nFilterCoefficients, os);
+                writeBits(convolution.nBytesFilterCoefficient, os);
+                writeBits(convolution.nBytesBias, os);
+                writeBits(convolution.nFilters, os);
+                writeBits(convolution.nFeatureMaps, os);
+                writeBits(convolution.nFeatureMapRows, os);
+                writeBits(convolution.nFeatureMapColumns, os);
+                writeBits(convolution.nFilterRows, os);
+                writeBits(offsetFromBase(convolution.pFilters), os);
+                writeBits(offsetFromBase(convolution.pBiases), os);
+                writeBits(convolution.nPoolSize, os);
+                writeBits(convolution.nPoolStride, os);
+                writeBits(convolution.poolType, os);
+                writePwl(convolution.pwl);
+                break;
+            }
+
+            case INTEL_RECURRENT:
+                THROW_GNA_EXCEPTION << "Exporting of recurrent layer not supported";
+            case INTEL_INTERLEAVE:
+                THROW_GNA_EXCEPTION << "Exporting of interleave layer not supported";
+            case INTEL_DEINTERLEAVE:
+                THROW_GNA_EXCEPTION << "Exporting of deinterleave layer not supported";
+            case INTEL_COPY:
+                THROW_GNA_EXCEPTION << "Exporting of copy layer not supported";
+            default:
+                THROW_GNA_EXCEPTION << "Exporting of unknown GNA layer kind(" << layer.nLayerKind << ")  not supported";
+        }
+
+        // writing offsets from base.
+        writeBits(offsetFromBase(layer.pInputs), os);
+        writeBits(offsetFromBase(layer.pOutputsIntermediate), os);
+        writeBits(offsetFromBase(layer.pOutputs), os);
+    }
+    // writing memory information
+    writeBits(static_cast<uint32_t>(states.size()), os);
+    for (auto && state : states) {
+        writeBits(offsetFromBase(state.first), os);
+        writeBits(state.second, os);
+    }
+
+    // once structure has been written lets push gna graph
+    os.write(reinterpret_cast<char*>(basePointer), gnaGraphSize);
+}
diff --git a/inference-engine/src/gna_plugin/gna_model_serial.hpp b/inference-engine/src/gna_plugin/gna_model_serial.hpp
new file mode 100644 (file)
index 0000000..0ba5be5
--- /dev/null
@@ -0,0 +1,209 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <istream>
+#include <vector>
+#include <utility>
+#include "gna-api.h"
+
+#pragma pack(push, 1)
+
+/**
+ * version history
+ * 1.0 - basic support
+ * 1.1 - added memory information
+ */
+
+#define HEADER_MAJOR 1
+#define HEADER_MINOR 1
+
+/**
+ * @brief Header version 1.0
+ */
+struct ModelHeader {
+    /**
+     *@brief MagicNumber – GNAM in ascii table, equals to hex 0x474e414d
+     */
+    char gnam[4];
+    /**
+     * @brief if header size is not equal to sizeof ModelHeader - some reserved data append in the end of header
+     * usually it is an indicator of working with version of model different that is current export function produce
+     */
+    uint32_t headerSize = 0u;
+    struct Version {
+        /**
+         * @details Version of format Major – unsigned int, ex: 0x0001
+         * every change in the header or in the layers definition should be reflected in version change
+         * for backward compatibility new parsers can read old versions of model with certain restrictions
+         */
+        uint16_t major = 0u;
+        /**
+         * @details Version of Format Minor – unsigned int,  corresponding to build revision for example
+         * changes in minor version are not affected layout of model
+         */
+        uint32_t minor = 0u;
+    } version;
+    /**
+     * @brief Memory required to be allocated using GNAAlloc()
+     */
+    uint64_t gnaMemSize = 0ull;
+    /**
+     * @brief Number of GNA Layers
+     */
+    uint64_t layersCount = 0ull;
+
+    /**
+     * @brief Grouping level
+     */
+    uint32_t nGroup = 0u;
+
+    /**
+     * Convolution related setting - they are affecting input transformation
+     */
+    uint32_t nRotateRows = 0u;
+    uint32_t nRotateColumns = 0u;
+
+
+    struct EndPoint {
+        /**
+         * if scale factor is different then pased into infer , network might need to be requantized
+         */
+        float scaleFactor = 0.f;
+        /**
+         * Offset in bytes of pointer descriptor
+         */
+        uint64_t descriptor_offset = 0ull;
+        /**
+         * Endpoint resolution in bytes.
+         */
+        uint32_t element_size = 0u;
+        /**
+         * Number of elements
+         */
+        uint32_t elements_count = 0u;
+    };
+    EndPoint input;
+    EndPoint output;
+
+    /**
+     * Reserved Data might be here
+     */
+};
+#pragma pack(pop)
+
+/**
+ * @brief implements serialisation tasks for GNAGraph
+ */
+class GNAModelSerial {
+ public:
+    /*
+     * In runtime endpoint mostly same as in serial version, except pf descriptor field
+     */
+    struct RuntimeEndPoint {
+        /**
+         * if scale factor is different then pased into infer , network might need to be requantized
+         */
+        float scaleFactor;
+        /**
+         * Pointer descriptor
+         */
+        void* descriptor_ptr;
+        /**
+         * Endpoint resolution in bytes.
+         */
+        uint32_t element_size;
+        /**
+         * Number of elements
+         */
+        uint32_t elements_count;
+
+        RuntimeEndPoint() = default;
+        RuntimeEndPoint(double scaleFactor,
+                    void* descriptor_ptr,
+                    uint32_t element_size,
+                    uint32_t elements_count) : scaleFactor(scaleFactor),
+                                    descriptor_ptr(descriptor_ptr),
+                                    element_size(element_size),
+                                    elements_count(elements_count) {
+        }
+    };
+    using MemoryType = std::vector<std::pair<void*, uint32_t>>;
+
+private:
+    intel_nnet_type_t *ptr_nnet;
+    RuntimeEndPoint input, output;
+    uint32_t nRotateRows = 0;
+    uint32_t nRotateColumns = 0;
+
+    MemoryType states, *pstates = nullptr;
+
+ public:
+    /**
+     *
+     * @brief Used for import/export
+     * @param ptr_nnet
+     * @param inputScale  - in/out parameter representing input scale factor
+     * @param outputScale - in/out parameter representing output scale factor
+     */
+    GNAModelSerial(intel_nnet_type_t *ptr_nnet, MemoryType &states_holder)
+        : ptr_nnet(ptr_nnet) , pstates(&states_holder) {
+    }
+
+    /**
+     * @brief used for export only since runtime params are not passed by pointer
+     * @param ptr_nnet
+     * @param runtime
+     */
+    GNAModelSerial(
+        intel_nnet_type_t *ptr_nnet,
+        RuntimeEndPoint input,
+        RuntimeEndPoint output) : ptr_nnet(ptr_nnet), input(input), output(output) {
+    }
+
+    GNAModelSerial & SetInputRotation(uint32_t nRotateRows, uint32_t nRotateColumns) {
+      this->nRotateColumns = nRotateColumns;
+      this->nRotateRows = nRotateRows;
+      return *this;
+    }
+
+    /**
+     * mark certain part of gna_blob as state (in future naming is possible)
+     * @param descriptor_ptr
+     * @param size
+     * @return
+     */
+    GNAModelSerial & AddState(void* descriptor_ptr, size_t size) {
+        states.emplace_back(descriptor_ptr, size);
+        return *this;
+    }
+
+    /**
+     * @brief calculate memory required for import gna graph
+     * @param is - opened input stream
+     * @return
+     */
+    static ModelHeader ReadHeader(std::istream &is);
+
+    /**
+     * @brief Import model from FS into preallocated buffer,
+     * buffers for pLayers, and pStructs are allocated here and required manual deallocation using mm_free
+     * @param ptr_nnet
+     * @param basePointer
+     * @param is - stream without header structure - TBD heder might be needed
+     */
+    void Import(void *basePointer, size_t gnaGraphSize, std::istream &is);
+
+    /**
+     * save gna graph to an outpus stream
+     * @param ptr_nnet
+     * @param basePtr
+     * @param gnaGraphSize
+     * @param os
+     */
+    void Export(void *basePtr,
+                size_t gnaGraphSize,
+                std::ostream &os) const;
+};
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp
new file mode 100644 (file)
index 0000000..620aa48
--- /dev/null
@@ -0,0 +1,2274 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#define NOMINMAX
+#include "cpp_interfaces/base/ie_plugin_base.hpp"
+#include "gna_plugin.hpp"
+#include "ie_plugin_config.hpp"
+#include "debug.h"
+#include "blob_factory.hpp"
+#include "gna_plugin_log.hpp"
+#include "gna_layer_info.hpp"
+#include <utility>
+#include <limits>
+#include "ie_memcpy.h"
+
+#ifdef PLOT
+void ExportGnaNetworkAndrzej(const char *ptr_name, intel_nnet_type_t* pNeuralNetwork);
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+#include <fstream>
+#include <stdexcept>
+#include <vector>
+#include <malloc.h>
+#include <math.h>
+#include <string.h>
+#include <list>
+#include <algorithm>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <memory>
+#include <dnn_memory.hpp>
+#include <ie_layers.h>
+#include "details/caseless.hpp"
+#include <gna-api-types-xnn.h>
+#include "gna-api.h"
+#include "gna-api-dumper.h"
+#include "dnn.h"
+#include "pwl.h"
+#include "util.h"
+#include "quantization/quantization.h"
+#include "lstm.hpp"
+#include "graph_tools.hpp"
+#include "gna_plugin_config.hpp"
+#include "gna/gna_config.hpp"
+#include "quantization/model_quantizer.hpp"
+#include "gna_model_serial.hpp"
+#include "gna_memory_state.hpp"
+#include "details/ie_cnn_network_tools.h"
+
+using namespace InferenceEngine;
+using namespace std;
+using namespace GNAPluginNS;
+using namespace InferenceEngine::details;
+
+#ifdef VERBOSE
+#define VERBOSE_LEVEL (1)
+#else
+#define VERBOSE_LEVEL (0)
+#endif
+
+#ifdef PLOT
+#define PLOT_LEVEL (1)
+#else
+#define PLOT_LEVEL (0)
+#endif
+
+
+#define PAGE_SIZE_BYTES 4096
+
+#define FROM_IR_DIM(mem, idx)\
+((mem->dims.size() > idx - 1) ? mem->dims[idx - 1] : 1)
+
+inline int16_t GNAPluginNS::ConvertFloatToInt16(float src) {
+        float rounding_value = (src > 0) ? 0.5f : -0.5f;
+        float value = src + rounding_value;
+        if (value > 32767.0) {
+            return 32767;
+        } else if (value < -32768.0) {
+            return -32768;
+        }
+        return (int16_t)value;
+}
+
+void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
+                    const float *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor) {
+    if (!ptr_dst || !ptr_src) {
+        return;
+    }
+    for (uint32_t i = 0; i < num_rows*num_columns; i++) {
+        ptr_dst[i] = GNAPluginNS::ConvertFloatToInt16(ptr_src[i]*scale_factor);
+    }
+}
+void GNAPluginNS::ConvertToFloat(float *ptr_dst,
+                    int32_t *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor) {
+    if (!ptr_dst || !ptr_src) {
+        return;
+    }
+    for (uint32_t i = 0; i < num_rows; i++) {
+        int32_t *ptr_int_row = ptr_src + i * num_columns;
+        float *ptr_float_row = ptr_dst + i * num_columns;
+        for (uint32_t j = 0; j < num_columns; j++) {
+            ptr_float_row[j] = static_cast<float>(ptr_int_row[j]) / scale_factor;
+        }
+    }
+}
+
+template <typename T, typename U>
+void GNAPlugin::copyInputData(T *dst,
+                const U *src,
+                uint32_t num_frames,
+                uint32_t num_group,
+                uint32_t num_vector_elements,
+                uint32_t num_vector_stride,
+                intel_dnn_orientation_t orientation) {
+    if (!dst || !src) {
+        return;
+    }
+    if (orientation == kDnnInterleavedOrientation) {
+        for (uint32_t i = 0; i < num_frames; i++) {
+            for (uint32_t j = 0; j < num_vector_elements; j++) {
+                if (!std::is_same<T, U>::value) {
+                    dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * input_scale_factor);
+                } else {
+                    dst[j * num_group + i] = src[i * num_vector_elements + j];
+                }
+            }
+            // pad to meet weight matrix row length requirement
+            for (uint32_t j = num_vector_elements; j < num_vector_stride; j++) {
+                dst[j * num_group + i] = 0;
+            }
+        }
+        // pad partial group
+        for (uint32_t i = num_frames; i < num_group; i++) {
+            for (uint32_t j = 0; j < num_vector_stride; j++) {
+                dst[j * num_group + i] = 0;
+            }
+        }
+    } else {
+        if (!std::is_same<T, U>::value) {
+            for (uint32_t i = 0; i < num_frames; i++) {
+                T *ptr_dst_vec = const_cast<T *>(reinterpret_cast<const T *>(dst) + i * num_vector_stride);
+                U *ptr_src_vec = const_cast<U *>(reinterpret_cast<const U *>(src) + i * num_vector_elements);
+                std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
+                for (int j=0; j < num_vector_elements; j++) {
+                    ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * input_scale_factor);
+                }
+            }
+
+        } else {
+            for (uint32_t i = 0; i < num_frames; i++) {
+                void *ptr_dst_vec = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(dst) + i * num_vector_stride * sizeof(T));
+                void *ptr_src_vec = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(src) + i * num_vector_elements * sizeof(U));
+                std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
+                std::memcpy(ptr_dst_vec, ptr_src_vec, num_vector_elements * sizeof(T));
+            }
+        }
+
+        for (uint32_t i = num_frames; i < num_group; i++) {
+            void *ptr_dst_vec = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(dst) + i * num_vector_stride * sizeof(T));
+            std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
+        }
+    }
+}
+
+template <typename T, typename U>
+void GNAPlugin::copyInputDataWithSplit(T *const dst,
+                const U *src,
+                const GNASplitLayer& splitInfo,
+                size_t precision_size) {
+    if (!dst || !src) {
+        return;
+    }
+    T *dst_ptr = dst;
+    const U *src_ptr = src;
+    precision_size = sizeof(T);
+    // we found split/slice layer connected to Input
+    for (auto&& outputLayer : splitInfo.splitOutputLayers) {
+        uint32_t begin = outputLayer.offset/precision_size;
+        uint32_t end = (outputLayer.offset + outputLayer.pure_size)/precision_size;
+        for (uint32_t i = begin; i < end; ++i) {
+            if (!std::is_same<T, U>::value) {
+                *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * input_scale_factor);
+            } else {
+                *(dst_ptr++) = *(src_ptr++);
+            }
+        }
+        begin = end;
+        end = (outputLayer.offset + ALIGN64(outputLayer.pure_size))/precision_size;
+        std::memset(dst_ptr, 0, (end - begin )* sizeof(uint16_t));
+        dst_ptr += end - begin;
+    }
+}
+
+void GNAPlugin::ExportScores(void *ptr_dst,
+                  void *ptr_src,
+                  intel_dnn_orientation_t orientation,
+                  uint32_t num_frames,
+                  uint32_t num_group,
+                  uint32_t num_vector_elements,
+                  uint32_t num_active_elements,
+                  uint32_t num_vector_stride,
+                  uint32_t num_bytes_per_element_input,
+                  uint32_t num_bytes_per_element) {
+    // source scores are possibly padded to multiple of 8 and possibly interleaved
+    // rotate if necessary and only copy actual scores (not padding)
+    if (orientation == kDnnInterleavedOrientation) {
+        if (num_bytes_per_element == 2) {
+            int16_t *dst = reinterpret_cast<int16_t *>(ptr_dst);
+            int16_t *src = reinterpret_cast<int16_t *>(ptr_src);
+            for (uint32_t i = 0; i < num_frames; i++) {
+                for (uint32_t j = 0; j < num_active_elements; j++) {
+                    dst[i * num_vector_elements + j] = src[j * num_group + i];
+                }
+                for (uint32_t j = num_active_elements; j < num_vector_elements; j++) {
+                    dst[i * num_vector_elements + j] = 0;
+                }
+            }
+        } else if (num_bytes_per_element == 4) {  // should work for both int and float
+            int32_t *dst = reinterpret_cast<int32_t *>(ptr_dst);
+            int8_t *src = reinterpret_cast<int8_t*>(ptr_src);
+            for (uint32_t i = 0; i < num_frames; i++) {
+                for (uint32_t j = 0; j < num_active_elements; j++) {
+                    auto input_ptr = src + (j * num_group + i) * num_bytes_per_element_input;
+                    auto dst_ptr = dst + (i * num_vector_elements + j);
+
+                    switch (num_bytes_per_element_input) {
+                        case 2 : {
+                            *dst_ptr  = static_cast<int32_t>(*reinterpret_cast<int16_t*>(input_ptr));
+                            break;
+                        }
+                        case 4 : {
+                            *dst_ptr  = *reinterpret_cast<int32_t*>(input_ptr);
+                            break;
+                        }
+                        default:
+                            THROW_GNA_EXCEPTION << "Unsupported output layer precision: " << num_bytes_per_element_input << "bytes";
+                    }
+                }
+                for (uint32_t j = num_active_elements; j < num_vector_elements; j++) {
+                    dst[i * num_vector_elements + j] = 0;
+                }
+            }
+        } else {
+            THROW_GNA_EXCEPTION << "Unsupported target precision for infer : " << num_bytes_per_element << "bytes";
+        }
+    } else {
+        if (num_bytes_per_element == 2) {
+            for (uint32_t i = 0; i < num_frames; i++) {
+                void *ptr_dst_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(int16_t));
+                void *ptr_src_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(int16_t));
+                memset(ptr_dst_vec, 0, num_vector_elements * sizeof(int16_t));
+                memcpy(ptr_dst_vec, ptr_src_vec, num_active_elements * sizeof(int16_t));
+            }
+        } else if (num_bytes_per_element == 4) {  // should work for both int and float
+            for (uint32_t i = 0; i < num_frames; i++) {
+                void *ptr_dst_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(float));
+                void *ptr_src_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(float));
+                memset(ptr_dst_vec, 0, num_vector_elements * sizeof(float));
+                memcpy(ptr_dst_vec, ptr_src_vec, num_active_elements * sizeof(float));
+            }
+        } else {
+            THROW_GNA_EXCEPTION << "Unsupported target precision for infer : " << num_bytes_per_element << "bytes";
+        }
+    }
+}
+
+void GNAPlugin::ImportFrames(
+                  void *ptr_dst,
+                  const void *ptr_src,
+                  Precision input_precision,
+                  intel_dnn_orientation_t orientation,
+                  uint32_t num_frames,
+                  uint32_t num_group,
+                  uint32_t num_vector_elements,
+                  uint32_t num_vector_stride) {
+    // special case if split/slice layers connected
+    // with Input detected
+    auto it = split_connection.end();
+    if (split_connection.size() != 0) {
+        it = std::find_if(split_connection.begin(), split_connection.end(), []
+                    (const std::pair<std::string, GNASplitLayer> &item) -> bool {
+                        return CaselessEq<std::string>()(item.second.splitInputLayer.name, "Input");
+                    });
+    }
+    if (orientation == kDnnInterleavedOrientation) {
+        // TODO : fix that as well
+        if (input_precision.size() == 2) {
+            int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
+            int16_t *src = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_src));
+            if (it != split_connection.end()) {
+                copyInputDataWithSplit(dst, src, it->second, input_precision.size());
+            } else {
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+            }
+        } else if (input_precision.size() == 4) {
+            if (!gnadevice) {
+                float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
+                float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
+                if (it != split_connection.end()) {
+                    copyInputDataWithSplit(dst, src, it->second, input_precision.size());
+                } else {
+                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+                }
+            } else {
+                int16_t *dst = reinterpret_cast<int16_t *>(ptr_dst);
+                const float *src = reinterpret_cast<const float *>(ptr_src);
+                if (it != split_connection.end()) {
+                    copyInputDataWithSplit(dst, src, it->second, input_precision.size());
+                } else {
+                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+                }
+            }
+        }
+    } else {
+        if (input_precision.size()== 2) {
+            int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
+            int16_t *src = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_src));
+            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+        } else if (input_precision.size() == 4) {
+            if (!gnadevice) {
+                float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
+                float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+            } else {
+                uint16_t *dst = const_cast<uint16_t *>(reinterpret_cast<const uint16_t *>(ptr_dst));
+                float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+            }
+        }
+    }
+}
+
+void GNAPlugin::fillMemoryConnections(std::map<std::string,
+                                            std::vector<InferenceEngine::CNNLayerPtr>>&
+                                                                            memoryPairs) {
+    for (auto &memory : memoryPairs) {
+        auto inputLayer = memory.second[1];
+        auto outputLayer = memory.second[0];
+
+        IE_ASSERT(1 == outputLayer->insData.size());
+
+        // creating connection for layers output as form of extramap
+        memory_connection.emplace_back(memory.first, GNAMemoryLayer(inputLayer, outputLayer));
+    }
+}
+
+void GNAPlugin::fillConcatConnections(InferenceEngine::CNNLayerPtr layer) {
+    // creating connection for each layer outputs as form of extramap
+    GNAPlugin::GNAConcatLayer layerInfoItem(layer);
+    size_t concat_size = 0;
+    std::string& id = layer->name;
+
+    for (size_t i = 0; i < layer->insData.size(); ++i) {
+        auto dataInput = layer->insData[i].lock();
+        if (!dataInput) {
+            THROW_GNA_EXCEPTION << "Input layer pointer for concat is unexpectedly absent";
+        }
+
+        auto ptrConcatLayerInput = dataInput->creatorLayer.lock();
+        if (!ptrConcatLayerInput) {
+            THROW_GNA_EXCEPTION << "Input layer for concat is unexpectedly absent";
+        }
+        layerInfoItem.concatInputLayers.emplace_back(
+                GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo({ptrConcatLayerInput->name, concat_size}));
+
+        size_t layer_size =
+                     InferenceEngine::details::product(begin(dataInput->dims),
+                                                      end(dataInput->dims)) * dataInput->precision.size();
+        concat_size += layer_size;
+    }
+    layerInfoItem.reserved_size = concat_size;
+    concat_connection.emplace(id, layerInfoItem);
+}
+
+void GNAPlugin::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) {
+    // creating connection for each layer inputs as form of extramap
+    GNAPlugin::GNASplitLayer layerInfoItem(layer);
+    size_t split_size = 0;
+    std::string& id = layer->name;
+    auto dataInput = layer->insData.begin()->lock();
+    if (!dataInput) {
+        THROW_GNA_EXCEPTION << "Input layer pointer for split/slice is unexpectedly absent";
+    }
+    auto ptrSplitLayerInput = dataInput->creatorLayer.lock();
+    if (!ptrSplitLayerInput) {
+        THROW_GNA_EXCEPTION << "Input layer for split/slice is unexpectedly absent";
+    }
+
+    LayerInfo ptrSplitLayerInputLayerInfo(ptrSplitLayerInput);
+    for (size_t i = 0; i < layer->outData.size(); ++i) {
+        size_t padding = 0;
+        size_t layer_size = 0;
+        auto& dataOutput = layer->outData[i];
+
+        if (!dataOutput || !dataInput) {
+            THROW_GNA_EXCEPTION << "Output layer pointer for split/slice is unexpectedly absent";
+        }
+
+        for (auto&& ptrSplitLayerOutputPair : dataOutput->getInputTo()) {
+            auto& ptrSplitLayerOutput = ptrSplitLayerOutputPair.second;
+            if (!ptrSplitLayerOutput) {
+                THROW_GNA_EXCEPTION << "Output layer for split/slice is unexpectedly absent";
+            }
+
+            padding = std::max(padding, LayerInfo(ptrSplitLayerOutput).paddingSize())
+                                                        * dataOutput->precision.size();
+            layer_size =
+                    InferenceEngine::details::product(begin(dataOutput->dims),
+                                                     end(dataOutput->dims)) * dataOutput->precision.size();
+
+            layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, split_size, layer_size);
+        }
+
+        split_size += ptrSplitLayerInputLayerInfo.isInput() ?
+                                ALIGN64(padding + layer_size):
+                                        padding + layer_size;
+    }
+    layerInfoItem.reserved_size = split_size;
+    layerInfoItem.splitInputLayer =
+                    GNAPlugin::GNASplitLayer::SplitConnectedLayerInfo({ptrSplitLayerInput->type, 0,
+                                                                    InferenceEngine::details::product(begin(dataInput->dims),
+                                                                    end(dataInput->dims)) * dataInput->precision.size()});
+    split_connection.emplace(id, layerInfoItem);
+}
+
+void GNAPlugin::DiagonalPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    AffinePrimitive(layer, true);
+}
+
+void GNAPlugin::ConvolutionPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto &convolution = dynamic_cast<ConvolutionLayer &>(*layer.get());
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    auto inputs = layer->insData.begin()->lock();
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_feature_map_rows = FROM_IR_DIM(inputs, 1) / convolution._stride_x;
+    uint32_t num_feature_map_columns = FROM_IR_DIM(inputs, 3) * convolution._stride_x / num_feature_maps;
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(inputs, 3);
+    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+    uint32_t num_padding = ALIGN(convolution._kernel_x * num_feature_map_columns * num_feature_maps, 8)
+                                            - convolution._kernel_x * num_feature_map_columns * num_feature_maps;
+    void *ptr_inputs;
+    void *ptr_outputs;
+    void *ptr_weights;
+    void *ptr_biases;
+
+    // TODO: questionable why for biases that are no in IR we inventing precision
+    auto biasPrecision = convolution._biases ? convolution._biases->precision() : outputs->precision;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+
+#ifdef PLOT
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name << dnnComponentsForLayer.size() - 1 << "\n";
+#endif
+    auto num_input_padding = ALIGN(num_feature_maps * num_feature_map_columns * num_feature_map_rows, 8)
+                                                        -  num_feature_maps * num_feature_map_columns * num_feature_map_rows;
+    auto num_filter_rows = convolution._kernel_x / convolution._stride_x;
+    dnn.InitConvolutional1DComponent(currentComponent,
+                            1,
+                            num_feature_maps *  num_feature_map_columns * num_feature_map_rows + num_input_padding,
+                            1,
+                            num_rows_out * convolution._out_depth,
+                            inputs->precision.size(),
+                            outputs->precision.size(),
+                            convolution._weights->precision().size(),
+                            biasPrecision.size(),
+                            convolution._out_depth,
+                            num_filter_rows,
+                            num_feature_maps * num_feature_map_columns * num_filter_rows + num_padding,
+
+                            num_feature_maps,  // interesting - why this is so in gna_example
+                            num_feature_map_rows,
+                            num_feature_map_columns,
+
+                            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs,
+                            ptr_weights,
+                            ptr_biases);
+
+    // update num_feature_maps for next convolutional layer
+    num_feature_maps = convolution._out_depth;  // = number of filters
+
+    size_t num_data_bytes_out =
+                        InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+                                                                                * outputs->precision.size();
+
+    size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size();
+
+    auto connectedInputLayer = connectInput(layer, ptr_inputs, num_data_bytes_in).input;
+
+    // TODO: convolution might be not the first layer in sorted order but connected via split for example - dont know how kaldi will handle that
+    if (LayerInfo(connectedInputLayer).isInput()) {
+        //  Kaldi features are opposite orientation
+        dnn.num_rotate_rows = num_feature_map_columns;
+        dnn.num_rotate_columns = num_feature_map_rows;
+    }
+
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+
+    // rotate
+    auto TransposeMatrix = [](uint8_t *ptr_matrix, size_t element_size, uint32_t num_rows, uint32_t num_cols) {
+        std::vector<uint8_t> temp_buffer(num_rows * num_cols * element_size);
+        for (uint32_t i = 0; i < num_rows; i++) {
+            for (uint32_t j = 0; j < num_cols; j++) {
+                    ie_memcpy(&temp_buffer.front() + (j*num_rows + i)*element_size,
+                          temp_buffer.size() - (i * num_cols + j) * element_size,
+                          ptr_matrix + (i*num_cols+j)*element_size,
+                          element_size);
+            }
+        }
+        return temp_buffer;
+    };
+
+    std::vector<uint8_t > transposedWeights;
+    for (uint32_t k = 0; k < convolution._out_depth; k++) {
+        uint8_t *ptr_filt_current
+            = convolution._weights->cbuffer().as<uint8_t *>() + k * num_columns_in * convolution._kernel[X_AXIS] * convolution.precision.size();
+        auto transposedPart = TransposeMatrix(ptr_filt_current, convolution.precision.size(), num_columns_in, convolution._kernel[X_AXIS]);
+        transposedWeights.insert(transposedWeights.end(), transposedPart.begin(), transposedPart.end());
+    }
+
+    if (num_padding == 0) {
+        gnamem->readonly().push_local_ptr(ptr_weights, transposedWeights.data(), convolution._weights->byteSize(), 64);
+    } else {
+        auto elementsIn = convolution._kernel_x * num_feature_map_columns + num_padding;
+        auto paddedWeights = elementsIn * convolution._out_depth;
+        auto paddedWeightsSize = paddedWeights * convolution.precision.size();
+        auto elements_in_row = convolution._kernel_x * num_feature_map_columns;
+        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) {
+            for (int i = 0; i < convolution._out_depth; i++) {
+                memcpy(data,
+                       transposedWeights.data() + elements_in_row * i * convolution.precision.size(),
+                       elements_in_row * convolution.precision.size());
+
+                data = reinterpret_cast<uint8_t *>(data) + elementsIn * convolution.precision.size();
+            }
+        }, 64);
+    }
+
+    if (convolution._biases) {
+        gnamem->readonly().push_ptr(ptr_biases,
+                                    convolution._biases->cbuffer().as<const void *>(),
+                                    convolution._biases->byteSize(),
+                                    64);
+    } else {
+        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+    }
+}
+
+void GNAPlugin::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto &power = dynamic_cast<PowerLayer &>(*layer.get());
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    if (power.power != 1.0) {
+        THROW_IE_EXCEPTION << "[GNA plugin] unsupported power factor, expected 1 but was " << power.power;
+    }
+
+    auto input = layer->insData[0].lock();
+
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_rows_in = FROM_IR_DIM(input, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(input, 2);
+    uint32_t num_rows_out = num_rows_in;
+
+    void *ptr_inputs;
+    void *ptr_outputs;
+    void *ptr_weights;
+    void *ptr_biases;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+    dnn.InitAffineComponent(currentComponent,
+                            num_rows_in,
+                            num_columns_in,
+                            num_rows_out,
+                            input->precision.size(),
+                            outputs->precision.size(),
+                            // TODO: only fp32 and Int16 tested
+                            quantized == nullptr ? input->precision.size() : 2,
+                            quantized == nullptr ? input->precision.size() : 4,
+                            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs,
+                            ptr_weights,
+                            ptr_biases,
+                            true);
+
+#ifdef PLOT
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n";
+#endif
+
+    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+        * outputs->precision.size();
+
+    size_t num_data_bytes_in = InferenceEngine::details::product(begin(input->dims), end(input->dims))
+        * input->precision.size();
+
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+    connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
+
+    if (power.scale != 1.0f) {
+        if (quantized == nullptr) {
+            gnamem->readonly().push_value(ptr_weights, power.scale, num_rows_out, 64);
+        } else {
+            auto scaledIdentity = quantized->_weights_quant.scale * power.scale;
+
+            #define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5):((a) + 0.5))
+
+            auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
+            gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+        }
+    }
+
+    if (power.offset != 0.0f) {
+        if (quantized == nullptr) {
+            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+        } else {
+            gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+        }
+    } else {
+        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+    }
+}
+
+void GNAPlugin::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto &pooling = dynamic_cast<PoolingLayer &>(*layer.get());
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    auto inputs = layer->insData.begin()->lock();
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(inputs, 3);
+    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+    uint32_t num_columns_out = FROM_IR_DIM(outputs, 3);
+    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+
+    void *ptr_inputs;
+    void *ptr_outputs;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+
+#ifdef PLOT
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name << dnnComponentsForLayer.size() - 1 << "\n";
+#endif
+    switch (pooling._type) {
+        case PoolingLayer::MAX: break;
+        // we are loosing precision here
+        case PoolingLayer::AVG:
+        default:
+            // TODO: convert to SUMM pooling
+            THROW_GNA_EXCEPTION << "Layer :" << layer->name << " not supported";
+    }
+
+    dnn.InitMaxpoolComponent(currentComponent,
+                            1,
+                            num_columns_in * num_rows_in ,
+                            1,
+                            num_columns_out * num_rows_out,
+                            inputs->precision.size(),
+                            outputs->precision.size(),
+                            pooling._kernel[X_AXIS],
+                            pooling._kernel[X_AXIS],
+                            num_columns_in,
+                            false,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs);
+
+    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+        * outputs->precision.size();
+
+    size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size();
+
+    connectInput(layer, ptr_inputs, num_data_bytes_in);
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+}
+
+void GNAPlugin::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    auto inputs = layer->insData.begin()->lock();
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
+    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+    uint32_t num_columns_out = FROM_IR_DIM(outputs, 2);
+    uint32_t num_padding_in = ALIGN(num_rows_in, 8) - num_rows_in;
+    uint32_t num_padding_out = ALIGN(num_rows_out, 8) - num_rows_out;
+    void *ptr_inputs;
+    void *ptr_outputs;
+    auto orientation = (num_cnn_rows_out > 0) ? kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+    dnn.InitCopyComponent(currentComponent,
+                          orientation,
+                          num_rows_in + num_padding_in,
+                          num_columns_in,
+                          num_rows_out + num_padding_out,
+                          num_columns_out,
+                          inputs->precision.size(),
+                          outputs->precision.size(),
+                          quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                          num_rows_out + num_padding_out,
+                          num_columns_out,
+                          ptr_inputs,
+                          ptr_outputs);
+
+    size_t num_data_bytes_out = ALIGN(InferenceEngine::details::product(
+                                                            begin(outputs->dims), end(outputs->dims)), 8)
+                                                                                * outputs->precision.size();
+    size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding_in) * inputs->precision.size();
+
+    connectInput(layer, ptr_inputs, num_data_bytes_in);
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+}
+
+void GNAPlugin::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto concatLayer = dynamic_cast<InferenceEngine::ConcatLayer *> (layer.get());
+
+    if (concatLayer == nullptr) {
+        return;
+    }
+    if (concatLayer->insData.size() != 2) {
+        THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers.";
+    }
+
+    auto prevInput0 = concatLayer->insData[0].lock();
+    auto prevInput1 = concatLayer->insData[1].lock();
+    if (!prevInput0 || !prevInput1) {
+        THROW_GNA_EXCEPTION << "Input layer for concat is unexpectedly absent";
+    }
+    if (prevInput0->precision.size() != prevInput1->precision.size()) {
+        THROW_GNA_EXCEPTION << "Different precision for Concat input layers are not supported";
+    }
+
+    for (auto &&outLayer : concatLayer->outData.front()->getInputTo()) {
+        if ( LayerInfo(outLayer.second).isConcat() ) {
+            auto& concatLayerInfo = concat_connection.find(concatLayer->name)->second;
+            connectOutput(layer, &concatLayerInfo.gna_ptr,
+                          &concatLayerInfo.gna_ptr, concatLayerInfo.reserved_size);
+        }
+    }
+}
+
+void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto cropLayer = dynamic_cast<InferenceEngine::CropLayer *> (layer.get());
+
+    if (cropLayer == nullptr) {
+        return;
+    }
+    if (cropLayer->axis.size() > 1) {
+        THROW_GNA_EXCEPTION <<
+        "Crop layer does not support the number of cropped dimentions = "
+        << cropLayer->axis.size() << ".";
+    }
+
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+    size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
+    size_t cropSize = cropLayer->dim.back() * cropLayer->precision.size();
+
+    if (ALIGN(cropOffset, 8) == cropOffset) {
+        // leave crop as it is
+        GNAPlugin::GNACropLayer cropLayerInfoItem(layer);
+        std::string& id = layer->name;
+        crop_connection.emplace(id, cropLayerInfoItem);
+        auto cropLayerInfo = crop_connection.find(cropLayer->name);
+
+        if (cropLayerInfo == crop_connection.end()) {
+            THROW_GNA_EXCEPTION <<
+            "Item is not in the storage but it was added recently...\n";
+        }
+
+        // calculate index idx for connectInput last parameter
+        connectInput(layer, &cropLayerInfo->second.gna_ptr, cropSize + cropOffset, cropOffset, 0);
+
+        // cases for certain output layers
+        for (auto &&outLayer : layer->outData.front()->getInputTo()) {
+            auto& nextLayer = outLayer.second;
+            if ( LayerInfo(nextLayer).isConcat() ) {
+                connectOutput(layer, &cropLayerInfo->second.gna_ptr, &cropLayerInfo->second.gna_ptr, cropSize);
+            }
+        }
+    } else {
+        gnalog() << "Crop " << layer->name << " is being replaced by Affine layer...\n";
+        auto outputs = *layer->outData.begin();
+        auto inputs = layer->insData.begin()->lock();
+
+        uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+        uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
+        uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+        uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+
+        void *ptr_inputs;
+        void *ptr_outputs;
+        void *ptr_weights;
+        void *ptr_biases;
+
+        dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+        auto &currentComponent = dnnComponentsForLayer.back().second;
+        dnn.InitAffineComponent(currentComponent,
+                                num_rows_in + num_padding,
+                                num_columns_in,
+                                num_rows_out,
+                                inputs->precision.size(),
+                                4,
+                                quantized == nullptr ? inputs->precision.size() : 2,
+                                4,
+                                quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                                quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                                ptr_inputs,
+                                ptr_outputs,
+                                ptr_weights,
+                                ptr_biases,
+                                false);
+
+        size_t num_data_bytes_out =
+        InferenceEngine::details::product(
+                                          begin(outputs->dims), end(outputs->dims)) * 4;
+
+        size_t num_data_bytes_in = num_columns_in *
+        (num_rows_in + num_padding) * inputs->precision.size();
+
+        connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
+        connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+
+        gnamem->readonly().push_initializer(ptr_weights, num_rows_out * (num_rows_in + num_padding)*layer->precision.size(), [=](void * data, size_t size) {
+            int out = 0;
+            for (int input = cropLayer->offset.back(); input < num_rows_out + cropLayer->offset.back(); ++input) {
+                auto mem_ptr = reinterpret_cast<uint8_t *>(data) + input * layer->precision.size() + out * (num_rows_in+num_padding) * layer->precision.size();
+                if (quantized == nullptr) {
+                    auto float_ptr = reinterpret_cast<float *>(mem_ptr);
+                    *float_ptr = 1.0f;
+                } else {
+                    auto int_ptr = reinterpret_cast<uint16_t *>(mem_ptr);
+                    *int_ptr = 1;
+                }
+                ++out;
+            }
+        }, 64);
+        if (quantized == nullptr) {
+            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+        } else {
+            gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+        }
+    }
+}
+
+void GNAPlugin::SplitPrimitive(InferenceEngine::CNNLayerPtr layer) {
+//  Nothing to do
+}
+
+void GNAPlugin::SlicePrimitive(InferenceEngine::CNNLayerPtr layer) {
+//  Nothing to do
+}
+
+void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto &eltwise = dynamic_cast<EltwiseLayer &>(*layer.get());
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    // for eltwise should be one input of 4 bytes and one of 2 bytes - detecting that
+    auto inputs2Bytes = layer->insData[0].lock();
+    auto inputs4Bytes = layer->insData[1].lock();
+
+    int biasesLayerIdx = 1;
+
+    if (quantized) {
+        if (eltwise._operation == EltwiseLayer::Sum) {
+            if (inputs4Bytes->precision.size() != 4) {
+                std::swap(inputs4Bytes, inputs2Bytes);
+                biasesLayerIdx = 0;
+            }
+            IE_ASSERT(inputs2Bytes->precision.size() == 2);
+            IE_ASSERT(inputs4Bytes->precision.size() == 4);
+        } else {
+            // for mul both inputs should be 2 bytes precision
+            IE_ASSERT(inputs2Bytes->precision.size() == 2);
+            IE_ASSERT(inputs4Bytes->precision.size() == 2);
+        }
+    }
+
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs4Bytes, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(inputs4Bytes, 2);
+    uint32_t num_rows_out = num_rows_in;
+
+    void *ptr_inputs;
+    void *ptr_outputs;
+    void *ptr_weights;
+    void *ptr_biases;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+    dnn.InitAffineComponent(currentComponent,
+                            num_rows_in,
+                            num_columns_in,
+                            num_rows_out,
+                            inputs2Bytes->precision.size(),
+                            outputs->precision.size(),
+                            // TODO: only fp32 and Int16 tested
+                            quantized == nullptr ? inputs2Bytes->precision.size() : 2,
+                            quantized == nullptr ? inputs4Bytes->precision.size() : 4,
+                            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs,
+                            ptr_weights,
+                            ptr_biases,
+                            true);
+
+#ifdef PLOT
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n";
+#endif
+
+    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+        * outputs->precision.size();
+
+    size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs2Bytes->dims), end(inputs2Bytes->dims))
+        * inputs2Bytes->precision.size();
+
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+    connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 1 - biasesLayerIdx);
+
+    switch (eltwise._operation) {
+        case EltwiseLayer::Sum:
+            if (quantized == nullptr) {
+                gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64);
+            } else {
+                auto scaledIdentity = quantized->_weights_quant.scale;
+
+                #define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5):((a) + 0.5))
+
+                auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
+                gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+            }
+            connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
+            break;
+
+        case EltwiseLayer::Prod:
+            if (quantized == nullptr) {
+                gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+            } else {
+                gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+            }
+            connectInput(layer, ptr_weights, num_data_bytes_in, 0, biasesLayerIdx);
+            break;
+
+        default:
+            THROW_GNA_EXCEPTION << "Unsupported eltwise operation: " << eltwise._operation;
+    }
+}
+
+void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag) {
+    auto &weightable = dynamic_cast<WeightableLayer &>(*layer.get());
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    auto inputs = layer->insData.begin()->lock();
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
+    uint32_t num_rows_out = isDiag ? num_rows_in : FROM_IR_DIM(outputs, 1);
+    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+
+    void *ptr_inputs;
+    void *ptr_outputs;
+    void *ptr_weights;
+    void *ptr_biases;
+
+    // TODO: questionable why for biases that are no in IR we inventing precision
+    auto biasPrecision = weightable._biases ? weightable._biases->precision() : outputs->precision;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+
+#ifdef PLOT
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name << (isDiag ? "diagonal_" : "affine_") << dnnComponentsForLayer.size() - 1 << "\n";
+#endif
+
+    dnn.InitAffineComponent(currentComponent,
+                            num_rows_in + num_padding,
+                            num_columns_in,
+                            num_rows_out,
+                            inputs->precision.size(),
+                            outputs->precision.size(),
+                            weightable._weights->precision().size(),
+                            biasPrecision.size(),
+                            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs,
+                            ptr_weights,
+                            ptr_biases,
+                            isDiag);
+
+    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+        * outputs->precision.size();
+
+    size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size();
+
+    auto connectionInfo = connectInput(layer, ptr_inputs, num_data_bytes_in);
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+
+    auto transpose = false;
+    auto transposedRows = 0;
+    auto transposedCols = 0;
+    /**
+     * TODO: enable transpose correction between Conv/affine layers implement dedicated pass
+     * TF topologies have inplace permutes so we dont care
+     * kaldi topologies did this internally
+     */
+    if (0 && connectionInfo.needTransposeWeights) {
+        gnalog() << "Transposing weights for layer: " << layer->name << "\n";
+        // direct order is 0, 1, 2, 3, supported order is only 0,3,2,1 where dim 2 is usually equals to 1
+        auto permuteOrder = connectionInfo.permute->GetParamAsInts("order");
+        if (permuteOrder != vector<int>({0, 3, 2, 1})) {
+            THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") <<
+                               ", but only support 0, 3, 2, 1";
+        }
+        transpose = !isDiag;
+        transposedRows = connectionInfo.permute->input()->getDims()[3];
+        transposedCols = connectionInfo.permute->input()->getDims()[1];
+    }
+
+    if (num_padding == 0) {
+        if (!transpose) {
+            gnamem->readonly().push_ptr(ptr_weights,
+                                        weightable._weights->cbuffer().as<const void *>(),
+                                        weightable._weights->byteSize(),
+                                        64);
+        } else {
+            // ToDO: write unit tests for transpose
+            gnamem->readonly().push_initializer(ptr_weights, weightable._weights->byteSize(), [=](void * data, size_t size) {
+                for (int k = 0; k < (isDiag ? 1 : num_rows_out); k++) {
+                    auto rowOffset = k * transposedRows * transposedCols * weightable.precision.size();
+                    auto cbuffer = weightable._weights->cbuffer().as<const uint8_t *>() + rowOffset;
+                    auto u8Data = reinterpret_cast<uint8_t *>(data) + rowOffset;
+                    for (int j = 0; j < transposedCols; j++) {
+                        for (int i = 0; i < transposedRows; i++) {
+                            auto offsetWrite = (transposedRows * j + i) * weightable.precision.size();
+                            auto offsetRead = (i * transposedCols + j) * weightable.precision.size();
+                            memcpy(u8Data + offsetWrite, cbuffer + offsetRead, weightable.precision.size());
+                        }
+                    }
+                }
+            }, 64);
+        }
+    } else {
+        auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
+        auto paddedWeights = isDiag ? elementsIn : elementsIn * num_rows_out;
+        auto paddedWeightsSize = paddedWeights * weightable.precision.size();
+
+        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) {
+            for (int i = 0; i < (isDiag ? 1 : num_rows_out); i++) {
+                memcpy(data,
+                       weightable._weights->cbuffer().as<const uint8_t *>() + num_rows_in * i * weightable.precision.size(),
+                       num_rows_in * weightable.precision.size());
+                data = reinterpret_cast<uint8_t *>(data) + (num_rows_in + num_padding) * weightable.precision.size();
+            }
+        }, 64);
+    }
+
+    if (weightable._biases) {
+        gnamem->readonly().push_ptr(ptr_biases,
+                         weightable._biases->cbuffer().as<const void *>(),
+                         weightable._biases->byteSize(),
+                         64);
+    } else {
+        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+    }
+}
+
+void GNAPlugin::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto *generic = dynamic_cast<GenericLayer *>(layer.get());
+    std::string type;
+    std::vector<intel_pwl_segment_t> ptr_pwl_segments;
+    uint32_t num_rows;
+    uint32_t num_columns;
+    void *ptr_inputs;
+    void *ptr_outputs;
+
+    do {
+        if (generic == nullptr) {
+            type = layer->type;
+            break;
+        }
+
+        if (CaselessEq<string>()(layer->type, "activation")) {
+            type = generic->GetParamAsString("type");
+            break;
+        } else {
+            type = layer->type;
+            break;
+        }
+    } while (false);
+
+    auto inputs = layer->insData.begin()->lock();
+    auto outputs = *layer->outData.begin();
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+    float output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+
+    auto orientation = (num_cnn_rows_out > 0) ? kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
+
+    if (inputs->dims.size() == 4) {
+        num_columns = FROM_IR_DIM(inputs, 3) * FROM_IR_DIM(inputs, 1);
+        num_rows = 1;
+    } else {
+        num_columns = FROM_IR_DIM(inputs, 2);
+        num_rows = FROM_IR_DIM(inputs, 1);
+    }
+
+    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+        * outputs->precision.size();
+
+    size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs->dims), end(inputs->dims))
+        * inputs->precision.size();
+
+    static caseless_unordered_map<std::string, DnnActivationType> supportedActivations = {
+        {"sigmoid", kActSigmoid},
+        {"tanh", kActTanh},
+        {"relu", kActRelu},
+        {"leakyrelu", kActLeakyRelu},
+        {"clamp", kActKaldiLstmClipping},
+        {"identity", kActIdentity}
+    };
+
+    auto it = supportedActivations.find(type);
+    if (it == supportedActivations.end()) {
+        THROW_GNA_EXCEPTION << "Activation function type not yet supported: " << type;
+    }
+    auto activation_type = DnnActivation::fromType(it->second);
+    activation_type.negative_slope = (it->second == kActRelu) ? dynamic_cast<ReLULayer*>(layer.get())->negative_slope : 0.0f;
+
+    // TODO: need to take graph dependency instead of linear
+    auto &prevComponent = dnnComponentsForLayer.back().second;
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+
+    intel_pwl_segment_t *ptr_pwl_segments_target = nullptr;
+
+    if (!inputs->precision.is_float()) {
+        // TODO: generalize activation function code
+        // now that scale factors are known, create PWL approximations to activation functions
+        float input_scale_factor = dnn.OutputScaleFactor(prevComponent);
+        if (uniformPwlDesign) {
+            switch (activation_type) {
+                case kActSigmoid:ptr_pwl_segments.resize(SIGMOID_NUM_SEGMENTS);
+                    break;
+                case kActTanh:ptr_pwl_segments.resize(TANH_NUM_SEGMENTS);
+                    break;
+                case kActRelu:ptr_pwl_segments.resize(RELU_NUM_SEGMENTS);
+                    break;
+                case kActLeakyRelu:ptr_pwl_segments.resize(RELU_NUM_SEGMENTS);
+                    break;
+                case kActKaldiLstmClipping:
+                case kActIdentity:ptr_pwl_segments.resize(IDENTITY_NUM_SEGMENTS);
+                    break;
+                case kActCustom:
+                default:THROW_GNA_EXCEPTION << "Activation function type not yet supported " << activation_type;
+            }
+            PwlDesign16(activation_type,
+                        &*ptr_pwl_segments.begin(),
+                        static_cast<uint32_t>(ptr_pwl_segments.size()),
+                        input_scale_factor,
+                        output_scale_factor);
+        } else {
+            PwlDesignOpt16(activation_type,
+                           ptr_pwl_segments,
+                           input_scale_factor,
+                           output_scale_factor);
+        }
+        ptr_pwl_segments_target = reinterpret_cast<intel_pwl_segment_t *>(&ptr_pwl_segments_target);
+    }
+
+    dnn.InitPiecewiseLinearComponent(currentComponent,
+                                     activation_type,
+                                     orientation,
+                                     num_rows,
+                                     num_columns,
+                                     inputs->precision.size(),
+                                     outputs->precision.size(),
+                                     ptr_pwl_segments.size(),
+                                     output_scale_factor,
+                                     ptr_inputs,
+                                     ptr_outputs,
+                                     ptr_pwl_segments_target);
+#ifdef PLOT
+#define GET_ACTIVATION_NAME(name)\
+case name:\
+    actName = #name;\
+    break;
+    string actName = "unknown";
+    switch (activation_type) {
+        GET_ACTIVATION_NAME(kActSigmoid);
+        GET_ACTIVATION_NAME(kActTanh);
+        GET_ACTIVATION_NAME(kActRelu);
+        GET_ACTIVATION_NAME(kActLeakyRelu);
+        GET_ACTIVATION_NAME(kActKaldiLstmClipping);
+        GET_ACTIVATION_NAME(kActIdentity);
+    }
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name <<  actName << "_" << dnnComponentsForLayer.size() - 1 <<"\n";
+#endif
+
+    connectInput(layer, ptr_inputs, num_data_bytes_in);
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+
+    if (ptr_pwl_segments_target != nullptr) {
+        gnamem->readonly().push_local_ptr(ptr_pwl_segments_target,
+                                          &ptr_pwl_segments.front(),
+                                          ptr_pwl_segments.size() * sizeof(intel_pwl_segment_t),
+                                          64);
+    }
+}
+
+
+void GNAPlugin::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto layerOrder = layer->GetParamAsInts("order");
+
+    if (layerOrder != vector<int>({0, 3, 2, 1})) {
+        THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") <<
+                           ", but only support 0,3,2,1";
+    }
+}
+
+class LayersBuilder {
+    using CreatorFnc = std::function<void(GNAPlugin*, CNNLayerPtr)>;
+
+ public:
+    LayersBuilder(const std::vector<std::string> &types, CreatorFnc callback) {
+        for (auto && str : types) {
+            getStorage()[str] = callback;
+        }
+    }
+    static caseless_unordered_map<std::string, CreatorFnc> &getStorage() {
+        static caseless_unordered_map<std::string, CreatorFnc> LayerBuilder;
+        return LayerBuilder;
+    }
+};
+
+#define CREATE(name) [](GNAPlugin *p, CNNLayerPtr l) {p->name(l);}
+void SKIP(GNAPlugin*, CNNLayerPtr) {}
+
+void GNAPlugin::CreateLayerPrimitive(CNNLayerPtr layer) {
+    static const LayersBuilder layersBuilder[] = {
+        {{"Input"}, [](GNAPlugin*, CNNLayerPtr l) {}},  // skip input layers they are not used in GNA lib, only as a memory blobs
+        {{"FullyConnected", "InnerProduct"}, CREATE(AffinePrimitive)},
+        {{"ScaleShift"}, CREATE(DiagonalPrimitive)},
+        {{"Eltwise"},
+         CREATE(EltwisePrimitive)},  // same as diagonal while weights are not taken from network, rather than from another output
+        {{"Split"}, SKIP},  // skip information about which part of prev layer need to consume handle during layer creation
+        {{"Slice"}, SKIP},
+        {{"clamp", "sigmoid", "relu", "tanh", "identity"}, CREATE(PWLPrimitive)},
+        {{"Convolution"}, CREATE(ConvolutionPrimitive)},
+        {{"Permute"}, CREATE(PermutePrimitive)},  // permute of certain form (2D transpose) can be assimilated in followed FC layer
+        {{"Pooling"}, CREATE(PoolingPrimitive)},
+        {{"Power"} , CREATE(PowerPrimitive)},
+        {{"Concat"}, CREATE(ConcatPrimitive)},
+        {{"Reshape"}, SKIP},  // TODO: handled not in GNA but rather in GNA plugin
+        {{"Crop"}, CREATE(CropPrimitive)},
+        {{"Copy"}, CREATE(CopyPrimitive)},
+    };
+    auto it = LayersBuilder::getStorage().find(layer->type);
+    if (it != LayersBuilder::getStorage().end()) {
+        it->second(this, layer);
+    } else {
+        THROW_GNA_EXCEPTION << "Unsupported layer: " << layer->name << ":" << layer->type;
+    }
+}
+
+
+GNAPlugin::GNAPlugin(const std::map<std::string, std::string>& configMap) {
+    // holds actual value of a found key
+    std::string value;
+    auto if_set = [&](std::string key, const std::function<void()> & handler) {
+        auto keyInMap = configMap.find(key);
+        if (keyInMap != configMap.end()) {
+            value = keyInMap->second;
+            handler();
+        }
+    };
+
+    if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] {
+        input_scale_factor = std::stod(value);
+    });
+
+    if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] {
+        dumpXNNPath = value;
+    });
+
+    if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] {
+        static caseless_unordered_map <std::string, uint32_t> supported_values = {
+            {GNAConfigParams::GNA_AUTO, GNA_AUTO},
+            {GNAConfigParams::GNA_HW, GNA_HARDWARE},
+            {GNAConfigParams::GNA_SW, GNA_SOFTWARE},
+            {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
+        };
+        auto procType = supported_values.find(value);
+        if (procType == supported_values.end()) {
+            THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
+        }
+        gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
+    });
+
+    if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] {
+        if (value == PluginConfigParams::YES) {
+            compact_mode = true;
+        } else if (value == PluginConfigParams::NO) {
+            compact_mode = false;
+        } else {
+            THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value;
+        }
+    });
+
+    if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] {
+        if (value == PluginConfigParams::YES) {
+            exclusive_async_requests  = true;
+        } else if (value == PluginConfigParams::NO) {
+            exclusive_async_requests  = false;
+        } else {
+            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+        }
+    });
+
+    if_set(GNA_CONFIG_KEY(PRECISION), [&] {
+        auto precision = Precision::FromStr(value);
+        if (precision != Precision::I8 && precision != Precision::I16) {
+            THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
+        }
+        gnaPrecision = precision;
+    });
+
+    if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] {
+        if (value == PluginConfigParams::YES) {
+            uniformPwlDesign = true;
+        } else if (value == PluginConfigParams::NO) {
+            uniformPwlDesign = false;
+        } else {
+            THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
+                                                            << "should be equal to YES/NO, but not" << value;
+        }
+    });
+
+    if_set(CONFIG_KEY(PERF_COUNT), [&] {
+        if (value == PluginConfigParams::YES) {
+            performance_counting = true;
+        } else if (value == PluginConfigParams::NO) {
+            performance_counting = false;
+        } else {
+            THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
+                                                            << "should be equal to YES/NO, but not" << value;
+        }
+    });
+
+    if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] {
+        uint64_t lib_threads = std::stoul(value, NULL, 10);
+        if (lib_threads == 0 || lib_threads > std::numeric_limits<uint8_t>::max()/2-1) {
+            THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
+                                                            << ", should be greateer than 0 and less than 127";
+        }
+        gna_lib_async_threads_num = lib_threads;
+    });
+
+    if_set(CONFIG_KEY(SINGLE_THREAD), [&] {
+        if (value == PluginConfigParams::YES) {
+            gna_openmp_multithreading  = false;
+        } else if (value == PluginConfigParams::NO) {
+            gna_openmp_multithreading  = true;
+        } else {
+            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+        }
+    });
+}
+
+GNAPluginNS::GNAPlugin::LayerType GNAPlugin::LayerTypeFromStr(const std::string &str) {
+    static const caseless_map<std::string, GNAPlugin::LayerType> LayerNameToType = {
+        { "Input" , Input },
+        { "Convolution" , Convolution },
+        { "ReLU" , ReLU },
+        { "Sigmoid" , Sigmoid },
+        { "TanH" , TanH },
+        { "Pooling" , Pooling },
+        { "FullyConnected" , FullyConnected },
+        { "InnerProduct" , InnerProduct},
+        { "Split" , Split },
+        { "Slice" , Slice },
+        { "Eltwise" , Eltwise },
+        { "Reshape" , Reshape },
+        { "ScaleShift" , ScaleShift },
+        { "Clamp" , Clamp },
+        { "Concat" , Concat },
+        { "Copy", Copy },
+        { "Permute" , Permute },
+        { "Power" , Power},
+        { "Memory" , Memory },
+        { "Crop" , Crop }
+    };
+    auto it = LayerNameToType.find(str);
+    if (it != LayerNameToType.end())
+        return it->second;
+    else
+        return NO_TYPE;
+}
+
+bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage) {
+    CNNLayerSet inputLayers;
+    InferenceEngine::InputsDataMap inputs;
+    std::unordered_set<CNNLayer *> allLayers;
+    auto specifiedDevice = network.getTargetDevice();
+    auto network_precision = network.getPrecision();
+    network.getInputsInfo(inputs);
+    auto network_input_precision = inputs.begin()->second->getInputPrecision();
+    auto batch_sise = network.getBatchSize();
+    if (network_precision != Precision::FP32) {
+        errMessage = "The plugin does not support networks with " + std::string(network_precision.name()) + " format.\n";
+        return false;
+    }
+    if (network_input_precision != Precision::FP32 &&
+        network_input_precision != Precision::I16) {
+        errMessage = "The plugin does not support input precision with " + std::string(network_input_precision.name()) + " format.\n";
+        return false;
+    }
+    if (specifiedDevice != InferenceEngine::TargetDevice::eCPU &&
+        specifiedDevice != InferenceEngine::TargetDevice::eGNA &&
+        specifiedDevice != InferenceEngine::TargetDevice::eDefault) {
+        errMessage = "The plugin does not support target device: " + std::string(getDeviceName(specifiedDevice)) + ".\n";
+        return false;
+    }
+
+    if (inputs.empty()) {
+        errMessage = "Network is empty (GNA)\n";
+        return false;
+    }
+
+    auto & secondLayers = inputs.begin()->second->getInputData()->getInputTo();
+    if (secondLayers.empty()) {
+        errMessage = "Network consists of input layer only (GNA)\n";
+        return false;
+    }
+
+    bool check_result = true;
+    InferenceEngine::details::UnorderedDFS(allLayers,
+                                           secondLayers.begin()->second,
+                                           [&](const CNNLayerPtr layer) {
+                                                if (LayerTypeFromStr(layer->type) == NO_TYPE) {
+                                                    errMessage = "Layer is unsupported by GNA: " + layer->name + ":" + layer->type + "\n";
+                                                    check_result =  false;
+                                                }
+                                                if (batch_sise != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) {
+                                                    check_result =  false;
+                                                }
+                                            }, false);
+
+    return check_result;
+}
+
+void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
+    //  Check the input network
+    std::string error;
+    if (!AreLayersSupported(network, error)) {
+        THROW_GNA_EXCEPTION << error.c_str();
+    }
+
+    // network optimisation phases
+    auto run_passes = [&] (CNNNetPtr network) {
+        auto layers = CNNNetSortTopologically(*network.get());
+        substitutePRelu(layers);
+        layers = CNNNetSortTopologically(*network.get());
+        reorderMaxPool(layers);
+        applyOrientations(layers);
+        insertIdentityLayer(layers);
+        insertDiagonalLayer(layers);
+    };
+
+    Config supported = Config({
+        {TargetDevice::eGNA, Precision::FP32, [&](InferenceEngine::ICNNNetwork &network) -> CNNNetworkPtr {
+            if (gnaPrecision == Precision::I16) {
+                ModelQuantizer<QuantI16> q;
+                return q.quantize(network, run_passes, input_scale_factor);
+            }
+
+            if (gnaPrecision == Precision::I8) {
+                ModelQuantizer<QuantI8> q;
+                return q.quantize(network, run_passes, input_scale_factor);
+            }
+            THROW_GNA_EXCEPTION << "no mans land for GNA precision";
+        }},
+        // TODO: need to have advanced precision matcher based on layers/biases
+        {TargetDevice::eGNA, Precision::MIXED},
+        {TargetDevice::eGNA, Precision::I16},
+        {TargetDevice::eCPU, Precision::FP32
+#define EMULATE_GNA_API_LAYERS
+#ifdef  EMULATE_GNA_API_LAYERS
+            , [&](InferenceEngine::ICNNNetwork & network) {
+            auto visitor = [&](InferenceEngine::CNNLayerPtr lp) {
+                return lp;
+            };
+            auto copiedNet = InferenceEngine::CNNNetCopy(network, visitor);
+            run_passes(copiedNet);
+
+            return copiedNet;
+        }
+#endif
+    }
+    });
+
+    supported.setDefaultDevice(TargetDevice::eGNA);
+    auto newNet = supported.find_configuration(network).convert(network);
+    auto networkPrecision = newNet->getPrecision();
+
+    if (!networkPrecision.is_float()) {
+        gnadevice.reset(new GNADeviceHelper(gna_proc_type,
+                                            gna_lib_async_threads_num,
+                                            gna_openmp_multithreading,
+                                            performance_counting));
+        gnamem.reset(new gna_memory_type(
+                    make_polymorph<GNAAllocator>(*gnadevice.get()), PAGE_SIZE_BYTES));
+    } else {
+        gnamem.reset(new gna_memory_type(make_polymorph<std::allocator<uint8_t>>()));
+    }
+
+    // creating intel dnn_t structures from network
+    auto sortedNet = CNNNetSortTopologically(*newNet);
+    std::vector<CNNLayerPtr> sortedNoMem;
+    std::map<std::string,
+                    std::vector<InferenceEngine::CNNLayerPtr>> memoryPairs;
+    // find all memory layers pairs and mark which one used as outputs
+    for (auto &layer : sortedNet) {
+        auto generic = dynamic_cast<GenericLayer *>(layer.get());
+        if (generic == nullptr) {
+            sortedNoMem.push_back(layer);
+            continue;
+        }
+        LayerInfo layerInfo(layer);
+        if (layerInfo.isMemory()) {
+            // collect all memory pairs
+            auto id = generic->GetParamAsString("id");
+            memoryPairs[id].resize(generic->GetParamAsInt("size"));
+            memoryPairs[id][generic->GetParamAsInt("index")] = layer;
+            continue;
+        } else if (layerInfo.isConcat()) {
+            fillConcatConnections(layer);
+        } else if (layerInfo.isSplit() || layerInfo.isSlice()) {
+            fillSplitConnections(layer);
+        }
+        sortedNoMem.push_back(layer);
+    }
+
+    // fill in extra storage with memory layers
+    fillMemoryConnections(memoryPairs);
+
+    // keep inputs information and create input primitives
+    newNet->getInputsInfo(inputsDataMap);
+    if (inputsDataMap.empty()) {
+        THROW_GNA_EXCEPTION << " No inputs for the topology";
+    }
+    if (inputsDataMap.size() != 1) {
+        THROW_GNA_EXCEPTION << " cannot infer topologies with more than one inputs";
+    }
+
+    inputDims = inputsDataMap.begin()->second->getDims();
+
+    // keep output dims
+    newNet->getOutputsInfo(outputsDataMap);
+    if (outputsDataMap.empty()) {
+        THROW_GNA_EXCEPTION << "No outputs for the topology";
+    }
+    if (outputsDataMap.size() != 1) {
+        THROW_GNA_EXCEPTION << "cannot infer topologies with more than one output";
+    }
+    outputDims = outputsDataMap.begin()->second->dims;
+
+    ptr_inputs_global.resize(gna_lib_async_threads_num);
+    ptr_outputs_global.resize(gna_lib_async_threads_num);
+    // CreatingLayer primitives
+    // TODO: solely gna_example convolution hack
+    num_feature_maps = 1;
+    for (auto layer = sortedNoMem.begin(); layer != sortedNoMem.end(); ++layer) {
+        CreateLayerPrimitive(*layer);
+    }
+    gnamem->bind_ptr(&ptr_outputs_global.front(), &dnnComponentsForLayer.back().second.ptr_outputs);
+
+    // make room for active list
+    auto &last_component = dnnComponentsForLayer.back().second;
+    gnamem->reserve_ptr(nullptr, ALIGN64(last_component.num_bytes_per_output * last_component.num_rows_out));
+
+    void *pParallelExecutionData  = nullptr;
+
+    // reserving more bytes for intermidiate data in parallel case - TODO: this works incorrectly in compact mode at lest
+    rwSegmentSize = gnamem->getRWBytes();
+    if (gna_lib_async_threads_num > 1) {
+        gnamem->reserve_ptr(&pParallelExecutionData, gnamem->getRWBytes() * (gna_lib_async_threads_num - 1));
+    }
+
+    gnamem->commit();
+
+    dnn.Init(gnamem->getBasePtr(),
+             gnamem->getTotalBytes(),
+             networkPrecision.is_float() ? kDnnFloat : kDnnInt,
+             1);
+
+    // TODO: this copy unneed infact we can directly create gna structs from list
+    for (auto &element : dnnComponentsForLayer) {
+        dnn.component.push_back(element.second);
+    }
+
+    // in fp32 mode last PWL cannot be computed without that
+    dnn.InitActiveList(NULL);
+
+    nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(0), -1, InferenceEngine::BlobMap()));
+
+    if (!networkPrecision.is_float()) {
+        // number of layer gets calculated inside that InitGNAStruct function
+        dnn.InitGNAStruct(&std::get<0>(nnets.front())->obj);
+    }
+
+    // creating same gna RW segment for paralle infer requests
+    for (int i = 1; i != gna_lib_async_threads_num; i++) {
+        nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(0), -1, InferenceEngine::BlobMap()));
+
+        // this can be improved by just copy all structures, but we are too lazy
+        dnn.InitGNAStruct(&std::get<0>(nnets.back())->obj);
+
+        // relocate rw pointers to new offset
+        auto basePtr = reinterpret_cast<uint8_t*>(pParallelExecutionData) + rwSegmentSize * (i - 1);
+
+        auto relocate = [basePtr, this](void *& ptr_out, void * ptr_in) {
+            if (ptr_in == nullptr) {
+                ptr_out = nullptr;
+            } else {
+                auto offset = reinterpret_cast<uint8_t *>(ptr_in) - reinterpret_cast<uint8_t *>(gnamem->getBasePtr());
+                ptr_out = basePtr + offset;
+            }
+        };
+
+        relocate(ptr_inputs_global[i], ptr_inputs_global[0]);
+        relocate(ptr_outputs_global[i], ptr_outputs_global[0]);
+        for (int j = 0; j != std::get<0>(nnets.front())->obj.nLayers; j++) {
+            auto & layer = std::get<0>(nnets[i])->obj.pLayers[j];
+
+            relocate(layer.pInputs, layer.pInputs);
+            relocate(layer.pOutputs, layer.pOutputs);
+            relocate(layer.pOutputsIntermediate, layer.pOutputsIntermediate);
+        }
+    }
+    orientation_in = dnn.component[0].orientation_in;
+    orientation_out = dnn.component[dnn.num_components()-1].orientation_out;
+    num_bytes_per_output = dnn.component[dnn.num_components()-1].num_bytes_per_output;
+
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(sortedNoMem.back());
+    output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+
+    num_rotate_rows = dnn.num_rotate_rows;
+    num_rotate_columns = dnn.num_rotate_columns;
+
+    DumpXNNToFile();
+
+#ifdef PLOT
+    dnn.WriteGraphWizModel("graph.dot");
+    // ExportGnaNetworkAndrzej("layers/loaded_from_ir", &nnet->obj);
+#endif
+}
+void GNAPlugin::DumpXNNToFile() const {
+    // TODO: output  precision as well as pointer might be incorrect, LSTM for sure
+    // gna looks automatically set layer 0 as output and adjust it's pointer / precision/ size respectively
+    if (!dumpXNNPath.empty()) {
+        if (!gnadevice) {
+            THROW_GNA_EXCEPTION << "Cannot generate XNNDump for float network";
+        }
+        auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices);
+        dump.header.rw_region_size = gnamem->getRWBytes();
+        dump.header.input_scaling_factor = input_scale_factor;
+        dump.header.output_scaling_factor = output_scale_factor;
+        std::ofstream dumpStream(dumpXNNPath, std::ios::out | std::ios::binary);
+        dumpStream.write(reinterpret_cast<char*>(&dump.header), sizeof(intel_gna_model_header));
+        dumpStream.write(reinterpret_cast<char*>(dump.model.get()), dump.header.model_size);
+    }
+}
+
+void RotateFeatures(uint8_t *ptr_feat,
+                    size_t element_size,
+                    uint32_t num_feature_vectors,
+                    uint32_t num_feature_vector_elements,
+                    uint32_t num_rotate_rows,
+                    uint32_t num_rotate_columns) {
+    if (num_feature_vector_elements == num_rotate_rows * num_rotate_columns) {
+        std::vector<uint8_t> temp(num_feature_vector_elements * element_size);
+        for (uint32_t k = 0; k < num_feature_vectors; k++) {
+            uint8_t *ptr_in = ptr_feat + k * num_feature_vector_elements * element_size;
+            for (uint32_t i = 0; i < num_rotate_rows; i++) {
+                for (uint32_t j = 0; j < num_rotate_columns; j++) {
+                    ie_memcpy(&temp.front() + (j * num_rotate_rows + i)*element_size,
+                              temp.size() - (i * num_rotate_columns + j)*element_size,
+                              ptr_in + (i * num_rotate_columns + j)*element_size,
+                              element_size);
+                }
+            }
+            memcpy(ptr_in, &temp.front(), num_feature_vector_elements * element_size);
+        }
+    } else {
+        THROW_GNA_EXCEPTION << "Rotate dimensions (" << num_rotate_rows << "," << num_rotate_columns
+                           <<") do not match buffer length of "<< num_feature_vector_elements <<" in RotateFeatures()!";
+    }
+}
+
+uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) {
+    return QueueInference(*input.begin()->second.get(), result);
+
+    /*if (!syncPoints.empty()) {
+        syncPoints.back().second = result;
+    }*/
+}
+
+uint32_t GNAPlugin::QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result) {
+    auto inputLayout = input.layout();
+    if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) {
+        THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: " << input.layout();
+    }
+    if (inputLayout == NCHW) {
+        inputLayout = NC;
+    }
+    auto is2D = input.layout() ==  Layout::NC || input.layout() == Layout ::CN;
+
+    auto freeNnet = std::find_if(std::begin(nnets), std::end(nnets), [](decltype(nnets.front()) & item) {
+        return std::get<1>(item) == -1;
+    });
+
+    if (freeNnet == nnets.end()) {
+        THROW_IE_EXCEPTION << as_status << REQUEST_BUSY
+                           << "GNA executable network has max of " << static_cast<uint32_t >(gna_lib_async_threads_num)
+                           << " parallel infer requests, please sync one of already running";
+    }
+
+    auto nnet = std::get<0>(*freeNnet).get();
+    auto idx = static_cast<uint32_t>(std::distance(std::begin(nnets), freeNnet));
+
+    if (ptr_inputs_global[idx] == nullptr) {
+        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+        THROW_GNA_EXCEPTION << "network not loaded : global input pointer not set";
+    }
+
+    if (orientation_in == kDnnUnknownOrientation) {
+        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+        THROW_GNA_EXCEPTION << "network not loaded : input orientation not set";
+    }
+
+    if (orientation_out == kDnnUnknownOrientation) {
+        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+        THROW_GNA_EXCEPTION << "network not loaded : output orientation not set";
+    }
+
+    ImportFrames(ptr_inputs_global[idx],
+                 input.cbuffer().as<float *>(),
+                 input.precision(),
+                 orientation_in,
+                 input.dims()[input.dims().size() - 1],
+                 is2D ? input.dims()[1] : input.dims()[input.dims().size() - 1],
+                 is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2],
+                 is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2]);
+
+    if ((inputLayout == Layout::NC || inputLayout == Layout::NCHW) != (orientation_in == kDnnInterleavedOrientation)) {
+        RotateFeatures(reinterpret_cast<uint8_t*>(ptr_inputs_global[idx]),
+                       gnadevice ? 2 : 4,
+                       // TODO: only works for cnn4a and google command so far
+                       input.dims()[input.dims().size() - 1],
+                       is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2],  // num_feature_vectors looks batch should be there
+                       num_rotate_rows,
+                       num_rotate_columns);
+    }
+
+    if (!gnadevice) {
+        dnn.Propagate();
+        std::get<1>(*freeNnet) = 1;
+    } else {
+        std::get<1>(*freeNnet) = gnadevice->propagate(&nnet->obj, ptr_active_indices, num_active_indices);
+    }
+    std::get<2>(*freeNnet) = result;
+    return idx;
+}
+
+void GNAPlugin::Wait(uint32_t idx) {
+    // already synced TODO: might be copy required ???
+    if (std::get<1>(nnets[idx]) == -1) return;
+
+    if (gnadevice) {
+        gnadevice->wait(std::get<1>(nnets[idx]));
+    }
+
+    std::get<1>(nnets[idx]) = -1;
+    auto & output = *std::get<2>(nnets[idx]).begin()->second;
+#ifdef PLOT
+    dnn.BeginNewWrite();
+    if (dnn.num_components() != 0) {
+        dnn.WriteDnnText("Net_.txt", kDnnFloat);
+        dnn.WriteInputAndOutputText();
+    }
+    dnn.WriteInputAndOutputTextGNA(&std::get<0>(nnets.front())->obj);
+#endif
+
+    if (output.layout() == Layout::NC) {
+        // TODO: rotate can be incorporated with exporting - used only in unit tests so far
+        // TODO: restore:
+//        if (orientation_out != kDnnInterleavedOrientation) {
+//            RotateFeatures(reinterpret_cast<uint8_t*>(ptr_outputs_global),
+//                           gnadevice ? 2 : 4,
+//                           input.dims()[input.dims().size() - 1],
+//                           input.dims()[0],  // num_feature_vectors looks batch should be there
+//                           input.dims()[0],
+//                           input.dims()[input.dims().size() - 1]);
+//        }
+
+        ExportScores(output.buffer(),
+                     ptr_outputs_global[idx],
+                     orientation_out,
+                     output.dims()[output.dims().size() - 1],
+                     output.dims()[1],
+                     output.dims()[0],
+                     output.dims()[0],
+                     output.dims()[0],
+                     // TODO: create better getter consider multiple outputs case
+                     gnadevice ? std::get<0>(nnets[idx])->obj.pLayers[std::get<0>(nnets[idx])->obj.nLayers - 1].nBytesPerOutput : sizeof(float),
+                     sizeof(float));
+    } else if (output.layout() != Layout::CN) {
+        THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC or Layout::CN. But was " << output.layout();
+    }
+
+    if (gnadevice) {
+#ifdef PLOT
+        FILE *f = nullptr;
+        static int num_infers = 0;
+        {
+            f = fopen("ex_scores.txt", "w");
+        }
+        num_infers++;
+        if (f) {
+            for (int i = 0; i < output.dims()[1]; i++) {
+                for (int j = 0; j < output.dims()[0]; j++) {
+                    fprintf(f, "%d ", output.cbuffer().as<int32_t *>()[output.dims()[0] * i + j]);
+                }
+                fprintf(f, "\n");
+            }
+            fprintf(f, "\n\n");
+        }
+#endif
+        ConvertToFloat(output.buffer(),
+                       output.buffer(),
+                       output.dims()[0],
+                       output.dims()[1],
+                       output_scale_factor);
+#ifdef PLOT
+        if (f) {
+            for (int i = 0; i < output.dims()[1]; i++) {
+                for (int j = 0; j < output.dims()[0]; j++) {
+                    fprintf(f, "%.2f ", output.cbuffer().as<float *>()[output.dims()[0] * i + j]);
+                }
+                fprintf(f, "\n");
+            }
+            fclose(f);
+        }
+#endif
+    }
+}
+
+
+void GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) {
+    BlobMap result;
+    result["output"] = std::shared_ptr<Blob>(&output, [](Blob*){});
+    Wait(QueueInference(input, result));
+}
+
+void GNAPlugin::Reset() {
+    for (auto && memLayer : memory_connection) {
+        std::memset(memLayer.second.gna_ptr, 0, memLayer.second.reserved_size);
+    }
+    for (auto && concatLayer : concat_connection) {
+        std::memset(concatLayer.second.gna_ptr, 0, concatLayer.second.reserved_size);
+    }
+}
+
+void GNAPlugin::Infer(const BlobMap &inputs, BlobMap &result) {
+    auto &input = *inputs.begin()->second.get();
+    auto &output = *result.begin()->second.get();
+    Infer(input, output);
+}
+
+Blob::Ptr GNAPlugin::GetOutputBlob(InferenceEngine::Precision precision) {
+    // need to have intermediate blob for interleave conversion
+    InferenceEngine::Blob::Ptr outputBlob;
+    outputBlob = make_blob_with_precision(precision, NC, outputDims);
+    outputBlob->allocate();
+    return outputBlob;
+}
+
+Blob::Ptr GNAPlugin::GetInputBlob(InferenceEngine::Precision precision) {
+    InferenceEngine::Blob::Ptr inputBlob;
+    // need to have intermediate blob for interleave conversion
+    // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not
+    inputBlob = make_blob_with_precision(precision, inputDims.size() == 2 ? NC : NCHW, inputDims);
+    inputBlob->allocate();
+    return inputBlob;
+}
+
+std::vector<InferenceEngine::MemoryStateInternal::Ptr>  GNAPlugin::QueryState() {
+    if (memory_connection.empty()) {
+        return {};
+    }
+
+    return {std::make_shared<GNAMemoryState>(shared_from_this())};
+}
+
+InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::string &modelFileName) {
+    // no need to return anything dueto weird design of internal base classes
+    std::fstream inputStream(modelFileName, ios_base::in | ios_base::binary);
+    if (inputStream.fail()) {
+        THROW_GNA_EXCEPTION << "Cannot open file to import model: " << modelFileName;
+    }
+
+    auto header = GNAModelSerial::ReadHeader(inputStream);
+
+    gnadevice.reset(new GNADeviceHelper(gna_proc_type,
+                                        gna_lib_async_threads_num,
+                                        gna_openmp_multithreading));
+    gnamem.reset(new gna_memory_type(make_polymorph<GNAAllocator>(*gnadevice.get()), PAGE_SIZE_BYTES));
+
+    void *basePtr = nullptr;
+    gnamem->reserve_ptr(&basePtr, header.gnaMemSize);
+    gnamem->commit();
+
+    nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(header.layersCount), -1, InferenceEngine::BlobMap()));
+    std::get<0>(nnets.back())->obj.nGroup = header.nGroup;
+    GNAModelSerial::MemoryType  mt;
+    auto serial = GNAModelSerial(&std::get<0>(nnets.back())->obj, mt);
+    serial.Import(basePtr, header.gnaMemSize, inputStream);
+
+    ptr_inputs_global.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.input.descriptor_offset));
+    ptr_outputs_global.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.output.descriptor_offset));
+
+    auto getOrientation = [](intel_nnet_layer_t & layer) {
+        return layer.nLayerKind == INTEL_CONVOLUTIONAL ?
+           kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
+    };
+
+    orientation_in = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]);
+    orientation_out = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers-1]);
+
+    num_bytes_per_output = header.output.element_size;
+
+
+    outputDims = SizeVector({header.output.elements_count / header.nGroup, header.nGroup});
+    inputDims = SizeVector({header.input.elements_count / header.nGroup, header.nGroup});
+
+    inputsDataMap["input"] = std::make_shared<InputInfo>();
+    inputsDataMap["input"]->setInputData(make_shared<Data>("input",
+                                                           inputDims,
+                                                           Precision::FP32,
+                                                           Layout::NC));
+    outputsDataMap["output"] = make_shared<Data>("output",
+                                                 outputDims,
+                                                 Precision::FP32,
+                                                 Layout::NC);
+
+    output_scale_factor = header.output.scaleFactor;
+    input_scale_factor = header.input.scaleFactor;
+
+    num_rotate_rows = header.nRotateRows;
+    num_rotate_columns = header.nRotateColumns;
+
+    for (auto && memory : mt) {
+        GNAMemoryLayer memoryLayer(nullptr, nullptr);
+        memoryLayer.gna_ptr = memory.first;
+        memoryLayer.reserved_size = memory.second;
+
+        memory_connection.emplace_back(make_pair(std::string("noname"), memoryLayer));
+    }
+
+    DumpXNNToFile();
+
+#ifdef PLOT
+    dnn.WriteGraphWizModel("graph.dot");
+    // ExportGnaNetworkAndrzej("layers/loaded_from_aot_file", &nnet->obj);
+#endif
+
+    return nullptr;
+}
+
+void GNAPlugin::Export(const std::string &fileName) {
+    if (ptr_inputs_global.empty() || ptr_outputs_global.empty()) {
+        THROW_GNA_EXCEPTION << " network not loaded";
+    }
+
+    std::fstream outStream(fileName, ios_base::out | ios_base::binary);
+
+    // TODO: nnet group parameter looks only used in application - so can we move this line into load network.
+    if (inputDims.size() == 2) {
+        std::get<0>(nnets.front())->obj.nGroup = inputDims[1];
+    }
+
+    auto serial = GNAModelSerial(&std::get<0>(nnets.front())->obj,
+                   {input_scale_factor,
+                    ptr_inputs_global[0],
+                    2,
+                    static_cast<uint32_t>(InferenceEngine::details::product(inputsDataMap.begin()->second->getDims()))},
+                   {output_scale_factor,
+                    ptr_outputs_global[0],
+                    num_bytes_per_output,
+                    static_cast<uint32_t>(InferenceEngine::details::product(outputsDataMap.begin()->second->getDims()))})
+        .SetInputRotation(dnn.num_rotate_rows, dnn.num_rotate_columns);
+
+    for (auto && memoryConnection : memory_connection) {
+        serial.AddState(memoryConnection.second.gna_ptr, memoryConnection.second.reserved_size);
+    }
+
+    serial.Export(gnamem->getBasePtr(), gnamem->getTotalBytes(), outStream);
+}
+
+void GNAPlugin::GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) {
+    if (performance_counting) {
+        gnadevice->getGnaPerfCounters(perfMap);
+    }
+}
+
+void GNAPlugin::AddExtension(InferenceEngine::IExtensionPtr extension) {}
+void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config) {}
+
+intel_dnn_component_t * GNAPlugin::find_first_unused_input(InferenceEngine::CNNLayerPtr current) {
+    if (current->insData.empty()) return nullptr;
+
+    auto prev_layer = current->insData.front().lock()->creatorLayer.lock();
+
+    return findDnnLayer(prev_layer);
+}
+void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, void *ptr_inputs, size_t num_data_bytes_out) {
+    gnalog() << "Connecting output " << layer->name << " ...\n";
+    // in case of Memory Layer it's input allocated in meminput layer
+    if (layer->outData.size() == 1) {
+        for (auto &&outLayer : layer->outData.front()->getInputTo()) {
+            auto& nextLayer = outLayer.second;
+            auto nextMemoryLayerIt =
+                std::find_if(begin(memory_connection), end(memory_connection),
+                                                        [&](MemoryConnection::value_type &comp) {
+                                                            return comp.second.getOutput()->name
+                                                                                == nextLayer->name;
+                                                        });
+            if (nextMemoryLayerIt != memory_connection.end()) {
+                auto &nextMemoryLayer = nextMemoryLayerIt->second;
+                // memory layer not yet initialized
+                if (nextMemoryLayer.reserved_size == 0) {
+                    gnamem->reserve_ptr(&nextMemoryLayer.gna_ptr, ALIGN64(num_data_bytes_out));
+                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, 0);
+
+                    nextMemoryLayer.reserved_offset = 0;
+                    nextMemoryLayer.reserved_size = ALIGN64(num_data_bytes_out);
+                } else {
+                    IE_ASSERT(nextMemoryLayer.reserved_size == ALIGN64(num_data_bytes_out));
+                    // same offsets
+                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, nextMemoryLayer.reserved_offset);
+                }
+                return;
+            }
+        }
+
+        // if one of next layers is concat...
+        for (auto &&outLayer : layer->outData.front()->getInputTo()) {
+            auto nextLayer = outLayer.second;
+            if ( LayerInfo(nextLayer).isConcat() ) {
+                auto& name = layer->name;
+                // we look for this concat layer pointer in extra concat map
+                auto concatLayerInfo = concat_connection.find(
+                                nextLayer->name);
+
+                if (concatLayerInfo != concat_connection.end()) {
+                    auto &concatLayerInfoItem = concatLayerInfo->second;
+
+                    // find this input in vector sum all outputs in primitive
+                    auto it = std::find_if(concatLayerInfoItem.concatInputLayers.begin(),
+                                            concatLayerInfoItem.concatInputLayers.end(),
+                                            [&name](GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) {
+                                                return item.name == name;
+                                            });
+                    // reserve full size for concat
+                    if (!concatLayerInfoItem.output_allocation_flag) {
+                        // check if this concat is being included by other one
+                        // by going thru each concat and checking inputs
+                        auto included =
+                            std::find_if(concat_connection.begin(),
+                                           concat_connection.end(),
+                               [&concatLayerInfo]
+                                    (const std::pair<std::string, GNAPlugin::GNAConcatLayer> &concatItem) -> bool {
+                                        auto it = std::find_if(concatItem.second.concatInputLayers.begin(),
+                                                        concatItem.second.concatInputLayers.end(),
+                                                        [&concatLayerInfo]
+                                                            (const GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) -> bool {
+                                                                            return item.name == concatLayerInfo->first;
+                                                            });
+                                        return it != concatItem.second.concatInputLayers.end();
+                                    });
+                        if (included == concat_connection.end()) {
+                            gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size));
+                        }
+                        concatLayerInfo->second.output_allocation_flag = true;
+                    }
+                    gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, it->offset);
+                } else {
+                    // error
+                }
+                return;
+            }
+        }
+    }
+
+    intel_dnn_component_t * unused_input = nullptr;
+    if (compact_mode) {
+        unused_input = find_first_unused_input(layer);
+        if (unused_input != nullptr) {
+            gnamem->bind_ptr(ptr, &unused_input->ptr_inputs, 0, ALIGN64(num_data_bytes_out));
+        }
+    }
+    // cannot reuse suitable input
+    if (unused_input == nullptr) {
+        gnamem->reserve_ptr(ptr, ALIGN64(num_data_bytes_out));
+    }
+}
+
+intel_dnn_component_t * GNAPlugin::findDnnLayer(CNNLayerPtr __layer) {
+    auto component = std::find_if(begin(dnnComponentsForLayer),
+                        end(dnnComponentsForLayer),
+                        [&](DnnComponentsForLayer::value_type &comp) {
+                            return comp.first == __layer->name;
+                        });
+    // check for generic prev layer
+    if (component != dnnComponentsForLayer.end()) {
+        return &component->second;
+    }
+
+    return nullptr;
+}
+
+GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, size_t offset, int idx) {
+    // selecting particular input layers
+    auto prevLayer = CNNNetPrevLayer(layer, idx);
+
+    gnalog() << "Connecting input " << layer->name << " to " << prevLayer->name << " ...\n";
+
+    // real input not a memory input
+    if (LayerInfo(prevLayer).isInput()) {
+        if (0 == bytes_alllocated_for_input) {
+            gnamem->push_value(&ptr_inputs_global.front(), static_cast<uint8_t>(0), num_data_bytes_in, 64);
+            bytes_alllocated_for_input = num_data_bytes_in;
+        }
+        if (ALIGN(num_data_bytes_in, 64) > ALIGN(bytes_alllocated_for_input, 64)) {
+            THROW_IE_EXCEPTION << "Layer: " << layer->name << " Cannot bind pointer to already allocated input, due to size_allocated="
+                                  << bytes_alllocated_for_input << ", and size_requested=" << num_data_bytes_in;
+        }
+        gnamem->bind_ptr(ptr, &ptr_inputs_global.front(), offset);
+        return prevLayer;
+    }
+
+    LayerInfo layerInfoObj(prevLayer);
+    LayerInfo thisLayerInfoObj(layer);
+    // connecting to split/slice splitiing layers
+    if (layerInfoObj.isSplit() || layerInfoObj.isSlice()) {
+        auto& splittingLayer = prevLayer;
+        auto& splitName = splittingLayer->name;
+        auto& name = layer->name;
+
+        // we look for this concat layer pointer in extra concat map
+        auto splitLayerInfo = split_connection.find(splitName);
+
+        if (splitLayerInfo != split_connection.end()) {
+            auto &splitLayerInfoItem = splitLayerInfo->second;
+            // find this input in vector sum all outputs in primitive
+            auto it = std::find_if(splitLayerInfoItem.splitOutputLayers.begin(),
+                                    splitLayerInfoItem.splitOutputLayers.end(),
+                                            [&name](GNAPlugin::GNASplitLayer::SplitConnectedLayerInfo &item) {
+                                                return item.name == name;
+                                            });
+
+            if (it != splitLayerInfoItem.splitOutputLayers.end()) {
+                gnalog()  << "Connecting split/slice input \n";
+                auto res = connectInput(splittingLayer, ptr,
+                                            splitLayerInfoItem.reserved_size, it->offset, 0);
+                gnalog()  << "Connected \n";
+                return res;
+            }
+        }
+        THROW_GNA_EXCEPTION << "Split/Slice layer: " << splitName
+                                 << " is not included in extra map. Something wrong happened";
+    } else if (layerInfoObj.isConcat()) {
+        auto concatLayerInfo = concat_connection.find(
+                                                    prevLayer->name);
+        if (concatLayerInfo != concat_connection.end()) {
+            auto & concatLayerInfoItem = concatLayerInfo->second;
+            // dnnLayer that is input for concat output layer
+            gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, offset);
+            // return layer over concat
+            return CNNNetPrevLayer(prevLayer);
+        }
+    } else if (layerInfoObj.isCrop()) {
+        auto cropLayerInfo = crop_connection.find(
+                                                    prevLayer->name);
+        if (cropLayerInfo != crop_connection.end()) {
+            auto & cropLayerInfoItem = cropLayerInfo->second;
+            gnamem->bind_ptr(ptr, &cropLayerInfoItem.gna_ptr, offset);
+            return CNNNetPrevLayer(prevLayer);
+        }
+    }
+    auto prevDnnLayer = findDnnLayer(prevLayer);
+
+    // check for generic prev layer
+    if (prevDnnLayer != nullptr) {
+        gnamem->bind_ptr(ptr, &prevDnnLayer->ptr_outputs, offset);
+        return prevLayer;
+    }
+
+    auto prevMemoryLayer =
+        std::find_if(begin(memory_connection), end(memory_connection), [&](MemoryConnection::value_type &comp) {
+            return comp.second.getInput()->name == prevLayer->name;
+        });
+    if (prevMemoryLayer != memory_connection.end()) {
+        // dnnLayer that is input for memory output layer
+        auto& memoryLayer = prevMemoryLayer->second;
+        if (memoryLayer.reserved_size == 0) {
+            gnamem->reserve_ptr(&memoryLayer.gna_ptr, ALIGN64(num_data_bytes_in));
+            gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, offset);
+
+            memoryLayer.reserved_offset = offset;
+            memoryLayer.reserved_size = ALIGN64(num_data_bytes_in);
+        } else {
+            IE_ASSERT(memoryLayer.reserved_size == ALIGN64(num_data_bytes_in));
+            // same offsets
+            gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, memoryLayer.reserved_offset);
+        }
+
+        return prevLayer;
+    }
+
+    // several layers are to be skipped right now
+    if (LayerInfo(prevLayer).isReshape()) {
+        gnalog()  << "Skipping reshape layer: " << prevLayer->name << "\n";
+        return connectInput(prevLayer, ptr, num_data_bytes_in, offset, 0);
+    }
+
+    if (LayerInfo(prevLayer).isPermute()) {
+        gnalog()  << "Skipping permute layer: " << prevLayer->name << "\n";
+        return {connectInput(prevLayer, ptr, num_data_bytes_in, offset, 0).input, true, prevLayer};
+    }
+
+
+    THROW_GNA_EXCEPTION << "Cannot connect input for: " << layer->name;
+}
+
diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp
new file mode 100644 (file)
index 0000000..53365d7
--- /dev/null
@@ -0,0 +1,488 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "cpp_interfaces/base/ie_plugin_base.hpp"
+#include "dnn.h"
+#include "gna_memory.hpp"
+#include "gna_device.hpp"
+#include <map>
+#include <list>
+#include <string>
+#include <utility>
+#include <memory>
+#include <vector>
+#include <tuple>
+#include <gna-api-status.h>
+#include <gna-api.h>
+#include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
+#include <cpp_interfaces/impl/ie_plugin_internal.hpp>
+#include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
+#include <graph_tools.hpp>
+#include "gna_allocator.hpp"
+#include "gna_api_wrapper.hpp"
+
+namespace GNAPluginNS {
+
+void ConvertToInt16(int16_t *ptr_dst,
+                    const float *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor);
+void ConvertToFloat(float *ptr_dst,
+                    int32_t *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor);
+
+int16_t ConvertFloatToInt16(float src);
+
+class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::enable_shared_from_this<GNAPlugin> {
+ protected:
+    AmIntelDnn dnn;
+    using dnn_ptr = std::shared_ptr<CPPWrapper<intel_nnet_type_t>>;
+
+    /**
+     * @brief - copy of nnet structure and indicator that related infer request not yet synced
+     */
+    std::vector<std::tuple<dnn_ptr, int32_t, InferenceEngine::BlobMap>> nnets;
+
+    intel_dnn_orientation_t orientation_in = kDnnUnknownOrientation;
+    intel_dnn_orientation_t orientation_out = kDnnUnknownOrientation;
+    double input_scale_factor = 1.0;
+    double output_scale_factor = 1.0;
+    uint32_t num_rotate_rows = 0;
+    uint32_t num_rotate_columns = 0;
+
+
+    uint32_t num_feature_maps = 1;
+    uint32_t num_memory_bytes;
+
+    std::vector<void *> ptr_inputs_global;
+    std::vector<void *> ptr_outputs_global;
+
+    int16_t *ptr_int_inputs = NULL;
+    int32_t *ptr_int_outputs = NULL;
+    uint32_t *ptr_active_indices = NULL;
+    uint32_t num_active_indices = 0;
+    uint32_t num_group_in = 0;
+    uint32_t num_bytes_weight;
+    uint32_t num_bytes_per_output = 0;
+
+    bool use_dynamic_quantization = false;
+    bool compact_mode = true;
+    bool exclusive_async_requests = false;
+    bool uniformPwlDesign = false;
+    uint8_t gna_lib_async_threads_num = 1;
+    bool gna_openmp_multithreading = false;
+    // precision of GNA hardware model
+    InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
+
+    bool performance_counting = false;
+    int  bytes_alllocated_for_input = 0;
+    intel_dnn_number_type_t output_type = kDnnInt;
+    std::string utterance_name;
+
+    // internal types
+    enum LayerType {
+        Input,
+        Convolution,
+        ReLU,
+        LeakyReLU,
+        Sigmoid,
+        TanH,
+        Activation,
+        Pooling,
+        FullyConnected,
+        InnerProduct,
+        Reshape,
+        Split,
+        Slice,
+        Eltwise,
+        ScaleShift,
+        Clamp,
+        Concat,
+        Copy,
+        Permute,
+        Memory,
+        Power,
+        Crop,
+        NO_TYPE
+    };
+
+ public:
+    explicit GNAPlugin(const std::map<std::string, std::string>& configMap);
+    /**
+     * @brief construct from aot rather then from cnn network
+     */
+    GNAPlugin() = default;
+
+    void LoadNetwork(InferenceEngine::ICNNNetwork &network) override;
+    using InferenceEngine::IInferencePluginInternal::Infer;
+
+    void Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) override;
+    void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) override;
+    void AddExtension(InferenceEngine::IExtensionPtr extension) override;
+    void SetConfig(const std::map<std::string, std::string> &config) override;
+    void LoadNetwork(InferenceEngine::IExecutableNetwork::Ptr &executableNetwork,
+                     InferenceEngine::ICNNNetwork &network,
+                     const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
+    void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result) override;
+    void SetLogCallback(InferenceEngine::IErrorListener &listener) override {};
+    void Reset();
+    /**
+     * @deprecated Use the version with config parameter
+     */
+    void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
+                      InferenceEngine::QueryNetworkResult &res) const override { }
+    void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
+                      const std::map<std::string, std::string>& config,
+                      InferenceEngine::QueryNetworkResult &res) const override { }
+    uint32_t QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result);
+    void Wait(uint32_t idx = 0);
+
+    uint32_t QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result);
+    /**
+     *
+     * @param sync - points to gna sync point
+     * @param idx - points to
+     * @param result
+     */
+    void Wait(uint32_t sync, InferenceEngine::Blob &result);
+
+    void Export(const std::string &fileName);
+    InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName
+        , const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
+    InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName);
+
+
+    bool IsExclusiveAsyncRequests() { return exclusive_async_requests; }
+
+    /**
+     * utility to provide input and output blobs externally to be used by InferenceEngine request API clients
+     */
+    InferenceEngine::Blob::Ptr GetInputBlob(InferenceEngine::Precision precision);
+    InferenceEngine::Blob::Ptr GetOutputBlob(InferenceEngine::Precision precision);
+    /**
+     * helpers to provide inputs info on AOT network
+     */
+    InferenceEngine::InputsDataMap GetInputs() {return inputsDataMap;}
+    InferenceEngine::OutputsDataMap GetOutputs() {return outputsDataMap;}
+    /**
+     * QueryState API
+     * @return
+     */
+     std::vector<InferenceEngine::IMemoryStateInternal::Ptr>  QueryState();
+
+ protected:
+    uint32_t num_cnn_rows_out = 0;
+    bool done = false;
+    std::string dumpXNNPath;
+    intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
+
+    void DumpXNNToFile() const;
+    void CreateLayerPrimitive(InferenceEngine::CNNLayerPtr);
+    void AffinePrimitive(InferenceEngine::CNNLayerPtr, bool isDiag = false);
+    void DiagonalPrimitive(InferenceEngine::CNNLayerPtr);
+    void ConvolutionPrimitive(InferenceEngine::CNNLayerPtr);
+    void PermutePrimitive(InferenceEngine::CNNLayerPtr);
+    void PoolingPrimitive(InferenceEngine::CNNLayerPtr);
+    void PowerPrimitive(InferenceEngine::CNNLayerPtr);
+    void ConcatPrimitive(InferenceEngine::CNNLayerPtr);
+    void CropPrimitive(InferenceEngine::CNNLayerPtr);
+    void EltwisePrimitive(InferenceEngine::CNNLayerPtr);
+    void SplitPrimitive(InferenceEngine::CNNLayerPtr);
+    void SlicePrimitive(InferenceEngine::CNNLayerPtr);
+    void PWLPrimitive(InferenceEngine::CNNLayerPtr);
+    void CopyPrimitive(InferenceEngine::CNNLayerPtr);
+    bool AreLayersSupported(InferenceEngine::ICNNNetwork& network, std::string& errMessage);
+    LayerType LayerTypeFromStr(std::string const &str);
+    /**
+     * maps tpe of connection to input and output layers also stores gna_pointer for alloc request
+     */
+    class GNAMemoryLayer {
+        InferenceEngine::CNNLayerPtr inputLayer;
+        InferenceEngine::CNNLayerPtr outputLayer;
+     public:
+        GNAMemoryLayer(InferenceEngine::CNNLayerPtr inLayer, InferenceEngine::CNNLayerPtr outLayer) :
+            inputLayer(inLayer), outputLayer(outLayer) {
+        }
+
+        InferenceEngine::CNNLayerPtr getInput() { return inputLayer; }
+        InferenceEngine::CNNLayerPtr getOutput() { return outputLayer; }
+
+        /**
+         * pointer to gna memory request
+         */
+        void *gna_ptr = nullptr;
+        /**
+         * gna memory of this size is reserved
+         */
+        size_t  reserved_size = 0;
+        /**
+         * gna memory of this offset from gna_ptr
+         */
+        size_t  reserved_offset = 0;
+    };
+
+    class GNAConcatLayer {
+        InferenceEngine::CNNLayerPtr concatLayer;
+
+     public:
+        explicit GNAConcatLayer(InferenceEngine::CNNLayerPtr layer) :
+                                        concatLayer(layer)
+                                        {}
+
+        InferenceEngine::CNNLayerPtr getConcat() { return concatLayer; }
+        /**
+         * pointer to gna memory request
+         */
+        void *gna_ptr = nullptr;
+        /**
+         * gna memory of this size is reserved for concat
+         */
+        size_t reserved_size = 0;
+        bool output_allocation_flag = false;
+        /**
+         * gna memory of this offset from gna_ptr
+         */
+        struct ConcatConnectedLayerInfo {
+            ConcatConnectedLayerInfo(const std::string& n,
+                                    size_t o) :
+                                     name(n),
+                                     offset(o) {}
+            std::string name = "";
+            size_t offset = 0;
+        };
+
+        std::vector<ConcatConnectedLayerInfo> concatInputLayers;
+    };
+
+    // Split, Slice
+    class GNASplitLayer {
+        InferenceEngine::CNNLayerPtr splitLayer;
+
+     public:
+        explicit GNASplitLayer(InferenceEngine::CNNLayerPtr layer) :
+                                        splitLayer(layer),
+                                        splitInputLayer()
+                                        {}
+
+        InferenceEngine::CNNLayerPtr getSplit() { return splitLayer; }
+        /**
+         * gna memory of this size is reserved for concat
+         */
+        size_t reserved_size = 0;
+        bool output_allocation_flag = false;
+        /**
+         * gna memory of this offset from gna_ptr
+         */
+        struct SplitConnectedLayerInfo {
+            SplitConnectedLayerInfo() {}
+            SplitConnectedLayerInfo(std::string& n,
+                                    size_t o,
+                                    size_t p) :
+                                     name(n),
+                                     offset(o),
+                                     pure_size(p) {}
+
+            SplitConnectedLayerInfo& operator=
+                    (SplitConnectedLayerInfo const& layerInfo) {
+                this->name      = layerInfo.name;
+                this->offset    = layerInfo.offset;
+                this->pure_size = layerInfo.pure_size;
+                return *this;
+            }
+            std::string name = "";
+            size_t offset    = 0;
+            size_t pure_size = 0;
+        };
+        SplitConnectedLayerInfo splitInputLayer;
+        std::vector<SplitConnectedLayerInfo> splitOutputLayers;
+    };
+
+    class GNACropLayer {
+        InferenceEngine::CNNLayerPtr cropLayer;
+
+    public:
+        explicit GNACropLayer(InferenceEngine::CNNLayerPtr layer) :
+        cropLayer(layer)
+        {}
+
+        InferenceEngine::CNNLayerPtr getCrop() { return cropLayer; }
+        /**
+         * pointer to gna croped memory beginning
+         */
+        void *gna_ptr = nullptr;
+    };
+    using MemoryConnection = std::list<std::pair<std::string, GNAMemoryLayer>>;
+    using ConcatConnection = std::map<std::string, GNAConcatLayer>;
+    using SplitConnection  = std::map<std::string, GNASplitLayer>;
+    using CropConnection  = std::map<std::string, GNACropLayer>;
+    // layers with extra storage for connections and additional
+    // non trivial processing
+    MemoryConnection memory_connection;
+    ConcatConnection concat_connection;
+    SplitConnection  split_connection;
+    CropConnection   crop_connection;
+    void fillMemoryConnections(std::map<std::string,
+                                 std::vector<InferenceEngine::CNNLayerPtr>> &memoryPairs);
+
+    void fillConcatConnections(InferenceEngine::CNNLayerPtr layer);
+    void fillSplitConnections(InferenceEngine::CNNLayerPtr layer);
+    /**
+     * maps layer name to dnn.component, in topological sort prev nodes will be initialized
+     */
+    using DnnComponentsForLayer = std::list<std::pair<std::string, intel_dnn_component_t>>;
+    std::list<std::pair<std::string, intel_dnn_component_t>> dnnComponentsForLayer;
+
+    /**
+     * @brief returns corresponding dnn layer for topology layer
+     * @param __layer
+     * @return
+     */
+    intel_dnn_component_t * findDnnLayer(InferenceEngine::CNNLayerPtr __layer);
+
+    using allocator_type = PolymorphAllocator<uint8_t>;
+    using gna_memory_type = GNAMemory<allocator_type>;
+
+    std::unique_ptr<GNADeviceHelper> gnadevice;
+    /**
+     * @brief size of RW segment without extra memory for parallel execution
+     */
+    uint32_t rwSegmentSize = 0;
+    std::unique_ptr<gna_memory_type> gnamem;
+
+    /**
+     * Connects either memory output, or generic output to a layer
+     * @param layer - layer pointer
+     * @param ptr - pointer to pointer where to store  output layer information
+     * @param sz - sizeof output blob
+     * @param ptr_inputs - sizeof output blob
+     */
+    void connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr_outputs, void *ptr_inputs, size_t sz);
+    /**
+     * Connects certain input to this layer
+     * @param layer - layer that we connect input to
+     * @param pVoid - pointer that  holds current layer pointer in gna_mem request
+     * @param num_data_bytes_in - size
+     * @param offset - num bytes to advance in buffer
+     * @param idx - index of input port that we are connecting
+     * @return layer used as input
+     */
+    struct ConnectionDetails {
+        InferenceEngine::CNNLayerPtr  input;
+        bool needTransposeWeights = false;
+        InferenceEngine::CNNLayerPtr permute;
+        ConnectionDetails(InferenceEngine::CNNLayerPtr input,
+                          bool bTranspose = false,
+                          InferenceEngine::CNNLayerPtr permute = nullptr)
+            : input(input)
+            , needTransposeWeights(bTranspose)
+            , permute(permute) {
+        }
+    };
+    ConnectionDetails connectInput(InferenceEngine::CNNLayerPtr layer,
+                      void *pVoid,
+                      size_t num_data_bytes_in,
+                      size_t offset = 0,
+                      int idx = 0);
+
+    void ImportFrames(void *ptr_dst,
+                     const void *ptr_src,
+                     InferenceEngine::Precision input_precision,
+                     intel_dnn_orientation_t orientation,
+                     uint32_t num_frames,
+                     uint32_t num_group,
+                     uint32_t num_vector_elements,
+                     uint32_t num_vector_stride);
+
+    void ExportScores(void *ptr_dst,
+                     void *ptr_src,
+                     intel_dnn_orientation_t orientation,
+                     uint32_t num_frames,
+                     uint32_t num_group,
+                     uint32_t num_vector_elements,
+                     uint32_t num_active_elements,
+                     uint32_t num_vector_stride,
+                     uint32_t num_bytes_per_element_input,
+                     uint32_t num_bytes_per_element);
+
+    friend void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
+                    const float *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor);
+    friend void GNAPluginNS::ConvertToFloat(float *ptr_dst,
+                    int32_t *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor);
+
+    friend int16_t GNAPluginNS::ConvertFloatToInt16(float src);
+
+    template <typename T, typename U>
+    void copyInputData(T *dst,
+                    const U *src,
+                    uint32_t num_frames,
+                    uint32_t num_group,
+                    uint32_t num_vector_elements,
+                    uint32_t num_vector_stride,
+                    intel_dnn_orientation_t orientation);
+
+    template <typename T, typename U>
+    void copyInputDataWithSplit(T *const dst,
+                    const U *src,
+                    const GNASplitLayer& splitInfo,
+                    size_t precision_size);
+    /**
+     * @brief GNA affine layers are always have activation atatched, while IR not
+     * @param net - copied net ready for quantisation
+     */
+    void insertIdentityLayer(std::vector<InferenceEngine::CNNLayerPtr> &layers);
+
+    /**
+     * @brief GNA convolution layers have deinterleaved oriantations, while affine one doesn't
+     * so between convolution and affine layers permute layers need to be inserted,
+     * or removed if they are present in topology
+     * @param layers
+     */
+    void applyOrientations(std::vector<InferenceEngine::CNNLayerPtr> &layers);
+
+
+    /**
+     * brief @search for specific patter in the graph (6 layers are replaced by single one)
+     * @param layers
+     */
+    void substitutePRelu(std::vector<InferenceEngine::CNNLayerPtr> &layers);
+
+    std::vector<InferenceEngine::CNNLayerPtr> getCandidatesForIdentityInsertion(const InferenceEngine::CNNLayerPtr layer);
+
+    /**
+     * diagonal layer insertion required in cases where activation followed by split layers, or any other
+     * topology changing layers
+     */
+    void insertDiagonalLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
+
+    /**
+     * @brief MaxPool can be reordered with activation, on GNA there is a strategy to have conv->maxpool->activation
+     * it means maxpool receives 4 bytes, and produces 4 bytes
+     */
+    void reorderMaxPool(std::vector<InferenceEngine::CNNLayerPtr> & layers);
+
+    /**
+     * copy layer insertion required in cases where input layer does not have output memory
+     */
+    void insertCopyLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
+
+    intel_dnn_component_t * find_first_unused_input(InferenceEngine::CNNLayerPtr current);
+
+    InferenceEngine::SizeVector inputDims;
+    InferenceEngine::InputsDataMap inputsDataMap;
+
+    InferenceEngine::SizeVector outputDims;
+    InferenceEngine::OutputsDataMap outputsDataMap;
+};
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.hpp b/inference-engine/src/gna_plugin/gna_plugin_config.hpp
new file mode 100644 (file)
index 0000000..f82e443
--- /dev/null
@@ -0,0 +1,67 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <vector>
+#include <memory>
+#include <utility>
+#include <ie_icnn_network.hpp>
+#include "ie_common.h"
+#include "gna_plugin_log.hpp"
+
+namespace GNAPluginNS {
+
+using CNNNetworkPtr = std::shared_ptr<InferenceEngine::ICNNNetwork>;
+
+struct Endpoint {
+    InferenceEngine::TargetDevice device;
+    InferenceEngine::Precision networkPrec;
+    std::function<CNNNetworkPtr(InferenceEngine::ICNNNetwork &network)> convert;
+
+    Endpoint(InferenceEngine::TargetDevice device,
+             InferenceEngine::Precision networkPrec,
+             std::function<CNNNetworkPtr(InferenceEngine::ICNNNetwork &network)> converter = [](InferenceEngine::ICNNNetwork &network) {
+                 return CNNNetworkPtr(&network, [](InferenceEngine::ICNNNetwork *nodelete) {});
+             }) : device(device), networkPrec(networkPrec), convert(converter) {
+    }
+};
+
+class Config {
+ public:
+    using Desc = std::vector<Endpoint>;
+    Desc supported;
+    InferenceEngine::TargetDevice _defaultDevice = InferenceEngine::TargetDevice::eDefault;
+
+ public:
+    explicit Config(std::vector<Endpoint> &&config)
+        : supported(std::move(config)) {
+    }
+
+    /**
+     * @brief default device value is plugin dependent, so it should be also set, to allow fallback
+     */
+    void setDefaultDevice(InferenceEngine::TargetDevice d) {
+        _defaultDevice = d;
+    }
+
+    inline Endpoint find_configuration(InferenceEngine::ICNNNetwork &network) {
+        auto device = network.getTargetDevice();
+        auto targetDevice = device == InferenceEngine::TargetDevice::eDefault ? _defaultDevice : device;
+
+        auto res = std::find_if(std::begin(supported), std::end(supported), [&](Endpoint &e) {
+            return e.networkPrec == network.getPrecision() && (
+                e.device == device ||
+                    e.device == targetDevice);
+        });
+
+        if (res == std::end(supported)) {
+            THROW_GNA_EXCEPTION << "\"The plugin doesn't support target device: "
+                               << InferenceEngine::TargetDeviceInfo::name(network.getTargetDevice())
+                               << ".\nSupported target device: " << InferenceEngine::TargetDeviceInfo::name(InferenceEngine::TargetDevice::eGNA);
+        }
+
+        return *res;
+    }
+};
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp b/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp
new file mode 100644 (file)
index 0000000..d231274
--- /dev/null
@@ -0,0 +1,22 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <memory>
+#include <ie_plugin.hpp>
+#include <cpp_interfaces/base/ie_plugin_base.hpp>
+#include "gna_plugin_internal.hpp"
+
+using namespace InferenceEngine;
+using namespace std;
+using namespace GNAPluginNS;
+
+INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin *&plugin, ResponseDesc *resp) noexcept {
+    try {
+        plugin = make_ie_compatible_plugin({1, 5, "GNAPlugin", "GNAPlugin"}, make_shared<GNAPluginInternal>());
+        return OK;
+    }
+    catch (std::exception &ex) {
+        return DescriptionBuffer(GENERAL_ERROR, resp) << ex.what();
+    }
+}
diff --git a/inference-engine/src/gna_plugin/gna_plugin_internal.hpp b/inference-engine/src/gna_plugin/gna_plugin_internal.hpp
new file mode 100644 (file)
index 0000000..3c2dcf0
--- /dev/null
@@ -0,0 +1,29 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <map>
+#include <cpp_interfaces/impl/ie_plugin_internal.hpp>
+#include <cpp_interfaces/impl/ie_executable_network_internal.hpp>
+#include "gna_executable_network.hpp"
+
+namespace GNAPluginNS {
+
+class GNAPluginInternal  : public InferenceEngine::InferencePluginInternal {
+ public:
+    InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network,
+                                                                       const std::map<std::string, std::string> &config) override {
+        return std::make_shared<GNAExecutableNetwork>(network, config);
+    }
+    void SetConfig(const std::map<std::string, std::string> &config) override {}
+    InferenceEngine::IExecutableNetwork::Ptr  ImportNetwork(const std::string &modelFileName,
+                                                            const std::map<std::string, std::string> &config) override {
+        return make_executable_network(std::make_shared<GNAExecutableNetwork>(modelFileName, config));
+    }
+};
+
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_plugin_log.hpp b/inference-engine/src/gna_plugin/gna_plugin_log.hpp
new file mode 100644 (file)
index 0000000..08f45ad
--- /dev/null
@@ -0,0 +1,54 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <details/ie_exception.hpp>
+
+// #define GNA_DEBUG
+#ifdef GNA_DEBUG
+/**
+ * @brief used for creating graphviz charts, and layers dump
+ */
+# define PLOT
+# define gnalog() std::cout
+# define gnawarn() std::cerr
+#else
+
+class GnaLog {
+ public :
+    template <class T>
+    GnaLog & operator << (const T &obj) {
+        return *this;
+    }
+
+    GnaLog &  operator<< (std::ostream & (*manip)(std::ostream &)) {
+        return *this;
+    }
+};
+
+inline GnaLog & gnalog() {
+    static GnaLog l;
+    return l;
+}
+inline GnaLog & gnawarn() {
+    return gnalog();
+}
+
+/**
+ * @brief gna_plugin exception unification
+ */
+#ifdef __PRETTY_FUNCTION__
+#undef __PRETTY_FUNCTION__
+#endif
+#if defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
+# define __PRETTY_FUNCTION__ __FUNCSIG__
+#else
+# define __PRETTY_FUNCTION__ __FUNCTION__
+#endif
+
+
+#endif
+
+#define THROW_GNA_EXCEPTION THROW_IE_EXCEPTION << "[GNAPlugin] in function " << __PRETTY_FUNCTION__<< ": "
diff --git a/inference-engine/src/gna_plugin/gna_plugin_passes.cpp b/inference-engine/src/gna_plugin/gna_plugin_passes.cpp
new file mode 100644 (file)
index 0000000..79d42d2
--- /dev/null
@@ -0,0 +1,338 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <utility>
+
+#include <quantization/quantized_layer_params.hpp>
+#include "gna_plugin.hpp"
+#include "gna_layer_info.hpp"
+
+
+using namespace InferenceEngine;
+using namespace std;
+using namespace GNAPluginNS;
+
+void GNAPlugin::insertDiagonalLayer(std::vector<CNNLayerPtr> & layers) {
+    int numOfDiagLayers = 0;
+    for (auto & l : layers) {
+        if (l->insData.empty()) continue;
+        auto prevLayer = CNNNetPrevLayer(l);
+        if (LayerInfo(l).isActivation()) {
+            if (LayerInfo(prevLayer).has32BOutput())
+                continue;
+        } else {
+            auto eltwise = dynamic_cast<InferenceEngine::EltwiseLayer *>(l.get());
+            if (!eltwise) {
+                continue;
+            }
+            // in case of eltwise sum one of input would be 4 bytes one - 2
+            // in case of eltwise mull one of input would be 2 bytes one - 2
+            // for e sum if we have 4-4 inputs we will handle that by inserting identity activation
+            // for e sum if we have 4-2 - OK
+            // for e sum if we have 2-2 inputs we need to insert diagonal -- handling here
+            // for e mul if we have 2-2 - OK
+            // for e mul if we have 2-4 - inputs we need to insert identity to put 4 bytes input into weights
+            // for e mul if we have 4-4 - inputs we need to insert 2 identities to put both 4 bytes input into weights
+
+            if (eltwise->_operation != EltwiseLayer::Sum)
+                continue;
+
+            auto prevLayer1 = CNNNetPrevLayer(l, 1);
+            if (!LayerInfo(prevLayer).has16BOutput() || !LayerInfo(prevLayer1).has16BOutput())
+                continue;
+        }
+
+#ifdef PLOT
+        std::cout << "Inserted Diagonal Layer between: " << prevLayer->name << " and " << l->name << "\n" << std::flush;
+#endif
+        // actual insertion
+        auto diagName = std::string("SyntheticScaleShift_") + std::to_string(numOfDiagLayers++);
+        auto diagLayer = make_shared<ScaleShiftLayer>(LayerParams({diagName, "ScaleShift", Precision::FP32}));
+
+        // TODO: diagonal size
+        std::vector<float> arrayOf1(l->outData[0]->dims[0], 1.f);
+        diagLayer->_weights = make_shared_blob<float>(l->outData[0]->precision, Layout::C, arrayOf1);;
+        auto newDims = l->outData[0]->dims;
+        auto dataPtr = std::make_shared<Data>(diagName,
+                                              newDims,
+                                              l->outData[0]->precision,
+                                              l->outData[0]->layout);
+
+        auto diagonalWithQuant = InferenceEngine::injectData<QuantizedLayerParams>(diagLayer);
+
+        dataPtr->creatorLayer = diagonalWithQuant;
+        diagonalWithQuant->outData.push_back(dataPtr);
+        CNNNetworkInsertLayer(prevLayer, l, diagonalWithQuant);
+    }
+}
+
+void GNAPlugin::reorderMaxPool(std::vector<InferenceEngine::CNNLayerPtr> & layers) {
+    // detecting following pattern
+    // conv->relu->maxpooling
+    // changing it to conv->mxpooling->relu
+    for (auto & l : layers) {
+        auto pool = LayerInfo(l);
+        if (!pool.isMaxPooling()) continue;
+
+        // checking prev layer type
+        auto activation = LayerInfo(CNNNetPrevLayer(l));
+        if (!activation.isActivation()) continue;
+
+        // if activation came from convolution
+        auto convolution = LayerInfo(CNNNetPrevLayer(static_cast<InferenceEngine::CNNLayer*>(activation)));
+        if (!convolution.isConvolution()) continue;
+
+        gnalog() << "MaxPooling: " << pool << ", reordered with activation: " << activation << "\n";
+
+        CNNNetSwapLayers(activation, pool);
+    }
+}
+
+std::vector<CNNLayerPtr> GNAPlugin::getCandidatesForIdentityInsertion(const CNNLayerPtr l) {
+    vector<CNNLayerPtr> prevLayers;
+
+    // skipping memory inputs and true inputs layers
+    if (l->insData.empty()) return {};
+
+    auto eltwise = dynamic_cast<InferenceEngine::EltwiseLayer *>(l.get());
+    auto concat = dynamic_cast<InferenceEngine::ConcatLayer *>(l.get());
+
+    // eltwise
+    if (eltwise != nullptr) {
+        // eltwise layer has 2 inputs, so depends on situation identity should or should not be inserted
+
+        // for  sum if we have 4-4 inputs we will handle that by inserting identity activation case (1)
+        // for  sum if we have 4-2 - OK
+        // for  sum if we have 2-2 inputs we need to insert diagonal
+
+        // for  mul if we have 2-2 - OK
+        // for  mul if we have 2-4 - inputs we need to insert identity activation to make 2 bytes input
+        // for  mul if we have 4-4 - inputs we need to insert 2 identities activations  to put 2 bytes input and weights
+        auto prev0 = CNNNetPrevLayer(l, 0);
+        auto prev1 = CNNNetPrevLayer(l, 1);
+        switch (eltwise->_operation) {
+            case EltwiseLayer::Sum:
+                if (!LayerInfo(prev0).has32BOutput() || !LayerInfo(prev1).has32BOutput()) {
+                    return prevLayers;
+                }
+                // TODO: wether there - are possibility to select what layer to quantize
+                prevLayers.push_back(prev0);
+                break;
+            case EltwiseLayer::Prod:
+                if (LayerInfo(prev0).has16BOutput() && LayerInfo(prev1).has16BOutput()) {
+                    return prevLayers;
+                }
+
+                if (LayerInfo(prev0).has32BOutput()) {
+                    prevLayers.push_back(prev0);
+                }
+
+                if (LayerInfo(prev1).has32BOutput()) {
+                    prevLayers.push_back(prev1);
+                }
+
+                break;
+            default :
+                THROW_GNA_EXCEPTION << "Eltwise Layer of type: " << eltwise->_operation << " not supported";
+        }
+    } else if (concat != nullptr) {
+        for (int i = 0; CNNNetHasPrevLayer(l.get(), i); ++i) {
+            auto prev = CNNNetPrevLayer(l, i);
+            if (LayerInfo(prev).has32BOutput()) {
+                prevLayers.push_back(prev);
+            }
+        }
+    } else {  // not eltwise or concat
+        // other layers has 1 inputs - situation is easier
+        // ex. activation or pooling - no need to insert identity activation.
+        if (LayerInfo(l).has32BInput())
+            return prevLayers;
+
+        auto prevLayer = CNNNetPrevLayer(l);
+        if (!LayerInfo(prevLayer).has32BOutput())
+            return prevLayers;
+
+        prevLayers.push_back(prevLayer);
+    }
+    return prevLayers;
+}
+
+void GNAPlugin::substitutePRelu(std::vector<InferenceEngine::CNNLayerPtr> &layers) {
+    auto getScale = [](CNNLayer* layer) {
+        auto powerCandidate = LayerInfo(layer);
+        if (!powerCandidate.isPower()) return 0.0f;
+        auto power = powerCandidate.as<PowerLayer*>();
+
+        return power->power == 1 && power->offset == 0.0f ? power->scale : 0.0f;
+    };
+
+    auto isScale = [getScale](CNNLayer* layer) {
+        return getScale(layer) != 0.0f;
+    };
+
+    auto isNegate = [getScale](CNNLayer* layer) {
+        return getScale(layer) == -1.0f;
+    };
+
+    auto getNext = [](CNNLayer* layer) {
+        CNNLayer* next = nullptr;
+        if (layer == nullptr) return next;
+        if (layer->outData.size() != 1) return next;
+        return layer->outData[0]->inputTo.begin()->second.get();
+    };
+
+    // TODO: unit tests for bad cases
+    for (auto & l : layers) {
+        // assume l is starting layer, that is followed by eltwise_sum(relu, negate/relu/scale/negate)
+        if (l->outData.size() != 1) continue;
+        auto &outputLayers = l->outData[0]->inputTo;
+        if (outputLayers.size() != 2) continue;
+
+        // one of followed layers need to be generic relu
+        auto first = LayerInfo(outputLayers.begin()->second);
+        auto second = LayerInfo((++outputLayers.begin())->second);
+
+        auto relu1 = outputLayers.begin()->second;
+        auto neg1 = (++outputLayers.begin())->second;
+        if (second.isRelu()) {
+            swap(first, second);
+            swap(relu1, neg1);
+        }
+        if (!first.isRelu()) continue;
+        // now we have relu as first layer, lets check second
+        // negate
+        if (!isNegate(neg1.get())) continue;
+
+        // relu
+        auto relu2 = getNext(second);
+        if (!LayerInfo(relu2).isRelu()) continue;
+
+        // scale
+        auto scale = getNext(relu2);
+        if (!isScale(scale)) continue;
+
+        // negate2
+        auto negate = getNext(scale);
+        if (!isNegate(negate)) continue;
+
+        // sum
+        auto sum = getNext(negate);
+        if (!LayerInfo(sum).isEltwiseSum()) continue;
+        if (sum->insData.size() != 2) continue;
+
+        auto s1 = sum->insData[0].lock()->creatorLayer.lock().get();
+        auto s2 = sum->insData[1].lock()->creatorLayer.lock().get();
+
+        if (s1 != static_cast<InferenceEngine::CNNLayer *>(first) &&
+            s2 != static_cast<InferenceEngine::CNNLayer *>(first)) {
+            continue;
+        }
+
+        // hurray we found parametric relu group - dont know what to do with it though
+        gnalog() << "PRelu with negative slope of " << -LayerInfo(scale).as<PowerLayer*>()->scale << " found" << std::endl;
+
+        // removing all layers references except of relu layer
+        outputLayers.clear();
+        outputLayers[relu1->name] = relu1;
+        // pointing relu to output of eltwise_summ
+        relu1->outData = sum->outData;
+        // changing creator layer
+        relu1->outData[0]->creatorLayer = relu1;
+        // pointing back to relu if any
+        if (!relu1->outData[0]->inputTo.empty()) {
+            auto summOutputLayer = relu1->outData[0]->inputTo.begin()->second;
+            summOutputLayer->insData.clear();
+            summOutputLayer->insData.push_back(relu1->outData[0]);
+        }
+
+        // changing negative slope
+        first.as<ReLULayer*>()->negative_slope = LayerInfo(scale).as<PowerLayer*>()->scale;
+    }
+}
+
+void GNAPlugin::applyOrientations(std::vector<CNNLayerPtr> & layers) {
+}
+
+void GNAPlugin::insertIdentityLayer(std::vector<CNNLayerPtr> &layers) {
+    int numOfIdentityLayers = 0;
+    for (auto & l : layers) {
+        for (auto && prev : getCandidatesForIdentityInsertion(l)) {
+            // actual insertion
+            auto activationName = std::string("identity_") + std::to_string(numOfIdentityLayers++);
+
+            gnalog() << "Inserted "<< activationName << " between: " << prev->name << " and " << l->name << "\n" << std::flush;
+
+            CNNLayerPtr activationLayer =
+                make_shared<GenericLayer>(LayerParams({activationName, "identity", Precision::FP32}));
+            auto inputData = l->insData[0].lock();
+            auto newDims = inputData->dims;
+            std::reverse(begin(newDims), end(newDims));
+
+            auto dataPtr = std::make_shared<Data>("FullyConnected",
+                                                  TensorDesc(inputData->precision,
+                                                             newDims,
+                                                             inputData->layout));
+
+            auto activationLayerWithQuant = InferenceEngine::injectData<QuantizedLayerParams>(activationLayer);
+            dataPtr->creatorLayer = activationLayerWithQuant;
+            activationLayerWithQuant->outData.push_back(dataPtr);
+            // wether 1 identity or all outputs TODO possible grouping here, need to implement special groupped inserter
+            bool notAll = false;
+            for (auto && nextData  : prev->outData) {
+                for (auto && nextLayer : nextData->inputTo) {
+                    if (nextLayer.second.get() == l.get())
+                        continue;
+                    if (getCandidatesForIdentityInsertion(nextLayer.second).empty()) {
+                        notAll = true;
+                    }
+                }
+            }
+
+            CNNNetworkInsertLayer(prev, notAll ? l : CNNLayerPtr(nullptr), activationLayerWithQuant);
+        }
+    }
+}
+
+void GNAPlugin::insertCopyLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers) {
+    int numCopyLayers = 0;
+    for (auto & l : layers) {
+        if (l->insData.empty()) continue;
+        auto prevLayer = CNNNetPrevLayer(l);
+        if ((LayerInfo(l).isMemory() && LayerInfo(prevLayer).isConcat()) ||
+            (LayerInfo(l).isConcat() && LayerInfo(prevLayer).isCrop())) {
+            if (LayerInfo(prevLayer).isCrop()) {
+                auto cropLayer = dynamic_cast<InferenceEngine::CropLayer *> (prevLayer.get());
+                size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
+                if (ALIGN(cropOffset, 8) != cropOffset) {
+                    // The crop will be replced by affine.
+                    // Copy layer insertion is not required
+                    continue;
+                }
+            }
+            std::string copyName = std::string("copy_") + std::to_string(numCopyLayers++);
+            gnalog() << "Inserted "<< copyName << " between: " << l->name << " and " << prevLayer->name << "\n" << std::flush;
+
+            CNNLayerPtr copyLayer =
+            make_shared<GenericLayer>(LayerParams({copyName, "Copy", Precision::FP32}));
+
+            auto inputData = l->insData[0].lock();
+            auto newDims = inputData->dims;
+
+            std::reverse(begin(newDims), end(newDims));
+
+            auto dataPtr = std::make_shared<Data>(copyName,
+                                                  TensorDesc(inputData->precision,
+                                                             newDims,
+                                                             inputData->layout));
+
+            auto copyWithQuant = InferenceEngine::injectData<QuantizedLayerParams>(copyLayer);
+            dataPtr->creatorLayer = copyWithQuant;
+            copyWithQuant->outData.push_back(dataPtr);
+            CNNNetworkInsertLayer(prevLayer, l, copyWithQuant);
+        }
+    }
+}
diff --git a/inference-engine/src/gna_plugin/lstm.cpp b/inference-engine/src/gna_plugin/lstm.cpp
new file mode 100644 (file)
index 0000000..53906e6
--- /dev/null
@@ -0,0 +1,69 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "lstm.hpp"
+
+const char *intel_lstm_projected_layer_name[NUM_LSTM_LAYERS] = {
+    "combined input transform",
+    "combined recurrent transform",
+    "input gate",
+    "forget gate",
+    "cell gate input part 1",
+    "cell gate input part 2",
+    "cell gate output part 1",
+    "cell gate output part 2",
+    "output gate",
+    "hidden gated output",
+    "projected output"
+};
+
+const char *intel_lstm_projected_layer_g4_name[NUM_LSTM_G4_LAYERS] = {
+    "combined input transform",
+    "deinterleave",
+    "interleave 1",
+    "interleave 2",
+    "interleave 3",
+    "interleave 4",
+    "combined recurrent transform - 1",
+    "input gate - 1",
+    "forget gate - 1",
+    "cell gate input part 1 - 1",
+    "cell gate input part 2 - 1",
+    "cell gate output part 1 - 1",
+    "cell gate output part 2 - 1",
+    "output gate - 1",
+    "hidden gated output - 1",
+    "projected output - 1",
+    "combined recurrent transform - 2",
+    "input gate - 2",
+    "forget gate - 2",
+    "cell gate input part 1 - 2",
+    "cell gate input part 2 - 2",
+    "cell gate output part 1 - 2",
+    "cell gate output part 2 - 2",
+    "output gate - 2",
+    "hidden gated output - 2",
+    "projected output - 2",
+    "combined recurrent transform - 3",
+    "input gate - 3",
+    "forget gate - 3",
+    "cell gate input part 1 - 3",
+    "cell gate input part 2 - 3",
+    "cell gate output part 1 - 3",
+    "cell gate output part 2 - 3",
+    "output gate - 3",
+    "hidden gated output - 3",
+    "projected output - 3",
+    "combined recurrent transform - 4",
+    "input gate - 4",
+    "forget gate - 4",
+    "cell gate input part 1 - 4",
+    "cell gate input part 2 - 4",
+    "cell gate output part 1 - 4",
+    "cell gate output part 2 - 4",
+    "output gate - 4",
+    "hidden gated output - 4",
+    "projected output - 4",
+    "interleave"
+};
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/lstm.hpp b/inference-engine/src/gna_plugin/lstm.hpp
new file mode 100644 (file)
index 0000000..6ce8f10
--- /dev/null
@@ -0,0 +1,209 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#define LSTM_GIFO_X_C (component_index)
+#define LSTM_GIFO_R_C (component_index+1)
+#define LSTM_INPUT_GATE_C (component_index+2)
+#define LSTM_INPUT_SIGMOID_C (component_index+3)
+#define LSTM_FORGET_GATE_C (component_index+4)
+#define LSTM_FORGET_SIGMOID_C (component_index+5)
+#define LSTM_CELL_INPUT1_C (component_index+6)
+#define LSTM_CELL_INPUT1_TANH_C (component_index+7)
+#define LSTM_CELL_INPUT2_C (component_index+8)
+#define LSTM_CELL_OUTPUT1_C (component_index+9)
+#define LSTM_CELL_TANH_C (component_index+10)
+#define LSTM_CELL_OUTPUT2_C (component_index+11)
+#define LSTM_CELL_CLIPPING_C (component_index+12)
+#define LSTM_OUTPUT_GATE_C (component_index+13)
+#define LSTM_OUTPUT_SIGMOID_C (component_index+14)
+#define LSTM_HIDDEN_C (component_index+15)
+#define LSTM_HIDDEN_IDENTITY_C (component_index+16)
+#define LSTM_PROJECTED_C (component_index+17)
+#define LSTM_PROJECTED_IDENTITY_C (component_index+18)
+#define NUM_LSTM_COMPONENTS 19
+
+#define BILSTM_GIFO_X_FW_C (component_index)
+#define BILSTM_GIFO_R_FW_C (component_index+1)
+#define BILSTM_INPUT_GATE_FW_C (component_index+2)
+#define BILSTM_INPUT_SIGMOID_FW_C (component_index+3)
+#define BILSTM_FORGET_GATE_FW_C (component_index+4)
+#define BILSTM_FORGET_SIGMOID_FW_C (component_index+5)
+#define BILSTM_CELL_INPUT1_FW_C (component_index+6)
+#define BILSTM_CELL_INPUT1_TANH_FW_C (component_index+7)
+#define BILSTM_CELL_INPUT2_FW_C (component_index+8)
+#define BILSTM_CELL_GATE_FW_C (component_index+9)
+#define BILSTM_CELL_OUTPUT1_FW_C (component_index+10)
+#define BILSTM_CELL_TANH_FW_C (component_index+11)
+#define BILSTM_CELL_COPY_FW_C (component_index+12)
+#define BILSTM_OUTPUT_GATE_FW_C (component_index+13)
+#define BILSTM_OUTPUT_SIGMOID_FW_C (component_index+14)
+#define BILSTM_HIDDEN_FW_C (component_index+15)
+#define BILSTM_HIDDEN_IDENTITY_FW_C (component_index+16)
+#define BILSTM_GIFO_X_BW_C (component_index+17)
+#define BILSTM_GIFO_R_BW_C (component_index+18)
+#define BILSTM_INPUT_GATE_BW_C (component_index+19)
+#define BILSTM_INPUT_SIGMOID_BW_C (component_index+20)
+#define BILSTM_FORGET_GATE_BW_C (component_index+21)
+#define BILSTM_FORGET_SIGMOID_BW_C (component_index+22)
+#define BILSTM_CELL_INPUT1_BW_C (component_index+23)
+#define BILSTM_CELL_INPUT1_TANH_BW_C (component_index+24)
+#define BILSTM_CELL_INPUT2_BW_C (component_index+25)
+#define BILSTM_CELL_GATE_BW_C (component_index+26)
+#define BILSTM_CELL_OUTPUT1_BW_C (component_index+27)
+#define BILSTM_CELL_TANH_BW_C (component_index+28)
+#define BILSTM_CELL_COPY_BW_C (component_index+29)
+#define BILSTM_OUTPUT_GATE_BW_C (component_index+30)
+#define BILSTM_OUTPUT_SIGMOID_BW_C (component_index+31)
+#define BILSTM_HIDDEN_BW_C (component_index+32)
+#define BILSTM_HIDDEN_IDENTITY_BW_C (component_index+33)
+#define NUM_BILSTM_COMPONENTS 34
+
+#include "gna-api.h"
+
+#define ACTIVATION_SCALE_IG  1024.0f
+#define ACTIVATION_SCALE_CI1 1024.0f
+#define ACTIVATION_SCALE_CO1 2048.0f
+#define ACTIVATION_SCALE_OG  2048.0f
+#define ACTIVATION_SCALE_HID 2048.0f
+#define MAX_WEIGHT_IFO_GATE  1024.0f
+#define NUM_WEIGHT_BYTES_IN        2
+#define NUM_WEIGHT_BYTES_PROJ    2
+
+typedef struct {
+    float min;
+    float max;
+    float sum;
+    float sum_squared;
+    uint32_t num_saturations;
+    uint32_t num_elements;
+} intel_buffer_stats_t;
+
+typedef struct {
+    intel_nnet_layer_t in;        // combined input transform
+    intel_nnet_layer_t rec;        // combined recurrent transform
+    intel_nnet_layer_t ig;        // input gate
+    intel_nnet_layer_t fg;        // forget gate
+    intel_nnet_layer_t ci1;        // cell gate input part 1
+    intel_nnet_layer_t ci2;        // cell gate input part 2
+    intel_nnet_layer_t co1;        // cell gate output part 1
+    intel_nnet_layer_t co2;        // cell gate output part 2
+    intel_nnet_layer_t og;        // output gate
+    intel_nnet_layer_t hid;        // hidden gated output
+    intel_nnet_layer_t proj;    // projected output
+} intel_lstm_projected_layer_t;
+
+typedef struct {
+    intel_affine_layer_t *in;        // combined input transform
+    intel_affine_layer_t *rec;        // combined recurrent transform
+    intel_affine_layer_t *ig;        // input gate
+    intel_affine_layer_t *fg;        // forget gate
+    intel_affine_layer_t *ci1;        // cell gate input part 1
+    intel_affine_layer_t *ci2;        // cell gate input part 2
+    intel_affine_layer_t *co1;        // cell gate output part 1
+    intel_affine_layer_t *co2;        // cell gate output part 2
+    intel_affine_layer_t *og;        // output gate
+    intel_affine_layer_t *hid;        // hidden gated output
+    intel_affine_layer_t *proj;        // projected output
+} intel_lstm_projected_transform_t;
+
+typedef struct {
+    intel_buffer_stats_t in;        // combined input transform
+    intel_buffer_stats_t rec;        // combined recurrent transform
+    intel_buffer_stats_t ig;        // input gate
+    intel_buffer_stats_t fg;        // forget gate
+    intel_buffer_stats_t ci1;        // cell gate input part 1
+    intel_buffer_stats_t ci2;        // cell gate input part 2
+    intel_buffer_stats_t co1;        // cell gate output part 1
+    intel_buffer_stats_t co2;        // cell gate output part 2
+    intel_buffer_stats_t og;        // output gate
+    intel_buffer_stats_t hid;        // hidden gated output
+    intel_buffer_stats_t proj;    // projected output
+} intel_lstm_projected_stats_t;
+
+typedef struct {
+    intel_nnet_layer_t rec;        // combined recurrent transform
+    intel_nnet_layer_t ig;        // input gate
+    intel_nnet_layer_t fg;        // forget gate
+    intel_nnet_layer_t ci1;        // cell gate input part 1
+    intel_nnet_layer_t ci2;        // cell gate input part 2
+    intel_nnet_layer_t co1;        // cell gate output part 1
+    intel_nnet_layer_t co2;        // cell gate output part 2
+    intel_nnet_layer_t og;        // output gate
+    intel_nnet_layer_t hid;        // hidden gated output
+    intel_nnet_layer_t proj;    // projected output
+} intel_lstm_partial_layer_t;
+
+typedef struct {
+    intel_affine_layer_t *rec;        // combined recurrent transform
+    intel_affine_layer_t *ig;        // input gate
+    intel_affine_layer_t *fg;        // forget gate
+    intel_affine_layer_t *ci1;        // cell gate input part 1
+    intel_affine_layer_t *ci2;        // cell gate input part 2
+    intel_affine_layer_t *co1;        // cell gate output part 1
+    intel_affine_layer_t *co2;        // cell gate output part 2
+    intel_affine_layer_t *og;        // output gate
+    intel_affine_layer_t *hid;        // hidden gated output
+    intel_affine_layer_t *proj;        // projected output
+} intel_lstm_partial_transform_t;
+
+typedef struct {
+    intel_buffer_stats_t rec;        // combined recurrent transform
+    intel_buffer_stats_t ig;        // input gate
+    intel_buffer_stats_t fg;        // forget gate
+    intel_buffer_stats_t ci1;        // cell gate input part 1
+    intel_buffer_stats_t ci2;        // cell gate input part 2
+    intel_buffer_stats_t co1;        // cell gate output part 1
+    intel_buffer_stats_t co2;        // cell gate output part 2
+    intel_buffer_stats_t og;        // output gate
+    intel_buffer_stats_t hid;        // hidden gated output
+    intel_buffer_stats_t proj;    // projected output
+} intel_lstm_partial_stats_t;
+
+typedef struct {
+    intel_nnet_layer_t in;                // combined input transform
+    intel_nnet_layer_t dintl;            // interleave x8
+    intel_nnet_layer_t intl1;            // deinterleave x2
+    intel_nnet_layer_t intl2;            // deinterleave x2
+    intel_nnet_layer_t intl3;            // deinterleave x2
+    intel_nnet_layer_t intl4;            // deinterleave x2
+    intel_lstm_partial_layer_t part[4];    // unrolled part
+    intel_nnet_layer_t intl;            // interleave x4
+} intel_lstm_projected_layer_g4_t;
+
+typedef struct {
+    intel_affine_layer_t *in;                // combined input transform
+    intel_lstm_partial_transform_t part[4];  // unrolled part
+} intel_lstm_projected_transform_g4_t;
+
+typedef struct {
+    intel_buffer_stats_t in;            // combined input transform
+    intel_lstm_partial_stats_t part[4];    // unrolled part
+} intel_lstm_projected_stats_g4_t;
+
+#define NUM_LSTM_LAYERS 11
+#define NUM_LSTM_G4_LAYERS 47
+
+extern const char *intel_lstm_projected_layer_name[NUM_LSTM_LAYERS];
+extern const char *intel_lstm_projected_layer_g4_name[NUM_LSTM_G4_LAYERS];
+/*
+void GetLstmBufferStats(intel_lstm_projected_layer_t *ptr_layer, std::vector<intel_lstm_projected_stats_t> &stats);
+void UpdateLstmBufferStats(std::vector<intel_lstm_projected_stats_t> &accum, std::vector<intel_lstm_projected_stats_t> stats);
+void ClearLstmBufferStats(std::vector<intel_lstm_projected_stats_t> &stats);
+void PrintLstmBufferStats(std::string preamble, std::vector<intel_lstm_projected_stats_t> stats);
+uint32_t NumBytesLstmMacroLayer(uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells, uint32_t num_group_size, uint32_t layer_num, bool is_compact);
+void InitLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform, uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells);
+void InitLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform, uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells);
+void AllocateLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform, intel_shared_outputs scratch, uint8_t **ptr_memory, uint32_t *ptr_num_bytes_used, uint32_t num_memory_bytes, bool is_compact);
+void AllocateLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform, intel_shared_outputs scratch, uint8_t **ptr_memory, uint32_t *ptr_num_bytes_used, uint32_t num_memory_bytes, bool is_compact);
+void ConnectLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform);
+void ConnectLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform);
+void QuantizeLstmMacroLayerG1(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_transform_t *ptr_transform, float input_scale, gna_scale_factor_t *scale, uint32_t j);
+void QuantizeLstmMacroLayerG4(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_transform_g4_t *ptr_transform, float input_scale, gna_scale_factor_t *scale, uint32_t j);
+void ReQuantizeLstmMacroLayerG1(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_t *ptr_layer, float input_scale, gna_scale_factor_t *scale, uint32_t j);
+void ReQuantizeLstmMacroLayerG4(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_g4_t *ptr_layer, float input_scale, gna_scale_factor_t *scale, uint32_t j);
+void IntegrityCheckLstmMacroLayer(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_t *ptr_layer, gna_scale_factor_t *scale, uint32_t j);
+
+*/
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/polymorh_allocator.hpp b/inference-engine/src/gna_plugin/polymorh_allocator.hpp
new file mode 100644 (file)
index 0000000..d50d8a3
--- /dev/null
@@ -0,0 +1,68 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+/**
+ * @brief c++17 concept simulation
+ */
+
+template<class T>
+class IPolymorhAllocator {
+ public:
+    virtual T *allocate(std::size_t n)  = 0;
+    virtual void deallocate(T *p, std::size_t n)  = 0;
+};
+
+template<class T>
+class allocator_polymorph;
+
+template<class T>
+class PolymorphAllocator {
+    std::shared_ptr<IPolymorhAllocator<T>> _impl;
+ public:
+    explicit PolymorphAllocator(const std::shared_ptr<IPolymorhAllocator<T>> &impl) : _impl(impl) {}
+
+    T *allocate(std::size_t n) {
+        return _impl->allocate(n);
+    }
+
+    void deallocate(T *p, std::size_t n) {
+        _impl->deallocate(p, n);
+    }
+};
+
+/**
+ * transform any allocator into polymorph type
+ * @tparam origin
+ */
+
+template<class origin>
+class polymorph_adapter : public IPolymorhAllocator<typename origin::value_type> {
+    origin _impl;
+    using T = typename origin::value_type;
+
+ public:
+    template<class ...Args>
+    explicit polymorph_adapter(Args &&... args)
+        :_impl(std::forward<Args>(args)...) {
+    }
+    T *allocate(std::size_t n) override {
+        return _impl.allocate(n);
+    }
+    void deallocate(T *p, std::size_t n) override {
+        _impl.deallocate(p, n);
+    }
+};
+
+template<class T, class ...Args>
+inline PolymorphAllocator<typename T::value_type> make_polymorph(Args &&... args) {
+    auto sp = std::make_shared<polymorph_adapter<T>>(std::forward<Args>(args)...);
+    auto ipoly = std::static_pointer_cast<IPolymorhAllocator<typename T::value_type>>(sp);
+
+    return PolymorphAllocator<typename T::value_type>(ipoly);
+}
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/pwl.h b/inference-engine/src/gna_plugin/pwl.h
new file mode 100644 (file)
index 0000000..fd45903
--- /dev/null
@@ -0,0 +1,70 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "dnn.h"
+#include <vector>
+
+#define SIGMOID_NUM_SEGMENTS 65
+#define SIGMOID_DOMAIN 10.0f  // portion of input to be approximated (-10,10)
+#define TANH_NUM_SEGMENTS 65
+#define TANH_DOMAIN 5.0f  // portion of input to be approximated (-5,5)
+#define RELU_NUM_SEGMENTS 2
+#define LEAKYRELU_SLOPE 0.01
+#define IDENTITY_NUM_SEGMENTS 3
+#define IDENTITY_DOMAIN 10.0f
+#define PWL_MAX_ERR_PERCENT 1.0f
+#define PWL_MAX_ITERATIONS 2000
+#define PWL_MAX_NUM_SEGMENTS 128
+#define PWL_DESIGN_THRESHOLD 0.1f
+#define PWL_DESIGN_SAMPLES 500
+#define ACTIVATION_SCALE_FACTOR 2048.0f
+#define IDENTITY_SCALE_FACTOR 2049.0f
+#define XBASEMASK 0xFFFFFFFC  // only top 30 bits are used
+#define KALDI_LSTM_CLIP_LOWER (-50.0)
+#define KALDI_LSTM_CLIP_UPPER (50.0)
+
+typedef struct {
+    double t;
+    double alpha;
+    double beta;
+    double m;
+    double b;
+} pwl_t;
+
+typedef struct {
+    double slope;
+    uint64_t slope_scale = 0;
+    uint32_t slope_scale_index;
+} pwl_gna_slope_scale_t;
+
+double first_deriv_tanh(const double x);
+double sigmoid(const double x);
+double first_deriv_sigmoid(const double x);
+double relu(const double x);
+double leaky_relu(const double x);
+
+double clipping(const double x, const double lbound, const double ubound);
+void PwlApply16(intel_dnn_component_t *component, const uint32_t num_subset_size);
+void PwlApply16(intel_dnn_component_t *component,
+                const uint32_t num_row_start,
+                const uint32_t num_row_end,
+                const uint32_t num_col_start,
+                const uint32_t num_col_end);
+void PwlApply32(intel_dnn_component_t *component, const uint32_t num_subset_size);
+void PwlApply32(intel_dnn_component_t *component,
+                const uint32_t num_row_start,
+                const uint32_t num_row_end,
+                const uint32_t num_col_start,
+                const uint32_t num_col_end);
+void PwlDesign16(const DnnActivation activation_type,
+                 intel_pwl_segment_t *ptr_segment,
+                 const uint32_t num_segments,
+                 const float scale_in,
+                 const float scale_out);
+void PwlDesignOpt16(const DnnActivation activation_type,
+                std::vector<intel_pwl_segment_t> &ptr_segment,
+                const float scale_in,
+                const float scale_out);
diff --git a/inference-engine/src/gna_plugin/pwl_design.cpp b/inference-engine/src/gna_plugin/pwl_design.cpp
new file mode 100644 (file)
index 0000000..1f325ba
--- /dev/null
@@ -0,0 +1,681 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "pwl.h"
+#include "gna_plugin_log.hpp"
+#include <vector>
+#include <algorithm>
+#include <limits>
+
+#define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5):((a) + 0.5))
+#define FLOAT_TO_INT32(a) static_cast<int32_t>(((a) < 0)?((a)-0.5):((a)+0.5))
+#ifdef _NO_MKL_
+#include <cmath>
+#include <details/ie_exception.hpp>
+#define SCOPY(num, in, inci, out, inco) for (int i_ = 0; i_ < *(num); i_++) *(out + i_ * *(inco)) = *(in + i_ * *(inci));
+#define SSCAL(num, scale, inout, inco)  for (int i_ = 0; i_ < *(num); i_++) *(inout + i_ * *(inco)) = *(scale) * *(inout + i_ * *(inco));
+#define TANH(num, in, out) for (int i_ = 0; i_ < num; i_++) *(out+i_) = tanh(*(in+i_))
+#else
+#include <mkl.h>
+#define SCOPY(num, in, incx, out, incy) scopy(num, in, incx, out, incy)
+#define SSCAL(num, scale, inout, incx) sscal(num, scale, inout, incx)
+#define TANH(num, in, out) vsTanh(num, in, out)
+#endif
+
+double first_deriv_tanh(const double x) { return(1.0 - tanh(x) * tanh(x)); }
+
+double sigmoid(const double x) { return(0.5 * (1.0 + tanh(x / 2))); }
+double first_deriv_sigmoid(const double x) { return(sigmoid(x) * (1.0 - sigmoid(x))); }
+double relu(const double x) { if (x < 0) { return(0.0); } else { return(x); } }
+double leaky_relu(const double x) { if (x < 0.0) { return(LEAKYRELU_SLOPE*x); } else { return(x); } }
+double clipping(const double x, const double lbound, const double ubound) { return((x < lbound)?lbound:((x > ubound)?ubound:x)); }
+
+double pivot_search(std::vector<pwl_t>& result, double(*f)(const double),
+                                    double(*first_deriv_f)(const double),
+                                    const uint32_t N,
+                                    const double alpha_0,
+                                    const double alpha_N,
+                                    const double threshold,
+                                    const bool negative) {
+    std::vector<std::vector<double>> t(N + 1);
+    std::vector<std::vector<double>> alpha(N + 1);
+    std::vector<std::vector<double>> epsilon(N + 1);
+    std::vector<std::vector<double>> d(N + 1);
+    bool same_epsilon = false;
+    double Delta;
+    double epsilon_final = 0.0;
+    double max_epsilon = 0.0;
+    double max_epsilon_prev;
+    double min_epsilon;
+    double sgn = (negative) ? -1.0 : 1.0;
+    int j;
+
+    if ( f == nullptr ||
+        first_deriv_f == nullptr ||
+        threshold < 0) {
+        return epsilon_final;
+    }
+    // Figure 4:  Box #1
+    j = 0;
+    Delta = 1.0;
+
+    for (int i = 0; i < N; i++) {
+        t[i].push_back(alpha_0 + (static_cast<double>((i + 1)) / static_cast<double>((N + 1))) * (alpha_N - alpha_0));
+    }
+
+    while (true) {
+        // Figure 4:  Box #2
+        alpha[0].resize(j + 1);
+        alpha[0][j] = alpha_0;
+        for (int i = 1; i < N; i++) {
+            alpha[i].resize(j + 1);
+            alpha[i][j] = (f(t[i - 1][j]) - f(t[i][j]) + first_deriv_f(t[i][j]) * t[i][j] - first_deriv_f(t[i - 1][j]) * t[i - 1][j])
+                / (first_deriv_f(t[i][j]) - first_deriv_f(t[i - 1][j]));
+        }
+        alpha[N].resize(j + 1);
+        alpha[N][j] = alpha_N;
+
+        // Figure 4:  Box #3
+        for (int i = 0; i < N; i++) {
+            epsilon[i].resize(j + 1);
+            epsilon[i][j] = sgn * (first_deriv_f(t[i][j]) * (alpha[i][j] - t[i][j]) + f(t[i][j]) - f(alpha[i][j]));
+        }
+        epsilon[N].resize(j + 1);
+        epsilon[N][j] = sgn * (first_deriv_f(t[N - 1][j]) * (alpha[N][j] - t[N - 1][j]) + f(t[N - 1][j]) - f(alpha[N][j]));
+
+        // Figure 4:  Test for completion
+        max_epsilon_prev = max_epsilon;
+        max_epsilon = fabs(epsilon[0][j]);
+        min_epsilon = fabs(epsilon[0][j]);
+        for (int i = 1; i < N + 1; i++) {
+            if (fabs(epsilon[i][j]) > max_epsilon) max_epsilon = fabs(epsilon[i][j]);
+            if (fabs(epsilon[i][j]) < min_epsilon) min_epsilon = fabs(epsilon[i][j]);
+        }
+        if ((j == PWL_MAX_ITERATIONS) || (max_epsilon - min_epsilon < threshold * min_epsilon)) {
+            pwl_t value;
+            result.resize(0);
+            epsilon_final = (max_epsilon + min_epsilon) / 4.0;  // Andrzej's modification
+            for (int i = 0; i < N; i++) {
+                double val, val_next;
+                value.t = t[i][j];
+                value.alpha = alpha[i][j];
+                val = sgn * first_deriv_f(value.t) * (value.alpha - value.t) + sgn * f(value.t) - epsilon_final;
+                val_next = sgn * first_deriv_f(value.t) * (alpha[i + 1][j] - value.t) + sgn * f(value.t) - epsilon_final;
+                value.beta = val;
+                value.m = (val_next - val) / (alpha[i + 1][j] - value.alpha);
+                value.b = (val - value.m * value.alpha);
+                result.push_back(value);
+            }
+            value.t = value.m = value.b = 0.0;
+            value.alpha = alpha[N][j];
+            value.beta = sgn * first_deriv_f(t[N - 1][j]) * (alpha[N][j] - t[N - 1][j]) + sgn * f(t[N - 1][j]) - epsilon_final;
+            result.push_back(value);
+            if (j == PWL_MAX_ITERATIONS) {
+                std::cerr << "Error:  failed to converge in pivot_search!" << std::endl;
+            }
+            return(epsilon_final);
+        }
+
+        if (j > 0) {
+            if (max_epsilon > max_epsilon_prev) {
+                j = j - 1;
+                Delta = Delta / 2;
+            } else if (max_epsilon == max_epsilon_prev) {
+                if (!same_epsilon) {
+                    same_epsilon = true;
+                } else {
+                    j = j - 1;
+                    Delta = Delta / 2;
+                    same_epsilon = false;
+                }
+            }
+        }
+
+        // Figure 4:  Box #4
+        for (int i = 0; i < N; i++) {
+            d[i].resize(j + 1);
+            d[i][j] = Delta * (epsilon[i + 1][j] - epsilon[i][j]) /
+                ((epsilon[i + 1][j] / (alpha[i + 1][j] - t[i][j])) + (epsilon[i][j] / (t[i][j] - alpha[i][j])));
+        }
+
+        // Figure 4:  Box #5
+        for (int i = 0; i < N; i++) {
+            t[i].resize(j + 2);
+            t[i][j + 1] = t[i][j] + d[i][j];
+        }
+        t[N].resize(j + 2);
+
+        j = j + 1;
+    }
+}
+
+double calculate_error_pct(const DnnActivationType fun,
+                            const double l_bound,
+                            const double u_bound,
+                            const double offset,
+                            const int samples) {
+    double delta = (u_bound - l_bound) / (samples + 1);
+    double min_val = 0.0;
+    double max_val = 0.0;
+
+    if ( delta < 0 ) {
+        return 0.0;
+    }
+
+    switch (fun) {
+        case kActSigmoid:  min_val = max_val = sigmoid(l_bound); break;
+        case kActTanh:     min_val = max_val = tanh(l_bound); break;
+    }
+
+    for (int i = 0; i < samples; i++) {
+        double arg = l_bound + i * delta;
+        double val = 0.0;
+        switch (fun) {
+            case kActSigmoid:  val = sigmoid(arg); break;
+            case kActTanh:     val = tanh(arg); break;
+        }
+        if (val > max_val) max_val = val;
+        if (val < min_val) min_val = val;
+    }
+
+    return(100.0 * fabs(offset) / (max_val - min_val));
+}
+
+bool split_search(const DnnActivationType fun,
+                    const double l_bound,
+                    const double u_bound) {
+    bool is_split = false;
+    if (l_bound > u_bound) {
+        return is_split;
+    }
+
+    switch (fun) {
+        case kActSigmoid:
+        case kActTanh:
+            if ((l_bound < 0.0) && (u_bound > 0.0)) {
+                is_split = true;
+            }
+            break;
+        default:
+            is_split = false;
+    }
+    return(is_split);
+}
+
+inline std::vector<pwl_t> negative_pwl(const std::vector<pwl_t>& pwl) {
+    std::vector<pwl_t> new_pwl;
+    new_pwl = pwl;
+    for (uint32_t i = 0; i < pwl.size(); i++) {
+        new_pwl[i].m = -pwl[i].m;
+        new_pwl[i].b = -pwl[i].b;
+        new_pwl[i].beta = -pwl[i].beta;
+    }
+
+    return(new_pwl);
+}
+
+std::vector<pwl_t> pwl_search(const DnnActivationType fun,
+                                const double l_bound,
+                                const double u_bound,
+                                const double threshold,
+                                const double allowed_err_pct,
+                                const int samples,
+                                double& err_pct) {
+    std::vector<pwl_t> pwl;
+    double err = 0.0;
+    int n_segments = 1;
+
+    if (l_bound > u_bound ||
+        threshold < 0) {
+        return pwl;
+    }
+
+    if (split_search(fun, l_bound, u_bound)) {
+        std::vector<pwl_t> pwl2;
+        double err_pct1 = 0.0, err_pct2 = 0.0;
+
+        pwl = pwl_search(fun, l_bound, 0.0, threshold, allowed_err_pct, samples, err_pct1);
+        pwl = negative_pwl(pwl);
+        pwl2 = pwl_search(fun, 0.0, u_bound, threshold, allowed_err_pct, samples, err_pct2);
+
+        // merge
+        pwl.pop_back();  // remove final alpha and beta from first half
+        pwl.insert(pwl.end(), pwl2.begin(), pwl2.end());  // concatenate the two halves
+        err_pct = (err_pct1 + err_pct2) / 2;  // this is not quite correct but should give an indication
+
+    } else {
+        if (fun == kActIdentity) {
+            pwl.resize(2);
+            pwl[0].alpha = pwl[0].t = pwl[0].beta = -std::numeric_limits<float>::infinity();
+            pwl[0].m = 1.0;
+            pwl[0].b = 0.0;
+            pwl[1].alpha = std::numeric_limits<float>::infinity();
+            pwl[1].beta = std::numeric_limits<float>::infinity();
+
+        } else if (fun == kActKaldiLstmClipping) {
+            pwl.resize(4);
+            pwl[0].alpha = pwl[0].t = pwl[0].beta = -std::numeric_limits<float>::infinity();
+            pwl[0].m = 0.0;
+            pwl[0].b = pwl[0].beta = KALDI_LSTM_CLIP_LOWER;
+            pwl[1].alpha = pwl[0].t = pwl[1].beta = KALDI_LSTM_CLIP_LOWER;
+            pwl[1].m = 1.0;
+            pwl[1].b = 0.0;
+            pwl[2].alpha = pwl[0].t = pwl[1].beta = KALDI_LSTM_CLIP_UPPER;
+            pwl[2].m = 0.0;
+            pwl[2].b = KALDI_LSTM_CLIP_UPPER;
+            pwl[3].alpha = pwl[3].beta = std::numeric_limits<float>::infinity();
+
+        } else {
+            bool negative = false;
+
+            switch (fun) {
+                case kActSigmoid:
+                    if (u_bound == 0) negative = true;  // make left half convex
+                    err = pivot_search(pwl, sigmoid, first_deriv_sigmoid, n_segments, l_bound, u_bound, threshold, negative);
+                    break;
+                case kActTanh:
+                    if (u_bound == 0) negative = true;  // make left half convex
+                    err = pivot_search(pwl, tanh, first_deriv_tanh, n_segments, l_bound, u_bound, threshold, negative);
+                    break;
+            }
+            err_pct = calculate_error_pct(fun, l_bound, u_bound, err, samples);
+
+            while ((n_segments < PWL_MAX_ITERATIONS) && (allowed_err_pct < err_pct)) {
+                n_segments += 1;
+                switch (fun) {
+                    case kActSigmoid:
+                        err = pivot_search(pwl, sigmoid, first_deriv_sigmoid, n_segments, l_bound, u_bound, threshold, negative);
+                        break;
+                    case kActTanh:
+                        err = pivot_search(pwl, tanh, first_deriv_tanh, n_segments, l_bound, u_bound, threshold, negative);
+                        break;
+                }
+                err_pct = calculate_error_pct(fun, l_bound, u_bound, err, samples);
+            }
+
+            if (n_segments >= PWL_MAX_ITERATIONS) {
+                std::cerr << "Error:  failed to converge in pwl_search!" << std::endl;
+            }
+        }
+    }
+    return(pwl);
+}
+
+pwl_gna_slope_scale_t gna_slope(const double slope,
+                                const double in_scale,
+                                const double out_scale) {
+    pwl_gna_slope_scale_t s;
+    s.slope = slope* out_scale / in_scale;
+
+    for (s.slope_scale_index = 3; s.slope_scale_index > 0; --s.slope_scale_index) {
+        s.slope_scale = static_cast<uint64_t>(1) << (8 * (1 + s.slope_scale_index));
+        if (((s.slope * s.slope_scale) <= std::numeric_limits<int16_t>::max()) &&
+                    ((s.slope * s.slope_scale) >= std::numeric_limits<int16_t>::min()))
+            break;
+    }
+    s.slope_scale = static_cast<uint64_t>(1) << (8 * (1 + s.slope_scale_index));
+
+    return(s);
+}
+
+void make_gna_pwl(const DnnActivation  fun,
+                    const std::vector<pwl_t>& pwl,
+                    const double l_bound,
+                    const double u_bound,
+                    const double in_scale,
+                    const double out_scale,
+                    std::vector<intel_pwl_segment_t> &gna_pwl) {
+    pwl_gna_slope_scale_t s;
+    uint32_t pwl_size = static_cast<int32_t>(pwl.size());
+    switch (fun) {
+        case kActSigmoid:
+        case kActTanh: {
+            auto n_segments = static_cast<int32_t> (pwl_size) + 1;
+            gna_pwl.resize(n_segments);
+            // insert extra segment for x values < l_bound
+            gna_pwl[0].xBase = static_cast<int32_t> (INT32_MIN & XBASEMASK);  // zero out the 2 lsb
+            if (fun == kActSigmoid) {
+                gnalog() <<  "=========================== Sigmoid Segments ===========================\n";
+                gna_pwl[0].yBase = gna_pwl[1].yBase = 0;
+                gna_pwl[1].xBase = (static_cast<int32_t> (in_scale * (-pwl[0].b / pwl[0].m))) & XBASEMASK;
+            } else {
+                gnalog() <<  "=========================== Tanh Segments ===========================\n";
+                gna_pwl[0].yBase = gna_pwl[1].yBase = static_cast<int16_t>(-1.0 * out_scale);
+                gna_pwl[1].xBase = (static_cast<int32_t> (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK;
+            }
+            gna_pwl[0].slope = 0;
+
+            gnalog() << (gna_pwl[0].xBase) / in_scale
+                     << " " << (gna_pwl[0].yBase) / out_scale
+                     << " " << 0.0
+                     << "\n";
+
+            s = gna_slope(pwl[0].m, in_scale, out_scale);
+            gna_pwl[1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
+            gna_pwl[1].xBase = gna_pwl[1].xBase | s.slope_scale_index;
+
+            gnalog() << (gna_pwl[1].xBase/in_scale)
+                     << " " << (gna_pwl[1].yBase) / out_scale
+                     << " " << pwl[0].m
+                     << "\n";
+
+            for (uint32_t i = 1; i < pwl_size - 1; ++i) {
+                s = gna_slope(pwl[i].m, in_scale, out_scale);
+                gna_pwl[i + 1].xBase = (static_cast<int32_t> (in_scale * pwl[i].alpha)) & XBASEMASK;
+                gna_pwl[i + 1].yBase = FLOAT_TO_INT16(pwl[i].beta * out_scale);
+                gna_pwl[i + 1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
+                gna_pwl[i + 1].xBase = gna_pwl[i + 1].xBase | s.slope_scale_index;
+
+                gnalog() << (pwl[i].alpha)
+                         << " " << pwl[i].beta
+                         << " " << pwl[i].m
+                         << "\n";
+            }
+            // insert extra segment for xvalues > u_bound
+            gna_pwl[n_segments - 1].xBase =
+                ((uint32_t) (in_scale * (1.0 - pwl[pwl_size - 2].b) / pwl[pwl_size - 2].m)) & XBASEMASK;
+            gna_pwl[n_segments - 1].yBase = FLOAT_TO_INT16(1.0 * out_scale);
+            gna_pwl[n_segments - 1].slope = 0;
+
+            gnalog() << (gna_pwl[n_segments - 1].xBase / in_scale)
+                     << " " << 1.0
+                     << " " << 0.0
+                     << "\n";
+            break;
+        }
+        case kActRelu:
+        case kActLeakyRelu: {
+            auto n_segments = 2;
+            gna_pwl.resize(n_segments);
+
+            gnalog() << "=========================== ReLU Segments ===========================\n";
+            int32_t x_lower = INT32_MIN;
+            int16_t y_lower = INT16_MIN;
+            if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
+            if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
+            gna_pwl[0].yBase = y_lower * fun.negative_slope;
+            s = gna_slope(fun.negative_slope, in_scale, out_scale);
+            gna_pwl[0].xBase = (x_lower & XBASEMASK) | s.slope_scale_index;  // zero out the 2 lsb
+            gna_pwl[0].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
+
+            gnalog() << gna_pwl[0].xBase / in_scale
+                    << " " << gna_pwl[0].yBase / out_scale
+                    << " " << (gna_pwl[0].slope * in_scale) / (out_scale*s.slope_scale)
+                    << "\n";
+            gna_pwl[1].xBase = 0;
+            gna_pwl[1].yBase = 0;
+            s = gna_slope(1.0, in_scale, out_scale);
+            gna_pwl[1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
+            gna_pwl[1].xBase = gna_pwl[1].xBase | s.slope_scale_index;
+            gnalog() << 0.0
+                    << " " << 0.0
+                    << " " << (gna_pwl[1].slope * in_scale) / (out_scale*s.slope_scale)
+                    << "\n";
+            break;
+        }
+        case kActIdentity:
+        case kActKaldiLstmClipping: {
+            int32_t x_lower = INT32_MIN;
+            int32_t x_upper = INT32_MAX;
+            int16_t y_lower = INT16_MIN;
+            int16_t y_upper = INT16_MAX;
+            auto n_segments = 2;
+            if (fun == kActKaldiLstmClipping) {
+                gnalog()  << "=========================== Clipping Segments ===========================\n";
+                if (x_lower < l_bound * in_scale) {
+                    if (y_lower < l_bound * out_scale) {
+                        x_lower = FLOAT_TO_INT32(l_bound * in_scale);
+                        y_lower = FLOAT_TO_INT16(l_bound * out_scale);
+                    } else {
+                        x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
+                    }
+                }
+                if (x_upper > u_bound * in_scale) {
+                    if (y_upper > u_bound * out_scale) {
+                        x_upper = FLOAT_TO_INT32(u_bound * in_scale);
+                        y_upper = FLOAT_TO_INT16(u_bound * out_scale);
+                    } else {
+                        x_upper = FLOAT_TO_INT32(y_upper  * in_scale / out_scale);
+                    }
+                }
+            } else {
+                gnalog() << "=========================== Identity Segments ===========================\n";
+                if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
+                if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale);
+                if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
+                if (y_upper > x_upper * out_scale / in_scale) y_upper = FLOAT_TO_INT16(x_upper * out_scale / in_scale);
+            }
+            gna_pwl.resize(n_segments);
+            gna_pwl[0].xBase = INT32_MIN & XBASEMASK;  // zero out the 2 lsb
+            gna_pwl[0].yBase = y_lower;
+            gna_pwl[0].slope = 0;
+            gnalog() << gna_pwl[0].xBase / in_scale
+                    << " " << gna_pwl[0].yBase / out_scale
+                    << " " << 0
+                    << "\n";
+            gna_pwl[1].xBase = x_lower & XBASEMASK;  // zero out the 2 lsb
+            gna_pwl[1].yBase = y_lower;
+            s = gna_slope(1.0, in_scale, out_scale);
+            gna_pwl[1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
+            gna_pwl[1].xBase = gna_pwl[1].xBase | s.slope_scale_index;
+            gnalog() << gna_pwl[1].xBase / in_scale
+                    << " " << gna_pwl[1].yBase / out_scale
+                    << " " << 1.0
+                    << "\n";
+            if (INT32_MAX > x_upper) {  // need a right segment
+                gna_pwl.push_back({
+                    static_cast<int32_t>(x_upper & XBASEMASK),  // zero out the 2 lsb
+                    y_upper,
+                    0 });
+
+                gnalog() << gna_pwl[n_segments].xBase / in_scale
+                    << " " << gna_pwl[n_segments].yBase / out_scale
+                    << " " << 0
+                    << "\n";
+                n_segments += 1;
+            }
+            break;
+        }
+        default:
+            gnalog() << "Unexpected function activation!\n";
+            std::cerr << "Unexpected function activation!\n";
+    }
+}
+
+void PwlDesignOpt16(const DnnActivation activation_type,
+                    std::vector<intel_pwl_segment_t> &ptr_segment,
+                    const float scale_in,
+                    const float scale_out) {
+    std::vector<pwl_t> pwl;
+    double err_pct = 0.0;
+    switch (activation_type) {
+        case kActSigmoid:
+            pwl = pwl_search(kActSigmoid, -SIGMOID_DOMAIN, SIGMOID_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct);
+            make_gna_pwl(activation_type, pwl, -SIGMOID_DOMAIN, SIGMOID_DOMAIN, scale_in, scale_out, ptr_segment);
+            break;
+        case kActTanh:
+            pwl = pwl_search(kActTanh, -TANH_DOMAIN, TANH_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct);
+            make_gna_pwl(activation_type, pwl, -TANH_DOMAIN, TANH_DOMAIN, scale_in, scale_out, ptr_segment);
+            break;
+        case kActRelu:
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            break;
+        case kActLeakyRelu:
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            break;
+        case kActIdentity:
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            break;
+        case kActKaldiLstmClipping:
+            make_gna_pwl(activation_type, pwl, KALDI_LSTM_CLIP_LOWER, KALDI_LSTM_CLIP_UPPER, scale_in, scale_out, ptr_segment);
+            break;
+        default:
+            break;
+    }
+}
+
+void PwlDesign16(const DnnActivation activation_type,
+                 intel_pwl_segment_t *ptr_segment,
+                 const uint32_t num_segments,
+                 const float scale_in,
+                 const float scale_out) {
+    switch (activation_type) {
+        case kActSigmoid:
+           {
+                gnalog() <<  "=========================== Sigmoid Segments===========================\n";
+                uint32_t num_segment_size = 0;
+                int32_t offset = 0;
+                ptr_segment[0].xBase = static_cast<int32_t>(INT32_MIN & XBASEMASK);  // zero out the 2 lsb
+                num_segment_size = static_cast<int32_t>(SIGMOID_DOMAIN * scale_in / ((num_segments-2) / 2) + 0.5);
+                offset = -static_cast<int32_t>(num_segment_size * (num_segments-2) / 2);
+                for (uint32_t i = 1; i < num_segments; i++) {
+                    ptr_segment[i].xBase = static_cast<int32_t>(offset & XBASEMASK);  // zero out the 2 lsb
+                    offset += num_segment_size;
+                }
+                for (uint32_t i = 0; i < num_segments; i++) {
+                    int32_t xbase = static_cast<int32_t>(ptr_segment[i].xBase & XBASEMASK);
+                    int32_t xbasenext = (i < num_segments-1) ? static_cast<int32_t>(ptr_segment[i+1].xBase & XBASEMASK) : INT32_MAX;
+                    float floatarg = static_cast<float>(xbase / (2 * scale_in));
+                    float floatargnext = static_cast<float>(xbasenext / (2 * scale_in));
+                    float floatval, floatvalnext, slope;
+                    TANH(1, &floatarg, &floatval);
+                    floatval = 0.5f * (1.0f + floatval);
+                    TANH(1, &floatargnext, &floatvalnext);
+                    floatvalnext = 0.5f * (1.0f + floatvalnext);
+                    slope = scale_out*(floatvalnext - floatval) / static_cast<float>(xbasenext - xbase);
+                    {
+                        // find best scale factor
+                        uint64_t slope_scale;
+                        uint32_t slope_scale_index;
+                        for (slope_scale_index = 3; slope_scale_index > 0; slope_scale_index--) {
+                            slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                            if (((slope * slope_scale) <= 32767.0) && ((slope * slope_scale) >= -32768.0))
+                                break;
+                        }
+                        slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                        ptr_segment[i].slope = FLOAT_TO_INT16(slope * slope_scale);
+
+                        ptr_segment[i].xBase = ptr_segment[i].xBase | slope_scale_index;
+                    }
+                    ptr_segment[i].yBase = FLOAT_TO_INT16(floatval * scale_out);
+                    gnalog() << (static_cast<int32_t>((ptr_segment[i].xBase & XBASEMASK))/scale_out)
+                             << " "
+                             << (static_cast<float>((ptr_segment[i].yBase))/scale_out)
+                             << " "
+                             << (slope/scale_out)
+                             << "\n";
+                }
+            }
+            break;
+        case kActTanh:
+            {
+                gnalog() <<  "=========================== Tanh Segments===========================\n";
+                uint32_t num_segment_size = 0;
+                int32_t offset = 0;
+                ptr_segment[0].xBase = static_cast<int32_t>(INT32_MIN & XBASEMASK);  // zero out the 2 lsb
+                num_segment_size = static_cast<int32_t>(TANH_DOMAIN * scale_in / ((num_segments-2) / 2) + 0.5);
+                offset = -static_cast<int32_t>(num_segment_size * (num_segments-2) / 2);
+                for (uint32_t i = 1; i < num_segments; i++) {
+                    ptr_segment[i].xBase = static_cast<int32_t>(offset & XBASEMASK);  // zero out the 2 lsb
+                    offset += num_segment_size;
+                }
+                for (uint32_t i = 0; i < num_segments; i++) {
+                    int32_t xbase = static_cast<int32_t>(ptr_segment[i].xBase & XBASEMASK);
+                    int32_t xbasenext = (i < num_segments-1) ?
+                                                    static_cast<int32_t>(ptr_segment[i+1].xBase & XBASEMASK) :
+                                                    INT32_MAX;
+                    float floatarg = static_cast<float>(xbase / scale_in);
+                    float floatargnext = static_cast<float>(xbasenext / scale_in);
+                    float floatval, floatvalnext, slope;
+                    TANH(1, &floatarg, &floatval);
+                    TANH(1, &floatargnext, &floatvalnext);
+                    slope = scale_out * (floatvalnext - floatval) /
+                                                static_cast<float>(xbasenext - xbase);
+                    {
+                        // find best scale factor
+                        uint64_t slope_scale;
+                        uint32_t slope_scale_index;
+                        for (slope_scale_index = 3; slope_scale_index > 0; slope_scale_index--) {
+                            slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                            if (((slope * slope_scale) <= 32767.0) && ((slope * slope_scale) >= -32768.0))
+                                break;
+                        }
+                        slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                        ptr_segment[i].slope = FLOAT_TO_INT16(slope * slope_scale);
+                        ptr_segment[i].xBase = ptr_segment[i].xBase | slope_scale_index;
+                    }
+                    ptr_segment[i].yBase = FLOAT_TO_INT16(floatval * scale_out);
+                    gnalog() << (static_cast<int32_t>((ptr_segment[i].xBase & XBASEMASK))/scale_out)
+                             << " "
+                             << (static_cast<float>((ptr_segment[i].yBase))/scale_out)
+                             << " "
+                             << (slope/scale_out)
+                             << "\n";
+                }
+            }
+            break;
+        case kActRelu:
+            std::cerr << "Rectilinear activation function design not yet implemented!" << std::endl;
+            throw -1;
+            break;
+        case kActIdentity:
+        case kActKaldiLstmClipping:  // clipping of IDENTITY is more aggressive than Kaldi
+            {
+                float slope = 0.0;
+                int64_t x_lower_limit = static_cast<int64_t>((INT16_MIN / scale_out) * scale_in - 0.5);
+                int64_t x_upper_limit = static_cast<int64_t>((INT16_MAX / scale_out) * scale_in + 0.5);
+                int16_t y_lower_limit = INT16_MIN;
+                int16_t y_upper_limit = INT16_MAX;
+                if (activation_type == kActKaldiLstmClipping)
+                    gnalog() << "=========================== Clipping Segments ===========================\n";
+                else
+                    gnalog() << "=========================== Identity Segments ===========================\n";
+                if (x_lower_limit < INT32_MIN) {
+                    std::cerr << "Warning:  saturation in PwlDesign16! " << x_lower_limit  << " < INT32_MIN"<< std::endl;
+                    x_lower_limit = INT32_MIN;
+                    y_lower_limit = static_cast<int16_t>((scale_out / scale_in)*static_cast<float>(INT32_MIN) - 0.5);
+                }
+                if (x_upper_limit > INT32_MAX) {
+                    std::cerr << "Warning:  saturation in PwlDesign16! " << x_upper_limit  << " > INT32_MAX"<< std::endl;
+                    x_upper_limit = INT32_MAX;
+                    y_upper_limit = static_cast<int16_t>((scale_out / scale_in)*static_cast<float>(INT32_MAX) + 0.5);
+                }
+                slope =
+                    static_cast<float>(static_cast<uint64_t>(y_upper_limit) - static_cast<uint64_t>(y_lower_limit)) /
+                                               static_cast<float>(static_cast<uint64_t>(x_upper_limit) - static_cast<uint64_t>(x_lower_limit));
+                ptr_segment[0].xBase = static_cast<int32_t>(INT32_MIN & XBASEMASK);  // zero out the 2 lsb
+                ptr_segment[0].yBase = y_lower_limit;
+                ptr_segment[0].slope = 0;
+
+                gnalog() << ptr_segment[0].xBase / scale_in
+                    << " " << ptr_segment[0].yBase / scale_out
+                    << " " << 0
+                    << "\n";
+
+                ptr_segment[1].xBase = static_cast<int32_t>(x_lower_limit & XBASEMASK);
+                ptr_segment[1].yBase = y_lower_limit;
+                {
+                    // find best scale factor
+                    uint64_t slope_scale = 0;
+                    uint32_t slope_scale_index = 0;
+                    for (slope_scale_index = 3; slope_scale_index > 0; slope_scale_index--) {
+                        slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                        if (((slope * slope_scale) <= std::numeric_limits<int16_t>::max()) &&
+                                    ((slope * slope_scale) >= std::numeric_limits<int16_t>::min()))
+                            break;
+                    }
+                    slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                    ptr_segment[1].slope = FLOAT_TO_INT16(slope * slope_scale);
+                    ptr_segment[1].xBase = ptr_segment[1].xBase | slope_scale_index;
+                }
+                ptr_segment[2].xBase = static_cast<int32_t>(x_upper_limit & XBASEMASK);
+                ptr_segment[2].yBase = y_upper_limit;
+                ptr_segment[2].slope = 0;
+            }
+            break;
+        default:
+            fprintf(stderr, "Activation function design for %s not yet implemented!\n", intel_dnn_activation_name[activation_type]);
+            throw -1;
+    }
+}
diff --git a/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp b/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp
new file mode 100644 (file)
index 0000000..6c42d92
--- /dev/null
@@ -0,0 +1,488 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <string>
+#include <utility>
+#include <gna-api-types-xnn.h>
+#include "ie_layers.h"
+#include "quantized_layer_params.hpp"
+#include "quantization.h"
+#include "details/caseless.hpp"
+#include "graph_tools.hpp"
+#include "blob_factory.hpp"
+#include "precision_ex.hpp"
+#include "pwl.h"
+#include "gna_layer_info.hpp"
+
+namespace GNAPluginNS {
+namespace details {
+
+/**
+ * @brief description of quantisation precision
+ * @tparam Ip - input precision
+ * @tparam Wp - weights precision
+ * @tparam Bp - biases precision
+ * @tparam Np - network precision - can be auto generated in future
+ */
+template <class Ip, class Op, class Wp, class Bp, class Np>
+struct QuantDescTmpl {
+    using WeightsPrecision = Wp;
+    using BiasesPrecision = Bp;
+
+    InferenceEngine::TPrecision<Ip> _Ip;
+    InferenceEngine::TPrecision<Op> _Op;
+    InferenceEngine::TPrecision<Wp> _Wp;
+    InferenceEngine::TPrecision<Bp> _Bp;
+    InferenceEngine::TPrecision<Np> _Np;
+
+    QuantDescTmpl() = default;
+    QuantDescTmpl(InferenceEngine::TPrecision<Ip> _Ip,
+              InferenceEngine::TPrecision<Op> _Op,
+              InferenceEngine::TPrecision<Wp> _Wp,
+              InferenceEngine::TPrecision<Bp> _Bp,
+              InferenceEngine::TPrecision<Np> _Np) : _Op(_Op), _Ip(_Ip), _Wp(_Wp), _Bp(_Bp), _Np(_Np) {
+    }
+
+    InferenceEngine::Precision getInputPrecision() const {
+        return _Ip;
+    }
+    InferenceEngine::Precision getWeightsPrecision() const {
+        return _Wp;
+    }
+    InferenceEngine::Precision getBiasesPrecision() const {
+        return _Bp;
+    }
+    InferenceEngine::Precision getNetPrecision() const {
+        return _Np;
+    }
+    InferenceEngine::Precision getOutputPrecision() const {
+        return _Op;
+    }
+};
+
+#define P_TYPE(X)\
+typename InferenceEngine::PrecisionTrait<InferenceEngine::Precision::X>::value_type
+
+#define PRECISION_TYPE(A, B, C, D, E)\
+    P_TYPE(A), P_TYPE(B), P_TYPE(C), P_TYPE(D), P_TYPE(E)
+
+
+struct QuantI16 : public QuantDescTmpl<PRECISION_TYPE(I16, I32, I16, I32, MIXED)> {
+    QuantI16() {
+        _Np = InferenceEngine::Precision::MIXED;
+    }
+};
+struct QuantI8  : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), intel_compound_bias_t, P_TYPE(MIXED)> {
+    QuantI8() {
+        _Np = InferenceEngine::Precision::MIXED;
+    }
+};
+
+template <class A, class B>
+struct QuantPair {
+    using MandatoryType = A;
+    using OptionalType = B;
+    static A mandatory () { return A();}
+    static B optional () { return B();}
+};
+
+/**
+ * @brief should allocated blob for specific data type, in case of src blob is nullptr
+ * @tparam T
+ * @return
+ */
+template <class T>
+inline bool shouldAlwaysAllocate() {
+    return false;
+}
+
+template <>
+inline bool shouldAlwaysAllocate<intel_compound_bias_t>() {
+    return true;
+}
+
+
+#undef P_TYPE
+#undef PRECISION_TYPE
+
+/**
+ * @brief  designate actual data quantisation functions trait
+ */
+template <class T>
+class Quant {
+ public:
+    template<class ...Args>
+    void operator()(Args && ... args) const { }
+};
+
+template<>
+class Quant<QuantI16> {
+ public:
+    template<class ...Args>
+    void operator()(Args && ... args) const {
+        QuantizeAffine16(std::forward<Args>(args)...);
+    }
+};
+
+template<>
+class Quant<QuantI8> {
+ public:
+    template<class ...Args>
+    void operator()(Args && ... args) const {
+        QuantizeAffine8(std::forward<Args>(args)...);
+    }
+};
+
+template<class QuantDesc, class QuantFunc>
+inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
+                                  InferenceEngine::WeightableLayer *wl,
+                                  const QuantFunc &fnc,
+                                  bool isDiagonal = false) {  // for diagonal layer number of weights and biases significatly smaller
+    // for quantized weights
+    auto intWeights =
+        make_custom_blob<typename QuantDesc::WeightsPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({wl->_weights->size()}));
+    intWeights->allocate();
+    if (intWeights->buffer() == nullptr) {
+        THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
+                            << "cannot copy weights for layer :"<< wl->name << " of size" << intWeights->byteSize();
+    }
+
+
+    auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) {
+        if (wl->_biases) {
+            return wl->_biases->size();
+        }
+        // calculating biases len using weight dims
+        auto & dims = wl->outData.front()->getDims();
+        return dims[1];
+    };
+
+    using BiasesPrecision = typename QuantDesc::BiasesPrecision;
+    auto biasMaker = [&] () {
+        InferenceEngine::Blob::Ptr zero;
+        if (!wl->_biases && !shouldAlwaysAllocate<BiasesPrecision>()) {
+            return zero;
+        }
+        auto bias = make_custom_blob<BiasesPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({
+            getBiasSizeForLayer(wl)
+        }));
+        bias->allocate();
+        if (bias->buffer() == nullptr) {
+            THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
+                                << "cannot copy bias for layer :"<< wl->name <<"of size" << bias->byteSize();
+        }
+
+        memset(bias->buffer(), 0, bias->byteSize());
+
+        return bias;
+    };
+    auto intBiases = biasMaker();
+
+    float input_scale_factor = 1.f;
+    if (InferenceEngine::CNNNetHasPrevLayer(wl)) {
+        auto quantDataForInputLayer =
+            InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
+        input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
+        if (std::isnan(input_scale_factor) ||
+            std::isinf(input_scale_factor)) {
+            THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
+        }
+    }
+    if (wl->outData[0]->getDims().size() < 2) {
+        THROW_IE_EXCEPTION << "Unsupported output dims size for " << wl->name <<", should be > 1, but " << wl->outData[0]->getDims().size();
+    }
+    if (wl->insData[0].lock().get()->getDims().size() < 2) {
+        THROW_IE_EXCEPTION << "Unsupported input dims size for " << wl->name << ", should be > 1, but " << wl->insData[0].lock().get()->getDims().size();
+    }
+    uint32_t num_rows = isDiagonal ? 1 : wl->outData[0]->getDims()[1];
+    uint32_t num_columns = wl->insData[0].lock().get()->getDims()[1];
+
+    if (isDiagonal) {
+        std::swap(num_rows, num_columns);
+    }
+
+    uint32_t num_rows_padded = num_rows;
+    uint32_t num_columns_padded = num_columns;
+
+    // TODO: replace this into fixed scale quantizer then
+
+    auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
+    {
+        fnc(wl->_weights->buffer().as<float *>(),
+            wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
+            intWeights->buffer(),
+            intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
+            input_scale_factor,
+            &quantData->_weights_quant.scale,
+            &quantData->_dst_quant.scale,
+            num_rows,
+            num_columns,
+            num_rows_padded,
+            num_columns_padded);
+    }
+    wl->_weights = intWeights;
+    wl->_biases = intBiases;
+
+    /**
+     * correcting precision for outdata
+     */
+    wl->precision = quantDesc.getWeightsPrecision();
+    for (auto &&outData : wl->outData) {
+        outData->setPrecision(quantDesc.getOutputPrecision());
+    }
+}
+
+
+template<class QuantDesc, class QuantFunc>
+inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
+                                  InferenceEngine::WeightableLayer *conv,
+                                  const QuantFunc &fnc) {
+    // for quantized weights
+    auto intWeights = make_custom_blob<typename QuantDesc::WeightsPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({conv->_weights->size()}));
+    intWeights->allocate();
+    if (intWeights->buffer() == nullptr) {
+        THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
+                            << "cannot copy weights for layer :"<< conv->name << " of size" << intWeights->byteSize();
+    }
+
+
+    auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) {
+        if (wl->_biases) {
+            return wl->_biases->size();
+        }
+        // calculating biases len using weight dims
+        auto & dims = wl->outData.front()->getDims();
+        return dims[1];
+    };
+
+    using BiasesPrecision = typename QuantDesc::BiasesPrecision;
+    auto biasMaker = [&] () {
+        InferenceEngine::Blob::Ptr zero;
+        if (!conv->_biases && !shouldAlwaysAllocate<BiasesPrecision>()) {
+            return zero;
+        }
+        auto bias = make_custom_blob<BiasesPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({
+                                                                                                          getBiasSizeForLayer(conv)
+                                                                                                      }));
+        bias->allocate();
+        if (bias->buffer() == nullptr) {
+            THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
+                                << "cannot copy bias for layer :"<< conv->name <<"of size" << bias->byteSize();
+        }
+        memset(bias->buffer(), 0, bias->byteSize());
+
+        return bias;
+    };
+    auto intBiases = biasMaker();
+
+    float input_scale_factor = 1.f;
+    if (InferenceEngine::CNNNetHasPrevLayer(conv)) {
+        auto quantDataForInputLayer =
+            InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(conv).get());
+        input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
+        if (std::isnan(input_scale_factor) ||
+            std::isinf(input_scale_factor)) {
+            THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
+        }
+    }
+    if (conv->outData[0]->getDims().size() < 2) {
+        THROW_IE_EXCEPTION << "Unsupported output dims size for " << conv->name <<", should be > 1, but " << conv->outData[0]->getDims().size();
+    }
+    if (conv->insData[0].lock().get()->getDims().size() < 2) {
+        THROW_IE_EXCEPTION << "Unsupported input dims size for " << conv->name << ", should be > 1, but " << conv->insData[0].lock().get()->getDims().size();
+    }
+    auto inputData = conv->insData[0].lock();
+
+    uint32_t num_rows = getBiasSizeForLayer(conv);
+    uint32_t num_columns = conv->_weights->size() / num_rows;
+
+    uint32_t num_rows_padded = num_rows;
+    uint32_t num_columns_padded = num_columns;
+
+    // TODO: replace this into fixed scale quantizer then
+
+    auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv);
+    {
+        fnc(conv->_weights->buffer().as<float *>(),
+            conv->_biases ? conv->_biases->buffer().as<float *>() : nullptr,
+            intWeights->buffer(),
+            intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
+            input_scale_factor,
+            &quantData->_weights_quant.scale,
+            &quantData->_dst_quant.scale,
+            num_rows,
+            num_columns,
+            num_rows_padded,
+            num_columns_padded);
+    }
+    conv->_weights = intWeights;
+    conv->_biases = intBiases;
+
+    /**
+     * correcting precision for outdata
+     */
+    conv->precision = quantDesc.getWeightsPrecision();
+    for (auto &&outData : conv->outData) {
+        outData->setPrecision(quantDesc.getOutputPrecision());
+    }
+}
+
+
+class DataQuantizerBase {
+ public:
+    explicit DataQuantizerBase(float scaleFactor) : scaleFactor(scaleFactor) {
+    }
+ protected:
+    float scaleFactor = 1.0;
+};
+/**
+ * Helper class to use partial specialisation of Layer type
+ * @tparam Desc
+ * @tparam Layer
+ */
+template<class Desc, class Layer>
+class DataQuantizer : public DataQuantizerBase {
+ public:
+    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+    bool operator()(Layer cnnLayer) const {
+        return false;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::CNNLayer *> : public DataQuantizerBase {
+ public:
+    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+
+    bool operator()(InferenceEngine::CNNLayer *cnnLayer) const {
+        for (auto &&outData : cnnLayer->outData) {
+            outData->setPrecision(Desc::mandatory().getOutputPrecision());
+        }
+        // set scale factor for input layers
+        auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
+        if (cnnLayer->insData.empty()) {
+            for (auto &&outData : cnnLayer->outData) {
+                outData->setPrecision(Desc::mandatory().getInputPrecision());
+            }
+        } else {
+                if (LayerInfo(*cnnLayer).isActivation() ||
+                        LayerInfo(*cnnLayer).isCopy()) {
+                // precision of activation layers is always equal input precision
+                for (auto &&outData : cnnLayer->outData) {
+                    outData->setPrecision(Desc::mandatory().getInputPrecision());
+                }
+            }
+        }
+        cnnLayer->precision = Desc::mandatory().getInputPrecision();
+
+        return true;
+    }
+};
+
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::SplitLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
+    using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
+ public:
+    explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
+    bool operator()(InferenceEngine::SplitLayer *splitLayer) const {
+        base::operator()(splitLayer);
+        // split layer doesnt change it's data at all
+        for (auto &&outData : splitLayer->outData) {
+            outData->setPrecision(Desc::mandatory().getInputPrecision());
+        }
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::ConcatLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
+    using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
+ public:
+    explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
+    bool operator()(InferenceEngine::ConcatLayer *concatLayer) const {
+        base::operator()(concatLayer);
+        for (auto &&outData : concatLayer->outData) {
+            outData->setPrecision(Desc::mandatory().getInputPrecision());
+        }
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::CropLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
+    using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
+ public:
+    explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
+    bool operator()(InferenceEngine::CropLayer *cropLayer) const {
+        base::operator()(cropLayer);
+        for (auto &&outData : cropLayer->outData) {
+            outData->setPrecision(Desc::mandatory().getInputPrecision());
+        }
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::ReshapeLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
+    using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
+ public:
+    explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
+    bool operator()(InferenceEngine::ReshapeLayer *reshapeLayer) const {
+        base::operator()(reshapeLayer);
+        // reshape layer doesnt change it's data at all
+        for (auto &&outData : reshapeLayer->outData) {
+            outData->setPrecision(Desc::mandatory().getInputPrecision());
+        }
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::WeightableLayer *> : public DataQuantizerBase {
+ public:
+    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+    bool operator()(InferenceEngine::WeightableLayer *wl) const {
+        quantizeWeightsBiases<typename Desc::MandatoryType>(Desc::mandatory(), wl, Quant<typename Desc::MandatoryType>());
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::ConvolutionLayer *> : public DataQuantizerBase {
+ public:
+    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+    bool operator()(InferenceEngine::WeightableLayer *wl) const {
+        quantizeWeightsBiasesConv<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>());
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::ScaleShiftLayer *> : public DataQuantizerBase {
+ public:
+    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+    bool operator()(InferenceEngine::ScaleShiftLayer *wl) const {
+        quantizeWeightsBiases<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>(), true);
+        return true;
+    }
+};
+
+}  // namespace details
+
+template<class Desc>
+class LayersQuantizer : public details::DataQuantizerBase {
+ public:
+    explicit LayersQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+    template<class T>
+    bool operator()(T input) const {
+        return details::DataQuantizer<Desc, T>(scaleFactor)(input);
+    }
+};
+
+using QuantI16 = details::QuantPair<details::QuantI16, details::QuantI16>;
+using QuantI8 = details::QuantPair<details::QuantI8, details::QuantI16>;
+
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp b/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp
new file mode 100644 (file)
index 0000000..797c87c
--- /dev/null
@@ -0,0 +1,78 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#pragma once
+#include <vector>
+#include "gna_plugin_config.hpp"
+#include "layer_transform.hpp"
+#include "graph_tools.hpp"
+#include "details/ie_cnn_network_tools.h"
+#include "layer_quantizer.hpp"
+#include "scale_factor_calc.hpp"
+
+namespace GNAPluginNS {
+/**
+ * Quantize entire cnn - network
+ * @tparam T - type trait for weights and biases
+ */
+template<class T>
+class ModelQuantizer {
+ public:
+    CNNNetworkPtr quantize(InferenceEngine::ICNNNetwork &model, float scaleFactor) const {
+        return quantize(model, [](InferenceEngine::CNNNetPtr &){}, scaleFactor);
+    }
+
+    template <class PreQuantisationCb>
+    CNNNetworkPtr quantize(InferenceEngine::ICNNNetwork &model, const PreQuantisationCb &cb, float scaleFactor) const {
+        auto visitor = [&](InferenceEngine::CNNLayerPtr lp) {
+            return InferenceEngine::injectData<QuantizedLayerParams>(lp);
+        };
+        auto copiedNet = InferenceEngine::CNNNetCopy(model, visitor);
+
+        // TODO: probably not the best way of using dynamic cast in order to transform Precision
+        // one of solution is to create not copyNet overloads, that accepts 2 functors, one for layer copy
+        // and another one for net copy
+        auto rawNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(copiedNet.get());
+        rawNet->setPrecision(T::mandatory().getNetPrecision());
+
+        // allow client code to access copied topology, to avoid copies if user would like to chain quantisation with
+        // another preprocessing
+        cb(copiedNet);
+
+        LayersQuantizer<T> lc(scaleFactor);
+        auto sortedNewNet = InferenceEngine::details::CNNNetSortTopologically(*copiedNet.get());
+        gnalog() << "Sorted layers: " << std::endl;
+        for (auto &&layer : sortedNewNet) {
+            gnalog() << layer->name << std::endl;
+        }
+
+        // weights scale is a hint, not all weightable layer preserve it in all possible precisions
+        propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), scaleFactor);
+
+        // sorted order gives possibility for propagate quantisation along depended layers
+        for (auto &&layer : sortedNewNet) {
+            transformLayer(layer, lc);
+        }
+
+        return copiedNet;
+    }
+
+ private :
+    void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net, int weightsBytesSize, float scaleFactor) const {
+        ScaleFactorCalculator sf(net, weightsBytesSize, scaleFactor);
+
+        while (!sf.allLayersProcessed()) {
+            for (auto &&layer : sf.getStartLayers()) {
+                transformLayer(layer, sf);
+                // transforming until we reached cases where output scale updated due to situation in downstream layer
+                if (sf.needToRestart()) {
+                    break;
+                }
+            }
+        }
+    }
+};
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/quantization/precision_ex.hpp b/inference-engine/src/gna_plugin/quantization/precision_ex.hpp
new file mode 100644 (file)
index 0000000..798345e
--- /dev/null
@@ -0,0 +1,95 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_precision.hpp"
+
+namespace InferenceEngine {
+
+/**
+ * @brief reverse trait for getting some precision from it's underlined memory type
+ * this might not work for certain precisions : for Q78, U16
+ * @tparam T
+ */
+template<class T>
+struct precision_from_media {
+    static const Precision::ePrecision type = Precision::CUSTOM;
+};
+
+template<>
+struct precision_from_media<float> {
+    static const Precision::ePrecision type = Precision::FP32;
+};
+
+template<>
+struct precision_from_media<uint16_t> {
+    static const Precision::ePrecision type = Precision::FP16;
+};
+
+template<>
+struct precision_from_media<int16_t> {
+    static const Precision::ePrecision type = Precision::I16;
+};
+
+template<>
+struct precision_from_media<uint8_t> {
+    static const Precision::ePrecision type = Precision::U8;
+};
+
+template<>
+struct precision_from_media<int8_t> {
+    static const Precision::ePrecision type = Precision::I8;
+};
+
+template<>
+struct precision_from_media<int32_t> {
+    static const Precision::ePrecision type = Precision::I32;
+};
+
+/**
+ * @brief container for storing both precision and it's underlined media type
+ * @tparam TMedia
+ */
+template <class TMedia>
+class TPrecision : public Precision {
+ public:
+    typedef TMedia MediaType;
+    TPrecision() : Precision(precision_from_media<TMedia>::type) {}
+    explicit TPrecision(const Precision & that) : Precision(that) {}
+    TPrecision & operator = (const Precision & that) {
+        Precision::operator=(that);
+        return *this;
+    }
+    explicit TPrecision(const Precision::ePrecision  value) : Precision(value) {}
+};
+
+template <class T> TPrecision<T> createTPrecision() {
+    TPrecision<T> cnt(InferenceEngine::Precision::fromType<T>());
+    return cnt;
+}
+
+template <InferenceEngine::Precision::ePrecision T>
+TPrecision<typename InferenceEngine::PrecisionTrait<T>::value_type> createTPrecision() {
+    TPrecision<typename InferenceEngine::PrecisionTrait<T>::value_type> cnt(T);
+    return cnt;
+}
+
+
+// special case for Mixed, or undefined precisions
+template <>
+class TPrecision<void> : public Precision {
+ public:
+    typedef void MediaType;
+    TPrecision() = default;
+    explicit TPrecision(const Precision & that) : Precision(that) {}
+    TPrecision & operator = (const Precision & that) {
+        Precision::operator=(that);
+        return *this;
+    }
+    explicit TPrecision(const Precision::ePrecision  value) : Precision(value) {}
+};
+
+
+}  // namespace InferenceEngine
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/quantization/quantization.cpp b/inference-engine/src/gna_plugin/quantization/quantization.cpp
new file mode 100644 (file)
index 0000000..457bff9
--- /dev/null
@@ -0,0 +1,699 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cstring>
+#include <iostream>
+#include "quantization.h"
+
+void QuantizeAffine16(float *ptr_float_weights,
+                      float *ptr_float_biases,
+                      int16_t *ptr_int_weights,
+                      int32_t *ptr_int_biases,
+                      float input_scale_factor,
+                      float *ptr_weight_scale_factor,
+                      float *ptr_output_scale_factor,
+                      uint32_t num_rows,
+                      uint32_t num_columns,
+                      uint32_t num_rows_padded,
+                      uint32_t num_columns_padded) {
+    uint32_t num_saturate = 0;
+
+    if (*ptr_weight_scale_factor == 1.0) {
+        // scale factor for weights is not calculated yet
+        float mean_weight = 0.0;
+        float mean_weight_squared = 0.0;
+        float max_weight = -1e20f;
+        float var_weight;
+        float mean_plus_2stdev;
+
+        for (uint32_t i = 0; i < num_rows; i++) {
+            for (uint32_t j = 0; j < num_columns; j++) {
+                float weight = ptr_float_weights[i * num_columns + j];
+                mean_weight += weight;
+                mean_weight_squared += weight * weight;
+                if (fabs(weight) > max_weight) {
+                    max_weight = fabs(weight);
+                }
+            }
+        }
+
+        mean_weight /= static_cast<float>(num_rows * num_columns);
+        mean_weight_squared /= static_cast<float>(num_rows * num_columns);
+        var_weight = mean_weight_squared - mean_weight * mean_weight;
+        mean_plus_2stdev = mean_weight + 2.0f * static_cast<float>(sqrtf(var_weight));
+
+        *ptr_weight_scale_factor = static_cast<float>(MAX_VAL_2B_WEIGHT) / max_weight;
+        *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
+    }
+
+    for (uint32_t row = 0; row < num_rows; row++) {
+        for (uint32_t col = 0; col < num_columns; col++) {
+            float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_weights[row * num_columns + col] * *ptr_weight_scale_factor + rounding_value;
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (value > 32767.0) {
+                *ptr_weight_16 = 32767;
+                num_saturate++;
+            } else if (value < -32768.0) {
+                *ptr_weight_16 = -32768;
+                num_saturate++;
+            } else {
+                *ptr_weight_16 = (int16_t) value;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            *ptr_weight_16 = 0;
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            *ptr_weight_16 = 0;
+        }
+    }
+
+    // case for element wise layer
+    if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) {
+        for (uint32_t j = 0; j < num_rows; j++) {
+            float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+            if (value > 2147483647.0) {
+                ptr_int_biases[j] = 2147483647L;
+                num_saturate++;
+            } else if (value < -2147483648.0) {
+                ptr_int_biases[j] = -2147483648LL;
+                num_saturate++;
+            } else {
+                ptr_int_biases[j] = (int32_t) value;
+            }
+        }
+        for (uint32_t j = num_rows; j < num_rows_padded; j++) {
+            ptr_int_biases[j] = 0;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine16()\n",
+                     num_saturate,
+                     num_rows * num_columns + num_rows);
+    }
+}
+
+void FixedQuantizeAffine16(float *ptr_float_weights,
+                           float *ptr_float_biases,
+                           int16_t *ptr_int_weights,
+                           int32_t *ptr_int_biases,
+                           float input_scale_factor,
+                           float weight_scale_factor,
+                           float *ptr_output_scale_factor,
+                           uint32_t num_rows,
+                           uint32_t num_columns,
+                           uint32_t num_rows_padded,
+                           uint32_t num_columns_padded) {
+    uint32_t num_saturate = 0;
+
+    for (uint32_t row = 0; row < num_rows; row++) {
+        for (uint32_t col = 0; col < num_columns; col++) {
+            float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (value > 32767.0) {
+                *ptr_weight_16 = 32767;
+                num_saturate++;
+            } else if (value < -32768.0) {
+                *ptr_weight_16 = -32768;
+                num_saturate++;
+            } else {
+                *ptr_weight_16 = (int16_t) value;
+            }
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            *ptr_weight_16 = 0;
+        }
+    }
+
+    *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
+
+    for (uint32_t j = 0; j < num_rows; j++) {
+        float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+        if (value > 2147483647.0) {
+            ptr_int_biases[j] = 2147483647L;
+            num_saturate++;
+        } else if (value < -2147483648.0) {
+            ptr_int_biases[j] = -2147483648LL;
+            num_saturate++;
+        } else {
+            ptr_int_biases[j] = (int32_t) value;
+        }
+    }
+    for (uint32_t j = num_rows; j < num_rows_padded; j++) {
+        ptr_int_biases[j] = 0;
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in FixedQuantizeAffine16()\n",
+                     num_saturate,
+                     num_rows * num_columns + num_rows);
+    }
+}
+
+float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements) {
+    float *ptr_float_feat = reinterpret_cast<float *>(ptr_float_memory);
+    float max = 0.0;
+    float scale_factor;
+
+    for (size_t i = 0; i < num_elements; i++) {
+        if (fabs(ptr_float_feat[i]) > max) {
+            max = fabs(ptr_float_feat[i]);
+        }
+    }
+
+    if (max == 0) {
+        scale_factor = 1.0;
+    } else {
+        scale_factor = target_max / max;
+    }
+
+    return (scale_factor);
+}
+
+float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors, float target_max) {
+    float max = 0.0;
+    float scale_factor;
+    uint32_t num_vectors = (uint32_t) input_vectors.size();
+
+    for (uint32_t i = 0; i < num_vectors; i++) {
+        float *ptr_float_feat = input_vectors[i].data();
+        uint32_t num_elements = (uint32_t) input_vectors[i].size();
+        for (uint32_t j = 0; i < num_elements; i++) {
+            if (fabs(ptr_float_feat[j]) > max) {
+                max = fabs(ptr_float_feat[j]);
+            }
+        }
+    }
+
+    if (max == 0) {
+        scale_factor = 1.0;
+    } else {
+        scale_factor = target_max / max;
+    }
+
+    return (scale_factor);
+}
+
+float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors,
+                                 int index,
+                                 int num_group_size,
+                                 float target_max) {
+    float max = 0.0;
+    float scale_factor;
+    uint32_t start_index = (uint32_t) index;
+    uint32_t end_index =
+        (uint32_t) ((index + num_group_size > input_vectors.size()) ? input_vectors.size() - 1 : start_index
+            + num_group_size);
+
+    for (uint32_t i = start_index; i < end_index; i++) {
+        float *ptr_float_feat = input_vectors[i].data();
+        uint32_t num_elements = (uint32_t) input_vectors[i].size();
+        for (uint32_t j = 0; j < num_elements; j++) {
+            if (fabs(ptr_float_feat[j]) > max) {
+                max = fabs(ptr_float_feat[j]);
+            }
+        }
+    }
+
+    if (max == 0) {
+        scale_factor = 1.0;
+    } else {
+        scale_factor = target_max / max;
+    }
+
+    return (scale_factor);
+}
+
+void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor) {
+    float *ptr_float_feat = reinterpret_cast<float *>(ptr_float_memory);
+    uint32_t num_saturate = 0;
+
+    int16_t *ptr_int_feat = reinterpret_cast<int16_t *>(ptr_int_memory);
+    for (uint32_t i = 0; i < num_elements; i++) {
+        float rounding_value = (ptr_float_feat[i] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_feat[i] * scale_factor + rounding_value;
+        if (value > 32767.0) {
+            ptr_int_feat[i] = 32767;
+            num_saturate++;
+        } else if (value < -32768.0) {
+            ptr_int_feat[i] = -32768;
+            num_saturate++;
+        } else {
+            ptr_int_feat[i] = (int16_t) value;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations during QuantizeVector16()\n", num_saturate, num_elements);
+    }
+}
+
+void QuantizeVector16(std::vector<std::vector<float>> &input_vectors,
+                      int16_t *ptr_int_memory,
+                      uint32_t index,
+                      uint32_t num_group_size,
+                      float scale_factor) {
+    int16_t *ptr_int_feat = reinterpret_cast<int16_t *> (ptr_int_memory);
+    uint32_t num_saturate = 0;
+    uint32_t num_elements = (uint32_t) input_vectors[0].size();  // assume all vector are same size
+    uint32_t start_index = (uint32_t) index;
+    uint32_t end_index =
+        (uint32_t) ((index + num_group_size > input_vectors.size()) ? input_vectors.size() - 1 : start_index
+            + num_group_size);
+
+    if (end_index - start_index < num_group_size) {
+        memset(ptr_int_feat, 0, num_elements * num_group_size * sizeof(int16_t));  // for zero padding partial group
+    }
+    for (uint32_t j = start_index; j < end_index; j++) {
+        for (uint32_t i = 0; i < num_elements; i++) {
+            float *ptr_float_feat = input_vectors[j].data();
+            float rounding_value = (ptr_float_feat[i] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_feat[i] * scale_factor + rounding_value;
+            if (value > 32767.0) {
+                ptr_int_feat[i * num_group_size + j - start_index] = 32767;
+                num_saturate++;
+            } else if (value < -32768.0) {
+                ptr_int_feat[i * num_group_size + j - start_index] = -32768;
+                num_saturate++;
+            } else {
+                ptr_int_feat[i * num_group_size + j - start_index] = (int16_t) value;
+            }
+        }
+    }
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations during QuantizeVector16()\n",
+                     num_saturate,
+                     num_elements * num_group_size);
+    }
+}
+
+void ReQuantizeVector16(int16_t *ptr_int_memory, uint32_t num_elements, float prev_scale_factor, float scale_factor) {
+    uint32_t num_saturate = 0;
+
+    int16_t *ptr_int_feat = reinterpret_cast<int16_t *> (ptr_int_memory);
+    for (uint32_t i = 0; i < num_elements; i++) {
+        float float_value = ptr_int_feat[i] / prev_scale_factor;
+        float rounding_value = (float_value > 0) ? 0.5f : -0.5f;
+        float value = float_value * scale_factor + rounding_value;
+        if (value > 32767.0) {
+            ptr_int_feat[i] = 32767;
+            num_saturate++;
+        } else if (value < -32768.0) {
+            ptr_int_feat[i] = -32768;
+            num_saturate++;
+        } else {
+            ptr_int_feat[i] = (int16_t) value;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations during ReQuantizeVector16()\n", num_saturate, num_elements);
+    }
+}
+
+void QuantizeBias16(float *ptr_float_biases,
+                    int32_t *ptr_int_biases,
+                    float input_scale_factor,
+                    float weight_scale_factor,
+                    float *ptr_output_scale_factor,
+                    uint32_t num_rows) {
+    uint32_t num_saturate = 0;
+
+    *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
+    for (uint32_t j = 0; j < num_rows; j++) {
+        float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+        if (value > 2147483647.0) {
+            ptr_int_biases[j] = 2147483647L;
+            num_saturate++;
+        } else if (value < -2147483648.0) {
+            ptr_int_biases[j] = -2147483648LL;
+            num_saturate++;
+        } else {
+            ptr_int_biases[j] = (int32_t) value;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeBias16()\n", num_saturate, num_rows);
+    }
+}
+
+void DeQuantizeVector16(int16_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor) {
+    int16_t *int16_vector = reinterpret_cast<int16_t *> (ptr_int_memory);
+    for (uint32_t i = 0; i < float_vector.size(); i++) {
+        float_vector[i] = int16_vector[i] / scale_factor;
+    }
+}
+
+void DeQuantizeVector32(int32_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor) {
+    int32_t *int32_vector = reinterpret_cast<int32_t  *> (ptr_int_memory);
+    for (uint32_t i = 0; i < float_vector.size(); i++) {
+        float_vector[i] = int32_vector[i] / scale_factor;
+    }
+}
+
+void DeQuantizeVector32(int32_t *ptr_int_memory,
+                        std::vector<float> &float_vector,
+                        uint32_t index,
+                        uint32_t num_group_size,
+                        float scale_factor) {
+    int32_t *int32_vector = reinterpret_cast<int32_t  *> (ptr_int_memory);
+    for (uint32_t i = 0; i < float_vector.size(); i++) {
+        float_vector[i] = int32_vector[i * num_group_size + index] / scale_factor;
+    }
+}
+bool IntegrityCheckAffine16(float *ptr_float_weights,
+                            float *ptr_float_biases,
+                            int16_t *ptr_int_weights,
+                            int32_t *ptr_int_biases,
+                            float weight_scale_factor,
+                            float output_scale_factor,
+                            uint32_t num_rows,
+                            uint32_t num_columns,
+                            uint32_t num_rows_padded,
+                            uint32_t num_columns_padded) {
+    bool model_ok = true;
+
+    for (uint32_t row = 0; row < num_rows; row++) {
+        for (uint32_t col = 0; col < num_columns; col++) {
+            float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            int16_t int_value;
+            if (value > 32767.0) {
+                int_value = 32767;
+            } else if (value < -32768.0) {
+                int_value = -32768;
+            } else {
+                int_value = (int16_t) value;
+            }
+            if (int_value != *ptr_weight_16) {
+                model_ok = false;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (*ptr_weight_16 != 0) {
+                model_ok = false;
+            }
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (*ptr_weight_16 != 0) {
+                model_ok = false;
+            }
+        }
+    }
+
+    for (uint32_t j = 0; j < num_rows; j++) {
+        float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_biases[j] * output_scale_factor + rounding_value;
+        int32_t int_value;
+        if (value > 2147483647.0) {
+            int_value = 2147483647L;
+        } else if (value < -2147483648.0) {
+            int_value = -2147483648LL;
+        } else {
+            int_value = (int32_t) value;
+        }
+        if (int_value != ptr_int_biases[j]) {
+            model_ok = false;
+        }
+    }
+    for (uint32_t j = num_rows; j < num_rows_padded; j++) {
+        if (ptr_int_biases[j] != 0) {
+            model_ok = false;
+        }
+    }
+
+    return (model_ok);
+}
+
+bool IntegrityCheckAffineWeights16(float *ptr_float_weights,
+                                   int16_t *ptr_int_weights,
+                                   float weight_scale_factor,
+                                   uint32_t num_rows,
+                                   uint32_t num_columns,
+                                   uint32_t num_rows_padded,
+                                   uint32_t num_columns_padded) {
+    bool model_ok = true;
+
+    for (uint32_t row = 0; row < num_rows; row++) {
+        for (uint32_t col = 0; col < num_columns; col++) {
+            float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            int16_t int_value;
+            if (value > 32767.0) {
+                int_value = 32767;
+            } else if (value < -32768.0) {
+                int_value = -32768;
+            } else {
+                int_value = (int16_t) value;
+            }
+            if (int_value != *ptr_weight_16) {
+                model_ok = false;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (*ptr_weight_16 != 0) {
+                model_ok = false;
+            }
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (*ptr_weight_16 != 0) {
+                model_ok = false;
+            }
+        }
+    }
+
+    return (model_ok);
+}
+
+
+void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
+                     int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
+                     float input_scale_factor, float *ptr_weight_scale_factor,
+                     float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
+                     uint32_t num_rows_padded, uint32_t num_columns_padded) {
+    uint32_t num_saturate = 0;
+
+    if (*ptr_weight_scale_factor == 1.0) {
+        // scale factor for weights is not calculated yet
+        float mean_weight = 0.0;
+        float mean_weight_squared = 0.0;
+        float max_weight = -1e20f;
+        float var_weight;
+        float mean_plus_2stdev;
+
+        for (uint32_t i = 0; i < num_rows; i++) {
+            for (uint32_t j = 0; j < num_columns; j++) {
+                float weight = ptr_float_weights[i*num_columns + j];
+                mean_weight += weight;
+                mean_weight_squared += weight * weight;
+                if (fabs(weight) > max_weight) {
+                    max_weight = fabs(weight);
+                }
+            }
+        }
+
+        mean_weight /= static_cast<float>(num_rows * num_columns);
+        mean_weight_squared /= static_cast<float>(num_rows * num_columns);
+        var_weight = mean_weight_squared - mean_weight * mean_weight;
+        mean_plus_2stdev = mean_weight + 2.0f * static_cast<float>(sqrtf(var_weight));
+
+        *ptr_weight_scale_factor = static_cast<float>(MAX_VAL_1B_WEIGHT) / max_weight;
+
+        // For 8 bit weights quantize as follows:
+        // 1. adjust scale factor to increase dynamic range of entire matrix by max multiplier
+        // 2. find maximum scaled weight for each row
+        // 3. find multiplier such that dividing by the multiplier brings row back within 8-bit dynamic range
+        // 4. quantize and store scaled row
+        *ptr_weight_scale_factor = MAX_OUT_MULTIPLIER * *ptr_weight_scale_factor;  //  increase dynamic range by max multiplier
+        *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
+    }
+    float valueAcc = 0.0;
+    for (uint32_t row = 0; row < num_rows; row++) {
+        float scaled_row_max = 0;
+        float rounding_value, value;
+        for (uint32_t col = 0; col < num_columns; col++) {
+            value = ptr_float_weights[row*num_columns + col] * *ptr_weight_scale_factor;
+            valueAcc += value;
+            if (fabs(value) > scaled_row_max) {
+                scaled_row_max = fabs(value);
+            }
+        }
+
+        value = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
+        ptr_int_biases[row].multiplier = (uint8_t) (value + 0.5);
+        for (uint32_t col = 0; col < num_columns; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+
+
+            value = ptr_float_weights[row*num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
+            if (value > 127.0) {
+                *ptr_weight_8 = 127;
+                num_saturate++;
+            } else if (value < -128.0) {
+                *ptr_weight_8 = -128;
+                num_saturate++;
+            } else {
+                *ptr_weight_8 = (int8_t)value;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            *ptr_weight_8 = 0;
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            *ptr_weight_8 = 0;
+        }
+        ptr_int_biases[row].multiplier = 0;
+    }
+
+    // bias value of the bas will be only used when input bias provided
+    if (ptr_float_biases != nullptr) {
+        for (uint32_t j = 0; j < num_rows; j++) {
+            float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+            if (value > 2147483647.0) {
+                ptr_int_biases[j].bias = 2147483647L;
+                num_saturate++;
+            } else if (value < -2147483648.0) {
+                ptr_int_biases[j].bias = -2147483648LL;
+                num_saturate++;
+            } else {
+                ptr_int_biases[j].bias = (int32_t) value;
+            }
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
+    }
+}
+
+
+void QuantizeBias8(float *ptr_float_biases,
+                   intel_compound_bias_t  *ptr_int_biases,
+                   float input_scale_factor,
+                   float weight_scale_factor,
+                   float *ptr_output_scale_factor, uint32_t num_rows) {
+    uint32_t num_saturate = 0;
+
+    *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
+    for (uint32_t j = 0; j < num_rows; j++) {
+        float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+        if (value > 2147483647.0) {
+            ptr_int_biases[j].bias = 2147483647L;
+            num_saturate++;
+        } else if (value < -2147483648.0) {
+            ptr_int_biases[j].bias = -2147483648LL;
+            num_saturate++;
+        } else {
+            ptr_int_biases[j].bias = (int32_t)value;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeBias8()\n", num_saturate, num_rows);
+    }
+}
+
+bool IntegrityCheckAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
+                           float weight_scale_factor, float output_scale_factor, uint32_t num_rows, uint32_t num_columns,
+                           uint32_t num_rows_padded, uint32_t num_columns_padded) {
+    bool model_ok = true;
+
+    for (uint32_t row = 0; row < num_rows; row++) {
+        float scaled_row_max = 0;
+        float rounding_value, value;
+        for (uint32_t col = 0; col < num_columns; col++) {
+            value = ptr_float_weights[row*num_columns + col] * weight_scale_factor;
+            if (fabs(value) > scaled_row_max) {
+                scaled_row_max = fabs(value);
+            }
+        }
+        value = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
+        if (ptr_int_biases[row].multiplier != (uint8_t)(value + 0.5)) {
+            model_ok = false;
+        }
+        for (uint32_t col = 0; col < num_columns; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            int8_t int_value;
+            rounding_value = (ptr_float_weights[row*num_columns + col] > 0) ? 0.5f : -0.5f;
+            value = ptr_float_weights[row*num_columns + col] * (weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
+            if (value > 127.0) {
+                int_value = 127;
+            } else if (value < -128.0) {
+                int_value = -128;
+            } else {
+                int_value = (int8_t)value;
+            }
+            if (int_value != *ptr_weight_8) {
+                model_ok = false;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            if (*ptr_weight_8 != 0) {
+                model_ok = false;
+            }
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            if (*ptr_weight_8 != 0) {
+                model_ok = false;
+            }
+        }
+        if (ptr_int_biases[row].multiplier != 0) {
+            model_ok = false;
+        }
+    }
+
+    for (uint32_t j = 0; j < num_rows; j++) {
+        float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_biases[j] * output_scale_factor + rounding_value;
+        int32_t int_value;
+        if (value > 2147483647.0) {
+            int_value = 2147483647L;
+        } else if (value < -2147483648.0) {
+            int_value = -2147483648LL;
+        } else {
+            int_value = (int32_t)value;
+        }
+        if (int_value != ptr_int_biases[j].bias) {
+            model_ok = false;
+        }
+    }
+
+    return(model_ok);
+}
+
diff --git a/inference-engine/src/gna_plugin/quantization/quantization.h b/inference-engine/src/gna_plugin/quantization/quantization.h
new file mode 100644 (file)
index 0000000..bd1ff7b
--- /dev/null
@@ -0,0 +1,100 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <cstdint>
+
+#define MAX_OUT_MULTIPLIER 230
+#define MAX_VAL_1B_WEIGHT 127
+#define MAX_VAL_2B_WEIGHT 16384
+#define MAX_VAL_2B_FEAT 16384
+#ifdef DEBUG
+#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
+#else
+#define QUANTWARNING(...)
+#endif
+
+void QuantizeAffine16(float *ptr_float_weights,
+                      float *ptr_float_biases,
+                      int16_t *ptr_int_weights,
+                      int32_t *ptr_int_biases,
+                      float input_scale_factor,
+                      float *ptr_weight_scale_factor,
+                      float *ptr_output_scale_factor,
+                      uint32_t num_rows,
+                      uint32_t num_columns,
+                      uint32_t num_rows_padded,
+                      uint32_t num_columns_padded);
+void FixedQuantizeAffine16(float *ptr_float_weights,
+                           float *ptr_float_biases,
+                           int16_t *ptr_int_weights,
+                           int32_t *ptr_int_biases,
+                           float input_scale_factor,
+                           float weight_scale_factor,
+                           float *ptr_output_scale_factor,
+                           uint32_t num_rows,
+                           uint32_t num_columns,
+                           uint32_t num_rows_padded,
+                           uint32_t num_columns_padded);
+float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
+float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors, float target_max);
+float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors,
+                                 int index,
+                                 int num_group_size,
+                                 float target_max);
+void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);
+void QuantizeVector16(std::vector<std::vector<float>> &input_vectors,
+                      int16_t *ptr_int_memory,
+                      uint32_t index,
+                      uint32_t num_group_size,
+                      float scale_factor);
+void ReQuantizeVector16(int16_t *ptr_int_memory, uint32_t num_elements, float prev_scale_factor, float scale_factor);
+bool IntegrityCheckAffine16(float *ptr_float_weights,
+                            float *ptr_float_biases,
+                            int16_t *ptr_int_weights,
+                            int32_t *ptr_int_biases,
+                            float weight_scale_factor,
+                            float output_scale_factor,
+                            uint32_t num_rows,
+                            uint32_t num_columns,
+                            uint32_t num_rows_padded,
+                            uint32_t num_columns_padded);
+bool IntegrityCheckAffineWeights16(float *ptr_float_weights,
+                                   int16_t *ptr_int_weights,
+                                   float weight_scale_factor,
+                                   uint32_t num_rows,
+                                   uint32_t num_columns,
+                                   uint32_t num_rows_padded,
+                                   uint32_t num_columns_padded);
+void QuantizeBias16(float *ptr_float_biases,
+                    int32_t *ptr_int_biases,
+                    float input_scale_factor,
+                    float weight_scale_factor,
+                    float *ptr_output_scale_factor,
+                    uint32_t num_rows);
+void DeQuantizeVector16(int16_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor);
+void DeQuantizeVector32(int32_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor);
+void DeQuantizeVector32(int32_t *ptr_int_memory,
+                        std::vector<float> &float_vector,
+                        uint32_t index,
+                        uint32_t num_group_size,
+                        float scale_factor);
+
+#include "gna-api.h"
+
+void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
+                     float input_scale_factor, float *ptr_weight_scale_factor, float *ptr_output_scale_factor,
+                     uint32_t num_rows, uint32_t num_columns, uint32_t num_rows_padded, uint32_t num_columns_padded);
+void QuantizeBias8(float *ptr_float_biases, intel_compound_bias_t  *ptr_int_biases, float input_scale_factor,
+                   float weight_scale_factor, float *ptr_output_scale_factor, uint32_t num_rows);
+bool IntegrityCheckAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
+                           float weight_scale_factor, float output_scale_factor, uint32_t num_rows, uint32_t num_columns,
+                           uint32_t num_rows_padded, uint32_t num_columns_padded);
+
+
diff --git a/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp
new file mode 100644 (file)
index 0000000..347102b
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+namespace GNAPluginNS {
+
+struct Quantization {
+    float scale = 1.0f;
+    float offset = 0.0f;
+    int shift = 0.0f;
+};
+
+struct QuantizedLayerParams {
+    Quantization _src_quant;
+    Quantization _dst_quant;
+    Quantization _weights_quant;
+    Quantization _bias_quant;
+    float _o_shift = 0.0f;
+    float _b_shift = 0.0f;
+};
+
+}  // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp
new file mode 100644 (file)
index 0000000..a3ba22c
--- /dev/null
@@ -0,0 +1,339 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <string>
+#include "gna_layer_info.hpp"
+#include "ie_layers.h"
+#include "gna_plugin_log.hpp"
+
+namespace GNAPluginNS {
+namespace details {
+using namespace InferenceEngine;
+struct ScaleFactorUpdateResult {
+    CNNLayer *restartLayer = nullptr;
+    ScaleFactorUpdateResult() = default;
+    explicit ScaleFactorUpdateResult(CNNLayer * restartlayer) : restartLayer(restartlayer) {
+    }
+    operator bool() {
+        return restartLayer == nullptr;
+    }
+};
+
+/**
+ * @brief calculates output scale factor per layer
+ * @tparam T
+ */
+template<class T>
+class ScaleFactorPerLayer {
+ public:
+    /**
+     * @brief calculates weights scale factor for fit dynamic range into target bitsize,
+     * also calculates output scale factor for the given layer
+     * @param cnnLayer
+     * @param weightsSize
+     * @param inputScaleFactor
+     * @param result
+     * @return
+     */
+    bool operator()(T cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
+        return false;
+    }
+};
+
+template<>
+class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
+ private :
+    const float activation_scale_factor = 2048.f;
+    const float identity_scale_factor = 2049.0f;
+    const float k = 5;
+    const float k_identity = 6;
+ public :
+    bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
+        if ( !cnnLayer ) {
+            THROW_IE_EXCEPTION << "Incorrect Convolutional Layer pointer \n";
+        }
+        LayerInfo layerInfo(*cnnLayer);
+        // TODO: current approach set input scale factor for true input layer(s) equals to provided factor,
+        auto quant = getInjectedData<QuantizedLayerParams>(*cnnLayer);
+        if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) {
+            // for memory output layer need to verify it's input scale factor
+            if (CNNNetHasPrevLayer(cnnLayer)) {
+                auto prevLayer = CNNNetPrevLayer(cnnLayer);
+                auto inputQuant = getInjectedData<QuantizedLayerParams>(prevLayer);
+                if (inputQuant->_dst_quant.scale != activation_scale_factor) {
+                    gnawarn() << "[WARNING] quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") "
+                                       << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
+                                       << activation_scale_factor << std::endl;
+                    inputQuant->_dst_quant.scale = activation_scale_factor;
+                    // restarting from that activation;
+                    result = ScaleFactorUpdateResult(prevLayer.get());
+                    return true;
+                }
+            }
+            quant->_src_quant.scale = quant->_dst_quant.scale = activation_scale_factor;
+            return true;
+        }
+
+        if (!CNNNetHasPrevLayer(cnnLayer)) {
+            quant->_dst_quant.scale = inputScaleFactor;
+            return ScaleFactorUpdateResult();
+        }
+
+        // by default layer is pass thru its scale factor
+        auto inputQuant = getInjectedData<QuantizedLayerParams>(CNNNetPrevLayer(cnnLayer));
+        quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
+        quant->_src_quant.scale = inputQuant->_dst_quant.scale;
+
+        if (layerInfo.isActivation()) {
+            // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
+            // set the initial value
+            quant->_dst_quant.scale = layerInfo.isIdentity() ? identity_scale_factor:activation_scale_factor;
+            // if activation is one from relu family, we need to apply heuruistic to avoid activation output overflow
+            if (layerInfo.isRelu() &&
+                    static_cast<uint64_t>(quant->_dst_quant.scale * quant->_src_quant.scale)
+                                                                > std::numeric_limits<int32_t>::max()-1) {
+                quant->_dst_quant.scale = (quant->_dst_quant.scale * 0.5);
+            }
+        }
+        return true;
+    }
+};
+
+template<>
+class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
+ public:
+    bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
+        if ( !eltwiseLayer ) {
+            THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
+        }
+        auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0);
+        auto in1 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 1);
+
+        auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
+        auto quantParams1 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in1);
+        auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*eltwiseLayer);
+
+        switch (eltwiseLayer->_operation) {
+            case InferenceEngine::EltwiseLayer::Prod: {
+                quantData->_weights_quant.scale = quantParams1->_dst_quant.scale;
+                quantData->_dst_quant.scale     = quantParams0->_dst_quant.scale * quantParams1->_dst_quant.scale;
+                break;
+            }
+            case InferenceEngine::EltwiseLayer::Sum: {
+                // detect which input will be used as biases
+                if (LayerInfo(in0).has32BOutput()) {
+                    std::swap(in0, in1);
+                    std::swap(quantParams0, quantParams1);
+                }
+
+                // this path might result in significant data loss
+                quantData->_weights_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale;
+                quantData->_dst_quant.scale = quantParams1->_dst_quant.scale;
+
+                // eltwise will always work in int16
+                auto maxValue = std::numeric_limits<int16_t>::max() - 1;
+                if (quantData->_weights_quant.scale > maxValue + 1) {
+                    // rescaling it's activation input
+                    // iterating thru previous layers of eltwise
+                    for (uint8_t i = 0; i < 2; ++i) {
+                        InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, i);
+                        // trick to get opposite index (for 0 -> 1 for 1 -> 0) by inversing i.
+                        auto quantParams =
+                                InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, !i));
+
+                        for (; InferenceEngine::CNNNetHasPrevLayer(in.get()); in = CNNNetPrevLayer(in)) {
+                            auto info = LayerInfo(in);
+                            // we skipping only split layers so far, also need to work on memory layers
+                            // this case for input from port 0
+                            if (info.isSplit() || info.isSlice()) {
+                                continue;
+                            } else if (info.has16BOutput() && info.isActivation()) {
+                                auto newOutputScale = quantParams->_dst_quant.scale / maxValue;
+                                if (newOutputScale > std::numeric_limits<int16_t>::max() / 2) {
+                                    break;
+                                }
+                                auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
+                                gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
+                                         << ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
+                                         << ", was " << quantDataForActivation->_dst_quant.scale <<"\n" << std::flush;
+                                quantDataForActivation->_dst_quant.scale = newOutputScale;
+                                result = ScaleFactorUpdateResult(in.get());
+                                return true;
+                            } else if (info.has16BOutput()) {
+                                break;
+                            }
+
+                            // if we are here it means that we are in the port 1
+                            if (info.isFullyConnected() || info.isConvolutional()) {
+                                auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
+                                auto newOutputScale = quantParams->_dst_quant.scale * maxValue;
+                                auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.scale;
+                                quantDataForInputLayer->_dst_quant.scale = newOutputScale;
+                                quantDataForInputLayer->_weights_quant.scale = newWeightScale;
+                                result = ScaleFactorUpdateResult(in.get());
+                                return true;
+                            }
+                        }
+                    }
+                    // we unable to rescale the input - results might be bad
+                    gnawarn() << "[INFO] weights saturated for " << eltwiseLayer->name << "\n";
+                }
+                break;
+            }
+            default : THROW_GNA_EXCEPTION << "Unsupported Eltwise layer for quantisation: " << eltwiseLayer->_operation;
+        }
+        return true;
+    }
+};
+
+template<>
+class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
+ private:
+    float const _scale_reduction_50 = 0.50;
+    float const _scale_reduction_45 = 0.45;
+    float const _scale_reduction_40 = 0.40;
+    float const _scale_reduction_35 = 0.35;
+
+    uint16_t const _scale_change_req_threshold = 30;
+    uint16_t const _scale_change_threshold_100 = 100;
+    uint16_t const _scale_change_threshold_150 = 150;
+    uint16_t const _scale_change_threshold_200 = 200;
+
+ public:
+    bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
+        if ( !wl ) {
+            THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer  \n";
+        } else if (!wl->_weights) {
+            THROW_GNA_EXCEPTION << "Incorrect weight value for " << wl->name << ":" << wl->type << "\n";
+        }
+
+        auto prevLayer = CNNNetPrevLayer(wl);
+        auto quantDataForInputLayer =
+            InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
+
+        auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
+        // TODO: pass 8 bits somehow
+        if (quant->_weights_quant.scale == 1.0f) {
+            size_t scaleRange = 0;
+            if (weightsSize == 2) {
+                scaleRange = MAX_VAL_2B_WEIGHT;
+            } else if (weightsSize == 1) {
+                scaleRange = MAX_VAL_1B_WEIGHT;
+            } else {
+                THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize;
+            }
+            quant->_weights_quant.scale =
+                ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size());
+
+            // TODO: findout why ???
+            if (weightsSize == 1) {
+                quant->_weights_quant.scale *= MAX_OUT_MULTIPLIER;
+            }
+        }
+
+        quant->_src_quant.scale = quantDataForInputLayer->_dst_quant.scale;
+
+        double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
+
+        if (weightsSize == 1 &&
+            static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
+                                                    static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) {
+            gnawarn() << "Output scale for " << wl->name
+                                            << " too large and are being reduced. Else saturations likely will happen \n";
+            // reduce weight scale according experimentatl heuruistic
+            if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_100) {
+                quant->_weights_quant.scale *= _scale_reduction_50;
+                tmp_dst_quant_scale *= _scale_reduction_50;
+            } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_150) {
+                quant->_weights_quant.scale *= _scale_reduction_45;
+                tmp_dst_quant_scale *= _scale_reduction_45;
+            } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_200) {
+                quant->_weights_quant.scale *= _scale_reduction_40;
+                tmp_dst_quant_scale *= _scale_reduction_40;
+            } else {
+                quant->_weights_quant.scale *= _scale_reduction_35;
+                tmp_dst_quant_scale *= _scale_reduction_35;
+            }
+        }
+
+        quant->_dst_quant.scale = tmp_dst_quant_scale;
+
+        return true;
+    }
+};
+
+template<>
+class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
+ public:
+    bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
+        return ScaleFactorPerLayer<InferenceEngine::WeightableLayer*>::operator()(wl, 2, inputScaleFactor, result);
+    }
+};
+
+/**
+ * GNA convolutions cannot be quantized in int8, remove when library starts support that
+ */
+template<>
+class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> {
+};
+
+
+}  // namespace details
+
+/**
+ * @brief scale factor calculator will calculate only output scale factors for the layer
+ * if scale factor propagation not possible, it will fall indicate a restart condition
+ */
+class ScaleFactorCalculator {
+    using Cnt = std::vector<InferenceEngine::CNNLayerPtr>;
+    Cnt  net;
+    mutable Cnt::const_iterator idx;
+    float inputScaleFactor;
+    mutable bool needRestart = false;
+    int weightsBytesSize;
+
+ public:
+    ScaleFactorCalculator(Cnt &net, int weightsBytesSize, float inputScaleFactor)
+            : net(net), inputScaleFactor(inputScaleFactor), weightsBytesSize(weightsBytesSize) {
+        idx = std::begin(this->net);
+    }
+    bool needToRestart() const {
+        return needRestart;
+    }
+    bool allLayersProcessed() const {
+        return idx == std::end(net);
+    }
+    std::vector<InferenceEngine::CNNLayerPtr> getStartLayers() const {
+        return std::vector<InferenceEngine::CNNLayerPtr>(idx, std::end(net));
+    }
+    template<class T>
+    bool operator()(T ptr) const {
+        needRestart = false;
+        details::ScaleFactorUpdateResult result;
+        if (!details::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, inputScaleFactor, result)) {
+            return false;
+        }
+        if (result) {
+            idx++;
+            return true;
+        }
+
+        idx = std::find_if(net.begin(), net.end(), [&](InferenceEngine::CNNLayerPtr cnnLayer) {
+            if (!result) {
+                return result.restartLayer == cnnLayer.get();
+            }
+            return ptr == cnnLayer.get();
+        });
+        idx++;
+        needRestart = true;
+        return true;
+    }
+};
+
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/util.cpp b/inference-engine/src/gna_plugin/util.cpp
new file mode 100644 (file)
index 0000000..c10e317
--- /dev/null
@@ -0,0 +1,46 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cinttypes>
+#ifndef _WIN32
+#include <mm_malloc.h>
+#endif
+#include <cstring>
+#include <details/ie_exception.hpp>
+#include "util.h"
+#include "gna_plugin_log.hpp"
+
+void *AllocateMemory(uint32_t num_memory_bytes, const char *ptr_name) {
+    void *ptr_memory = _mm_malloc(num_memory_bytes, 64);
+    if (ptr_memory == NULL) {
+        THROW_GNA_EXCEPTION << "Memory allocation failed for " << ptr_name;
+    }
+    memset(ptr_memory, 0, num_memory_bytes);
+
+    return (ptr_memory);
+}
+
+void FreeMemory(void *ptr_memory) {
+    if (ptr_memory != NULL) {
+        _mm_free(ptr_memory);
+    }
+    ptr_memory = NULL;
+}
+
+int32_t MemoryOffset(void *ptr_target, void *ptr_base) {
+    uint64_t target = (uint64_t) ptr_target;
+    uint64_t base = (uint64_t) ptr_base;
+    if (target == 0) {  // handle NULL pointers separately
+        return (-1);
+    } else if (target < base) {
+        THROW_GNA_EXCEPTION << "Error:  target address value " <<  target<< " is less than base address " << base << " in MemoryOffset()";
+    } else {
+        uint64_t diff = target - base;
+        if (diff > 0x7fffffff) {
+            THROW_GNA_EXCEPTION << "Error:  target address value " << target << " too far from base address " << base << " in MemoryOffset()!";
+        }
+        return ((int32_t) diff);
+    }
+}
+
diff --git a/inference-engine/src/gna_plugin/util.h b/inference-engine/src/gna_plugin/util.h
new file mode 100644 (file)
index 0000000..0838bd2
--- /dev/null
@@ -0,0 +1,9 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+void *AllocateMemory(uint32_t num_memory_bytes, const char *ptr_name);
+void FreeMemory(void *ptr_memory);
+int32_t MemoryOffset(void *ptr_target, void *ptr_base);
index a8f7be9..7456834 100644 (file)
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 set (TARGET_NAME "HeteroPlugin")
 
 file(GLOB SOURCES
@@ -19,9 +20,7 @@ include_directories(
     ${CMAKE_CURRENT_SOURCE_DIR}
 )
 
-if(WIN32)
-    add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
-endif()
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
 
 add_library(${TARGET_NAME} SHARED ${SOURCES} ${HEADERS})
 target_link_libraries(${TARGET_NAME} inference_engine ${INTEL_ITT_LIBS})
index b645964..59f112a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index ead3fff..1192abb 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -90,6 +89,10 @@ HeteroExecutableNetwork::HeteroExecutableNetwork(InferenceEngine::ICNNNetwork &n
     load(network, config, extensions, listener);
 }
 
+void dla_layer_colorer(const CNNLayerPtr layer,
+                       ordered_properties &printed_properties,
+                       ordered_properties &node_properties);
+
 void HeteroExecutableNetwork::load(InferenceEngine::ICNNNetwork &network_,
                                    const std::map<std::string, std::string> &config,
                                    const std::vector<InferenceEngine::IExtensionPtr> &extensions,
@@ -127,6 +130,14 @@ void HeteroExecutableNetwork::load(InferenceEngine::ICNNNetwork &network_,
         } else {
             THROW_IE_EXCEPTION << "The 'TARGET_FALLBACK' option was not defined for heterogeneous plugin";
         }
+    } else {
+        if (dumpDotFile) {
+            std::stringstream stream(std::stringstream::out);
+            stream << "hetero_affinity_" << network.getName() << ".dot";
+
+            std::ofstream file(stream.str().c_str());
+            saveGraphToDot(network, file, dla_layer_colorer);
+        }
     }
 
     details::CNNNetworkIterator el(&network);
@@ -226,6 +237,7 @@ void HeteroExecutableNetwork::load(InferenceEngine::ICNNNetwork &network_,
         // set precision for intermediate data (not for external) to FP32
         // later on we have to add Plugin::getPreferableInputPrecision(network) and
         // Plugin::getPreferableOutputPrecision(network) and set precision based on this info
+        // TODO(amalyshe) add clever selectino of precision for intermediate blobs
         for (auto &&it : clonedInputs) {
             if (externalInputsData.find(it.first) == externalInputsData.end()) {
                 it.second->setInputPrecision(Precision::FP32);
@@ -242,6 +254,7 @@ void HeteroExecutableNetwork::load(InferenceEngine::ICNNNetwork &network_,
 
         // Temporal solution until each plugin starts to support desirable precision
         // Only for CPU registered device we are changing all FP16 types to FP32 and convert blobs if any
+        // TODO(amalyshe) remove this hack to preoper network.setPrecision(FP16) and feeding to CPU plugin
         if (affinity == "CPU") {
             tempNetwork->setPrecision(Precision::FP32);
             details::CNNNetworkIterator itcpu(reinterpret_cast<ICNNNetwork *>(tempNetwork.get()));
index cfc5e52..24b59b0 100644 (file)
@@ -1,12 +1,7 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
-/**
- * @brief a header file for ExecutableNetwork
- * @file dlia_executable_network.hpp
- */
 #pragma once
 
 #include <memory>
index cc3dc7c..fff3d16 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -72,6 +71,7 @@ void Engine::AddExtension(InferenceEngine::IExtensionPtr extension) {
 
 void Engine::SetAffinity(InferenceEngine::ICNNNetwork &network,
                          const std::map<std::string, std::string> &config) {
+    // TODO(amalyshe) config is not used here, talk with RAN why it appeared in initial interface
     FallbackPolicy fbPolicy(_deviceLoaders, _config[KEY_HETERO_DUMP_GRAPH_DOT]== YES);
     fbPolicy.init(_config["TARGET_FALLBACK"], config, _extensions);
     fbPolicy.setAffinity(config, network);
@@ -83,7 +83,7 @@ INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(
         ResponseDesc *resp) noexcept {
     try {
         plugin = new HeteroPluginBase<Engine>(
-                {{1, 4}, "heteroPlugin", "heteroPlugin"},
+                {{1, 5}, "heteroPlugin", "heteroPlugin"},
                 std::make_shared<Engine>());
         return OK;
     }
index 90611ff..93fa7b3 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index a666743..41f0e98 100644 (file)
@@ -6,6 +6,7 @@ set (TARGET_NAME "inference_engine")
 
 file (GLOB LIBRARY_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/builders/*.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/cpp_interfaces/*.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/shape_infer/*.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/shape_infer/built-in/*.cpp
@@ -34,6 +35,7 @@ if( (NOT DEFINED ENABLE_SSE42) OR ENABLE_SSE42)
     include_directories(${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42)
     set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/blob_transform_sse42.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
     set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/ie_preprocess_data_sse42.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
+    set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
     add_definitions(-DHAVE_SSE=1)
 endif()
 
@@ -46,10 +48,7 @@ file (GLOB PUBLIC_HEADERS
        ${PUBLIC_HEADERS_DIR}/*.hpp
       )
 
-if(WIN32)
-  add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_API)
-endif()
-
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_API)
 
 # Create named folders for the sources within the .vcproj
 # Empty name lists them directly under the .vcproj
@@ -63,9 +62,9 @@ add_library(${TARGET_NAME} SHARED
             ${LIBRARY_SRC}
             ${LIBRARY_HEADERS}
             ${PUBLIC_HEADERS})
+set_ie_threading_interface_for(${TARGET_NAME})
 
-
-target_link_libraries(${TARGET_NAME} PRIVATE pugixml ade ${CMAKE_DL_LIBS} ${INTEL_ITT_LIBS})
+target_link_libraries(${TARGET_NAME} PRIVATE pugixml fluid ade ${CMAKE_DL_LIBS} ${INTEL_ITT_LIBS})
 
 # Properties->C/C++->General->Additional Include Directories
 target_include_directories(${TARGET_NAME} PUBLIC ${PUBLIC_HEADERS_DIR}
@@ -73,7 +72,8 @@ target_include_directories(${TARGET_NAME} PUBLIC ${PUBLIC_HEADERS_DIR}
                                                   "${IE_MAIN_SOURCE_DIR}/src/dumper")
 
 target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src")
-target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/ade/sources/ade/include")
+target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/ocv")
+
 if(ENABLE_MKL_DNN)
     target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/cpu/xbyak")
 endif()
@@ -86,6 +86,7 @@ add_library(${TARGET_NAME}_s STATIC
             ${LIBRARY_SRC}
             ${LIBRARY_HEADERS}
             ${PUBLIC_HEADERS})
+set_ie_threading_interface_for(${TARGET_NAME}_s)
 
 # Properties->C/C++->General->Additional Include Directories
 target_include_directories(${TARGET_NAME}_s PUBLIC ${PUBLIC_HEADERS_DIR}
@@ -93,7 +94,8 @@ target_include_directories(${TARGET_NAME}_s PUBLIC ${PUBLIC_HEADERS_DIR}
                                                   "${IE_MAIN_SOURCE_DIR}/src/dumper")
 
 target_include_directories(${TARGET_NAME}_s SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src")
-target_include_directories(${TARGET_NAME}_s SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/ade/sources/ade/include")
+target_include_directories(${TARGET_NAME}_s SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/ocv")
+
 if(ENABLE_MKL_DNN)
   target_include_directories(${TARGET_NAME}_s SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/cpu/xbyak")
 endif()
@@ -102,8 +104,10 @@ target_compile_definitions(${TARGET_NAME}_s PUBLIC -DUSE_STATIC_IE)
 
 set_target_properties(${TARGET_NAME}_s PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}_s)
 
-# export targets
+target_link_libraries(${TARGET_NAME}_s PRIVATE fluid
+                                       PRIVATE ade)
 
+# export targets
 export(TARGETS ${TARGET_NAME} NAMESPACE IE:: FILE "${CMAKE_BINARY_DIR}/targets.cmake")
 
 configure_file(
@@ -112,6 +116,6 @@ configure_file(
     COPYONLY)
 
 configure_file(
-    "${CMAKE_SOURCE_DIR}/cmake/share/InferenceEngineConfig-version.cmake"
+    "${CMAKE_SOURCE_DIR}/cmake/share/InferenceEngineConfig-version.cmake.in"
     "${CMAKE_BINARY_DIR}/InferenceEngineConfig-version.cmake"
-    COPYONLY)
+    COPYONLY)
\ No newline at end of file
index 4e668b4..041c565 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index f6f9841..7348354 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 95f0848..8be9ab9 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -16,6 +15,19 @@ InferenceEngine::Blob::Ptr make_blob_with_precision(const InferenceEngine::Tenso
     return make_blob_with_precision(desc.getPrecision(), desc, ptr);
 }
 
+InferenceEngine::Layout plain_layout(InferenceEngine::SizeVector dims) {
+    int n = dims.size();
+    return n == 1 ? InferenceEngine::C    :
+           n == 2 ? InferenceEngine::NC   :
+           n == 3 ? InferenceEngine::CHW  :
+           n == 4 ? InferenceEngine::NCHW :
+                    InferenceEngine::ANY;
+}
+
+InferenceEngine::Blob::Ptr make_plain_blob(InferenceEngine::Precision prec, const InferenceEngine::SizeVector dims) {
+    return make_blob_with_precision({prec, dims, plain_layout(dims)});
+}
+
 InferenceEngine::Blob::Ptr CreateBlobFromData(const InferenceEngine::DataPtr &data) {
     // TODO Here some decision should be made about the layout.
     // For now we just pass the layout and use conversion to NCHW for ANY.
index fc0f1b0..a4a5d20 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -36,6 +35,9 @@ template <InferenceEngine::Precision::ePrecision precision, class ... Args> Infe
 
 INFERENCE_ENGINE_API_CPP(InferenceEngine::Blob::Ptr) make_blob_with_precision(const InferenceEngine::TensorDesc& desc);
 INFERENCE_ENGINE_API_CPP(InferenceEngine::Blob::Ptr) make_blob_with_precision(const InferenceEngine::TensorDesc& desc, void* ptr);
+INFERENCE_ENGINE_API_CPP(InferenceEngine::Blob::Ptr) make_plain_blob(InferenceEngine::Precision prec, const InferenceEngine::SizeVector dims);
+
+INFERENCE_ENGINE_API_CPP(InferenceEngine::Layout) plain_layout(InferenceEngine::SizeVector dims);
 
 template <class ... Args>
 InferenceEngine::Blob::Ptr make_blob_with_precision(InferenceEngine::Precision precision, Args &&... args) {
diff --git a/inference-engine/src/inference_engine/builders/ie_argmax_layer.cpp b/inference-engine/src/inference_engine/builders/ie_argmax_layer.cpp
new file mode 100644 (file)
index 0000000..265913f
--- /dev/null
@@ -0,0 +1,61 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_argmax_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ArgMaxLayer::ArgMaxLayer(const std::string& name): LayerFragment("ArgMax", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+}
+
+Builder::ArgMaxLayer::ArgMaxLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "ArgMax"))
+        THROW_IE_EXCEPTION << "Cannot create ArgMaxLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ArgMaxLayer& Builder::ArgMaxLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::ArgMaxLayer::getPort() const {
+    return getLayer().getInputPorts()[0];
+}
+
+Builder::ArgMaxLayer& Builder::ArgMaxLayer::setPort(const Port &port) {
+    getLayer().getInputPorts()[0] = port;
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+int Builder::ArgMaxLayer::getAxis() const {
+    return getLayer().getParameters()["axis"].asInt();
+}
+Builder::ArgMaxLayer& Builder::ArgMaxLayer::setAxis(int axis) {
+    getLayer().getParameters()["axis"] = axis;
+    return *this;
+}
+size_t Builder::ArgMaxLayer::getTopK() const {
+    return getLayer().getParameters()["top_k"].asUInt();
+}
+Builder::ArgMaxLayer& Builder::ArgMaxLayer::setTopK(size_t topK) {
+    getLayer().getParameters()["top_k"] = topK;
+    return *this;
+}
+size_t Builder::ArgMaxLayer::getOutMaxVal() const {
+    return getLayer().getParameters()["out_max_val"].asUInt();
+}
+Builder::ArgMaxLayer& Builder::ArgMaxLayer::setOutMaxVal(size_t outMaxVal) {
+    if (outMaxVal > 1)
+        THROW_IE_EXCEPTION << "OutMaxVal supports only 0 and 1 values.";
+    getLayer().getParameters()["out_max_val"] = outMaxVal;
+    return *this;
+}
+
diff --git a/inference-engine/src/inference_engine/builders/ie_batch_normalization_layer.cpp b/inference-engine/src/inference_engine/builders/ie_batch_normalization_layer.cpp
new file mode 100644 (file)
index 0000000..1c3d275
--- /dev/null
@@ -0,0 +1,68 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_batch_normalization_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::BatchNormalizationLayer::BatchNormalizationLayer(const std::string& name): LayerFragment("BatchNormalization", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setEpsilon(0.00000001f);
+}
+
+Builder::BatchNormalizationLayer::BatchNormalizationLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "BatchNormalization"))
+        THROW_IE_EXCEPTION << "Cannot create BatchNormalizationLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::BatchNormalizationLayer& Builder::BatchNormalizationLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::BatchNormalizationLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::BatchNormalizationLayer& Builder::BatchNormalizationLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+Builder::BatchNormalizationLayer& Builder::BatchNormalizationLayer::setWeights(const Blob::CPtr& weights) {
+    getLayer().addConstantData("weights", weights);
+    return *this;
+}
+Builder::BatchNormalizationLayer& Builder::BatchNormalizationLayer::setBiases(const Blob::CPtr& biases) {
+    getLayer().addConstantData("biases", biases);
+    return *this;
+}
+
+float Builder::BatchNormalizationLayer::getEpsilon() const {
+    return getLayer().getParameters()["epsilon"].asFloat();
+}
+Builder::BatchNormalizationLayer& Builder::BatchNormalizationLayer::setEpsilon(float eps) {
+    getLayer().getParameters()["epsilon"] = eps;
+    return *this;
+}
+
+void Builder::BatchNormalizationLayer::validate(const Layer& layer)  {
+    auto weightsIt = layer.getConstantData().find("weights");
+    auto biasesIt = layer.getConstantData().find("biases");
+    bool valid = weightsIt != layer.getConstantData().end() &&
+            biasesIt != layer.getConstantData().end() &&
+            weightsIt->second != nullptr &&
+            weightsIt->second->cbuffer() != nullptr &&
+            biasesIt->second != nullptr &&
+            biasesIt->second->cbuffer() != nullptr;
+    if (!valid)
+        THROW_IE_EXCEPTION << "Cannot create BatchNotmalization layer! Weights and biases are required!";
+}
+
+REG_VALIDATOR_FOR(BatchNormalization,  Builder::BatchNormalizationLayer::validate);
\ No newline at end of file
diff --git a/inference-engine/src/inference_engine/builders/ie_clamp_layer.cpp b/inference-engine/src/inference_engine/builders/ie_clamp_layer.cpp
new file mode 100644 (file)
index 0000000..0bc1fb9
--- /dev/null
@@ -0,0 +1,56 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_clamp_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ClampLayer::ClampLayer(const std::string& name): LayerFragment("Clamp", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setMinValue(0.0f);
+    setMaxValue(1.0f);
+}
+
+Builder::ClampLayer::ClampLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Clamp"))
+        THROW_IE_EXCEPTION << "Cannot create ClampLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ClampLayer& Builder::ClampLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::ClampLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::ClampLayer& Builder::ClampLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+float Builder::ClampLayer::getMaxValue() const {
+    return getLayer().getParameters()["max"].asFloat();
+}
+
+Builder::ClampLayer& Builder::ClampLayer::setMaxValue(float maxValue) {
+    getLayer().getParameters()["max"] = maxValue;
+    return *this;
+}
+
+float Builder::ClampLayer::getMinValue() const {
+    return getLayer().getParameters()["min"].asFloat();
+}
+
+Builder::ClampLayer& Builder::ClampLayer::setMinValue(float minValue) {
+    getLayer().getParameters()["min"] = minValue;
+    return *this;
+}
+
diff --git a/inference-engine/src/inference_engine/builders/ie_concat_layer.cpp b/inference-engine/src/inference_engine/builders/ie_concat_layer.cpp
new file mode 100644 (file)
index 0000000..8ba326f
--- /dev/null
@@ -0,0 +1,53 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_concat_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ConcatLayer::ConcatLayer(const std::string& name): LayerFragment("Concat", name) {
+    getLayer().getOutputPorts().resize(1);
+    setAxis(1);
+}
+
+Builder::ConcatLayer::ConcatLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Concat"))
+        THROW_IE_EXCEPTION << "Cannot create ConcatLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ConcatLayer& Builder::ConcatLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::ConcatLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::ConcatLayer& Builder::ConcatLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+const std::vector<Port>& Builder::ConcatLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+
+Builder::ConcatLayer& Builder::ConcatLayer::setInputPorts(const std::vector<Port>& ports) {
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+
+size_t Builder::ConcatLayer::getAxis() const {
+    return getLayer().getParameters()["axis"].asUInt();
+}
+
+Builder::ConcatLayer& Builder::ConcatLayer::setAxis(size_t axis) {
+    getLayer().getParameters()["axis"] = axis;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_const_layer.cpp b/inference-engine/src/inference_engine/builders/ie_const_layer.cpp
new file mode 100644 (file)
index 0000000..da5d43d
--- /dev/null
@@ -0,0 +1,39 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_const_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ConstLayer::ConstLayer(const std::string& name): LayerFragment("Const", name) {
+    getLayer().getOutputPorts().resize(1);
+}
+
+Builder::ConstLayer::ConstLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Const"))
+        THROW_IE_EXCEPTION << "Cannot create ConstLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ConstLayer& Builder::ConstLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::ConstLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::ConstLayer& Builder::ConstLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+Builder::ConstLayer& Builder::ConstLayer::setData(const Blob::CPtr& data) {
+    getLayer().addConstantData("custom", data);
+    return *this;
+}
+
diff --git a/inference-engine/src/inference_engine/builders/ie_convolution_layer.cpp b/inference-engine/src/inference_engine/builders/ie_convolution_layer.cpp
new file mode 100644 (file)
index 0000000..a66e155
--- /dev/null
@@ -0,0 +1,170 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_convolution_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ConvolutionLayer::ConvolutionLayer(const std::string& name): LayerFragment("Convolution", name) {
+    getLayer().getInputPorts().resize(1);
+    getLayer().getOutputPorts().resize(1);
+}
+
+Builder::ConvolutionLayer::ConvolutionLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Convolution"))
+        THROW_IE_EXCEPTION << "Cannot create ConvolutionLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ConvolutionLayer::operator Builder::Layer() const {
+    Layer genLayer(getLayer());
+
+    std::vector<size_t> l_kernel = getKernel();
+    std::vector<size_t> l_dilation = getDilation();
+    std::vector<size_t> l_paddingBegin = getPaddingsBegin();
+    std::vector<size_t> l_paddingEnd = getPaddingsEnd();
+    std::vector<size_t> l_strides = getStrides();
+
+    if (l_paddingBegin.empty() && !l_kernel.empty())
+        l_paddingBegin.resize(l_kernel.size(), 0);
+    if (l_paddingEnd.empty() && !l_kernel.empty())
+        l_paddingEnd.resize(l_kernel.size(), 0);
+    if (l_dilation.empty() && !l_kernel.empty())
+        l_dilation.resize(l_kernel.size(), 1);
+    if (l_strides.empty() && !l_kernel.empty())
+        l_strides.resize(l_kernel.size(), 1);
+
+    if (!getOutDepth() || l_kernel.empty() || l_kernel.size() != l_paddingBegin.size() || l_kernel.size() != l_paddingEnd.size() ||
+            l_kernel.size() != l_dilation.size() || l_kernel.size() != l_strides.size())
+        THROW_IE_EXCEPTION << genLayer.getType() << " node " << genLayer.getName() << " contains incorrect parameters!";
+
+    genLayer.getParameters()["kernel"] = l_kernel;
+    genLayer.getParameters()["strides"] = l_strides;
+    genLayer.getParameters()["pads_begin"] = l_paddingBegin;
+    genLayer.getParameters()["pads_end"] = l_paddingEnd;
+    genLayer.getParameters()["dilations"] = l_dilation;
+    return genLayer;
+}
+
+Builder::ConvolutionLayer &Builder::ConvolutionLayer::setName(const std::string &name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setWeights(const Blob::CPtr& weights) {
+    getLayer().addConstantData("weights", weights);
+    return *this;
+}
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setBiases(const Blob::CPtr& biases) {
+    getLayer().addConstantData("biases", biases);
+    return *this;
+}
+
+const Port& Builder::ConvolutionLayer::getInputPort() const {
+    return getLayer().getInputPorts()[0];
+}
+
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setInputPort(const Port& port) {
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+const Port& Builder::ConvolutionLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setOutputPort(const Port& port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+const std::vector<size_t> Builder::ConvolutionLayer::getKernel() const {
+    return uInts2size_t(getLayer().getParameters()["kernel"].asUInts({}));
+}
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setKernel(const std::vector<size_t>& kernel) {
+    getLayer().getParameters()["kernel"] = kernel;
+    return *this;
+}
+
+const std::vector<size_t> Builder::ConvolutionLayer::getStrides() const {
+    return uInts2size_t(getLayer().getParameters()["strides"].asUInts({}));
+}
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setStrides(const std::vector<size_t>& strides) {
+    getLayer().getParameters()["strides"] = strides;
+    return *this;
+}
+
+const std::vector<size_t> Builder::ConvolutionLayer::getDilation() const {
+    return uInts2size_t(getLayer().getParameters()["dilations"].asUInts({}));
+}
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setDilation(const std::vector<size_t>& dilation) {
+    getLayer().getParameters()["dilations"] = dilation;
+    return *this;
+}
+
+const std::vector<size_t> Builder::ConvolutionLayer::getPaddingsBegin() const {
+    return uInts2size_t(getLayer().getParameters()["pads_begin"].asUInts({}));
+}
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setPaddingsBegin(const std::vector<size_t>& paddings) {
+    getLayer().getParameters()["pads_begin"] = paddings;
+    return *this;
+}
+
+const std::vector<size_t> Builder::ConvolutionLayer::getPaddingsEnd() const {
+    return uInts2size_t(getLayer().getParameters()["pads_end"].asUInts({}));
+}
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setPaddingsEnd(const std::vector<size_t>& paddings) {
+    getLayer().getParameters()["pads_end"] = paddings;
+    return *this;
+}
+
+size_t Builder::ConvolutionLayer::getGroup() const {
+    return getLayer().getParameters()["group"].asUInt(1);
+}
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setGroup(size_t group) {
+    getLayer().getParameters()["group"] = group;
+    return *this;
+}
+
+size_t Builder::ConvolutionLayer::getOutDepth() const {
+    return getLayer().getParameters()["output"].asUInt(0);
+}
+Builder::ConvolutionLayer& Builder::ConvolutionLayer::setOutDepth(size_t outDepth) {
+    getLayer().getParameters()["output"] = outDepth;
+    return *this;
+}
+
+void Builder::ConvolutionLayer::validate(const Layer& layer) {
+    Layer convLayer = layer;
+    Builder::ConvolutionLayer convBuilder(convLayer);
+    std::vector<size_t> l_kernel = convBuilder.getKernel();
+
+    // WA for old IRs
+    if (l_kernel.empty() && layer.getParameters().find("kernel-x") != layer.getParameters().end() &&
+            layer.getParameters().find("kernel-y") != layer.getParameters().end())
+        return;
+
+    std::vector<size_t> l_dilation = convBuilder.getDilation();
+    std::vector<size_t> l_paddingBegin = convBuilder.getPaddingsBegin();
+    std::vector<size_t> l_paddingEnd = convBuilder.getPaddingsEnd();
+    std::vector<size_t> l_strides = convBuilder.getStrides();
+
+    if (l_paddingBegin.empty() && !l_kernel.empty())
+        l_paddingBegin.resize(l_kernel.size(), 0);
+    if (l_paddingEnd.empty() && !l_kernel.empty())
+        l_paddingEnd.resize(l_kernel.size(), 0);
+    if (l_dilation.empty() && !l_kernel.empty())
+        l_dilation.resize(l_kernel.size(), 1);
+    if (l_strides.empty() && !l_kernel.empty())
+        l_strides.resize(l_kernel.size(), 1);
+
+    if (!convBuilder.getOutDepth() || l_kernel.empty() || l_kernel.size() != l_paddingBegin.size() || l_kernel.size() != l_paddingEnd.size() ||
+            l_kernel.size() != l_dilation.size() || l_kernel.size() != l_strides.size())
+        THROW_IE_EXCEPTION << layer.getType() << " node " << layer.getName() << " contains incorrect parameters!";
+}
+
+REG_VALIDATOR_FOR(Convolution, Builder::ConvolutionLayer::validate);
diff --git a/inference-engine/src/inference_engine/builders/ie_crop_layer.cpp b/inference-engine/src/inference_engine/builders/ie_crop_layer.cpp
new file mode 100644 (file)
index 0000000..7fe2591
--- /dev/null
@@ -0,0 +1,69 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_crop_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::CropLayer::CropLayer(const std::string& name): LayerFragment("Crop", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(2);
+}
+
+Builder::CropLayer::CropLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Crop"))
+        THROW_IE_EXCEPTION << "Cannot create CropLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::CropLayer& Builder::CropLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const std::vector<Port>& Builder::CropLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+
+Builder::CropLayer& Builder::CropLayer::setInputPorts(const std::vector<Port>& ports) {
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+
+const Port& Builder::CropLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::CropLayer& Builder::CropLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+const std::vector<size_t> Builder::CropLayer::getAxis() const {
+    return uInts2size_t(getLayer().getParameters()["axis"].asUInts());
+}
+
+Builder::CropLayer& Builder::CropLayer::setAxis(const std::vector<size_t>& axis) {
+    getLayer().getParameters()["axis"] = axis;
+    return *this;
+}
+
+const std::vector<size_t> Builder::CropLayer::getOffset() const {
+    return uInts2size_t(getLayer().getParameters()["offset"].asUInts());
+}
+
+Builder::CropLayer& Builder::CropLayer::setOffset(const std::vector<size_t>& offsets) {
+    getLayer().getParameters()["offset"] = offsets;
+    return *this;
+}
+
+void Builder::CropLayer::validate(const Layer& layer) {
+    if (layer.getInputPorts().size() != 2)
+        THROW_IE_EXCEPTION << "Incorrect parameters for layer " << layer.getName() << " should have 2 inputs!";
+}
+
+REG_VALIDATOR_FOR(Crop, Builder::CropLayer::validate);
\ No newline at end of file
diff --git a/inference-engine/src/inference_engine/builders/ie_ctc_greedy_decoder_layer.cpp b/inference-engine/src/inference_engine/builders/ie_ctc_greedy_decoder_layer.cpp
new file mode 100644 (file)
index 0000000..c3e017a
--- /dev/null
@@ -0,0 +1,46 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_ctc_greedy_decoder_layer.hpp>
+#include <details/caseless.hpp>
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::CTCGreedyDecoderLayer::CTCGreedyDecoderLayer(const std::string& name): LayerFragment("CTCGreedyDecoder", name) {
+    getLayer().getOutputPorts().resize(1);
+}
+
+Builder::CTCGreedyDecoderLayer::CTCGreedyDecoderLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "CTCGreedyDecoder"))
+        THROW_IE_EXCEPTION << "Cannot create CTCGreedyDecoderLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::CTCGreedyDecoderLayer& Builder::CTCGreedyDecoderLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+const std::vector<Port>& Builder::CTCGreedyDecoderLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+Builder::CTCGreedyDecoderLayer& Builder::CTCGreedyDecoderLayer::setInputPorts(const std::vector<Port>& ports) {
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+const Port& Builder::CTCGreedyDecoderLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+Builder::CTCGreedyDecoderLayer& Builder::CTCGreedyDecoderLayer::setOutputPort(const Port& port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+bool Builder::CTCGreedyDecoderLayer::getCTCMergeRepeated() const {
+    return getLayer().getParameters()["ctc_merge_repeated"].asBool();
+}
+Builder::CTCGreedyDecoderLayer& Builder::CTCGreedyDecoderLayer::setCTCMergeRepeated(bool flag) {
+    getLayer().getParameters()["ctc_merge_repeated"] = flag;
+    return *this;
+}
+
diff --git a/inference-engine/src/inference_engine/builders/ie_deconvolution_layer.cpp b/inference-engine/src/inference_engine/builders/ie_deconvolution_layer.cpp
new file mode 100644 (file)
index 0000000..dfb607a
--- /dev/null
@@ -0,0 +1,20 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_deconvolution_layer.hpp>
+#include <details/caseless.hpp>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::DeconvolutionLayer::DeconvolutionLayer(const std::string& name): ConvolutionLayer(name) {
+    getLayer().setType("Deconvolution");
+}
+Builder::DeconvolutionLayer::DeconvolutionLayer(Layer& genLayer): ConvolutionLayer(genLayer.getName()) {
+    getLayer().setName("");
+    getLayer().setType("");
+    getLayer() = genLayer;
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Deconvolution"))
+        THROW_IE_EXCEPTION << "Cannot create DeconvolutionLayer decorator for layer " << getLayer().getType();
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_detection_output_layer.cpp b/inference-engine/src/inference_engine/builders/ie_detection_output_layer.cpp
new file mode 100644 (file)
index 0000000..f836445
--- /dev/null
@@ -0,0 +1,124 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_detection_output_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::DetectionOutputLayer::DetectionOutputLayer(const std::string& name): LayerFragment("DetectionOutput", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(2);
+}
+
+Builder::DetectionOutputLayer::DetectionOutputLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "DetectionOutput"))
+        THROW_IE_EXCEPTION << "Cannot create DetectionOutputLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const std::vector<Port>& Builder::DetectionOutputLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setInputPorts(const std::vector<Port> &ports) {
+    if (ports.size() != 3)
+        THROW_IE_EXCEPTION << "Incorrect number of inputs for DetectionOutput layer.";
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+
+const Port& Builder::DetectionOutputLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+size_t Builder::DetectionOutputLayer::getNumClasses() const {
+    return getLayer().getParameters()["num_classes"].asUInt();
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setNumClasses(size_t num) {
+    getLayer().getParameters()["num_classes"] = num;
+    return *this;
+}
+int Builder::DetectionOutputLayer::getBackgroudLabelId() const {
+    return getLayer().getParameters()["background_label_id"].asInt(-1);
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setBackgroudLabelId(int labelId) {
+    getLayer().getParameters()["background_label_id"] = labelId;
+    return *this;
+}
+int Builder::DetectionOutputLayer::getTopK() const {
+    return getLayer().getParameters()["top_k"].asInt();
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setTopK(int topK) {
+    getLayer().getParameters()["top_k"] = topK;
+    return *this;
+}
+int Builder::DetectionOutputLayer::getKeepTopK() const {
+    return getLayer().getParameters()["keep_top_k"].asInt();
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setKeepTopK(int topK) {
+    getLayer().getParameters()["keep_top_k"] = topK;
+    return *this;
+}
+int Builder::DetectionOutputLayer::getNumOrientClasses() const {
+    return getLayer().getParameters()["num_orient_classes"].asInt();
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setNumOrientClasses(int numClasses) {
+    getLayer().getParameters()["num_orient_classes"] = numClasses;
+    return *this;
+}
+std::string Builder::DetectionOutputLayer::getCodeType() const {
+    return getLayer().getParameters()["code_type"];
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setCodeType(std::string type) {
+    getLayer().getParameters()["code_type"] = type;
+    return *this;
+}
+int Builder::DetectionOutputLayer::getInterpolateOrientation() const {
+    return getLayer().getParameters()["interpolate_orientation"].asInt();
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setInterpolateOrientation(int orient) {
+    getLayer().getParameters()["interpolate_orientation"] = orient;
+    return *this;
+}
+float Builder::DetectionOutputLayer::getNMSThreshold() const {
+    return getLayer().getParameters()["nms_threshold"].asFloat();
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setNMSThreshold(float threshold) {
+    getLayer().getParameters()["nms_threshold"] = threshold;
+    return *this;
+}
+float Builder::DetectionOutputLayer::getConfidenceThreshold() const {
+    return getLayer().getParameters()["confidence_threshold"].asFloat();
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setConfidenceThreshold(float threshold) {
+    getLayer().getParameters()["confidence_threshold"] = threshold;
+    return *this;
+}
+bool Builder::DetectionOutputLayer::getShareLocation() const {
+    return getLayer().getParameters()["share_location"].asBool();
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setShareLocation(bool flag) {
+    getLayer().getParameters()["share_location"] = flag;
+    return *this;
+}
+bool Builder::DetectionOutputLayer::getVariantEncodedInTarget() const {
+    return getLayer().getParameters()["variance_encoded_in_target"].asBool();
+}
+Builder::DetectionOutputLayer& Builder::DetectionOutputLayer::setVariantEncodedInTarget(bool flag) {
+    getLayer().getParameters()["variance_encoded_in_target"] = flag;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_eltwise_layer.cpp b/inference-engine/src/inference_engine/builders/ie_eltwise_layer.cpp
new file mode 100644 (file)
index 0000000..cffecaa
--- /dev/null
@@ -0,0 +1,86 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_eltwise_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::EltwiseLayer::EltwiseLayer(const std::string& name): LayerFragment("Eltwise", name) {
+    getLayer().getOutputPorts().resize(1);
+    setEltwiseType(EltwiseType::SUM);
+}
+
+Builder::EltwiseLayer::EltwiseLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Eltwise"))
+        THROW_IE_EXCEPTION << "Cannot create EltwiseLayer decorator for layer " << getLayer().getType();
+
+    std::string operatorStr = getLayer().getParameters()["operation"];
+    if (operatorStr == "max") {
+        type = MAX;
+    } else if (operatorStr == "sum") {
+        type = SUM;
+    } else if (operatorStr == "mul") {
+        type = MUL;
+    }
+}
+
+Builder::EltwiseLayer& Builder::EltwiseLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const std::vector<Port>& Builder::EltwiseLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+
+Builder::EltwiseLayer& Builder::EltwiseLayer::setInputPorts(const std::vector<Port>& ports) {
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+
+const Port& Builder::EltwiseLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::EltwiseLayer& Builder::EltwiseLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+const std::vector<float> Builder::EltwiseLayer::getScales() const {
+    return getLayer().getParameters()["scales"].asFloats({});
+}
+
+// TODO: IR doesn't contain Scales!!!
+Builder::EltwiseLayer& Builder::EltwiseLayer::setScales(const std::vector<float>& scales) {
+    getLayer().getParameters()["scales"] = scales;
+    return *this;
+}
+
+Builder::EltwiseLayer::EltwiseType Builder::EltwiseLayer::getEltwiseType() const {
+    return type;
+}
+
+Builder::EltwiseLayer& Builder::EltwiseLayer::setEltwiseType(Builder::EltwiseLayer::EltwiseType type) {
+    this->type = type;
+    std::string operatorStr;
+    switch (type) {
+    case MAX:
+        operatorStr = "max";
+        break;
+    case SUM:
+        operatorStr = "sum";
+        break;
+    case MUL:
+        operatorStr = "mul";
+    }
+    getLayer().getParameters()["operation"] = operatorStr;
+    return *this;
+}
+
+
diff --git a/inference-engine/src/inference_engine/builders/ie_elu_layer.cpp b/inference-engine/src/inference_engine/builders/ie_elu_layer.cpp
new file mode 100644 (file)
index 0000000..5be0044
--- /dev/null
@@ -0,0 +1,46 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_elu_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ELULayer::ELULayer(const std::string& name): LayerFragment("ELU", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setAlpha(1);
+}
+
+Builder::ELULayer::ELULayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "ELU"))
+        THROW_IE_EXCEPTION << "Cannot create ELULayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ELULayer& Builder::ELULayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::ELULayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::ELULayer& Builder::ELULayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+float Builder::ELULayer::getAlpha() const {
+    return getLayer().getParameters()["alpha"].asFloat();
+}
+
+Builder::ELULayer& Builder::ELULayer::setAlpha(float alpha) {
+    getLayer().getParameters()["alpha"] = alpha;
+    return *this;
+}
+
diff --git a/inference-engine/src/inference_engine/builders/ie_fully_connected_layer.cpp b/inference-engine/src/inference_engine/builders/ie_fully_connected_layer.cpp
new file mode 100644 (file)
index 0000000..1abe7b8
--- /dev/null
@@ -0,0 +1,62 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_fully_connected_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::FullyConnectedLayer::FullyConnectedLayer(const std::string& name): LayerFragment("FullyConnected", name) {
+    getLayer().getInputPorts().resize(1);
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getParameters()["out-size"] = 0;
+}
+
+Builder::FullyConnectedLayer::FullyConnectedLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "FullyConnected"))
+        THROW_IE_EXCEPTION << "Cannot create FullyConnectedLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::FullyConnectedLayer &Builder::FullyConnectedLayer::setName(const std::string &name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+Builder::FullyConnectedLayer& Builder::FullyConnectedLayer::setWeights(const Blob::CPtr& weights) {
+    getLayer().addConstantData("weights", weights);
+    return *this;
+}
+Builder::FullyConnectedLayer& Builder::FullyConnectedLayer::setBiases(const Blob::CPtr& biases) {
+    getLayer().addConstantData("biases", biases);
+    return *this;
+}
+
+const Port& Builder::FullyConnectedLayer::getInputPort() const {
+    return getLayer().getInputPorts()[0];
+}
+
+Builder::FullyConnectedLayer& Builder::FullyConnectedLayer::setInputPort(const Port& port) {
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+const Port& Builder::FullyConnectedLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::FullyConnectedLayer& Builder::FullyConnectedLayer::setOutputPort(const Port& port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+size_t Builder::FullyConnectedLayer::getOutputNum() const {
+    return getLayer().getParameters()["out-size"].asUInt();
+}
+Builder::FullyConnectedLayer& Builder::FullyConnectedLayer::setOutputNum(size_t outNum) {
+    getLayer().getParameters()["out-size"] = outNum;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_grn_layer.cpp b/inference-engine/src/inference_engine/builders/ie_grn_layer.cpp
new file mode 100644 (file)
index 0000000..1cc1a7a
--- /dev/null
@@ -0,0 +1,45 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_grn_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::GRNLayer::GRNLayer(const std::string& name): LayerFragment("GRN", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setBeta(0);
+}
+
+Builder::GRNLayer::GRNLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "GRN"))
+        THROW_IE_EXCEPTION << "Cannot create GRNLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::GRNLayer& Builder::GRNLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::GRNLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::GRNLayer& Builder::GRNLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+float Builder::GRNLayer::getBeta() const {
+    return getLayer().getParameters()["beta"].asFloat();
+}
+
+Builder::GRNLayer& Builder::GRNLayer::setBeta(float beta) {
+    getLayer().getParameters()["beta"] = beta;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_input_layer_layer.cpp b/inference-engine/src/inference_engine/builders/ie_input_layer_layer.cpp
new file mode 100644 (file)
index 0000000..e7e099f
--- /dev/null
@@ -0,0 +1,40 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_input_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::InputLayer::InputLayer(const std::string& name): LayerFragment("Input", name) {
+    getLayer().getOutputPorts().resize(1);
+}
+
+Builder::InputLayer::InputLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Input"))
+        THROW_IE_EXCEPTION << "Cannot create InputLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::InputLayer& Builder::InputLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::InputLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::InputLayer& Builder::InputLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+void Builder::InputLayer::validate(const Layer& layer) {
+    if (layer.getOutputPorts()[0].shape().empty())
+        THROW_IE_EXCEPTION << layer.getType() << " node " << layer.getName() << " should have shape!";
+}
+
+REG_VALIDATOR_FOR(Input,  Builder::InputLayer::validate);
\ No newline at end of file
diff --git a/inference-engine/src/inference_engine/builders/ie_layer_builder.cpp b/inference-engine/src/inference_engine/builders/ie_layer_builder.cpp
new file mode 100644 (file)
index 0000000..a65dd7c
--- /dev/null
@@ -0,0 +1,165 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_layer_builder.hpp>
+#include <details/caseless.hpp>
+#include <ie_network.hpp>
+
+#include <limits>
+#include <memory>
+#include <vector>
+#include <string>
+#include <map>
+
+using namespace InferenceEngine;
+
+Builder::Layer::Layer(const std::string& type, const std::string& name): id((std::numeric_limits<idx_t>::max)()), type(type), name(name) {}
+
+Builder::Layer::Layer(const ILayer::Ptr& layer) {
+    id = layer->getId();
+    getType() = layer->getType();
+    getName() = layer->getName();
+    getGraph() = layer->getGraph();
+    getParameters() = layer->getParameters()->getParameters();
+    getInputPorts() = layer->getInputPorts();
+    getOutputPorts() = layer->getOutputPorts();
+    getConstantData() = layer->getParameters()->getConstantData();
+}
+Builder::Layer::Layer(const ILayer::CPtr& layer) {
+    id = layer->getId();
+    getType() = layer->getType();
+    getName() = layer->getName();
+    getGraph() = layer->getGraph();
+    getParameters() = layer->getParameters()->getParameters();
+    getInputPorts() = layer->getInputPorts();
+    getOutputPorts() = layer->getOutputPorts();
+    getConstantData() = layer->getParameters()->getConstantData();
+}
+
+Builder::Layer::Layer(idx_t id, const Builder::Layer& layer): Layer(layer) {
+    this->id = id;
+}
+
+idx_t Builder::Layer::getId() const {
+    return id;
+}
+
+std::string& Builder::Layer::getType() {
+    return type;
+}
+const std::string& Builder::Layer::getType() const {
+    return type;
+}
+Builder::Layer& Builder::Layer::setType(const std::string& type) {
+    getType() = type;
+    return *this;
+}
+
+std::string& Builder::Layer::getName() {
+    return name;
+}
+const std::string& Builder::Layer::getName() const {
+    return name;
+}
+Builder::Layer& Builder::Layer::setName(const std::string& name) {
+    getName() = name;
+    return *this;
+}
+
+INetwork::Ptr& Builder::Layer::getGraph() {
+    return graph;
+}
+const INetwork::Ptr& Builder::Layer::getGraph() const {
+    return graph;
+}
+Builder::Layer& Builder::Layer::setGraph(const INetwork::Ptr& graph) {
+    getGraph() = graph;
+    return *this;
+}
+
+const std::map<std::string, Parameter>& Builder::Layer::getParameters() const {
+    return params;
+}
+std::map<std::string, Parameter>& Builder::Layer::getParameters() {
+    return params;
+}
+Builder::Layer& Builder::Layer::setParameters(const std::map<std::string, Parameter>& params) {
+    getParameters() = params;
+    return *this;
+}
+
+const std::map<std::string, Blob::CPtr>& Builder::Layer::getConstantData() const {
+    return constData;
+}
+std::map<std::string, Blob::CPtr>& Builder::Layer::getConstantData() {
+    return constData;
+}
+Builder::Layer& Builder::Layer::setConstantData(const std::map<std::string, Blob::Ptr>& constData) {
+    for (const auto& it : constData)
+        addConstantData(it.first, it.second);
+    return *this;
+}
+Builder::Layer& Builder::Layer::setConstantData(const std::map<std::string, Blob::CPtr>& constData) {
+    getConstantData() = constData;
+    return *this;
+}
+Builder::Layer& Builder::Layer::addConstantData(const std::string& name, const Blob::CPtr& data) {
+    getConstantData()[name] = data;
+    return *this;
+}
+
+std::vector<Port>& Builder::Layer::getInputPorts() {
+    return inPorts;
+}
+const std::vector<Port>& Builder::Layer::getInputPorts() const {
+    return inPorts;
+}
+Builder::Layer& Builder::Layer::setInputPorts(const std::vector<Port> &ports) {
+    getInputPorts() = ports;
+    return *this;
+}
+
+std::vector<Port>& Builder::Layer::getOutputPorts() {
+    return outPorts;
+}
+const std::vector<Port>& Builder::Layer::getOutputPorts() const {
+    return outPorts;
+}
+Builder::Layer& Builder::Layer::setOutputPorts(const std::vector<Port> &ports) {
+    getOutputPorts() = ports;
+    return *this;
+}
+
+const ILayer::Ptr Builder::Layer::build() const {
+    validate();
+    details::Layer::Ptr layer = std::make_shared<details::Layer>(id);
+
+    layer->getName() = name;
+    layer->getType() = type;
+    layer->setGraph(graph);
+    layer->getInputPorts() = inPorts;
+    layer->getOutputPorts() = outPorts;
+    layer->getParameters()->getParameters() = params;
+    layer->getParameters()->getConstantData() = constData;
+    return std::static_pointer_cast<ILayer>(layer);
+}
+
+void Builder::Layer::addValidator(const std::string &type, const std::function<void(const Layer&)>& validator) {
+    auto holder = getValidatorsHolder();
+    if (holder->validators.find(type) == holder->validators.end())
+        holder->validators[type] = validator;
+}
+
+void Builder::Layer::validate() const {
+    if (getValidatorsHolder()->validators.find(type) != getValidatorsHolder()->validators.end())
+        getValidatorsHolder()->validators[type](*this);
+}
+
+std::shared_ptr<Builder::ValidatorsHolder> Builder::Layer::getValidatorsHolder() {
+    static std::shared_ptr<ValidatorsHolder> localHolder;
+    if (localHolder == nullptr) {
+        localHolder = std::make_shared<ValidatorsHolder>();
+    }
+    return localHolder;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_layer_fragment.cpp b/inference-engine/src/inference_engine/builders/ie_layer_fragment.cpp
new file mode 100644 (file)
index 0000000..8cefe78
--- /dev/null
@@ -0,0 +1,52 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_layer_fragment.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+using namespace details;
+
+Builder::LayerFragment::LayerFragment(const std::string& type, const std::string& name): layer(type, name), refLayer(layer) {}
+
+Builder::LayerFragment::LayerFragment(Layer& genLayer): layer("", ""), refLayer(genLayer) {}
+
+Builder::LayerFragment &Builder::LayerFragment::operator=(const Builder::LayerFragment &rval) {
+    layer = rval.layer;
+    refLayer = rval.refLayer;
+    if (!layer.getType().empty() && !layer.getName().empty())
+        refLayer = layer;
+    return *this;
+}
+
+Builder::LayerFragment::LayerFragment(const Builder::LayerFragment & rval): LayerFragment("", "") {
+    *this = rval;
+}
+
+Builder::LayerFragment::operator Builder::Layer() const {
+    getLayer().validate();
+    return getLayer();
+}
+
+const std::string& Builder::LayerFragment::getType() const {
+    return getLayer().getType();
+}
+const std::string& Builder::LayerFragment::getName() const {
+    return getLayer().getName();
+}
+
+Builder::Layer& Builder::LayerFragment::getLayer() const {
+    return refLayer;
+}
+
+const std::vector<size_t> Builder::LayerFragment::uInts2size_t(const std::vector<unsigned int>& vector) const {
+    std::vector<size_t> newVector;
+    newVector.reserve(vector.size());
+    for (const auto& it : vector) {
+        newVector.push_back(it);
+    }
+    return newVector;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_memory_layer.cpp b/inference-engine/src/inference_engine/builders/ie_memory_layer.cpp
new file mode 100644 (file)
index 0000000..f987b07
--- /dev/null
@@ -0,0 +1,70 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_memory_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::MemoryLayer::MemoryLayer(const std::string& name): LayerFragment("Memory", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+}
+
+Builder::MemoryLayer::MemoryLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Memory"))
+        THROW_IE_EXCEPTION << "Cannot create MemoryLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::MemoryLayer& Builder::MemoryLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::MemoryLayer::getInputPort() const {
+    return getLayer().getInputPorts()[0];
+}
+
+Builder::MemoryLayer& Builder::MemoryLayer::setInputPort(const Port &port) {
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+const Port& Builder::MemoryLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::MemoryLayer& Builder::MemoryLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+const std::string Builder::MemoryLayer::getId() const {
+    return getLayer().getParameters()["id"];
+}
+Builder::MemoryLayer& Builder::MemoryLayer::setId(const std::string& id) {
+    getLayer().getParameters()["id"] = id;
+    return *this;
+}
+size_t Builder::MemoryLayer::getIndex() const {
+    return getLayer().getParameters()["index"].asUInt();
+}
+Builder::MemoryLayer& Builder::MemoryLayer::setIndex(size_t index) {
+    if (index > 1)
+        THROW_IE_EXCEPTION << "Index supports only 0 and 1 values.";
+    getLayer().getParameters()["index"] = index;
+    return *this;
+}
+size_t Builder::MemoryLayer::getSize() const {
+    return getLayer().getParameters()["size"].asUInt(2);
+}
+Builder::MemoryLayer& Builder::MemoryLayer::setSize(size_t size) {
+    if (size != 2)
+        THROW_IE_EXCEPTION << "Only size equal 2 is supported.";
+    getLayer().getParameters()["size"] = size;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_mvn_layer.cpp b/inference-engine/src/inference_engine/builders/ie_mvn_layer.cpp
new file mode 100644 (file)
index 0000000..0211e9f
--- /dev/null
@@ -0,0 +1,60 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_mvn_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::MVNLayer::MVNLayer(const std::string& name): LayerFragment("MVN", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setEpsilon(9.999999717180685e-10f);
+    setNormalize(true);
+    setAcrossChannels(true);
+}
+
+Builder::MVNLayer::MVNLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "MVN"))
+        THROW_IE_EXCEPTION << "Cannot create MVNLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::MVNLayer& Builder::MVNLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::MVNLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::MVNLayer& Builder::MVNLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+bool Builder::MVNLayer::getAcrossChannels() const {
+    return getLayer().getParameters()["across_channels"].asBool(true);
+}
+Builder::MVNLayer& Builder::MVNLayer::setAcrossChannels(bool flag) {
+    getLayer().getParameters()["across_channels"] = flag ? 1 : 0;
+    return *this;
+}
+bool Builder::MVNLayer::getNormalize() const {
+    return getLayer().getParameters()["normalize_variance"].asBool(true);
+}
+Builder::MVNLayer& Builder::MVNLayer::setNormalize(bool flag) {
+    getLayer().getParameters()["normalize_variance"] = flag ? 1 : 0;
+    return *this;
+}
+float Builder::MVNLayer::getEpsilon() const {
+    return getLayer().getParameters()["eps"].asFloat();
+}
+Builder::MVNLayer& Builder::MVNLayer::setEpsilon(float eps) {
+    getLayer().getParameters()["eps"] = eps;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_network_builder.cpp b/inference-engine/src/inference_engine/builders/ie_network_builder.cpp
new file mode 100644 (file)
index 0000000..70d3cde
--- /dev/null
@@ -0,0 +1,698 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <ie_builders.hpp>
+#include <ie_network.hpp>
+#include "graph_tools.hpp"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <memory>
+#include <vector>
+#include <string>
+#include <limits>
+#include <map>
+
+#include <shape_infer/ie_reshaper.hpp>
+#include "ie_format_parser.h"
+#include "ie_layer_parsers.h"
+#include "blob_factory.hpp"
+#include <details/caseless.hpp>
+
+#include "ie_cnn_layer_builder.h"
+#include "ie_memcpy.h"
+
+using namespace InferenceEngine;
+
+/******************************************************************************
+ Network builder
+ ******************************************************************************/
+Builder::Network::Network(const std::string &name): Builder::Network(Context(), name) {}
+Builder::Network::Network(const INetwork &network): Builder::Network(Context(), network) {}
+Builder::Network::Network(const ICNNNetwork &network): Builder::Network(Context(), network) {}
+
+Builder::Network::Network(const Context& ieContext, const std::string &name): ctx(ieContext), name(name), version(3) {}
+
+Builder::Network::Network(const Context& ieContext, const INetwork &network): ctx(ieContext), name(network.getName()), version(3) {
+    for (const auto& layer : network) {
+        layers.push_back(Layer(layer));
+        const auto layerConnections = network.getLayerConnections(layer->getId());
+        for (const auto& connection : layerConnections) {
+            bool found = false;
+            for (const auto& con : connections) {
+                if (con == connection) {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                connections.push_back(connection);
+            }
+        }
+    }
+}
+
+Builder::Network::Network(const Context& ieContext, const ICNNNetwork &network): ctx(ieContext), name(network.getName()), version(0) {
+    auto allInputs = CNNNetGetAllInputLayers(network);
+    InputsDataMap inputs;
+    network.getInputsInfo(inputs);
+    if (inputs.empty() && allInputs.empty())
+        THROW_IE_EXCEPTION << "Cannot create graph! No inputs for the topology " << network.getName();
+
+    std::unordered_map<std::string, idx_t> name2id;
+    std::unordered_set<Data*> dataPtrs;
+    std::vector<CNNLayerPtr> queueLayers;
+
+    auto createGenericFromCNNLayer = [&](const CNNLayerPtr& cnnLayer) {
+        std::vector<Port> inputPorts;
+        for (const auto& data : cnnLayer->insData) {
+            auto lockedData = data.lock();
+            if (!lockedData)
+                continue;
+            if (dataPtrs.find(lockedData.get()) == dataPtrs.end()) {
+                dataPtrs.insert(lockedData.get());
+            }
+            inputPorts.emplace_back(lockedData->getTensorDesc().getDims());
+        }
+        std::vector<Port> outputPorts;
+        for (const auto& data : cnnLayer->outData) {
+            if (dataPtrs.find(data.get()) == dataPtrs.end()) {
+                dataPtrs.insert(data.get());
+            }
+            outputPorts.push_back(Port(data->getTensorDesc().getDims()));
+        }
+
+        std::map<std::string, Parameter> params;
+        for (const auto& it : cnnLayer->params) {
+            params[it.first] = it.second;
+        }
+        const auto layer = Layer(cnnLayer->type, cnnLayer->name)
+                .setInputPorts(inputPorts).setOutputPorts(outputPorts)
+                .setParameters(params).setConstantData(cnnLayer->blobs);
+        idx_t layerId = addLayer(layer);
+        name2id[layer.getName()] = layerId;
+        return layerId;
+    };
+
+    auto addPreProcessFor = [&](const InputInfo::Ptr& inputInfo) {
+        auto inputLayer = getLayer(name2id[inputInfo->name()]);
+        if (inputLayer.getType().empty() && inputLayer.getName().empty())
+            return;
+
+        ResizeAlgorithm alg = inputInfo->getPreProcess().getResizeAlgorithm();
+        std::string algStr;
+        switch (alg) {
+            case RESIZE_BILINEAR:
+                algStr = "RESIZE_BILINEAR";
+                break;
+            case RESIZE_AREA:
+                algStr = "RESIZE_AREA";
+                break;
+            default:
+                break;
+        }
+
+        if (!algStr.empty())
+            inputLayer.getParameters()["resize_alg"] = algStr;
+
+        switch (inputInfo->getPreProcess().getMeanVariant()) {
+            case MEAN_IMAGE: {
+                auto meanWidth = inputInfo->getPreProcess()[0]->meanData->dims()[0];
+                auto meanHeight = inputInfo->getPreProcess()[0]->meanData->dims()[1];
+
+                TensorDesc desc(Precision::FP32, inputLayer.getOutputPorts()[0].shape(), Layout::NCHW);
+                Blob::Ptr meanBuffer = make_blob_with_precision(desc);
+                meanBuffer->allocate();
+                auto *meanData = meanBuffer->buffer().as<float *>();
+                for (unsigned channel = 0; channel < inputInfo->getPreProcess().getNumberOfChannels(); channel++) {
+                    Blob::Ptr meanBlob = inputInfo->getPreProcess()[channel]->meanData;
+                    if (!meanBlob || meanBlob->precision() != Precision::FP32)
+                        THROW_IE_EXCEPTION << "mean image not provided or not in Float 32";
+                    if (meanBlob->size() != meanHeight*meanWidth) {
+                        THROW_IE_EXCEPTION << "mean image size does not match expected network input, expecting " << meanWidth << " x " << meanHeight;
+                    }
+                    ie_memcpy(meanData + channel*meanBlob->size(),
+                            meanBuffer->byteSize() - channel*meanBlob->size() * sizeof(float),
+                            meanBlob->buffer(),
+                            meanBlob->byteSize());
+                }
+
+                // WA for batch != 1
+                // Reshape for new batch is not supported for models with mean image
+                size_t noBatchSize = desc.getBlockingDesc().getStrides()[0];
+                for (size_t b = 1; b < inputLayer.getOutputPorts()[0].shape()[0]; b++) {
+                    ie_memcpy(meanData + noBatchSize*b,
+                              meanBuffer->byteSize() - noBatchSize * b * sizeof(float),
+                              meanData,
+                              noBatchSize * sizeof(float));
+                }
+
+                std::vector<PortInfo> outPorts;
+                std::vector<Connection> inputConnections = getLayerConnections(inputLayer.getId());
+                for (const auto& connection : inputConnections) {
+                    outPorts.push_back(connection.to());
+                    disconnect(connection);
+                }
+
+                idx_t constId = addLayer(Builder::ConstLayer(inputLayer.getName() + "_mean_image")
+                                                 .setPort(inputLayer.getOutputPorts()[0]).setData(meanBuffer));
+                idx_t constNegId = addLayer({{constId}}, Builder::PowerLayer(inputLayer.getName() + "_mean_image_neg")
+                                                 .setPort(inputLayer.getOutputPorts()[0]).setScale(-1));
+
+                idx_t eltwiseId = addLayer({{inputLayer.getId()}, {constNegId}},
+                        Builder::EltwiseLayer(inputLayer.getName() + "_mean_image_elt")
+                             .setInputPorts({inputLayer.getOutputPorts()[0], inputLayer.getOutputPorts()[0]})
+                             .setOutputPort(inputLayer.getOutputPorts()[0])
+                             .setEltwiseType(Builder::EltwiseLayer::EltwiseType::SUM));
+
+                for (const auto& port : outPorts) {
+                    connect({eltwiseId}, port);
+                }
+            }
+                break;
+            case MEAN_VALUE: {
+                TensorDesc desc(Precision::FP32, {inputInfo->getPreProcess().getNumberOfChannels()}, Layout::C);
+                Blob::Ptr mean = make_blob_with_precision(desc);
+                mean->allocate();
+                Blob::Ptr scale = make_blob_with_precision(desc);
+                scale->allocate();
+                Blob::Ptr emptyScale = make_blob_with_precision(desc);
+                emptyScale->allocate();
+                auto *meanData = mean->buffer().as<float *>();
+                auto *scaleData = scale->buffer().as<float *>();
+                auto *emptyScaleData = emptyScale->buffer().as<float *>();
+                bool noMean = true;
+                bool noScale = true;
+                for (size_t i = 0; i < inputInfo->getPreProcess().getNumberOfChannels(); i++) {
+                    meanData[i] = -inputInfo->getPreProcess()[i]->meanValue;
+                    noMean = noMean && (meanData[i] == 0);
+                    scaleData[i] = inputInfo->getPreProcess()[i]->stdScale;
+                    emptyScaleData[i] = 1;
+                    noScale = noScale && (scaleData[i] == 1);
+                }
+                std::vector<PortInfo> outPorts;
+                std::vector<Connection> inputConnections = getLayerConnections(inputLayer.getId());
+                for (const auto& connection : inputConnections) {
+                    outPorts.push_back(connection.to());
+                    disconnect(connection);
+                }
+
+                idx_t meanId = inputLayer.getId();
+                if (!noMean) {
+                    meanId = addLayer({{inputLayer.getId()}},
+                                            Builder::ScaleShiftLayer(inputLayer.getName() + "_mean_value")
+                                                    .setPort(inputLayer.getOutputPorts()[0])
+                                                    .setBiases(mean).setWeights(emptyScale));
+                }
+
+                idx_t scaleId = meanId;
+                if (!noScale) {
+                    scaleId = addLayer({{meanId}},
+                                             Builder::ScaleShiftLayer(inputLayer.getName() + "_scale_value")
+                                                     .setPort(inputLayer.getOutputPorts()[0])
+                                                     .setWeights(scale));
+                }
+
+                for (const auto& port : outPorts) {
+                    connect({scaleId}, port);
+                }
+            }
+                break;
+            default:
+                break;
+        }
+    };
+
+    for (auto input : inputs) {
+        auto inputLayer = input.second->getInputData()->getCreatorLayer().lock();
+
+        if (dataPtrs.find(input.second->getInputData().get()) == dataPtrs.end()) {
+            dataPtrs.insert(input.second->getInputData().get());
+        }
+
+        if (!inputLayer) {
+            // For v1 parser
+            inputLayer.reset(new CNNLayer({input.second->getInputData()->getName(),
+                                           "Input",
+                                           input.second->getInputData()->getPrecision()}));
+
+            inputLayer->outData.push_back(input.second->getInputData());
+        }
+        const auto layer = InputLayer(inputLayer->name).setPort(Port(inputLayer->outData[0]->getTensorDesc().getDims()));
+        name2id[layer.getName()] = addLayer(layer);
+
+        for (const auto &nlayer : input.second->getInputData()->getInputTo()) {
+            queueLayers.push_back(nlayer.second);
+        }
+    }
+    for (auto input : allInputs) {
+        auto isRealInput = std::find_if(std::begin(inputs), std::end(inputs),
+                                        [&](InputsDataMap::value_type &inputInfo) {
+                                            return inputInfo.second->getInputData()->getName() == input->name;
+                                        });
+        if (isRealInput != std::end(inputs)) {
+            continue;
+        }
+
+        details::CaselessEq<std::string> eq;
+        CNNLayerPtr cnnLayer = input;
+
+        if (eq(input->type, "Memory")) {
+            auto memoryId = input->GetParamAsString("id");
+            cnnLayer.reset(new CNNLayer({input->name + "/id=" + memoryId, "MemoryInput", input->precision}));
+            cnnLayer->params = input->params;
+            cnnLayer->outData = input->outData;
+        }
+
+        createGenericFromCNNLayer(cnnLayer);
+
+        size_t count_out = 0;
+        for (auto &&outData : input->outData) {
+            for (auto &&nlayer : outData->getInputTo()) {
+                queueLayers.push_back(nlayer.second);
+            }
+            count_out++;
+        }
+    }
+    while (!queueLayers.empty()) {
+        auto cnnLayerPtr = *queueLayers.begin();
+
+        if (name2id.find(cnnLayerPtr->name) == name2id.end()) {
+            createGenericFromCNNLayer(cnnLayerPtr);
+
+            for (auto &&outData : cnnLayerPtr->outData) {
+                for (auto &&nlayer : outData->getInputTo()) {
+                    queueLayers.push_back(nlayer.second);
+                }
+            }
+        }
+
+        queueLayers.erase(queueLayers.begin());
+    }
+    std::map<std::string, DataPtr> output;
+    network.getOutputsInfo(output);
+
+    for (auto it = output.begin(); it != output.end(); it++) {
+        CNNLayerPtr creator = (*it).second->getCreatorLayer().lock();
+        if (name2id.find(creator->name) == name2id.end())
+            THROW_IE_EXCEPTION << "Cannot find output layer " << creator->name;
+
+        auto lastLayer = getLayer(name2id[creator->name]);
+        if (lastLayer.getName() == "" && lastLayer.getType().empty())
+            THROW_IE_EXCEPTION << "Cannot find output layer " << creator->name;
+
+        std::string name = "out_" + lastLayer.getName();
+
+        CNNLayerPtr cnnOutLayer(new CNNLayer({name, "Output", creator->outData[0]->getPrecision()}));
+        cnnOutLayer->insData.push_back((*it).second);
+
+        idx_t outLayerId = createGenericFromCNNLayer(cnnOutLayer);
+
+        idx_t inIdx(0);
+        for (size_t i = 0; i < creator->outData.size(); i++) {
+            if (creator->outData[i] == (*it).second) {
+                inIdx = i;
+                break;
+            }
+        }
+
+        connections.push_back(Connection({lastLayer.getId(), inIdx}, {outLayerId}));
+    }
+
+    for (const auto dataPtr : dataPtrs) {
+        auto cnnInputLayer = dataPtr->getCreatorLayer().lock();
+        idx_t inIdx(0);
+        if (!cnnInputLayer) {
+            // For v1 parser
+            cnnInputLayer.reset(new CNNLayer({dataPtr->getName(),
+                                              "Input",
+                                              dataPtr->getPrecision()}));
+        } else {
+            for (size_t i = 0; i < cnnInputLayer->outData.size(); i++) {
+                if (cnnInputLayer->outData[i].get() == dataPtr) {
+                    inIdx = i;
+                    break;
+                }
+            }
+        }
+        for (const auto& it : dataPtr->inputTo) {
+            if (name2id.find(cnnInputLayer->name) == name2id.end() || name2id.find(it.second->name) == name2id.end())
+                THROW_IE_EXCEPTION << "Cannot create connections between nodes: " << cnnInputLayer->name << " -> " << it.second->name;
+            idx_t outIdx(0);
+
+            for (size_t i = 0; i < it.second->insData.size(); i++) {
+                const auto lockedData = it.second->insData[i].lock();
+                if (lockedData && lockedData.get() == dataPtr) {
+                    outIdx = i;
+                    break;
+                }
+            }
+            connections.push_back(Connection({name2id[cnnInputLayer->name], inIdx}, {name2id[it.second->name], outIdx}));
+        }
+    }
+
+    for (auto input : inputs) {
+        addPreProcessFor(input.second);
+    }
+}
+
+std::vector<Builder::Layer>& Builder::Network::getLayers() {
+    return layers;
+}
+
+const std::vector<Builder::Layer>& Builder::Network::getLayers() const {
+    return layers;
+}
+
+idx_t Builder::Network::addLayer(const std::vector<PortInfo> &inputs,
+                                 const Layer& layer) {
+    auto layer_id = addLayer(layer);
+    for (size_t i = 0; i < inputs.size(); i++) {
+        connect({inputs[i].layerId(), inputs[i].portId()}, {layer_id, i});
+    }
+    return layer_id;
+}
+
+idx_t Builder::Network::addLayer(const Layer& layer) {
+    auto getAvailableId = [&](idx_t defaultId) {
+        if (defaultId == (std::numeric_limits<idx_t>::max)())
+            defaultId = 0;
+
+        auto it = layers.begin();
+        while (it != layers.end()) {
+            for (it = layers.begin(); it != layers.end(); it++) {
+                if (it->getId() == defaultId) {
+                    defaultId++;
+                    break;
+                }
+            }
+        }
+        return defaultId;
+    };
+    auto generateAvailableName = [&](const std::string& name, idx_t id) {
+        const std::string idName = "id" + std::to_string(id);
+        std::string generatedName(name);
+        if (generatedName.empty())
+            generatedName = idName;
+        bool nameIsUnique(false);
+        while (!nameIsUnique) {
+            nameIsUnique = true;
+            for (const auto& layer : layers) {
+                if (generatedName == layer.getName()) {
+                    nameIsUnique = false;
+                    generatedName += "_" + idName;
+                }
+            }
+        }
+        return generatedName;
+    };
+    idx_t generatedId = getAvailableId(layer.getId());
+    const auto name = generateAvailableName(layer.getName(), generatedId);
+    layers.emplace_back(generatedId, layer);
+    layers[layers.size() - 1].getName() = name;
+    return generatedId;
+}
+
+void Builder::Network::connect(const PortInfo& input, const PortInfo& output) {
+    connections.emplace_back(input, output);
+}
+
+void Builder::Network::removeLayer(idx_t layerId) {
+    auto it = layers.begin();
+    for (; it != layers.end(); it++) {
+        if (it->getId() == layerId) {
+            break;
+        }
+    }
+    if (it != layers.end())
+        layers.erase(it);
+}
+
+void Builder::Network::disconnect(const Connection& connection) {
+    auto it = connections.begin();
+    for (; it != connections.end(); it++) {
+        if (connection == *it)
+            break;
+    }
+    if (it != connections.end())
+        connections.erase(it);
+}
+
+const INetwork::Ptr Builder::Network::build() const {
+    // Check that all ports are connected
+    for (const auto& layer : layers) {
+        std::vector<bool> existInCon(layer.getInputPorts().size());
+        std::vector<bool> existOutCon(layer.getOutputPorts().size());
+
+        const auto layerConnections = getLayerConnections(layer.getId());
+        for (const auto& connection : layerConnections) {
+            if (connection.from().layerId() == layer.getId()) {
+                existOutCon[connection.from().portId()] = true;
+                getLayer(connection.to().layerId());
+            }
+            if (connection.to().layerId() == layer.getId()) {
+                existInCon[connection.to().portId()] = true;
+                getLayer(connection.from().layerId());
+            }
+        }
+        bool allPortsConnected = true;
+        for (const auto& cons : {existInCon, existOutCon}) {
+            for (const auto &existCon : cons) {
+                allPortsConnected = allPortsConnected && existCon;
+            }
+        }
+        if (!allPortsConnected)
+            THROW_IE_EXCEPTION << "Not all ports of layer " << layer.getName() << " were connected!";
+    }
+
+    InferenceEngine::details::Network::Ptr network = std::make_shared<InferenceEngine::details::Network>(ctx, name);
+    for (const auto& layer : layers) {
+        network->addLayer(layer.build());
+    }
+    for (const auto& connection : connections) {
+        network->addConnection(connection);
+    }
+
+    // Check that all ports are connected
+    for (const auto& layer : *network) {
+        std::vector<bool> existInCon(layer->getInputPorts().size());
+        std::vector<bool> existOutCon(layer->getOutputPorts().size());
+
+        const auto layerConnections = network->getLayerConnections(layer->getId());
+        for (const auto& connection : layerConnections) {
+            if (connection.from().layerId() == layer->getId()) {
+                existOutCon[connection.from().portId()] = true;
+            }
+            if (connection.to().layerId() == layer->getId()) {
+                existInCon[connection.to().portId()] = true;
+            }
+        }
+        bool allPortsConnected = true;
+        for (const auto& cons : {existInCon, existOutCon}) {
+            for (const auto &existCon : cons) {
+                allPortsConnected = allPortsConnected && existCon;
+            }
+        }
+        if (!allPortsConnected)
+            THROW_IE_EXCEPTION << "Not all ports of layer " << layer->getName() << " were connected!";
+    }
+
+    std::map<std::string, SizeVector> inputShapes;
+    for (const auto& input : network->getInputs())
+        inputShapes[input->getName()] = input->getOutputPorts()[0].shape();
+
+    if (version) {
+        details::BaseCreator::version_ = version;
+    }
+
+    ShapeInfer::Reshaper reshaper(ctx, network);
+    ResponseDesc resp;
+    StatusCode sts = reshaper.run(inputShapes, &resp);
+    // Not all implementations may be registered if all shapes were read from IR.
+    if (sts == NOT_FOUND) {
+        bool allShapesLooksGood = true;
+        for (const auto& connection : network->getConnections()) {
+            if (network->getLayer(connection.from().layerId())->
+                    getOutputPorts()[connection.from().portId()].shape() !=
+                network->getLayer(connection.to().layerId())->
+                        getInputPorts()[connection.to().portId()].shape()) {
+                allShapesLooksGood = false;
+                break;
+            }
+        }
+        if (allShapesLooksGood)
+            sts = OK;
+    }
+
+    if (sts != OK)
+        THROW_IE_EXCEPTION << resp.msg;
+
+    return std::static_pointer_cast<INetwork>(network);
+}
+
+const std::shared_ptr<ICNNNetwork> Builder::convertToICNNNetwork(const INetwork::Ptr& network) {
+    std::unique_ptr<details::CNNNetworkImpl> cnnNetworkImpl(new details::CNNNetworkImpl());
+
+    Precision detectedPrecision = Precision::FP32;
+    for (const auto& layer : *network) {
+        const auto& params = layer->getParameters();
+        if (!params)
+            continue;
+        Precision prc = Precision::UNSPECIFIED;
+        for (const auto& blobIterator : params->getConstantData()) {
+            if (blobIterator.second) {
+                prc = blobIterator.second->precision();
+                break;
+            }
+        }
+        if (prc != Precision::UNSPECIFIED) {
+            detectedPrecision = prc;
+            break;
+        }
+    }
+
+    auto createCNNLayer = [](const std::shared_ptr<const ILayer>& layer, Precision precision) {
+        static std::vector<std::shared_ptr<BaseConverter>> convertors = {
+                std::make_shared<LayerConverter<InferenceEngine::PowerLayer>>("Power"),
+                std::make_shared<LayerConverter<InferenceEngine::ConvolutionLayer>>("Convolution"),
+                std::make_shared<LayerConverter<InferenceEngine::DeconvolutionLayer>>("Deconvolution"),
+                std::make_shared<LayerConverter<InferenceEngine::PoolingLayer>>("Pooling"),
+                std::make_shared<LayerConverter<InferenceEngine::FullyConnectedLayer>>("InnerProduct"),
+                std::make_shared<LayerConverter<InferenceEngine::FullyConnectedLayer>>("FullyConnected"),
+                std::make_shared<LayerConverter<InferenceEngine::NormLayer>>("LRN"),
+                std::make_shared<LayerConverter<InferenceEngine::NormLayer>>("Norm"),
+                std::make_shared<LayerConverter<InferenceEngine::SoftMaxLayer>>("Softmax"),
+                std::make_shared<LayerConverter<InferenceEngine::GRNLayer>>("GRN"),
+                std::make_shared<LayerConverter<InferenceEngine::MVNLayer>>("MVN"),
+                std::make_shared<LayerConverter<InferenceEngine::ReLULayer>>("ReLU"),
+                std::make_shared<LayerConverter<InferenceEngine::ClampLayer>>("Clamp"),
+                std::make_shared<LayerConverter<InferenceEngine::SplitLayer>>("Split"),
+                std::make_shared<LayerConverter<InferenceEngine::SplitLayer>>("Slice"),
+                std::make_shared<LayerConverter<InferenceEngine::ConcatLayer>>("Concat"),
+                std::make_shared<LayerConverter<InferenceEngine::EltwiseLayer>>("Eltwise"),
+                std::make_shared<LayerConverter<InferenceEngine::ScaleShiftLayer>>("ScaleShift"),
+                std::make_shared<LayerConverter<InferenceEngine::PReLULayer>>("PReLU"),
+                std::make_shared<LayerConverter<InferenceEngine::CropLayer>>("Crop"),
+                std::make_shared<LayerConverter<InferenceEngine::ReshapeLayer>>("Reshape"),
+                std::make_shared<LayerConverter<InferenceEngine::ReshapeLayer>>("Flatten"),
+                std::make_shared<LayerConverter<InferenceEngine::TileLayer>>("Tile"),
+                std::make_shared<ActivationConverter>(),
+                std::make_shared<LayerConverter<InferenceEngine::BatchNormalizationLayer>>("BatchNormalization"),
+        };
+        for (auto &convertor : convertors) {
+            if (!convertor->canCreate(layer->getType()))
+                continue;
+            return convertor->createLayer(layer, precision);
+        }
+        static LayerConverter<CNNLayer> genericCreator("");
+        return genericCreator.createLayer(layer, precision);
+    };
+
+    cnnNetworkImpl->setName(network->getName());
+    cnnNetworkImpl->setPrecision(Precision::UNSPECIFIED);
+    for (const auto& layer : *network) {
+        if (details::CaselessEq<std::string>()(layer->getType(), "Output"))
+            continue;
+        CNNLayerPtr cnnLayer = createCNNLayer(layer, detectedPrecision);
+        if (cnnNetworkImpl->getPrecision() == Precision::UNSPECIFIED) {
+            cnnNetworkImpl->setPrecision(cnnLayer->precision);
+        } else if (cnnNetworkImpl->getPrecision() == Precision::MIXED &&
+                   cnnNetworkImpl->getPrecision() != cnnLayer->precision) {
+            cnnNetworkImpl->setPrecision(Precision::MIXED);
+        }
+
+        auto connections = network->getLayerConnections(layer->getId());
+        std::unordered_set<idx_t> inputNum, outputNum;
+        for (const auto& connection : connections) {
+            if (connection.from().layerId() != layer->getId())
+                inputNum.insert(connection.to().portId());
+            else
+                outputNum.insert(connection.from().portId());
+        }
+        cnnLayer->insData.resize(inputNum.size());
+        cnnLayer->outData.resize(outputNum.size());
+        cnnNetworkImpl->addLayer(cnnLayer);
+    }
+
+    for (const auto& layer : *network) {
+        auto connections = network->getLayerConnections(layer->getId());
+        CNNLayerPtr cnnLayer;
+        StatusCode sts = cnnNetworkImpl->getLayerByName(layer->getName().c_str(), cnnLayer, nullptr);
+        details::CaselessEq<std::string> eq;
+        if (sts != OK && eq(layer->getType(), "Output"))
+            continue;
+        else if (sts != OK)
+            THROW_IE_EXCEPTION << "Cannot find CNNLayer by name " << layer->getName();
+
+        for (const auto& connection : connections) {
+            if (connection.from().layerId() != layer->getId())
+                continue;
+
+            const auto& outLayer = network->getLayer(connection.to().layerId());
+
+            CNNLayerPtr cnnOutLayer;
+            sts = cnnNetworkImpl->getLayerByName(outLayer->getName().c_str(), cnnOutLayer, nullptr);
+            if (sts != OK && !eq(outLayer->getType(), "Output"))
+                THROW_IE_EXCEPTION << "Cannot find CNNLayer by name " << outLayer->getName();
+
+            std::string dataName = layer->getName();
+            if (cnnLayer->outData.size() > 1) {
+                dataName += "_" + std::to_string(connection.from().portId());
+            }
+            DataPtr& data = cnnNetworkImpl->getData(dataName);
+            if (!data) {
+                TensorDesc dataDesc(detectedPrecision, layer->getOutputPorts()[connection.from().portId()].shape(),
+                                    TensorDesc::getLayoutByDims(layer->getOutputPorts()[connection.from().portId()].shape()));
+                data = std::make_shared<Data>(layer->getName(), dataDesc);
+                data->creatorLayer = cnnLayer;
+            }
+            cnnLayer->outData[connection.from().portId()] = data;
+            if (cnnOutLayer) {
+                data->inputTo[outLayer->getName()] = cnnOutLayer;
+                cnnOutLayer->insData[connection.to().portId()] = data;
+            } else {
+                cnnNetworkImpl->addOutput(data->getName());
+            }
+        }
+
+        cnnLayer->validateLayer();
+        if (eq(cnnLayer->type, "Input")) {
+            InputInfo::Ptr inputInfo(new InputInfo());
+            inputInfo->setInputData(*cnnLayer->outData.begin());
+            cnnNetworkImpl->setInputInfo(inputInfo);
+        }
+    }
+
+    return std::shared_ptr<ICNNNetwork>(cnnNetworkImpl.release());
+}
+
+Builder::Network::operator const INetwork::Ptr() const {
+    return build();
+}
+
+const Builder::Layer &Builder::Network::getLayer(idx_t layerId) const {
+    for (auto& layer : getLayers()) {
+        if (layer.getId() == layerId)
+            return layer;
+    }
+    THROW_IE_EXCEPTION << "Cannot find layer with id: " << layerId;
+}
+
+Builder::Layer &Builder::Network::getLayer(idx_t layerId) {
+    for (auto& layer : getLayers()) {
+        if (layer.getId() == layerId)
+            return layer;
+    }
+    THROW_IE_EXCEPTION << "Cannot find layer with id: " << layerId;
+}
+
+const std::vector<Connection> Builder::Network::getLayerConnections(idx_t layerId) const noexcept {
+    std::vector<Connection> layerConnections;
+    for (const auto connection : connections) {
+        if (connection.from().layerId() == layerId || connection.to().layerId() == layerId)
+            layerConnections.push_back(connection);
+    }
+    return layerConnections;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_norm_layer.cpp b/inference-engine/src/inference_engine/builders/ie_norm_layer.cpp
new file mode 100644 (file)
index 0000000..cb6d47b
--- /dev/null
@@ -0,0 +1,85 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_norm_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::NormLayer::NormLayer(const std::string& name): LayerFragment("Norm", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setAcrossMaps(false);
+    setSize(0);
+    setAlpha(0);
+    setBeta(0);
+}
+
+Builder::NormLayer::NormLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Norm"))
+        THROW_IE_EXCEPTION << "Cannot create NormLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::NormLayer& Builder::NormLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::NormLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::NormLayer& Builder::NormLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+size_t Builder::NormLayer::getSize() const {
+    return getLayer().getParameters()["local-size"].asUInt();
+}
+
+Builder::NormLayer& Builder::NormLayer::setSize(size_t size) {
+    getLayer().getParameters()["local-size"] = size;
+    return *this;
+}
+
+float Builder::NormLayer::getAlpha() const {
+    return getLayer().getParameters()["alpha"].asFloat();
+}
+
+Builder::NormLayer& Builder::NormLayer::setAlpha(float alpha) {
+    getLayer().getParameters()["alpha"] = alpha;
+    return *this;
+}
+
+float Builder::NormLayer::getBeta() const {
+    return getLayer().getParameters()["beta"].asFloat();
+}
+
+Builder::NormLayer& Builder::NormLayer::setBeta(float beta) {
+    getLayer().getParameters()["beta"] = beta;
+    return *this;
+}
+
+bool Builder::NormLayer::getAcrossMaps() const {
+    return getLayer().getParameters()["region"].asString() == "across";
+}
+
+Builder::NormLayer& Builder::NormLayer::setAcrossMaps(bool acrossMap)  {
+    std::string value = acrossMap ? "across" : "same";
+    getLayer().getParameters()["region"] = value;
+    return *this;
+}
+
+Builder::NormLayer::NormType Builder::NormLayer::getRegion() const {
+    return getAcrossMaps() ? Builder::NormLayer::NormType::ACROSS_CHANNELS :
+                             Builder::NormLayer::NormType::WITHIN_CHANNEL;
+}
+Builder::NormLayer& Builder::NormLayer::setRegion(Builder::NormLayer::NormType type) {
+    setAcrossMaps(type);
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_normalize_layer.cpp b/inference-engine/src/inference_engine/builders/ie_normalize_layer.cpp
new file mode 100644 (file)
index 0000000..699993f
--- /dev/null
@@ -0,0 +1,65 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_normalize_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::NormalizeLayer::NormalizeLayer(const std::string& name): LayerFragment("Normalize", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setAcrossMaps(false);
+    setChannelShared(false);
+    setEpsilon(0.0000001f);
+}
+
+Builder::NormalizeLayer::NormalizeLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Normalize"))
+        THROW_IE_EXCEPTION << "Cannot create NormalizeLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::NormalizeLayer& Builder::NormalizeLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::NormalizeLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::NormalizeLayer& Builder::NormalizeLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+bool Builder::NormalizeLayer::getAcrossMaps() const {
+    return getLayer().getParameters()["region"].asBool();
+}
+
+Builder::NormalizeLayer& Builder::NormalizeLayer::setAcrossMaps(bool acrossMap)  {
+    getLayer().getParameters()["region"] = acrossMap ? 1 : 0;
+    return *this;
+}
+
+bool Builder::NormalizeLayer::getChannelShared() const {
+    return getLayer().getParameters()["channel_shared"].asBool();
+}
+
+Builder::NormalizeLayer& Builder::NormalizeLayer::setChannelShared(bool channelShared)  {
+    getLayer().getParameters()["channel_shared"] = channelShared ? 1 : 0;
+    return *this;
+}
+
+float Builder::NormalizeLayer::getEpsilon() const {
+    return getLayer().getParameters()["eps"].asFloat();
+}
+
+Builder::NormalizeLayer& Builder::NormalizeLayer::setEpsilon(float eps) {
+    getLayer().getParameters()["eps"] = eps;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_output_layer_layer.cpp b/inference-engine/src/inference_engine/builders/ie_output_layer_layer.cpp
new file mode 100644 (file)
index 0000000..88dfcf1
--- /dev/null
@@ -0,0 +1,33 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_output_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::OutputLayer::OutputLayer(const std::string& name): LayerFragment("Output", name) {
+    getLayer().getInputPorts().resize(1);
+}
+
+Builder::OutputLayer::OutputLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Output"))
+        THROW_IE_EXCEPTION << "Cannot create OutputLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::OutputLayer& Builder::OutputLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::OutputLayer::getPort() const {
+    return getLayer().getInputPorts()[0];
+}
+
+Builder::OutputLayer& Builder::OutputLayer::setPort(const Port &port) {
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_permute_layer.cpp b/inference-engine/src/inference_engine/builders/ie_permute_layer.cpp
new file mode 100644 (file)
index 0000000..2cfa879
--- /dev/null
@@ -0,0 +1,52 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_permute_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+#include <vector>
+
+using namespace InferenceEngine;
+
+Builder::PermuteLayer::PermuteLayer(const std::string& name): LayerFragment("Permute", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+}
+
+Builder::PermuteLayer::PermuteLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Permute"))
+        THROW_IE_EXCEPTION << "Cannot create PermuteLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::PermuteLayer& Builder::PermuteLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::PermuteLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::PermuteLayer& Builder::PermuteLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+const Port& Builder::PermuteLayer::getInputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::PermuteLayer& Builder::PermuteLayer::setInputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+const std::vector<size_t> Builder::PermuteLayer::getOrder() const {
+    return uInts2size_t(getLayer().getParameters()["order"].asUInts());
+}
+Builder::PermuteLayer& Builder::PermuteLayer::setOrder(const std::vector<size_t>& ratios) {
+    getLayer().getParameters()["order"] = ratios;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_pooling_layer.cpp b/inference-engine/src/inference_engine/builders/ie_pooling_layer.cpp
new file mode 100644 (file)
index 0000000..41db6c8
--- /dev/null
@@ -0,0 +1,187 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_pooling_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::PoolingLayer::PoolingLayer(const std::string& name): LayerFragment("Pooling", name) {
+    getLayer().getInputPorts().resize(1);
+    getLayer().getOutputPorts().resize(1);
+    setExcludePad(false);
+    setPoolingType(PoolingType::MAX);
+    setRoundingType(RoundingType::CEIL);
+}
+
+Builder::PoolingLayer::PoolingLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Pooling"))
+        THROW_IE_EXCEPTION << "Cannot create PoolingLayer decorator for layer " << getLayer().getType();
+
+    std::string typeStr = getLayer().getParameters()["pool-method"].asString("max");
+    if (typeStr == "max")
+        type = MAX;
+    else if (typeStr == "avg")
+        type = AVG;
+
+    typeStr = getLayer().getParameters()["rounding_type"].asString("ceil");
+    if (typeStr == "ceil")
+        roundingType = CEIL;
+    else if (typeStr == "avg")
+        roundingType = FLOOR;
+}
+
+Builder::PoolingLayer::operator Builder::Layer() const {
+    Layer genLayer(getLayer());
+
+    std::vector<size_t> l_kernel = getKernel();
+    std::vector<size_t> l_paddingBegin = getPaddingsBegin();
+    std::vector<size_t> l_paddingEnd = getPaddingsEnd();
+    std::vector<size_t> l_strides = getStrides();
+
+    if (l_paddingBegin.empty() && !l_kernel.empty())
+        l_paddingBegin.resize(l_kernel.size(), 0);
+    if (l_paddingEnd.empty() && !l_kernel.empty())
+        l_paddingEnd.resize(l_kernel.size(), 0);
+    if (l_strides.empty() && !l_kernel.empty())
+        l_strides.resize(l_kernel.size(), 1);
+
+    if (l_kernel.empty() || l_kernel.size() != l_paddingBegin.size() || l_kernel.size() != l_paddingEnd.size() || l_kernel.size() != l_strides.size())
+        THROW_IE_EXCEPTION << genLayer.getType() << " node " << genLayer.getName() << " contains incorrect parameters!";
+
+    genLayer.getParameters()["kernel"] = l_kernel;
+    genLayer.getParameters()["strides"] = l_strides;
+    genLayer.getParameters()["pads_begin"] = l_paddingBegin;
+    genLayer.getParameters()["pads_end"] = l_paddingEnd;
+    return genLayer;
+}
+
+Builder::PoolingLayer &Builder::PoolingLayer::setName(const std::string &name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::PoolingLayer::getInputPort() const {
+    return getLayer().getInputPorts()[0];
+}
+
+Builder::PoolingLayer& Builder::PoolingLayer::setInputPort(const Port& port) {
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+const Port& Builder::PoolingLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::PoolingLayer& Builder::PoolingLayer::setOutputPort(const Port& port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+const std::vector<size_t> Builder::PoolingLayer::getKernel() const {
+    return uInts2size_t(getLayer().getParameters()["kernel"].asUInts({}));
+}
+Builder::PoolingLayer& Builder::PoolingLayer::setKernel(const std::vector<size_t>& kernel) {
+    getLayer().getParameters()["kernel"] = kernel;
+    return *this;
+}
+
+const std::vector<size_t> Builder::PoolingLayer::getStrides() const {
+    return uInts2size_t(getLayer().getParameters()["strides"].asUInts({}));
+}
+Builder::PoolingLayer& Builder::PoolingLayer::setStrides(const std::vector<size_t>& strides) {
+    getLayer().getParameters()["strides"] = strides;
+    return *this;
+}
+
+const std::vector<size_t> Builder::PoolingLayer::getPaddingsBegin() const {
+    return uInts2size_t(getLayer().getParameters()["pads_begin"].asUInts({}));
+}
+Builder::PoolingLayer& Builder::PoolingLayer::setPaddingsBegin(const std::vector<size_t>& paddings) {
+    getLayer().getParameters()["pads_begin"] = paddings;
+    return *this;
+}
+
+const std::vector<size_t> Builder::PoolingLayer::getPaddingsEnd() const {
+    return uInts2size_t(getLayer().getParameters()["pads_end"].asUInts({}));
+}
+Builder::PoolingLayer& Builder::PoolingLayer::setPaddingsEnd(const std::vector<size_t>& paddings) {
+    getLayer().getParameters()["pads_end"] = paddings;
+    return *this;
+}
+
+Builder::PoolingLayer::PoolingType Builder::PoolingLayer::getPoolingType() const {
+    return type;
+}
+Builder::PoolingLayer& Builder::PoolingLayer::setPoolingType(Builder::PoolingLayer::PoolingType type) {
+    this->type = type;
+    std::string typeStr;
+    switch (type) {
+        case MAX:
+            typeStr = "max";
+            break;
+        case AVG:
+            typeStr = "avg";
+            break;
+    }
+    getLayer().getParameters()["pool-method"] = typeStr;
+    return *this;
+}
+
+Builder::PoolingLayer::RoundingType Builder::PoolingLayer::getRoundingType() const {
+    return roundingType;
+}
+Builder::PoolingLayer& Builder::PoolingLayer::setRoundingType(Builder::PoolingLayer::RoundingType type) {
+    roundingType = type;
+    std::string typeStr;
+    switch (type) {
+        case CEIL:
+            typeStr = "ceil";
+            break;
+        case FLOOR:
+            typeStr = "floor";
+            break;
+    }
+    getLayer().getParameters()["rounding_type"] = typeStr;
+    return *this;
+}
+
+bool Builder::PoolingLayer::getExcludePad() const {
+    return getLayer().getParameters()["exclude-pad"].asBool();
+}
+
+Builder::PoolingLayer& Builder::PoolingLayer::setExcludePad(bool exclude) {
+    getLayer().getParameters()["exclude-pad"] = exclude;
+    return *this;
+}
+
+
+void Builder::PoolingLayer::validate(const Layer& layer) {
+    Layer poolLayer = layer;
+    Builder::PoolingLayer poolBuilder(poolLayer);
+    std::vector<size_t> l_kernel = poolBuilder.getKernel();
+    // WA for old IRs
+    if (l_kernel.empty() && layer.getParameters().find("kernel-x") != layer.getParameters().end() &&
+        layer.getParameters().find("kernel-y") != layer.getParameters().end())
+        return;
+    std::vector<size_t> l_paddingBegin = poolBuilder.getPaddingsBegin();
+    std::vector<size_t> l_paddingEnd = poolBuilder.getPaddingsEnd();
+    std::vector<size_t> l_strides = poolBuilder.getStrides();
+
+    if (l_paddingBegin.empty() && !l_kernel.empty())
+        l_paddingBegin.resize(l_kernel.size(), 0);
+    if (l_paddingEnd.empty() && !l_kernel.empty())
+        l_paddingEnd.resize(l_kernel.size(), 0);
+    if (l_strides.empty() && !l_kernel.empty())
+        l_strides.resize(l_kernel.size(), 1);
+
+    if (l_kernel.empty() || l_kernel.size() != l_paddingBegin.size() || l_kernel.size() != l_paddingEnd.size() || l_kernel.size() != l_strides.size())
+        THROW_IE_EXCEPTION << layer.getType() << " node " << layer.getName() << " contains incorrect parameters!";
+}
+
+REG_VALIDATOR_FOR(Pooling, Builder::PoolingLayer::validate);
diff --git a/inference-engine/src/inference_engine/builders/ie_power_layer.cpp b/inference-engine/src/inference_engine/builders/ie_power_layer.cpp
new file mode 100644 (file)
index 0000000..c3142fa
--- /dev/null
@@ -0,0 +1,66 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_power_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::PowerLayer::PowerLayer(const std::string& name): LayerFragment("Power", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setPower(1);
+    setScale(1);
+    setShift(0);
+}
+
+Builder::PowerLayer::PowerLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Power"))
+        THROW_IE_EXCEPTION << "Cannot create PowerLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::PowerLayer& Builder::PowerLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::PowerLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::PowerLayer& Builder::PowerLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+float Builder::PowerLayer::getPower() const {
+    return getLayer().getParameters()["power"].asFloat();
+}
+
+Builder::PowerLayer& Builder::PowerLayer::setPower(float power) {
+    getLayer().getParameters()["power"] = power;
+    return *this;
+}
+
+float Builder::PowerLayer::getScale() const {
+    return getLayer().getParameters()["scale"].asFloat();
+}
+
+Builder::PowerLayer& Builder::PowerLayer::setScale(float scale) {
+    getLayer().getParameters()["scale"] = scale;
+    return *this;
+}
+
+float Builder::PowerLayer::getShift() const {
+    return getLayer().getParameters()["shift"].asFloat();
+}
+
+Builder::PowerLayer& Builder::PowerLayer::setShift(float shift) {
+    getLayer().getParameters()["shift"] = shift;
+    return *this;
+}
+
diff --git a/inference-engine/src/inference_engine/builders/ie_prelu_layer.cpp b/inference-engine/src/inference_engine/builders/ie_prelu_layer.cpp
new file mode 100644 (file)
index 0000000..6263f96
--- /dev/null
@@ -0,0 +1,49 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_prelu_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::PReLULayer::PReLULayer(const std::string& name): LayerFragment("PReLU", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setChannelShared(false);
+}
+
+Builder::PReLULayer::PReLULayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "PReLU"))
+        THROW_IE_EXCEPTION << "Cannot create PReLULayer decorator for layer " << getLayer().getType();
+}
+
+Builder::PReLULayer& Builder::PReLULayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::PReLULayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::PReLULayer& Builder::PReLULayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+bool Builder::PReLULayer::getChannelShared() const {
+    return getLayer().getParameters()["channel_shared"].asBool();
+}
+Builder::PReLULayer& Builder::PReLULayer::setChannelShared(bool flag) {
+    getLayer().getParameters()["channel_shared"] = flag ? 1 : 0;
+    return *this;
+}
+
+Builder::PReLULayer& Builder::PReLULayer::setWeights(const Blob::CPtr& weights) {
+    getLayer().addConstantData("weights", weights);
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_prior_box_clustered_layer.cpp b/inference-engine/src/inference_engine/builders/ie_prior_box_clustered_layer.cpp
new file mode 100644 (file)
index 0000000..c52b2f4
--- /dev/null
@@ -0,0 +1,124 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_prior_box_clustered_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::PriorBoxClusteredLayer::PriorBoxClusteredLayer(const std::string& name): LayerFragment("PriorBoxClustered", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(2);
+}
+
+Builder::PriorBoxClusteredLayer::PriorBoxClusteredLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "PriorBoxClustered"))
+        THROW_IE_EXCEPTION << "Cannot create PriorBoxClusteredLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const std::vector<Port>& Builder::PriorBoxClusteredLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setInputPorts(const std::vector<Port> &ports) {
+    if (ports.size() != 2)
+        THROW_IE_EXCEPTION << "Incorrect number of inputs for PriorBoxClustered layer.";
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+
+const Port& Builder::PriorBoxClusteredLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+float Builder::PriorBoxClusteredLayer::getVariance() const {
+    return getLayer().getParameters()["variance"].asFloat();
+}
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setVariance(float variance) {
+    getLayer().getParameters()["variance"] = variance;
+    return *this;
+}
+
+float Builder::PriorBoxClusteredLayer::getOffset() const {
+    return getLayer().getParameters()["offset"].asFloat();
+}
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setOffset(float offset) {
+    getLayer().getParameters()["offset"] = offset;
+    return *this;
+}
+
+float Builder::PriorBoxClusteredLayer::getWidth() const {
+    return getLayer().getParameters()["width"].asFloat();
+}
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setWidth(float width) {
+    getLayer().getParameters()["width"] = width;
+    return *this;
+}
+
+float Builder::PriorBoxClusteredLayer::getHeight() const {
+    return getLayer().getParameters()["height"].asFloat();
+}
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setHeight(float height) {
+    getLayer().getParameters()["height"] = height;
+    return *this;
+}
+
+const std::vector<float> Builder::PriorBoxClusteredLayer::getSteps() const {
+    return {getLayer().getParameters()["step_h"].asFloat(), getLayer().getParameters()["step_w"].asFloat()};
+}
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setSteps(const std::vector<float> steps) {
+    if (steps.size() != 2)
+        THROW_IE_EXCEPTION << "PriorBoxClusteredLayer supports sizes only for height and width dimensions!";
+    getLayer().getParameters()["step_h"] = steps[0];
+    getLayer().getParameters()["step_w"] = steps[1];
+    return *this;
+}
+
+const std::vector<float> Builder::PriorBoxClusteredLayer::getImgSizes() const {
+    return {getLayer().getParameters()["img_h"].asFloat(), getLayer().getParameters()["img_w"].asFloat()};
+}
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setImgSizes(const std::vector<float> sizes) {
+    if (sizes.size() != 2)
+        THROW_IE_EXCEPTION << "PriorBoxClusteredLayer allows to specify only height and width dimensions of an input image!";
+    getLayer().getParameters()["img_h"] = sizes[0];
+    getLayer().getParameters()["img_w"] = sizes[1];
+    return *this;
+}
+
+float Builder::PriorBoxClusteredLayer::getStep() const {
+    return getLayer().getParameters()["step"].asFloat();
+}
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setStep(float step) {
+    getLayer().getParameters()["step"] = step;
+    return *this;
+}
+
+bool Builder::PriorBoxClusteredLayer::getClip() const {
+    return getLayer().getParameters()["clip"].asBool();
+}
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setClip(bool flag) {
+    getLayer().getParameters()["clip"] = flag;
+    return *this;
+}
+
+bool Builder::PriorBoxClusteredLayer::getFlip() const {
+    return getLayer().getParameters()["flip"].asBool();
+}
+Builder::PriorBoxClusteredLayer& Builder::PriorBoxClusteredLayer::setFlip(bool flag) {
+    getLayer().getParameters()["flip"] = flag;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_prior_box_layer.cpp b/inference-engine/src/inference_engine/builders/ie_prior_box_layer.cpp
new file mode 100644 (file)
index 0000000..dab36e0
--- /dev/null
@@ -0,0 +1,118 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_prior_box_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::PriorBoxLayer::PriorBoxLayer(const std::string& name): LayerFragment("PriorBox", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(2);
+    setScaleAllSizes(true);
+}
+
+Builder::PriorBoxLayer::PriorBoxLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "PriorBox"))
+        THROW_IE_EXCEPTION << "Cannot create PriorBoxLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const std::vector<Port>& Builder::PriorBoxLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setInputPorts(const std::vector<Port> &ports) {
+    if (ports.size() != 2)
+        THROW_IE_EXCEPTION << "Incorrect number of inputs for PriorBox layer.";
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+
+const Port& Builder::PriorBoxLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+float Builder::PriorBoxLayer::getVariance() const {
+    return getLayer().getParameters()["variance"].asFloat();
+}
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setVariance(float variance) {
+    getLayer().getParameters()["variance"] = variance;
+    return *this;
+}
+
+float Builder::PriorBoxLayer::getOffset() const {
+    return getLayer().getParameters()["offset"].asFloat();
+}
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setOffset(float offset) {
+    getLayer().getParameters()["offset"] = offset;
+    return *this;
+}
+
+float Builder::PriorBoxLayer::getStep() const {
+    return getLayer().getParameters()["step"].asFloat();
+}
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setStep(float step) {
+    getLayer().getParameters()["step"] = step;
+    return *this;
+}
+
+size_t Builder::PriorBoxLayer::getMinSize() const {
+    return getLayer().getParameters()["min_size"].asUInt();
+}
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setMinSize(size_t minSize) {
+    getLayer().getParameters()["min_size"] = minSize;
+    return *this;
+}
+size_t Builder::PriorBoxLayer::getMaxSize() const {
+    return getLayer().getParameters()["max_size"].asUInt();
+}
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setMaxSize(size_t maxSize) {
+    getLayer().getParameters()["max_size"] = maxSize;
+    return *this;
+}
+
+bool Builder::PriorBoxLayer::getScaleAllSizes() const {
+    return getLayer().getParameters()["scale_all_sizes"].asBool(true);
+}
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setScaleAllSizes(bool flag) {
+    getLayer().getParameters()["scale_all_sizes"] = flag;
+    return *this;
+}
+
+bool Builder::PriorBoxLayer::getClip() const {
+    return getLayer().getParameters()["clip"].asBool();
+}
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setClip(bool flag) {
+    getLayer().getParameters()["clip"] = flag;
+    return *this;
+}
+
+bool Builder::PriorBoxLayer::getFlip() const {
+    return getLayer().getParameters()["flip"].asBool();
+}
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setFlip(bool flag) {
+    getLayer().getParameters()["flip"] = flag;
+    return *this;
+}
+
+const std::vector<size_t> Builder::PriorBoxLayer::getAspectRatio() const {
+    return uInts2size_t(getLayer().getParameters()["aspect_ratio"].asUInts());
+}
+Builder::PriorBoxLayer& Builder::PriorBoxLayer::setAspectRatio(const std::vector<size_t>& aspectRatio) {
+    getLayer().getParameters()["aspect_ratio"] = aspectRatio;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_proposal_layer.cpp b/inference-engine/src/inference_engine/builders/ie_proposal_layer.cpp
new file mode 100644 (file)
index 0000000..2437b7c
--- /dev/null
@@ -0,0 +1,103 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_proposal_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ProposalLayer::ProposalLayer(const std::string& name): LayerFragment("Proposal", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(3);
+}
+
+Builder::ProposalLayer::ProposalLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Proposal"))
+        THROW_IE_EXCEPTION << "Cannot create ProposalLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ProposalLayer& Builder::ProposalLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const std::vector<Port>& Builder::ProposalLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+
+Builder::ProposalLayer& Builder::ProposalLayer::setInputPorts(const std::vector<Port> &ports) {
+    if (ports.size() != 3)
+        THROW_IE_EXCEPTION << "Incorrect number of inputs for Proposal layer.";
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+
+const Port& Builder::ProposalLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::ProposalLayer& Builder::ProposalLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+size_t Builder::ProposalLayer::getPostNMSTopN() const {
+    return getLayer().getParameters()["post_nms_topn"].asUInt();
+}
+Builder::ProposalLayer& Builder::ProposalLayer::setPostNMSTopN(size_t topN) {
+    getLayer().getParameters()["post_nms_topn"] = topN;
+    return *this;
+}
+size_t Builder::ProposalLayer::getPreNMSTopN() const {
+    return getLayer().getParameters()["pre_nms_topn"].asUInt();
+}
+Builder::ProposalLayer& Builder::ProposalLayer::setPreNMSTopN(size_t topN) {
+    getLayer().getParameters()["pre_nms_topn"] = topN;
+    return *this;
+}
+float Builder::ProposalLayer::getNMSThresh() const {
+    return getLayer().getParameters()["nms_thresh"].asFloat();
+}
+Builder::ProposalLayer& Builder::ProposalLayer::setNMSThresh(float thresh) {
+    getLayer().getParameters()["nms_thresh"] = thresh;
+    return *this;
+}
+size_t Builder::ProposalLayer::getBaseSize() const {
+    return getLayer().getParameters()["base_size"].asUInt();
+}
+Builder::ProposalLayer& Builder::ProposalLayer::setBaseSize(size_t baseSize) {
+    getLayer().getParameters()["base_size"] = baseSize;
+    return *this;
+}
+size_t Builder::ProposalLayer::getMinSize() const {
+    return getLayer().getParameters()["min_size"].asUInt();
+}
+Builder::ProposalLayer& Builder::ProposalLayer::setMinSize(size_t minSize) {
+    getLayer().getParameters()["min_size"] = minSize;
+    return *this;
+}
+size_t Builder::ProposalLayer::getFeatStride() const {
+    return getLayer().getParameters()["feat_stride"].asUInt();
+}
+Builder::ProposalLayer& Builder::ProposalLayer::setFeatStride(size_t featStride) {
+    getLayer().getParameters()["feat_stride"] = featStride;
+    return *this;
+}
+const std::vector<float> Builder::ProposalLayer::getScale() const {
+    return getLayer().getParameters()["scale"].asFloats();
+}
+Builder::ProposalLayer& Builder::ProposalLayer::setScale(const std::vector<float>& scales) {
+    getLayer().getParameters()["scale"] = scales;
+    return *this;
+}
+const std::vector<float> Builder::ProposalLayer::getRatio() const {
+    return getLayer().getParameters()["ratio"].asFloats();
+}
+Builder::ProposalLayer& Builder::ProposalLayer::setRatio(const std::vector<float>& ratios) {
+    getLayer().getParameters()["ratio"] = ratios;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_psroi_pooling_layer.cpp b/inference-engine/src/inference_engine/builders/ie_psroi_pooling_layer.cpp
new file mode 100644 (file)
index 0000000..8a023d3
--- /dev/null
@@ -0,0 +1,61 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_psroi_pooling_layer.hpp>
+#include <details/caseless.hpp>
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::PSROIPoolingLayer::PSROIPoolingLayer(const std::string& name): LayerFragment("PSROIPooling", name) {
+    getLayer().getOutputPorts().resize(1);
+}
+
+Builder::PSROIPoolingLayer::PSROIPoolingLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "PSROIPooling"))
+        THROW_IE_EXCEPTION << "Cannot create PSROIPoolingLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+const std::vector<Port>& Builder::PSROIPoolingLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setInputPorts(const std::vector<Port>& ports) {
+    if (ports.size() != 2)
+        THROW_IE_EXCEPTION << "PSROIPoolingLayer should have 2 inputs!";
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+const Port& Builder::PSROIPoolingLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setOutputPort(const Port& port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+float Builder::PSROIPoolingLayer::getSpatialScale() const {
+    return getLayer().getParameters()["spatial_scale"].asFloat();
+}
+Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setSpatialScale(float spatialScale) {
+    getLayer().getParameters()["spatial_scale"] = spatialScale;
+    return *this;
+}
+size_t Builder::PSROIPoolingLayer::getOutputDim() const {
+    return getLayer().getParameters()["output_dim"].asUInt();
+}
+Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setOutputDim(size_t outDim) {
+    getLayer().getParameters()["output_dim"] = outDim;
+    return *this;
+}
+size_t Builder::PSROIPoolingLayer::getGroupSize() const {
+    return getLayer().getParameters()["group_size"].asUInt();
+}
+Builder::PSROIPoolingLayer& Builder::PSROIPoolingLayer::setGroupSize(size_t size) {
+    getLayer().getParameters()["group_size"] = size;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_region_yolo_layer.cpp b/inference-engine/src/inference_engine/builders/ie_region_yolo_layer.cpp
new file mode 100644 (file)
index 0000000..bcefcbb
--- /dev/null
@@ -0,0 +1,96 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_region_yolo_layer.hpp>
+#include <details/caseless.hpp>
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::RegionYoloLayer::RegionYoloLayer(const std::string& name): LayerFragment("RegionYolo", name) {
+    getLayer().getInputPorts().resize(1);
+    getLayer().getOutputPorts().resize(1);
+}
+
+Builder::RegionYoloLayer::RegionYoloLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "RegionYolo"))
+        THROW_IE_EXCEPTION << "Cannot create RegionYoloLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+const Port& Builder::RegionYoloLayer::getInputPort() const {
+    return getLayer().getInputPorts()[0];
+}
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setInputPort(const Port& port) {
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+const Port& Builder::RegionYoloLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setOutputPort(const Port& port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+int Builder::RegionYoloLayer::getCoords() const {
+    return getLayer().getParameters()["coords"].asInt();
+}
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setCoords(int coords) {
+    getLayer().getParameters()["coords"] = coords;
+    return *this;
+}
+int Builder::RegionYoloLayer::getClasses() const {
+    return getLayer().getParameters()["classes"].asInt();
+}
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setClasses(int classes) {
+    getLayer().getParameters()["classes"] = classes;
+    return *this;
+}
+int Builder::RegionYoloLayer::getNum() const {
+    return getLayer().getParameters()["num"].asInt();
+}
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setNum(int num) {
+    getLayer().getParameters()["num"] = num;
+    return *this;
+}
+bool Builder::RegionYoloLayer::getDoSoftMax() const {
+    return getLayer().getParameters()["do_softmax"].asBool();
+}
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setDoSoftMax(bool flag) {
+    getLayer().getParameters()["do_softmax"] = flag ? 1 : 0;
+    return *this;
+}
+float Builder::RegionYoloLayer::getAnchors() const {
+    return getLayer().getParameters()["anchors"].asFloat();
+}
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setAnchors(float anchors) {
+    getLayer().getParameters()["anchors"] = anchors;
+    return *this;
+}
+int Builder::RegionYoloLayer::getMask() const {
+    return getLayer().getParameters()["mask"].asInt();
+}
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setMask(int mask) {
+    getLayer().getParameters()["mask"] = mask;
+    return *this;
+}
+size_t Builder::RegionYoloLayer::getAxis() const {
+    return getLayer().getParameters()["axis"].asUInt();
+}
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setAxis(size_t axis) {
+    getLayer().getParameters()["axis"] = axis;
+    return *this;
+}
+size_t Builder::RegionYoloLayer::getEndAxis() const {
+    return getLayer().getParameters()["end_axis"].asUInt();
+}
+Builder::RegionYoloLayer& Builder::RegionYoloLayer::setEndAxis(size_t axis) {
+    getLayer().getParameters()["end_axis"] = axis;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_relu6_layer.cpp b/inference-engine/src/inference_engine/builders/ie_relu6_layer.cpp
new file mode 100644 (file)
index 0000000..d39b2d0
--- /dev/null
@@ -0,0 +1,47 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_relu6_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ReLU6Layer::ReLU6Layer(const std::string& name): LayerFragment("ReLU6", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setN(6);
+}
+
+Builder::ReLU6Layer::ReLU6Layer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "ReLU6"))
+        THROW_IE_EXCEPTION << "Cannot create ReLU6Layer decorator for layer " << getLayer().getType();
+}
+
+Builder::ReLU6Layer& Builder::ReLU6Layer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::ReLU6Layer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::ReLU6Layer& Builder::ReLU6Layer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+float Builder::ReLU6Layer::getN() const {
+    return getLayer().getParameters()["n"].asFloat();
+}
+
+Builder::ReLU6Layer& Builder::ReLU6Layer::setN(float n) {
+    getLayer().getParameters()["n"] = n;
+    return *this;
+}
+
+
diff --git a/inference-engine/src/inference_engine/builders/ie_relu_layer.cpp b/inference-engine/src/inference_engine/builders/ie_relu_layer.cpp
new file mode 100644 (file)
index 0000000..29793c4
--- /dev/null
@@ -0,0 +1,45 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_relu_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ReLULayer::ReLULayer(const std::string& name): LayerFragment("ReLU", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setNegativeSlope(0);
+}
+
+Builder::ReLULayer::ReLULayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "ReLU"))
+        THROW_IE_EXCEPTION << "Cannot create ReLULayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ReLULayer& Builder::ReLULayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::ReLULayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::ReLULayer& Builder::ReLULayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+float Builder::ReLULayer::getNegativeSlope() const {
+    return getLayer().getParameters()["negative_slope"].asFloat();
+}
+
+Builder::ReLULayer& Builder::ReLULayer::setNegativeSlope(float negativeSlope) {
+    getLayer().getParameters()["negative_slope"] = negativeSlope;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_reorg_yolo_layer.cpp b/inference-engine/src/inference_engine/builders/ie_reorg_yolo_layer.cpp
new file mode 100644 (file)
index 0000000..83c831f
--- /dev/null
@@ -0,0 +1,47 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_reorg_yolo_layer.hpp>
+#include <details/caseless.hpp>
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ReorgYoloLayer::ReorgYoloLayer(const std::string& name): LayerFragment("ReorgYolo", name) {
+    getLayer().getInputPorts().resize(1);
+    getLayer().getOutputPorts().resize(1);
+}
+
+Builder::ReorgYoloLayer::ReorgYoloLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "ReorgYolo"))
+        THROW_IE_EXCEPTION << "Cannot create ReorgYoloLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ReorgYoloLayer& Builder::ReorgYoloLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+const Port& Builder::ReorgYoloLayer::getInputPort() const {
+    return getLayer().getInputPorts()[0];
+}
+Builder::ReorgYoloLayer& Builder::ReorgYoloLayer::setInputPort(const Port& port) {
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+const Port& Builder::ReorgYoloLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+Builder::ReorgYoloLayer& Builder::ReorgYoloLayer::setOutputPort(const Port& port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+int Builder::ReorgYoloLayer::getStride() const {
+    return getLayer().getParameters()["stride"].asInt();
+}
+Builder::ReorgYoloLayer& Builder::ReorgYoloLayer::setStride(int stride) {
+    getLayer().getParameters()["stride"] = stride;
+    return *this;
+}
+
diff --git a/inference-engine/src/inference_engine/builders/ie_reshape_layer.cpp b/inference-engine/src/inference_engine/builders/ie_reshape_layer.cpp
new file mode 100644 (file)
index 0000000..9f6c1f9
--- /dev/null
@@ -0,0 +1,54 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_reshape_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ReshapeLayer::ReshapeLayer(const std::string& name): LayerFragment("Reshape", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+}
+
+Builder::ReshapeLayer::ReshapeLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Reshape"))
+        THROW_IE_EXCEPTION << "Cannot create ReshapeLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ReshapeLayer& Builder::ReshapeLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::ReshapeLayer::getInputPort() const {
+    return getLayer().getInputPorts()[0];
+}
+
+Builder::ReshapeLayer& Builder::ReshapeLayer::setInputPort(const Port &port) {
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+const Port& Builder::ReshapeLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::ReshapeLayer& Builder::ReshapeLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+const std::vector<int> Builder::ReshapeLayer::getDims() const {
+    return getLayer().getParameters()["dim"].asInts();
+}
+
+Builder::ReshapeLayer& Builder::ReshapeLayer::setDims(const std::vector<int>& dims) {
+    getLayer().getParameters()["dim"] = dims;
+    return *this;
+}
+
diff --git a/inference-engine/src/inference_engine/builders/ie_roi_pooling_layer.cpp b/inference-engine/src/inference_engine/builders/ie_roi_pooling_layer.cpp
new file mode 100644 (file)
index 0000000..bd1cf4f
--- /dev/null
@@ -0,0 +1,58 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_roi_pooling_layer.hpp>
+#include <details/caseless.hpp>
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ROIPoolingLayer::ROIPoolingLayer(const std::string& name): LayerFragment("ROIPooling", name) {
+    getLayer().getOutputPorts().resize(1);
+    setPooled({0, 0});
+}
+
+Builder::ROIPoolingLayer::ROIPoolingLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "ROIPooling"))
+        THROW_IE_EXCEPTION << "Cannot create ROIPoolingLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ROIPoolingLayer& Builder::ROIPoolingLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+const std::vector<Port>& Builder::ROIPoolingLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+Builder::ROIPoolingLayer& Builder::ROIPoolingLayer::setInputPorts(const std::vector<Port>& ports) {
+    if (ports.size() != 2)
+        THROW_IE_EXCEPTION << "ROIPoolingLayer should have 2 inputs!";
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+const Port& Builder::ROIPoolingLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+Builder::ROIPoolingLayer& Builder::ROIPoolingLayer::setOutputPort(const Port& port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+float Builder::ROIPoolingLayer::getSpatialScale() const {
+    return getLayer().getParameters()["spatial_scale"].asFloat();
+}
+Builder::ROIPoolingLayer& Builder::ROIPoolingLayer::setSpatialScale(float spatialScale) {
+    getLayer().getParameters()["spatial_scale"] = spatialScale;
+    return *this;
+}
+const std::vector<int> Builder::ROIPoolingLayer::getPooled() const {
+    return {getLayer().getParameters()["pooled_h"].asInt(0), getLayer().getParameters()["pooled_w"].asInt(0)};
+}
+Builder::ROIPoolingLayer& Builder::ROIPoolingLayer::setPooled(const std::vector<int>& pooled) {
+    if (pooled.size() != 2)
+        THROW_IE_EXCEPTION << "ROIPoolingLayer supports only pooled for height and width dimensions";
+    getLayer().getParameters()["pooled_h"] = pooled[0];
+    getLayer().getParameters()["pooled_w"] = pooled[1];
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_scale_shift_layer.cpp b/inference-engine/src/inference_engine/builders/ie_scale_shift_layer.cpp
new file mode 100644 (file)
index 0000000..534959b
--- /dev/null
@@ -0,0 +1,44 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_scale_shift_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::ScaleShiftLayer::ScaleShiftLayer(const std::string& name): LayerFragment("ScaleShift", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+}
+
+Builder::ScaleShiftLayer::ScaleShiftLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "ScaleShift"))
+        THROW_IE_EXCEPTION << "Cannot create ScaleShiftLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::ScaleShiftLayer& Builder::ScaleShiftLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::ScaleShiftLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::ScaleShiftLayer& Builder::ScaleShiftLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+Builder::ScaleShiftLayer& Builder::ScaleShiftLayer::setWeights(const Blob::CPtr& weights) {
+    getLayer().addConstantData("weights", weights);
+    return *this;
+}
+Builder::ScaleShiftLayer& Builder::ScaleShiftLayer::setBiases(const Blob::CPtr& biases) {
+    getLayer().addConstantData("biases", biases);
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_sigmoid_layer.cpp b/inference-engine/src/inference_engine/builders/ie_sigmoid_layer.cpp
new file mode 100644 (file)
index 0000000..72ccc80
--- /dev/null
@@ -0,0 +1,35 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_sigmoid_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::SigmoidLayer::SigmoidLayer(const std::string& name): LayerFragment("Sigmoid", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+}
+
+Builder::SigmoidLayer::SigmoidLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Sigmoid"))
+        THROW_IE_EXCEPTION << "Cannot create SigmoidLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::SigmoidLayer& Builder::SigmoidLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::SigmoidLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::SigmoidLayer& Builder::SigmoidLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_simpler_nms_layer.cpp b/inference-engine/src/inference_engine/builders/ie_simpler_nms_layer.cpp
new file mode 100644 (file)
index 0000000..1fc3e07
--- /dev/null
@@ -0,0 +1,89 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_simpler_nms_layer.hpp>
+#include <details/caseless.hpp>
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::SimplerNMSLayer::SimplerNMSLayer(const std::string& name): LayerFragment("SimplerNMS", name) {
+    getLayer().getOutputPorts().resize(1);
+}
+
+Builder::SimplerNMSLayer::SimplerNMSLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "SimplerNMS"))
+        THROW_IE_EXCEPTION << "Cannot create SimplerNMSLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+const std::vector<Port>& Builder::SimplerNMSLayer::getInputPorts() const {
+    return getLayer().getInputPorts();
+}
+Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setInputPorts(const std::vector<Port>& ports) {
+    getLayer().getInputPorts() = ports;
+    return *this;
+}
+const Port& Builder::SimplerNMSLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setOutputPort(const Port& port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+size_t Builder::SimplerNMSLayer::getPreNMSTopN() const {
+    return getLayer().getParameters()["pre_nms_topn"].asUInt();
+}
+Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setPreNMSTopN(size_t topN) {
+    getLayer().getParameters()["pre_nms_topn"] = topN;
+    return *this;
+}
+size_t Builder::SimplerNMSLayer::getPostNMSTopN() const {
+    return getLayer().getParameters()["post_nms_topn"].asUInt();
+}
+Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setPostNMSTopN(size_t topN) {
+    getLayer().getParameters()["post_nms_topn"] = topN;
+    return *this;
+}
+size_t Builder::SimplerNMSLayer::getFeatStride() const {
+    return getLayer().getParameters()["feat_stride"].asUInt();
+}
+Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setFeatStride(size_t featStride) {
+    getLayer().getParameters()["feat_stride"] = featStride;
+    return *this;
+}
+size_t Builder::SimplerNMSLayer::getMinBoxSize() const {
+    return getLayer().getParameters()["min_bbox_size"].asUInt();
+}
+Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setMinBoxSize(size_t minSize) {
+    getLayer().getParameters()["min_bbox_size"] = minSize;
+    return *this;
+}
+size_t Builder::SimplerNMSLayer::getScale() const {
+    return getLayer().getParameters()["scale"].asUInt();
+}
+Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setScale(size_t scale) {
+    getLayer().getParameters()["scale"] = scale;
+    return *this;
+}
+
+float Builder::SimplerNMSLayer::getCLSThreshold() const {
+    return getLayer().getParameters()["cls_threshold"].asFloat();
+}
+Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setCLSThreshold(float threshold) {
+    getLayer().getParameters()["cls_threshold"] = threshold;
+    return *this;
+}
+float Builder::SimplerNMSLayer::getIOUThreshold() const {
+    return getLayer().getParameters()["iou_threshold"].asFloat();
+}
+Builder::SimplerNMSLayer& Builder::SimplerNMSLayer::setIOUThreshold(float threshold) {
+    getLayer().getParameters()["iou_threshold"] = threshold;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_softmax_layer.cpp b/inference-engine/src/inference_engine/builders/ie_softmax_layer.cpp
new file mode 100644 (file)
index 0000000..d4ccfa9
--- /dev/null
@@ -0,0 +1,45 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_softmax_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::SoftMaxLayer::SoftMaxLayer(const std::string& name): LayerFragment("SoftMax", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+    setAxis(1);
+}
+
+Builder::SoftMaxLayer::SoftMaxLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "SoftMax"))
+        THROW_IE_EXCEPTION << "Cannot create SoftMaxLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::SoftMaxLayer& Builder::SoftMaxLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::SoftMaxLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::SoftMaxLayer& Builder::SoftMaxLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+size_t Builder::SoftMaxLayer::getAxis() const {
+    return getLayer().getParameters()["axis"].asUInt();
+}
+
+Builder::SoftMaxLayer& Builder::SoftMaxLayer::setAxis(size_t axis) {
+    getLayer().getParameters()["axis"] = axis;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_split_layer.cpp b/inference-engine/src/inference_engine/builders/ie_split_layer.cpp
new file mode 100644 (file)
index 0000000..50d04dd
--- /dev/null
@@ -0,0 +1,53 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_split_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::SplitLayer::SplitLayer(const std::string& name): LayerFragment("Concat", name) {
+    getLayer().getInputPorts().resize(1);
+    setAxis(1);
+}
+
+Builder::SplitLayer::SplitLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Concat"))
+        THROW_IE_EXCEPTION << "Cannot create SplitLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::SplitLayer& Builder::SplitLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::SplitLayer::getInputPort() const {
+    return getLayer().getInputPorts()[0];
+}
+
+Builder::SplitLayer& Builder::SplitLayer::setInputPort(const Port &port) {
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+const std::vector<Port>& Builder::SplitLayer::getOutputPorts() const {
+    return getLayer().getOutputPorts();
+}
+
+Builder::SplitLayer& Builder::SplitLayer::setOutputPorts(const std::vector<Port>& ports) {
+    getLayer().getOutputPorts() = ports;
+    return *this;
+}
+
+size_t Builder::SplitLayer::getAxis() const {
+    return getLayer().getParameters()["axis"].asUInt();
+}
+
+Builder::SplitLayer& Builder::SplitLayer::setAxis(size_t axis) {
+    getLayer().getParameters()["axis"] = axis;
+    return *this;
+}
diff --git a/inference-engine/src/inference_engine/builders/ie_tanh_layer.cpp b/inference-engine/src/inference_engine/builders/ie_tanh_layer.cpp
new file mode 100644 (file)
index 0000000..37eb7eb
--- /dev/null
@@ -0,0 +1,35 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_tanh_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::TanHLayer::TanHLayer(const std::string& name): LayerFragment("TanH", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+}
+
+Builder::TanHLayer::TanHLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "TanH"))
+        THROW_IE_EXCEPTION << "Cannot create TanHLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::TanHLayer& Builder::TanHLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::TanHLayer::getPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::TanHLayer& Builder::TanHLayer::setPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
\ No newline at end of file
diff --git a/inference-engine/src/inference_engine/builders/ie_tile_layer.cpp b/inference-engine/src/inference_engine/builders/ie_tile_layer.cpp
new file mode 100644 (file)
index 0000000..fade9f3
--- /dev/null
@@ -0,0 +1,62 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <builders/ie_tile_layer.hpp>
+#include <details/caseless.hpp>
+
+#include <vector>
+#include <string>
+
+using namespace InferenceEngine;
+
+Builder::TileLayer::TileLayer(const std::string& name): LayerFragment("Tile", name) {
+    getLayer().getOutputPorts().resize(1);
+    getLayer().getInputPorts().resize(1);
+}
+
+Builder::TileLayer::TileLayer(Layer& genLayer): LayerFragment(genLayer) {
+    if (!details::CaselessEq<std::string>()(getLayer().getType(), "Tile"))
+        THROW_IE_EXCEPTION << "Cannot create TileLayer decorator for layer " << getLayer().getType();
+}
+
+Builder::TileLayer& Builder::TileLayer::setName(const std::string& name) {
+    getLayer().getName() = name;
+    return *this;
+}
+
+const Port& Builder::TileLayer::getInputPort() const {
+    return getLayer().getInputPorts()[0];
+}
+
+Builder::TileLayer& Builder::TileLayer::setInputPort(const Port &port) {
+    getLayer().getInputPorts()[0] = port;
+    return *this;
+}
+
+const Port& Builder::TileLayer::getOutputPort() const {
+    return getLayer().getOutputPorts()[0];
+}
+
+Builder::TileLayer& Builder::TileLayer::setOutputPort(const Port &port) {
+    getLayer().getOutputPorts()[0] = port;
+    return *this;
+}
+
+size_t Builder::TileLayer::getTiles() const {
+    return getLayer().getParameters()["tiles"].asUInt();
+}
+
+Builder::TileLayer& Builder::TileLayer::setTiles(size_t tiles) {
+    getLayer().getParameters()["tiles"] = tiles;
+    return *this;
+}
+
+size_t Builder::TileLayer::getAxis() const {
+    return getLayer().getParameters()["axis"].asUInt();
+}
+
+Builder::TileLayer& Builder::TileLayer::setAxis(size_t axis) {
+    getLayer().getParameters()["axis"] = axis;
+    return *this;
+}
\ No newline at end of file
index 38fd18d..620fe34 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -14,6 +13,7 @@
 #include "debug.h"
 #include "graph_tools.hpp"
 #include <vector>
+#include "network_serializer.h"
 
 using namespace std;
 using namespace InferenceEngine;
@@ -100,7 +100,7 @@ void CNNNetworkImpl::validate(int version) {
         std::string inputType = "Input";
         for (auto i : inputs) {
             CNNLayerPtr layer = i.second->getInputData()->creatorLayer.lock();
-            if (!equal(layer->type, inputType)) {
+            if (layer && !equal(layer->type, inputType)) {
                 THROW_IE_EXCEPTION << "Input layer " << layer->name
                                    << " should have Input type but actually its type is " << layer->type;
             }
@@ -207,6 +207,19 @@ CNNNetworkImpl::AddExtension(const InferenceEngine::IShapeInferExtensionPtr& ext
     return OK;
 }
 
+StatusCode CNNNetworkImpl::serialize(const std::string &xmlPath, const std::string &binPath, ResponseDesc* resp) const noexcept {
+    try {
+        NetworkSerializer::serialize(xmlPath, binPath, (InferenceEngine::ICNNNetwork&)*this);
+    } catch (const InferenceEngineException& e) {
+        return DescriptionBuffer(GENERAL_ERROR, resp) << e.what();
+    } catch (const std::exception& e) {
+        return DescriptionBuffer(UNEXPECTED, resp) << e.what();
+    } catch (...) {
+        return DescriptionBuffer(UNEXPECTED, resp);
+    }
+    return OK;
+}
+
 StatusCode CNNNetworkImpl::setBatchSize(size_t size, ResponseDesc* responseDesc) noexcept {
     auto originalBatchSize = getBatchSize();
     if (originalBatchSize == size)
index 2c83f36..d2d9ae1 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "description_buffer.hpp"
 #include <string>
 #include <vector>
-#include <shape_infer/ie_reshaper.hpp>
 
 #include "cnn_network_stats_impl.hpp"
 
 namespace InferenceEngine {
+namespace ShapeInfer {
+class Reshaper;
+
+using ReshaperPtr = std::shared_ptr<Reshaper>;
+}  // namespace ShapeInfer
 namespace details {
 class INFERENCE_ENGINE_API_CLASS(CNNNetworkImpl) : public ICNNNetwork {
 public:
@@ -126,6 +129,8 @@ public:
     StatusCode
     AddExtension(const InferenceEngine::IShapeInferExtensionPtr &extension, InferenceEngine::ResponseDesc *resp) noexcept override;
 
+    StatusCode serialize(const std::string &xmlPath, const std::string &binPath, ResponseDesc* resp) const noexcept override;
+
 protected:
     Precision precision {Precision::MIXED};
     std::map<std::string, DataPtr> _data;
@@ -136,7 +141,7 @@ protected:
     /// @brief
     TargetDevice _targetDevice;
     DataPtr _emptyData;
-    InferenceEngine::ShapeInfer::Reshaper::Ptr _reshaper;
+    ShapeInfer::ReshaperPtr _reshaper;
     CNNNetworkStatsImplPtr _stats;
 };
 
index 0da74a7..58dd61f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -17,6 +16,7 @@
 
 #include <ie_common.h>
 #include <details/ie_cnn_network_tools.h>
+#include <details/caseless.hpp>
 #include <blob_factory.hpp>
 #include <data_stats.h>
 #include "cnn_network_impl.hpp"
@@ -33,11 +33,300 @@ using namespace InferenceEngine::details;
 
 using StatsMap = std::map<std::string, InferenceEngine::NetworkNodeStatsPtr>;
 
-void CNNNetworkInt8Normalizer::AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor) {
+
+CNNStatisticHelper::CNNStatisticHelper(CNNNetwork &network, const std::map<std::string, NetworkNodeStatsPtr> &internalNodesStats,
+                                       int maxSign, int maxUnsign) {
+    internalNodesStats_ = internalNodesStats;
+    network_ = network;
+    maxSign_ = maxSign;
+    maxUnsign_ = maxUnsign;
+
+    NormalizeStatistic();
+}
+
+bool CNNStatisticHelper::canLayerBeQuantized(const std::string &layerName) const {
+    // TODO(amalyshe) this verification should be extended to 1) inputs 2) there might not be
+    // statistic for every and each layer, but we might go over layers to search it
+    if (internalNodesStats_.find(layerName) == internalNodesStats_.end()) {
+        return true;
+    }
+    return false;
+}
+
+void CNNStatisticHelper::copyStatistics(const std::string& srcName, const std::string& dstName) {
+    internalNodesStats_[dstName] = internalNodesStats_[srcName];
+}
+
+bool CNNStatisticHelper::hasNegativeOutput(const std::string &layerName, int outputPort) const {
+    // TODO(amalyshe) parameter outputPort is not used yet, logic of dedication to the port
+    // should be implemented
+
+    NetworkNodeStatsPtr layerStat = internalNodesStats_.at(layerName);
+    for (auto v : layerStat->_minOutputs) {
+        if (v < 0.f) {
+            return true;
+        }
+    }
+    return false;
+}
+
+InferenceEngine::Blob::Ptr CNNStatisticHelper::getInputScale(CNNLayer::Ptr layer) const {
+    auto previousLayer = layer->insData[0].lock()->creatorLayer.lock();
+    std::string inputLayerName = previousLayer->name;
+
+    // for case when we have the only average pooling before, we need to take this
+    // statistic from input of avg pooloing to compensate work of average pooling
+    // and to stay in int8 as much as we can
+    if (previousLayer->type == "Pooling" && (previousLayer->precision == Precision::I8 || previousLayer->precision == Precision::U8)) {
+        // take input name to the pooling
+        inputLayerName = previousLayer->insData[0].lock()->creatorLayer.lock()->name;
+    }
+    size_t inputChannels = layer->insData[0].lock()->getTensorDesc().getDims()[1];
+    return calculateScaleFactor(inputChannels, getStatistic(previousLayer),
+                                hasNegativeOutput(previousLayer->name) ? maxSign_ : maxUnsign_);
+}
+
+InferenceEngine::Blob::Ptr CNNStatisticHelper::getOutputScale(CNNLayer::Ptr layer) const {
+    // TODO(amalyshe) for now we are looking to precision on the data node
+    size_t outputChannels = layer->outData[0]->getTensorDesc().getDims()[1];
+    if (layer->outData.size() != 1) {
+        THROW_IE_EXCEPTION << "Trying to get scales after layer having multiple ouptut ports";
+    }
+    return calculateScaleFactor(outputChannels, getStatistic(layer),
+                                layer->outData[0]->getPrecision() == Precision::I8 ? maxSign_ : maxUnsign_);
+}
+
+int CNNStatisticHelper::getMaxSignValue() const {
+    return maxSign_;
+}
+
+InferenceEngine::Blob::Ptr CNNStatisticHelper::calculateScaleFactor(size_t channels ,
+    NetworkNodeStatsPtr stats, int maxInt) const {
+    if (stats->_minOutputs.size() != channels || stats->_maxOutputs.size() != channels) {
+        THROW_IE_EXCEPTION << "min and max sizes should be equal to channels count";
+    }
+
+    // Creating i-scale blob
+    std::shared_ptr<Data> iScaleData = std::shared_ptr<Data>(new Data("scale", { channels }, Precision::FP32, Layout::C));
+    auto iScale = CreateBlobFromData(iScaleData);
+    iScale->allocate();
+    float* iScaleMemory = static_cast<float*>(iScale->buffer());
+
+    for (int c = 0; c < channels; c++) {
+        float maxc = 0;
+            // maxc = fmax(maxc, fabs(stats[k]->_minOutputs[c]));        // TODO Check if we should take minimums into account
+            maxc = fmax(maxc, fabs(stats->_maxOutputs[c]));
+            maxc = fmax(maxc, fabs(stats->_minOutputs[c]));
+
+        iScaleMemory[c] = maxc / static_cast<float>(maxInt);
+
+        if (fabs(iScaleMemory[c]) < 1e-7) {
+            iScaleMemory[c] = 1.0f;
+        }
+    }
+    return iScale;
+}
+
+NetworkNodeStatsPtr CNNStatisticHelper::getStatistic(CNNLayer::Ptr layer) const {
+    // TODO(amalyshe) all logic of traversing over network and get apropriate statistics should be here
+    // for now it is a stub
+    auto it = internalNodesStats_.find(getLatestInFuse(layer)->name);
+    if (it != internalNodesStats_.end()) {
+        return it->second;
+    }
+    THROW_IE_EXCEPTION << "no stat for layer " << getLatestInFuse(layer)->name;
+}
+
+CNNLayer::Ptr CNNStatisticHelper::getLatestInFuse(CNNLayer::Ptr layer) const {
+    if (layer->outData[0]->inputTo.size() == 1 &&
+        CaselessEq<std::string>()(layer->outData[0]->inputTo.begin()->second->type, "relu")) {
+        return layer->outData[0]->inputTo.begin()->second;
+    }
+    // Conv-Sum-ReLU fuse
+    // We need to return original layer if it will be used as a sum parame and ReLU if
+    // iterating over outputs of pointed layer and look for the only eltwise
+    CNNLayer::Ptr eltwise = nullptr;
+    if (layer->outData.size() == 1) {
+        for (auto it : layer->outData[0]->inputTo) {
+            if (CaselessEq<std::string>()(it.second->type, "eltwise")) {
+                if (eltwise) {
+                    THROW_IE_EXCEPTION << "Pattern when one layer pass data to several eltwise layers are not supported in int8 quantization";
+                }
+                eltwise = it.second;
+            }
+        }
+    }
+
+    if (eltwise) {
+        // if current layer is not a convolution return it as finish of fuse
+        if (!CaselessEq<std::string>()(layer->type, "convolution")) {
+            return layer;
+        } else {
+            // look to the ports of eltwise
+            if (eltwise->insData[1].lock()->creatorLayer.lock() == layer &&
+                CaselessEq<std::string>()(eltwise->insData[0].lock()->creatorLayer.lock()->type, "convolution")) {
+                // this is a case when two convolutions come to eltwise, the second one will be selected for fuse,
+                // first will be used as sum operator
+                return layer;
+            }
+            // given layer is a convolution and will be used for fuse, but we need to verify if there is ReLU after eltwise
+            if (eltwise->outData[0]->inputTo.size() == 1 &&
+                CaselessEq<std::string>()(eltwise->outData[0]->inputTo.begin()->second->type, "relu")) {
+                return eltwise->outData[0]->inputTo.begin()->second;
+            }
+            return eltwise;
+        }
+    }
+
+    return layer;
+}
+
+
+void CNNStatisticHelper::NormalizeStatistic() {
+    StatsMap newMap;
+
+    float dummy;
+
+    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network_);
+    for (auto l : sortedLayers) {
+        // if layer's statistic exists in the newMap, ignore it
+        if (newMap.find(l->name) != newMap.end()) {
+            continue;
+        }
+        // verify if layer is starter layer for propagating of statistic
+        bool isStarterLayer = false;
+
+        // a case if we do not have converted statistic before the current layer
+        // go over all inputs and verify if statistic exists for all of inputs
+        bool allInputsHaveStatistics = true;
+        for (auto i : l->insData) {
+            if (newMap.find(i.lock()->creatorLayer.lock()->name) == newMap.end()) {
+                allInputsHaveStatistics = false;
+            }
+        }
+        // if we do not have statistic - verify who is consumer of this layer
+        if (!allInputsHaveStatistics) {
+            if (l->outData.size() == 1) {
+                for (auto it : l->outData[0]->inputTo) {
+                    if (CaselessEq<std::string>()(it.second->type, "scaleshift") ||
+                        CaselessEq<std::string>()(it.second->type, "convolution")) {
+                        isStarterLayer = true;
+                    }
+                }
+            }
+        } else {
+            isStarterLayer = true;
+        }
+        if (!isStarterLayer) {
+            continue;
+        }
+
+        // we do not support yet layers for quantization which split data
+        if (l->outData.size() != 1) {
+            continue;
+        }
+
+        InferenceEngine::NetworkNodeStatsPtr currentStat = std::make_shared<NetworkNodeStats>();
+
+        bool perChannelScale = true;
+
+        if (CaselessEq<std::string>()(l->type, "concat")
+            && l->outData.size() == 1 && l->outData[0]->getTensorDesc().getDims().size() == 4) {
+            size_t concatLayerIdx = 0;
+            for (int k = 0; k < l->insData.size(); k++) {
+                auto prevKLayer = l->insData[k].lock()->creatorLayer.lock();
+                // looking for the statistic for prevKLayer
+                auto kLayerStat = newMap.find(prevKLayer->name);
+                if (kLayerStat != newMap.end()) {
+                    for (size_t ikStat = 0; ikStat < kLayerStat->second->_maxOutputs.size(); ikStat++, concatLayerIdx++) {
+                        currentStat->_maxOutputs.push_back(kLayerStat->second->_maxOutputs[ikStat]);
+                        currentStat->_minOutputs.push_back(kLayerStat->second->_minOutputs[ikStat]);
+                    }
+                } else {
+                    THROW_IE_EXCEPTION << "We have incomplete statistic for predecessors of concat layer " << l->name;
+                }
+            }
+        } else {
+            // go over all children until we get convoluition, scaleshift, eltwise or unknown layer
+            // layers Pooling and ReLU are passthrough
+            // to understand the granularity of the scaling
+            // layer concat is a lyer which produce statistics and waterfall it down
+            std::vector<CNNLayer::Ptr> toAnalyze;
+            for (auto it : l->outData[0]->inputTo) {
+                toAnalyze.push_back(it.second);
+            }
+
+            if (CaselessEq<std::string>()(l->type, "eltwise")) {
+                perChannelScale = false;
+            }
+            while (!toAnalyze.empty() && perChannelScale) {
+                CNNLayer::Ptr tl = toAnalyze.back();
+                toAnalyze.pop_back();
+                if (CaselessEq<std::string>()(tl->type, "pooling") ||
+                    CaselessEq<std::string>()(tl->type, "relu") ||
+                    CaselessEq<std::string>()(tl->type, "concat")) {
+                    if (tl->outData.size() == 1) {
+                        for (auto it : tl->outData[0]->inputTo) {
+                            toAnalyze.push_back(it.second);
+                        }
+                    }
+                } else if (CaselessEq<std::string>()(tl->type, "convolution")) {
+                    // verify number of groups
+                    ConvolutionLayer *pConv = dynamic_cast<ConvolutionLayer *>(tl.get());
+                    if (pConv->_group != pConv->_out_depth) {
+                        perChannelScale = false;
+                    }
+                } else if (CaselessEq<std::string>()(tl->type, "eltwise")) {
+                    perChannelScale = false;
+                }
+            }
+
+            auto itOld = internalNodesStats_.find(getLatestInFuse(l)->name);
+            if (itOld != internalNodesStats_.end()) {
+                currentStat->_maxOutputs = itOld->second->_maxOutputs;
+                currentStat->_minOutputs = itOld->second->_minOutputs;
+
+                if (!perChannelScale) {
+                    float min = FLT_MAX;
+                    float max = FLT_MIN;
+                    if (!itOld->second->_maxOutputs.empty()) {
+                        DataStats::GetDataAbsMax(&itOld->second->_maxOutputs[0], itOld->second->_maxOutputs.size(), max);
+                        std::fill(currentStat->_maxOutputs.begin(), currentStat->_maxOutputs.end(), max);
+                    }
+                    if (!itOld->second->_minOutputs.empty()) {
+                        DataStats::GetDataMinMax(&itOld->second->_minOutputs[0], itOld->second->_minOutputs.size(), min, dummy);
+                        std::fill(currentStat->_minOutputs.begin(), currentStat->_minOutputs.end(), min);
+                    }
+                }
+            }
+        }
+
+        // propagate this statistic to all layers without scale in primitives
+        std::vector<CNNLayer::Ptr> toAnalyze;
+        toAnalyze.push_back(l);
+        while (!toAnalyze.empty()) {
+            CNNLayer::Ptr tl = toAnalyze.back();
+            toAnalyze.pop_back();
+            newMap[tl->name] = currentStat;
+            if (tl->outData.size() == 1) {
+                for (auto it : tl->outData[0]->inputTo) {
+                    if (CaselessEq<std::string>()(it.second->type, "pooling") ||
+                        CaselessEq<std::string>()(it.second->type, "relu")) {
+                        toAnalyze.push_back(it.second);
+                    }
+                }
+            }
+        }
+    }
+
+    internalNodesStats_ = newMap;
+}
+
+void CNNNetworkInt8Normalizer::AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor, size_t port) {
     // verify if data exists
-    if (newLayer && successor && successor->insData.size() == 1) {
+    if (newLayer && successor && successor->insData.size() > port) {
         // get the insData
-        DataPtr pData = successor->insData[0].lock();
+        DataPtr pData = successor->insData[port].lock();
 
         Data *edge2 = new Data(*pData.get());
         DataPtr newEdge(edge2);
@@ -45,7 +334,7 @@ void CNNNetworkInt8Normalizer::AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr new
         newEdge->getInputTo()[successor->name] = successor;
         newEdge->name = newLayer->name;
         newEdge->getCreatorLayer() = newLayer;
-        successor->insData[0] = newEdge;
+        successor->insData[port] = newEdge;
         newLayer->outData.push_back(newEdge);
 
         newLayer->insData.push_back(pData);
@@ -66,7 +355,7 @@ void CNNNetworkInt8Normalizer::AddLayerToCNNNetworkAfterData(DataPtr pData, CNNL
         newEdgeAfterLayer->creatorLayer = layer;
         newEdgeAfterLayer->inputTo.clear();
         newEdgeAfterLayer->inputTo[nextLayerName] = nextLayer;
-        newEdgeAfterLayer->precision = Precision::FP32;
+        newEdgeAfterLayer->setPrecision(Precision::FP32);
 
         pData->getInputTo().erase(nextLayerName);
         pData->getInputTo()[layer->name] = layer;
@@ -114,7 +403,14 @@ void CNNNetworkInt8Normalizer::fillInScaleShift(ScaleShiftLayer* scshLayer, size
     }
 }
 
-void CNNNetworkInt8Normalizer::AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2) {
+void CNNNetworkInt8Normalizer::AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2,
+    CNNStatisticHelper& statHelper) {
+
+    if (CaselessEq<std::string>()(layer2->type, "priorbox") ||
+        CaselessEq<std::string>()(layer2->type, "priorboxclustered")) {
+        return;
+    }
+
     // Searching the connection between the layers
     int l1_out_i = 0;
     for (; l1_out_i < layer1->outData.size(); l1_out_i++) {
@@ -176,11 +472,15 @@ void CNNNetworkInt8Normalizer::AddScaleShiftBetween(CNNNetwork& net, const CNNLa
             fillInScaleShift(scshLayer, c, oScaleBuffer, iScaleBuffer);
         }
 
-        ssCnnLayer->outData[0]->precision = ssCnnLayer->outData[0]->inputTo.begin()->second->precision;
+        Precision odPrecision = Precision::FP32;
+        if (layer2->precision == Precision::I8) {
+            odPrecision = statHelper.hasNegativeOutput(layer1->name) ? Precision::I8 : Precision::U8;
+        }
+        ssCnnLayer->outData[0]->setPrecision(odPrecision);
     }
 }
 
-void CNNNetworkInt8Normalizer::AddScaleShifts(CNNNetwork& net) {
+void CNNNetworkInt8Normalizer::AddScaleShifts(CNNNetwork& net, CNNStatisticHelper& statHelper) {
     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
 
     std::vector<std::pair<CNNLayerPtr, CNNLayerPtr>> pairs;
@@ -191,9 +491,13 @@ void CNNNetworkInt8Normalizer::AddScaleShifts(CNNNetwork& net) {
                 CNNLayer::Ptr next = nextIter.second;
 
                 // Checking for an INT8 convolution with FP32 output
-                if (iter->type == "Convolution" && iter->precision == Precision::I8 && next->precision == Precision::FP32) {
-                    // Do nothing here
-                    // MKLDNNPlugin will generate u8->f32 convolution
+                if (iter->type == "Convolution" &&
+                    iter->precision == Precision::I8 &&
+                    next->precision == Precision::FP32 &&
+                    iter->outData[l1_out_i]->getPrecision() == Precision::FP32) {
+                    // Do nothing here only if iter provides data to fp32 layers
+                    // MKLDNNPlugin will generate x8->f32 convolution
+
                 } else if ((iter->precision != Precision::FP32 && next->precision == Precision::FP32) ||
                            (iter->precision == Precision::FP32 && next->precision != Precision::FP32)) {
                     pairs.push_back(std::pair<CNNLayerPtr, CNNLayerPtr>(iter, next));
@@ -203,7 +507,7 @@ void CNNNetworkInt8Normalizer::AddScaleShifts(CNNNetwork& net) {
     }
 
     for (auto& pair : pairs) {
-        AddScaleShiftBetween(net, pair.first, pair.second);
+        AddScaleShiftBetween(net, pair.first, pair.second, statHelper);
     }
 }
 
@@ -267,478 +571,363 @@ void CNNNetworkInt8Normalizer::ScaleDataToInt(const float* srcData, size_t srcSi
     }
 }
 
-NetworkNodeStatsPtr CNNNetworkInt8Normalizer::mergeNetworkNodesStats(std::vector<NetworkNodeStatsPtr> stats) {
-    int c = stats[0]->_maxOutputs.size();
-    for (auto s : stats) {
-        if (s->_maxOutputs.size() != c || s->_minOutputs.size() != c) {
-            THROW_IE_EXCEPTION << "Inconsistent stats";
+CNNLayer::Ptr CNNNetworkInt8Normalizer::createDWConvolutionForScale(const std::string &layerName, size_t channels, float *ssWValues, float *ssSValues) {
+    // create new Convolution layer
+    LayerParams params;
+    params.name = layerName;
+    params.precision = Precision::FP32;
+    params.type = "Convolution";
+
+    CNNLayerPtr lptr = std::make_shared<ConvolutionLayer>(params);
+    ConvolutionLayer *pConv = dynamic_cast<ConvolutionLayer *>(lptr.get());
+
+    pConv->_kernel.insert(X_AXIS, 1);
+    pConv->_kernel.insert(Y_AXIS, 1);
+    pConv->_stride.insert(X_AXIS, 1);
+    pConv->_stride.insert(Y_AXIS, 1);
+    pConv->_padding.insert(X_AXIS, 0);
+    pConv->_padding.insert(Y_AXIS, 0);
+    pConv->_pads_end.insert(X_AXIS, 0);
+    pConv->_pads_end.insert(Y_AXIS, 0);
+    pConv->_dilation.insert(X_AXIS, 1);
+    pConv->_dilation.insert(Y_AXIS, 1);
+
+    pConv->_out_depth = channels;
+    // mkl-dnn does not have i8 depthwise convolution accepting signed i8 input
+    // when it is available, need to uncomment below lines
+
+    // workaround - creation of new weights for simple convolution
+    if (pConv->_out_depth % 16 == 0) {
+        pConv->_group = pConv->_out_depth / 16;
+        Blob::Ptr weights = nullptr;
+        std::shared_ptr<Data> wData = std::shared_ptr<Data>(new Data("weights", { pConv->_out_depth * 16 }, Precision::FP32, Layout::C));
+        weights = CreateBlobFromData(wData);
+        weights->allocate();
+        float *buffer = weights->buffer().as<float *>();
+        size_t iDist = 0, iSrc = 0;
+        for (size_t g = 0; g < pConv->_group; g++) {
+            for (size_t k = 0; k < 16; k++) {
+                for (size_t s = 0; s < 16; s++) {
+                    buffer[iDist++] = (s == k) ? ssWValues[iSrc++] : 0.f;
+                }
+            }
         }
-    }
-
-    NetworkNodeStatsPtr res = NetworkNodeStatsPtr(new NetworkNodeStats(c));
-    for (int i = 0; i < c; i++) {
-        float globalMin = stats[0]->_minOutputs[i], globalMax = stats[0]->_maxOutputs[i];
-        for (auto s : stats) {
-            if (s->_maxOutputs[i] > globalMax) globalMax = s->_maxOutputs[i];
-            if (s->_minOutputs[i] < globalMin) globalMin = s->_minOutputs[i];
+        pConv->_weights = weights;
+        pConv->blobs["weights"] = weights;
+    } else {
+        Blob::Ptr weights = nullptr;
+        std::shared_ptr<Data> wData = std::shared_ptr<Data>(new Data("weights", { pConv->_out_depth * pConv->_out_depth }, Precision::FP32, Layout::C));
+        weights = CreateBlobFromData(wData);
+        weights->allocate();
+        float *buffer = weights->buffer().as<float *>();
+        for (size_t i = 0, idx = 0; i < pConv->_out_depth; i++) {
+            for (size_t j = 0; j < pConv->_out_depth; j++) {
+                if (i == j) {
+                    buffer[idx] = ssWValues[i];
+                } else {
+                    buffer[idx] = 0.f;
+                }
+                idx++;
+            }
         }
-        res->_minOutputs[i] = globalMin;
-        res->_maxOutputs[i] = globalMax;
+        pConv->_weights = weights;
+        pConv->blobs["weights"] = weights;
+        pConv->_group = 1;
     }
-
-    return res;
-}
-
-InferenceEngine::Blob::Ptr CNNNetworkInt8Normalizer::calculateScaleFactor(const std::string& name, size_t channels,
-                                                                          std::vector<NetworkNodeStatsPtr> stats, int maxInt) {
-    for (int k = 0; k < stats.size(); k++) {
-        if (stats[k]->_minOutputs.size() != channels || stats[k]->_maxOutputs.size() != channels) {
-            THROW_IE_EXCEPTION << "min and max sizes should be equal to channels count";
-        }
+    // end of workaround
+
+    // fililng of biases
+    Blob::Ptr biasesBlob = nullptr;
+    std::shared_ptr<Data> bData = std::shared_ptr<Data>(new Data("biases", { pConv->_out_depth }, Precision::FP32, Layout::C));
+    biasesBlob = CreateBlobFromData(bData);
+    biasesBlob->allocate();
+    float *bufferBiases = biasesBlob->buffer().as<float *>();
+    for (size_t c = 0; c < pConv->_out_depth; c++) {
+        bufferBiases[c] = ssSValues[c];
     }
+    pConv->_biases = biasesBlob;
 
-    // Creating i-scale blob
-    std::shared_ptr<Data> iScaleData = std::shared_ptr<Data>(new Data(name, { channels }, Precision::FP32, Layout::C));
-    auto iScale = CreateBlobFromData(iScaleData);
-    iScale->allocate();
-    float* iScaleMemory = static_cast<float*>(iScale->buffer());
-
-    for (int c = 0; c < channels; c++) {
-        float maxc = 0;
-        for (int k = 0; k < stats.size(); k++) {
-            // maxc = fmax(maxc, fabs(stats[k]->_minOutputs[c]));        // TODO Check if we should take minimums into account
-            maxc = fmax(maxc, fabs(stats[k]->_maxOutputs[c]));
-        }
-
-        iScaleMemory[c] = maxc / static_cast<float>(maxInt);
-
-        if (fabs(iScaleMemory[c]) < 1e-7) {
-            iScaleMemory[c] = 1.0f;
-        }
-    }
-    return iScale;
+    pConv->blobs["weights"] = pConv->_weights;
+    pConv->blobs["biases"] = pConv->_biases;
+    return lptr;
 }
 
-std::vector<NetworkNodeStatsPtr> splitStats(NetworkNodeStatsPtr stats, std::vector<size_t> channels) {
-    NetworkNodeStats s = *stats.get();  // Copying the stats
-    std::vector<NetworkNodeStatsPtr> res;
-
-    size_t j = 0;
-    for (size_t ci = 0; ci < channels.size(); ci++) {
-        NetworkNodeStatsPtr latest = NetworkNodeStatsPtr(new NetworkNodeStats(channels[ci]));
-        for (size_t k = 0; k < channels[ci]; k++) {
-            if (j > stats->_minOutputs.size()) THROW_IE_EXCEPTION << "Incorrect stats or channels";
-            latest->_minOutputs[k] = stats->_minOutputs[j];
-            latest->_maxOutputs[k] = stats->_maxOutputs[j];
-            j++;
-        }
-        res.push_back(latest);
-    }
-    return res;
-}
-
-void CNNNetworkInt8Normalizer::ConvertToInt8(int maxSign, int maxUnsign, CNNNetwork& net, const std::map<std::string, NetworkNodeStatsPtr>& netNodesStats) {
-    std::map<std::string, NetworkNodeStatsPtr> internalNodesStats = netNodesStats;
+void CNNNetworkInt8Normalizer::replaceScaleShiftByDWConvolution(CNNNetwork &net) {
     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
-
-    // Back iterating the network, searching for the "Eltwise-driven subnets"
-    // Eltwise-driven subnet is a subnet which ends with an Eltwise and starts with I8-convolution.
-    // All the nodes in the subnet should have the same o-scales
-    std::vector<CNNLayerPtr> backSortedLayers = sortedLayers;
-    std::reverse(std::begin(backSortedLayers), std::end(backSortedLayers));
-
-    std::set<CNNLayerPtr> skippedEltwises;
-    // Back propagating statistics
-    std::set<CNNLayerPtr> eltwisesProcessed;
-    for (auto iter : backSortedLayers) {
-        if (iter->params.find("quantization_level") != iter->params.end() && iter->params["quantization_level"] == "FP32") {
-            continue;
-        }
-        if (internalNodesStats.find(iter->name) == internalNodesStats.end()) {
-            continue;
-        }
-
-        if (iter->type == "Eltwise") {
-            // Counting Eltwises in a row
-            std::set<CNNLayerPtr> eltwisesSequence;
-            CNNLayerPtr ptr = iter;
-            bool added;
-            do {
-                added = false;
-                for (auto& n : ptr->insData) {
-                    CNNLayerPtr in = n.lock()->creatorLayer.lock();
-                    if (in->type == "ReLU") {
-                        in = in->insData[0].lock()->creatorLayer.lock();
-                    }
-
-                    if (in->type == "Eltwise") {
-                        ptr = in;
-                        eltwisesSequence.insert(in);
-                        added = true;
-                    }
+    for (auto layer : sortedLayers) {
+        if (CaselessEq<std::string>()(layer->type, "scaleshift")
+            && layer->insData[0].lock()->creatorLayer.lock()
+            && !CaselessEq<std::string>()(layer->insData[0].lock()->creatorLayer.lock()->type, "input")
+            && layer->outData[0]->inputTo.size() > 0) {
+            // verification if this layer does not pass data to PriorBox, if it passes, we do not substitute
+            bool notToPriorBox = true;
+            for (auto o : layer->outData[0]->inputTo) {
+                if (CaselessEq<std::string>()(o.second->type, "priorbox") ||
+                    CaselessEq<std::string>()(o.second->type, "priorboxclustered")) {
+                    notToPriorBox = false;
                 }
-            } while (added);
-
-            if (eltwisesSequence.size() > 5) {
-                skippedEltwises.insert(eltwisesSequence.begin(), eltwisesSequence.end());
+            }
+            if (notToPriorBox) {
+                ScaleShiftLayer *pSS = dynamic_cast<ScaleShiftLayer *>(layer.get());
+                float *ssWValues = pSS->_weights->buffer().as<float *>();
+                float *ssSValues = pSS->_biases->buffer().as<float *>();
+                CNNLayer::Ptr newLayer = createDWConvolutionForScale(layer->name, layer->outData[0]->getTensorDesc().getDims()[1], ssWValues, ssSValues);
+
+                newLayer->outData = layer->outData;
+                newLayer->outData[0]->creatorLayer = newLayer;
+                newLayer->insData = layer->insData;
+                newLayer->insData[0].lock()->inputTo.erase(layer->name);
+                newLayer->insData[0].lock()->inputTo[newLayer->name] = newLayer;
             }
         }
+    }
+}
 
-        if (iter->type == "Eltwise" &&
-                eltwisesProcessed.find(iter) == eltwisesProcessed.end() &&
-                internalNodesStats.find(iter->name) != internalNodesStats.end()) {
-            eltwisesProcessed.insert(iter);
+void CNNNetworkInt8Normalizer::QuantizeConvolution(CNNLayer::Ptr convolution,
+                                                    CNNStatisticHelper& statHelper) {
+    size_t inputChannels = convolution->insData[0].lock()->getTensorDesc().getDims()[1];
+    size_t outputChannels = convolution->outData[0]->getTensorDesc().getDims()[1];
 
-            // Collecting all the convolutions that starts the "Eltwise-driven subnet"
-            std::set<DataPtr> edgesToWatch;
-            std::map<DataPtr, NetworkNodeStatsPtr> edgeStats;
+    auto iScale = statHelper.getInputScale(convolution);
 
-            for (auto in : iter->insData) {
-                edgesToWatch.insert(in.lock());
-            }
+    convolution->blobs["i-scale"] = iScale;
 
-            // Add the statistics of the Eltwise to each edge
-            for (auto e : edgesToWatch) {
-                NetworkNodeStatsPtr p = internalNodesStats.find(iter->name)->second;
-                edgeStats.insert({ e, p });
-            }
+    Blob::Ptr weights = nullptr;
+    Blob::Ptr biases = nullptr;
 
-            do {
-                std::set<DataPtr> previousETW = edgesToWatch;
-                // For each LayerToWatch processing all its direct inputs
-                for (auto e : previousETW) {
-                    auto prevLayer = e->creatorLayer.lock();
-                        if (internalNodesStats.find(prevLayer->name) != internalNodesStats.end()) {
-                            if (prevLayer->type == "Convolution") {
-                            // Setting the current node's stats to the stats saved for the edge after it
-                            internalNodesStats[prevLayer->name] = edgeStats[e];
-#ifndef NDEBUG
-                            std::cout << "Propagated stats from " << e->name << " to " << prevLayer->name
-                                    << "(" << internalNodesStats[prevLayer->name]->_maxOutputs[0] << ")" << std::endl;
-#endif
-                            } else if (prevLayer->type == "Eltwise") {
-                                eltwisesProcessed.insert(prevLayer);
-                            // Setting the current node's stats to the stats saved for the edge after it
-                            internalNodesStats[prevLayer->name] = edgeStats[e];
-#ifndef NDEBUG
-                            std::cout << "Propagated stats from " << e->name << " to " << prevLayer->name
-                                    << "(" << internalNodesStats[prevLayer->name]->_maxOutputs[0] << ")" << std::endl;
-#endif
-                            for (auto ee : prevLayer->insData) {
-                                // Adding the edges before the node to the watch list
-                                edgesToWatch.insert(ee.lock());
-                                // Propagating the stats upwards
-                                edgeStats.insert({ ee.lock(), internalNodesStats[prevLayer->name] });
-                            }
-                            } else if (prevLayer->type == "Pooling") {
-                            // Setting the current node's stats to the stats saved for the edge after it
-                            internalNodesStats[prevLayer->name] = edgeStats[e];
-#ifndef NDEBUG
-                            std::cout << "Propagated stats from " << e->name << " to " << prevLayer->name
-                                    << "(" << internalNodesStats[prevLayer->name]->_maxOutputs[0] << ")" << std::endl;
-#endif
-                            for (auto ee : prevLayer->insData) {
-                                // Adding the edges beforereleases/openvino-2018-r4 the node to the watch list
-                                edgesToWatch.insert(ee.lock());
-                                // Propagating the stats upwards
-                                edgeStats.insert({ ee.lock(), internalNodesStats[prevLayer->name] });
-                            }
-                        } else if (prevLayer->type == "ReLU") {
-                            // Setting the current node's stats to the stats saved for the edge after it
-                            internalNodesStats[prevLayer->name] = NetworkNodeStatsPtr(new NetworkNodeStats(*edgeStats[e].get()));
-                            for (auto& mo : internalNodesStats[prevLayer->name]->_minOutputs) {
-                                mo = 0;
-                        }
-#ifndef NDEBUG
-                            std::cout << "Propagated stats from " << e->name << " to " << prevLayer->name
-                                    << ", zeroing the minimal values" << "(" << internalNodesStats[prevLayer->name]->_maxOutputs[0] << ")" << std::endl;
-#endif
-                            for (auto ee : prevLayer->insData) {
-                                // Adding the edges before the node to the watch list
-                                edgesToWatch.insert(ee.lock());
-                                // Propagating the stats upwards
-                                edgeStats.insert({ ee.lock(), internalNodesStats[prevLayer->name] });
-                    }
-                        } else if (prevLayer->type == "Concat") {
-                            // Setting the current node's stats to the stats saved for the edge after it
-                            internalNodesStats[prevLayer->name] = edgeStats[e];
-#ifndef NDEBUG
-                            std::cout << "Propagated stats from " << e->name << " to " << prevLayer->name << "("
-                                    << internalNodesStats[prevLayer->name]->_maxOutputs[0] << ")" << std::endl;
-#endif
-                            // Getting the inputs channels counts
-                            std::vector<size_t> inputsChannels;
-                            for (auto i : prevLayer->insData) {
-                                size_t channels = i.lock()->getTensorDesc().getDims()[1];
-                                inputsChannels.push_back(channels);
-                            }
+    Blob::Ptr int8weights = nullptr;
+    Blob::Ptr int32biases = nullptr;
 
-                            // Splitting the stats to feed them upwards the Concat inputs
-                            std::vector<NetworkNodeStatsPtr> inStats = splitStats(internalNodesStats[prevLayer->name], inputsChannels);
-                            auto in = prevLayer->insData.begin();
-                            for (size_t i = 0; i < inStats.size(); i++) {
-                                edgeStats.insert({ in->lock(), inStats[i] });
-                                // Adding the edges before the node to the watch list
-                                edgesToWatch.insert(in->lock());
-                                in++;
-                            }
-
-                        } else {
-                            // Setting the current node's stats to the stats saved for the edge after it
-                            internalNodesStats[prevLayer->name] = edgeStats[e];
-                            for (auto ee : prevLayer->insData) {
-                                // Adding the edges before the node to the watch list
-                                edgesToWatch.insert(ee.lock());
-                                // Propagating the stats upwards
-                                edgeStats.insert({ ee.lock(), internalNodesStats[prevLayer->name] });
-                    }
-                }
-            }
+    if (convolution->blobs.find("weights")!= convolution->blobs.end()) {
+        weights = convolution->blobs["weights"];
 
-                    edgesToWatch.erase(e);
-                }
-            } while (!edgesToWatch.empty());
-        }
+        // Creating int8 weights blob
+        std::shared_ptr<Data> int8WeightsData = std::shared_ptr<Data>(new Data("weights", weights->dims(), Precision::I8, weights->layout()));
+        int8weights = CreateBlobFromData(int8WeightsData);
+        int8weights->allocate();
+        convolution->blobs["weights"] = int8weights;
     }
 
-    // Converting layers to Int8. Calculating the multipliers if needed
-    for (auto iter : sortedLayers) {
-        if (iter->params.find("quantization_level") != iter->params.end() && iter->params["quantization_level"] == "FP32") {
-            continue;
-        }
-        if (internalNodesStats.find(iter->name) == internalNodesStats.end()) {
-            continue;
-        }
+    if (convolution->blobs.find("biases")!= convolution->blobs.end()) {
+        biases = convolution->blobs["biases"];
 
-        if (iter->type == "Eltwise") {
-            if (skippedEltwises.find(iter) != skippedEltwises.end()) {
-#ifndef NDEBUG
-                std::cout << "Skipping Eltwise " << iter->name << " conversion" << std::endl;
-#endif
-                continue;
-            }
+        // Creating int8 biases blob
+        std::shared_ptr<Data> int32BiasesData = std::shared_ptr<Data>(new Data("biases", biases->dims(), Precision::I32, biases->layout()));
+        int32biases = CreateBlobFromData(int32BiasesData);
+        int32biases->allocate();
+        convolution->blobs["biases"] = int32biases;
+    }
 
-            auto eltw = dynamic_cast<EltwiseLayer*>(iter.get());
-            if (eltw == nullptr) THROW_IE_EXCEPTION << "Can't interpret " << iter->name << " as an Eltwise layer";
+    std::vector<float> weightScalers;
 
-            // Checking if all the previous layers are I8
-            bool canConvert = true;
-            for (auto in : iter->insData) {
-                auto previousLayer = in.lock()->creatorLayer.lock();
-                if (previousLayer->precision != Precision::I8) {
-                    // If the precision isn't I8, we don't convert the Eltwise
-                    canConvert = false;
-                }
-            }
 
-            if (canConvert && eltw->_operation == EltwiseLayer::eOperation::Sum) {
-                // Mark it I8
-                iter->precision = Precision::I8;
-                if (iter->outData[0]->inputTo.size() == 1 &&
-                    iter->outData[0]->inputTo.begin()->second->type == "ReLU") {
-                    auto reluLayer = iter->outData[0]->inputTo.begin()->second;
+    // Creating w-scale blob
+    if (weights) {
+        const float *weight = static_cast<const float *>(weights->buffer());
 
-                    // Signed int8 between Eltwise and ReLU
-                    for (auto&& out : iter->outData) {
-                        out->precision = Precision::I8;
-                    }
+        ConvolutionLayer *pConv = dynamic_cast<ConvolutionLayer *>(convolution.get());
+        if (pConv->_group == 0) {
+            THROW_IE_EXCEPTION << "Convolution '" << convolution->name << "'has wrong groups number == 0";
+        }
+
+        std::vector<float> newWeights;  // "new" weights are weights multiplied by i-scale
 
-                    // ReLU after Eltwise is being set to signed int8 type unlike ReLU after a Convolution.
-                    // This is the best way to support Eltwise-ReLU-Eltwise chain (that is common in ResNet-like nets)
-                    reluLayer->precision = Precision::I8;
+        size_t W_CO = outputChannels / pConv->_group,
+        W_CI = inputChannels / pConv->_group,
+        W_HW = weights->size()/ W_CI / W_CO / pConv->_group;
 
-                    // Signed int8 after ReLU
-                    for (auto&& out : reluLayer->outData) {
-                        out->precision = Precision::I8;
+        {
+            float *iScaleMemory = static_cast<float *>(iScale->buffer());
+            for (size_t g = 0; g < pConv->_group; g++) {
+                for (size_t co = 0; co < W_CO; co++) {
+                    for (size_t ci = 0; ci < W_CI; ci++) {
+                        size_t kernelBase = g * W_CO * W_CI * W_HW + co * W_CI * W_HW + ci * W_HW;
+                        for (size_t hw = 0; hw < W_HW; hw++) {
+                            newWeights.push_back(weight[kernelBase + hw] * iScaleMemory[g * W_CI + ci]);
+                        }
                     }
                 }
             }
-        } else if (iter->type == "Convolution") {
-            size_t inputChannels = iter->insData[0].lock()->dims[2];
-            size_t outputChannels = iter->outData[0]->dims[2];
-
-            auto previousLayer = iter->insData[0].lock()->creatorLayer.lock();
-            std::string inputLayerName = previousLayer->name;
-
-            // for case when we have the only average pooling before, we need to take this
-            // statistic from input of avg pooloing to compensate work of average pooling
-            // and to stay in int8 as much as we can
-            if (previousLayer->type == "Pooling" && (previousLayer->precision == Precision::I8 || previousLayer->precision == Precision::U8)) {
-                // take input name to the pooling
-                inputLayerName = previousLayer->insData[0].lock()->creatorLayer.lock()->name;
-            }
-
-
-            if (internalNodesStats.find(inputLayerName) == internalNodesStats.end()) {
-                THROW_IE_EXCEPTION << "No stats for layer " << inputLayerName;
-            }
-
-            // Checking the topology
-            if (iter->outData.size() != 1) {
-                THROW_IE_EXCEPTION << "Strange convolution with multiple outputs";
-            }
-
-            // Checking if we have negative inputs
-            float min_inp = 0;
-            for (int c = 0; c < inputChannels; c++) {
-                if (internalNodesStats.at(inputLayerName)->_minOutputs[c] < min_inp)
-                    min_inp = internalNodesStats.at(inputLayerName)->_minOutputs[c];
-            }
-            // Layer has negative input and can't be converted to INT8
-            if (min_inp < 0) {
-                continue;
-            }
+        }
+        size_t outChannelSize = weights->dims()[0] / W_CO / pConv->_group;
 
-            auto iScale = calculateScaleFactor("i-scale", inputChannels, { internalNodesStats.at(inputLayerName) }, maxUnsign);
-            iter->blobs["i-scale"] = iScale;
+        // Calculating weights normalization scale factor (w-scale)
+        float *weight_convolution;
+        size_t co;
+        for (co = 0, weight_convolution = &newWeights[0]; co < outputChannels; co++, weight_convolution += outChannelSize) {
+            float max = FLT_MIN;
+            DataStats::GetDataAbsMax(weight_convolution, outChannelSize, max);
 
-            Blob::Ptr weights = nullptr;
-            Blob::Ptr biases = nullptr;
+            float scaler = static_cast<float>(statHelper.getMaxSignValue())/ max;
+            weightScalers.push_back(scaler);
+        }
 
-            Blob::Ptr int8weights = nullptr;
-            Blob::Ptr int32biases = nullptr;
+        std::shared_ptr<Data> wScaleData = std::shared_ptr<Data>(new Data("w-scale", { outputChannels }, Precision::FP32, Layout::C));
+        auto wScale = CreateBlobFromData(wScaleData);
+        wScale->allocate();
 
-            if (iter->blobs.find("weights") != iter->blobs.end()) {
-                weights = iter->blobs["weights"];
+        float *wScaleMemory = static_cast<float *>(wScale->buffer());
 
-                // Creating int8 weights blob
-                std::shared_ptr<Data> int8WeightsData = std::shared_ptr<Data>(new Data("weights", weights->dims(), Precision::I8, weights->layout()));
-                int8weights = CreateBlobFromData(int8WeightsData);
-                int8weights->allocate();
-                iter->blobs["weights"] = int8weights;
-            }
-
-            if (iter->blobs.find("biases") != iter->blobs.end()) {
-                biases = iter->blobs["biases"];
+        for (size_t i = 0; i < outputChannels; i++) {
+            wScaleMemory[i] = 1.0 / weightScalers[i];
+        }
+        convolution->blobs["w-scale"] = wScale;
 
-                // Creating int8 biases blob
-                std::shared_ptr<Data> int32BiasesData = std::shared_ptr<Data>(new Data("biases", biases->dims(), Precision::I32, biases->layout()));
-                int32biases = CreateBlobFromData(int32BiasesData);
-                int32biases->allocate();
-                iter->blobs["biases"] = int32biases;
-            }
+        auto oScale = statHelper.getOutputScale(statHelper.getLatestInFuse(convolution));
+        convolution->blobs["o-scale"] = oScale;
 
-            std::vector<float> weightScalers;
+        // debug scales. Need to compare with actual values in FP32 scoring
+        convolution->blobs["ext-scale"] = convolution->blobs["o-scale"];
 
+        // Normalizing the weights
+        ScaleDataToInt(&newWeights[0], weights->size(), int8weights, weightScalers);
+    }
 
-            // Creating w-scale blob
-            if (weights) {
-                const float* weight = static_cast<const float*>(weights->buffer());
+    // Normalizing the biases
+    if (biases) {
+        const float *bias = static_cast<const float *>(biases->buffer());
+        ScaleDataToInt(bias, biases->size(), int32biases, weightScalers);
+    }
+}
 
-                ConvolutionLayer* pConv = dynamic_cast<ConvolutionLayer*>(iter.get());
-                if (pConv->_group == 0) {
-                    THROW_IE_EXCEPTION << "Convolution '" << iter->name << "'has wrong groups number == 0";
-                }
+void CNNNetworkInt8Normalizer::returnTailToFP32(CNNLayer::Ptr layer) {
+    std::set<CNNLayer::Ptr> layersToReturn;
+    layersToReturn.insert(layer);
+    while (!layersToReturn.empty()) {
+        CNNLayer::Ptr layerA = *layersToReturn.begin();
+        layersToReturn.erase(layerA);
+        // 1. if it is Pooling layer, or concat layer, we can return it to FP32 as well
+        // we need to return it's out data
+        if ((CaselessEq<std::string>()(layerA->type, "pooling")
+            || CaselessEq<std::string>()(layerA->type, "concat")) &&
+            layerA->outData.size() == 1) {
+            layerA->precision = Precision::FP32;
+            layerA->outData[0]->setPrecision(Precision::FP32);
+        }
 
-                std::vector<float> newWeights;  // "new" weights are weights multiplied by i-scale
+        if ((CaselessEq<std::string>()(layerA->type, "convolution")
+            || CaselessEq<std::string>()(layerA->type, "relu")) &&
+            layerA->outData.size() == 1) {
+            layerA->outData[0]->setPrecision(Precision::FP32);
+        }
 
-                size_t W_CO = outputChannels / pConv->_group,
-                        W_CI = inputChannels / pConv->_group,
-                        W_HW = weights->dims()[0] / W_CI / W_CO / pConv->_group;
 
-                {
-                    float* iScaleMemory = static_cast<float*>(iScale->buffer());
-                    for (size_t g = 0; g < pConv->_group; g++) {
-                        for (size_t co = 0; co < W_CO; co++) {
-                            for (size_t ci = 0; ci < W_CI; ci++) {
-                                size_t kernelBase = g * W_CO * W_CI * W_HW + co * W_CI * W_HW + ci * W_HW;
-                                for (size_t hw = 0; hw < W_HW; hw++) {
-                                    newWeights.push_back(weight[kernelBase + hw] * iScaleMemory[g * W_CI + ci]);
-                                }
-                            }
+        // adding parents for analysis
+        if (!CaselessEq<std::string>()(layerA->type, "convolution")) {
+            // for all parrents, if they produce data to only FP32 layers
+            for (auto i : layerA->insData) {
+                DataPtr d = i.lock();
+                if (d->creatorLayer.lock()->precision != Precision::FP32
+                    && (CaselessEq<std::string>()(layerA->type, "pooling")
+                        || CaselessEq<std::string>()(layerA->type, "relu")
+                        || CaselessEq<std::string>()(layerA->type, "concat"))) {
+                    // check if layer produce to only FP32
+                    bool consumersFP32 = true;
+                    for (auto dOut : d->inputTo) {
+                        if (dOut.second->precision != Precision::FP32) {
+                            consumersFP32 = false;
                         }
                     }
+                    if (consumersFP32) {
+                        layersToReturn.insert(d->creatorLayer.lock());
+                    }
                 }
-                size_t outChannelSize = weights->dims()[0] / W_CO / pConv->_group;
-
-                // Calculating weights normalization scale factor (w-scale)
-                float* weight_iter;
-                size_t co;
-                for (co = 0, weight_iter = &newWeights[0]; co < outputChannels; co++, weight_iter += outChannelSize) {
-                    float max = FLT_MIN;
-                    DataStats::GetDataAbsMax(weight_iter, outChannelSize, max);
-
-                    float scaler = static_cast<float>(maxSign) / max;
-                    weightScalers.push_back(scaler);
-                }
-
-                std::shared_ptr<Data> wScaleData = std::shared_ptr<Data>(new Data("w-scale", { outputChannels }, Precision::FP32, Layout::C));
-                auto wScale = CreateBlobFromData(wScaleData);
-                wScale->allocate();
-
-                float* wScaleMemory = static_cast<float*>(wScale->buffer());
-
-                for (size_t i = 0; i < outputChannels; i++) {
-                    wScaleMemory[i] = 1.0 / weightScalers[i];
-                }
-                iter->blobs["w-scale"] = wScale;
+            }
+        }
+    }
+}
 
-                // Creating o-scale blob
-                // verify if there is ReLU just after the convolution, in this case
-                // we will normalize only positive values to maxUnsign
-                // anther decision - we will not propagate o-scale for cases if we do not have
-                // conv-relu pattern because there is no sense right now to normalize to sihned I8
-                // no primitives can process such input for a while
-                if (iter->outData[0]->inputTo.size() == 1 &&
-                    iter->outData[0]->inputTo.begin()->second->type == "ReLU") {
-                    auto reluLayer = iter->outData[0]->inputTo.begin()->second;
+bool CNNNetworkInt8Normalizer::isNextFusionAllowed(CNNLayer::Ptr layer) const {
+    // fusion can happen only if initial layer supplys data to only one layer
+    // if it sends to several layers - it is safe to execute initial layer in any precision
+    if (layer->outData[0]->inputTo.size() == 1) {
+        std::string aType = layer->outData[0]->inputTo.begin()->second->type;
+        if (CaselessEq<std::string>()(aType, "relu")) {
+            ReLULayer *rL = dynamic_cast<ReLULayer *>(layer->outData[0]->inputTo.begin()->second.get());
+            if (rL->negative_slope != 0.f) {
+                return false;
+            }
+        } else {
+            static const InferenceEngine::details::caseless_set<std::string> nonSuportedActivations =
+            {"elu", "clamp", "tanh", "logistic", "square", "abs",
+            "sqrt", "linear", "bounded_elu", "sort_relu", "relu6"};
+            return !(nonSuportedActivations.find(aType) != nonSuportedActivations.end());
+        }
+    }
+    return true;
+}
 
-                    auto oScale = calculateScaleFactor("o-scale", outputChannels, { internalNodesStats.at(reluLayer->name) }, maxUnsign);
-                    iter->blobs["o-scale"] = oScale;
+void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork &net, CNNStatisticHelper &statHelper) {
+    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
 
-                    // Unsigned int8 precision for ReLU
-                    reluLayer->precision = Precision::U8;
+    // Converting layers to Int8. Calculating the multipliers if needed
+    for (auto iter : sortedLayers) {
+        if (iter->params.find("quantization_level") != iter->params.end() && iter->params["quantization_level"] == "FP32") {
+            continue;
+        }
 
-                } else {
-                    auto oScale = calculateScaleFactor("o-scale", outputChannels, { internalNodesStats.at(iter->name) }, maxUnsign);
-                    iter->blobs["o-scale"] = oScale;
-                }
+        if (statHelper.canLayerBeQuantized(iter->name)) {
+            continue;
+        }
 
+        if (CaselessEq<std::string>()(iter->type, "convolution")) {
+            if (isNextFusionAllowed(iter)) {
                 iter->precision = Precision::I8;
-                // Normalizing the weights
-                ScaleDataToInt(&newWeights[0], weights->size(), int8weights, weightScalers);
+                // we will override I8 to U8 during analysing of Conv-ReLU and Conv-Sum-ReLU fusions
+                iter->outData[0]->setPrecision(Precision::I8);
             }
-
-            // Normalizing the biases
-            if (biases) {
-                const float* bias = static_cast<const float*>(biases->buffer());
-                ScaleDataToInt(bias, biases->size(), int32biases, weightScalers);
+        } else if (CaselessEq<std::string>()(iter->type, "relu")) {
+            // casting to ReLU
+            ReLULayer *rL = dynamic_cast<ReLULayer *>(iter.get());
+            DataPtr outData = iter->outData.size() ? iter->outData[0] : nullptr;
+            if (iter->insData[0].lock()->creatorLayer.lock()->precision != Precision::FP32
+                && outData->getPrecision() == Precision::FP32) {
+                iter->precision = Precision::I8;
+                if (rL->negative_slope != 0.0f) {
+                    outData->setPrecision(Precision::I8);
+                } else {
+                    outData->setPrecision(Precision::U8);
+                    // if convolution is a predecessor, change its data to U8 also
+                    CNNLayer::Ptr prevLayer = iter->insData[0].lock()->creatorLayer.lock();
+                    if (prevLayer && CaselessEq<std::string>()(prevLayer->type, "convolution")) {
+                        iter->insData[0].lock()->setPrecision(Precision::U8);
+                    }
+                    // if there is a patter A0 -> Eltwise -> ReLU and Convolution -> Eltwise -> ReLU,
+                    // need to mark data after conv as U8
+                    if (prevLayer && CaselessEq<std::string>()(prevLayer->type, "eltwise")) {
+                        iter->insData[0].lock()->setPrecision(Precision::U8);
+                        // decising which input will be used for fusion conv-sum-relu
+                        CNNLayer::Ptr input1 = prevLayer->insData[0].lock()->creatorLayer.lock();
+                        CNNLayer::Ptr input2 = prevLayer->insData[1].lock()->creatorLayer.lock();
+                        CNNLayer::Ptr convLayer = nullptr;
+                        CNNLayer::Ptr sumLayer = nullptr;
+
+                        if (!CaselessEq<std::string>()(input1->type, "convolution")) {
+                            sumLayer = input1;
+                            convLayer = input2;
+                        } else {
+                            // it covers a case when both inputs are convolutions or when first input is not convolution
+                            convLayer = input1;
+                            sumLayer = input2;
+                        }
+                        convLayer->outData[0]->setPrecision(sumLayer->outData[0]->getPrecision());
+                    }
+                }
             }
-        } else if (iter->type == "Pooling") {
-            auto pool = dynamic_cast<PoolingLayer*>(iter.get());
+        } else if (CaselessEq<std::string>()(iter->type, "pooling")) {
+            auto pool = dynamic_cast<PoolingLayer *>(iter.get());
             if (pool && (pool->_type == PoolingLayer::MAX
-                || (pool->_type == PoolingLayer::AVG
-                    && pool->outData.size() == 1
-                    && pool->outData[0]->inputTo.size() == 1
-                    && pool->outData[0]->inputTo.begin()->second->type == "Convolution"))) {
+                         || (pool->_type == PoolingLayer::AVG
+                             && pool->outData.size() == 1))) {
                 auto prevLayer = iter->insData[0].lock()->creatorLayer.lock();
                 if (prevLayer && (prevLayer->precision == Precision::I8 || prevLayer->precision == Precision::U8)) {
                     iter->precision = Precision::I8;
-                    if (iter->outData.size() == 1) {
-                        for (auto&& out : iter->outData) {
-                            out->precision = Precision::U8;
-                        }
-                    }
-                }
-            }
-        } else if (iter->type == "Concat") {
-            bool allParentsInt = true;
-
-#ifndef NDEBUG
-            Precision p = iter->insData[0].lock()->precision;
-            for (auto inputData : iter->insData) {
-                if (inputData.lock()->precision != p) {
-                    std::cerr << "WARNING: We have a Concat " << iter->name << " whose inputs have different precisions" << std::endl;
-                }
-            }
-#endif
-
-            for (auto inputData : iter->insData) {
-                auto inPrecision = inputData.lock()->creatorLayer.lock()->precision;
-                if (inPrecision != Precision::I8 && inPrecision != Precision::U8) {
-                    allParentsInt = false;
+                    iter->outData[0]->setPrecision(
+                        statHelper.hasNegativeOutput(iter->name) ? Precision::I8 : Precision::U8);
                 }
             }
+        } else if (CaselessEq<std::string>()(iter->type, "concat")) {
+            // we can do safe
             // casting to concat and take axis parameter
             // we can concat scales only if concat does concatination by feature maps
             bool axisFeatureMaps = false;
@@ -752,83 +941,154 @@ void CNNNetworkInt8Normalizer::ConvertToInt8(int maxSign, int maxUnsign, CNNNetw
             } else {
                 THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer " << iter->name << " to concat";
             }
-            if (allParentsInt && axisFeatureMaps) {
-                iter->precision = Precision::I8;
-                if (iter->outData.size() == 1) {
-                    for (auto&& out : iter->outData) {
-                        out->precision = Precision::U8;
+
+            if (axisFeatureMaps) {
+                // verification of input data types
+                bool inputFP32 = false;
+                bool inputI8 = false;
+                bool inputU8 = false;
+
+                for (auto inputData : iter->insData) {
+                    auto data = inputData.lock();
+                    if (data->getPrecision() == Precision::FP32) {
+                        inputFP32 = true;
+                    } else if (data->getPrecision() == Precision::I8) {
+                        inputI8 = true;
+                    } else if (data->getPrecision() == Precision::U8) {
+                        inputU8 = true;
+                    } else {
+                        // Is it a case of input, i.e. passing I16 to concat?
+                        // TODO(amalyshe) to handle inputs as a separate usecase
+                        THROW_IE_EXCEPTION << "I8 normalizer: input data has unknown precision on the edge for concat: " << data->name;
                     }
                 }
-            } else {
-                for (auto&& id : iter->insData) {
-                    id.lock()->precision = Precision::FP32;
-                }
-            }
-        }
-    }
 
-    // Processing edges precisions
-    for (auto iter : sortedLayers) {
-       if (iter->params.find("quantization_level") != iter->params.end() && iter->params["quantization_level"] == "FP32") {
-           continue;
-       }
-       if (internalNodesStats.find(iter->name) == internalNodesStats.end()) {
-           continue;
-       }
-
-       if (iter->type == "Convolution") {
-           if (iter->outData[0]->inputTo.size() > 0) {
-               auto nextFirstLayer = iter->outData[0]->inputTo.begin()->second;
-
-               // If we have only a single ReLU after the convolution
-               if (iter->outData[0]->inputTo.size() == 1 && nextFirstLayer->type == "ReLU") {
-                   // Setting precision I8 between the convolution and ReLU
-                   // (this will be eliminated by the MKLDNNPlugin GraphOptimizer, but it's beautiful)
-                   iter->outData[0]->precision = Precision::I8;
-                   // If any integer output found, setting ReLU output to U8
-                   nextFirstLayer->outData[0]->precision = Precision::U8;
-
-               } else {
-                   // If there is no ReLU after the convolution...
-                   for (auto&& inTo : iter->outData[0]->inputTo) {
-                       if (inTo.second->precision == Precision::I8 || inTo.second->precision == Precision::U8) {
-                           // If any integer output found, setting the convolution output to I8
-                           iter->outData[0]->precision = Precision::I8;
-                           break;
-                       }
-                   }
-               }
-           }
-        } else if (iter->type == "Eltwise") {
-            if (iter->precision == Precision::I8) {
-                size_t outputChannels = iter->outData[0]->dims[2];
+                if (inputFP32) {
+                    for (auto i : iter->insData) {
+                        if (i.lock()->creatorLayer.lock()->precision != Precision::FP32) {
+                            returnTailToFP32(i.lock()->creatorLayer.lock());
+                        }
+                    }
+                } else {
+                    iter->precision = Precision::I8;
 
-                std::vector<NetworkNodeStatsPtr> stats;
-                stats.push_back(internalNodesStats.at(iter->name));
+                    // we set outpout precision to U8 only if all inputs are U8, in other case it will be I8
+                    auto outputPrecision = (inputU8 && !inputI8) ? Precision::U8 : Precision::I8;
+
+                    // if we have mixed input for I8 and U8, we have to insert scale to edges having U8 to convert to I8
+                    // Yes, it leads to loosing of some precision and might lead to some performance degradation
+                    // until we have scale supporting s8/u8 input and s8/u8 output.
+                    if (inputU8 && inputI8) {
+                        // looking for all edges having U8
+                        for (size_t d = 0; d < iter->insData.size(); d++) {
+                            auto data = iter->insData[d].lock();
+                            if (data->getPrecision() == Precision::U8) {
+                                size_t c = static_cast<size_t>(data->getDims()[1]);
+
+                                std::vector<float> ssWValues;
+                                std::vector<float> ssSValues;
+                                for (auto i = 0; i < c; i++) {
+                                    ssWValues.push_back(1.0f);
+                                    ssSValues.push_back(0.0f);
+                                }
+                                std::string layerName = data->creatorLayer.lock()->name + "_ScaleShift_U8I8_" + iter->name;
+                                CNNLayer::Ptr newLayer = createDWConvolutionForScale(layerName, c, ssWValues.data(), ssSValues.data());
+                                newLayer->precision = Precision::I8;
+                                AddLayerToCNNNetworkBeforeLayer(newLayer, iter, d);
+
+                                // update statistic to pass quantization smoothly
+                                std::string inputLayerName = newLayer->insData[0].lock()->creatorLayer.lock()->name;
+                                statHelper.copyStatistics(inputLayerName, layerName);
+                                newLayer->outData[0]->setPrecision(Precision::I8);
+                            }
+                        }
+                    }
 
-                auto oScale = calculateScaleFactor("o-scale", outputChannels, stats, maxUnsign);
+                    if (iter->outData.size() == 1) {
+                        for (auto &&out : iter->outData) {
+                            out->setPrecision(outputPrecision);
+                        }
+                    }
+                }
+            }
+        } else if (CaselessEq<std::string>()(iter->type, "eltwise")) {
+            // we decide which of the layers will be in int-8 mode and initialize special scale which will be used
+            // later in "conv-sum-relu" fuse. i8 execution of eltwise always assume this fusion
+            if (isNextFusionAllowed(iter)) {
+                if (iter->insData.size() == 2) {
+                    CNNLayer::Ptr input1 = iter->insData[0].lock()->creatorLayer.lock();
+                    CNNLayer::Ptr input2 = iter->insData[1].lock()->creatorLayer.lock();
+                    if ((CaselessEq<std::string>()(input1->type, "convolution")
+                         || CaselessEq<std::string>()(input2->type, "convolution")) &&
+                        !CaselessEq<std::string>()(input1->type, "concat") &&
+                        !CaselessEq<std::string>()(input2->type, "concat") &&
+                        input1->precision != Precision::FP32 &&
+                        input2->precision != Precision::FP32) {
+                        // understand which layer will be used for sum
+                        CNNLayer::Ptr sumLayer = nullptr;
+                        CNNLayer::Ptr convLayer = nullptr;
+
+                        if (!CaselessEq<std::string>()(input1->type, "convolution")) {
+                            sumLayer = input1;
+                            convLayer = input2;
+                        } else {
+                            // it covers a case when both inputs are convolutions or when first input is not convolution
+                            sumLayer = input2;
+                            convLayer = input1;
+                        }
 
-                size_t inputChannels = iter->insData[0].lock()->dims[2];
+                        // mark eltwise as a I8 executable, mark out data as I8
+                        iter->precision = Precision::I8;
+                        iter->outData[0]->setPrecision(Precision::I8);
+                        // calculate the only scale
+                        Blob::Ptr sumLayerScales = statHelper.getOutputScale(sumLayer);
+                        Blob::Ptr convLayerScales = statHelper.getOutputScale(statHelper.getLatestInFuse(convLayer));
+                        float *sumScale = sumLayerScales->buffer().as<float *>();
+                        float *convScale = convLayerScales->buffer().as<float *>();
+                        for (size_t i = 0; i < sumLayerScales->size(); i++) {
+                            sumScale[i] /= convScale[i];
+                        }
 
-                for (auto inputData : iter->insData) {
-                    auto prevData = inputData.lock();
-                    auto prevLayer = prevData->creatorLayer.lock();
-                    prevData->precision = Precision::I8;
+                        iter->blobs["eltwise-sum-scale"] = sumLayerScales;
+                    }
                 }
+            } else {
+                // if there are convolutions are inputs to this eltwise, we forcedly move them to FP32
+                for (auto i : iter->insData) {
+                    if (CaselessEq<std::string>()(i.lock()->creatorLayer.lock()->type, "convolution")) {
+                        i.lock()->creatorLayer.lock()->precision = Precision::FP32;
+                        i.lock()->setPrecision(Precision::FP32);
+                    }
+                }
+            }
+        }
+    }
 
-                // Setting the self oScale to the same as the previous convolutions
-                iter->blobs["o-scale"] = oScale;
-                iter->precision = Precision::I8;
+    // quantization of weights/biases
+    sortedLayers = CNNNetSortTopologically(net);
+    for (auto iter : sortedLayers) {
+        if (iter->precision == Precision::I8 && CaselessEq<std::string>()(iter->type, "convolution")) {
+            QuantizeConvolution(iter, statHelper);
+        }
+    }
 
-                for (auto&& out : iter->outData) {
-                    out->precision = Precision::I8;
-                }
+    // Returning of tails to FP32 mode if optimistic approach marked them as I8
+    // no sense to do pooling in i8, we can return just after convolution
+    for (auto iter : sortedLayers) {
+        // TODO(amalyshe) here is a handling of case when iter provides data to the only one next layer
+        // need to extend to cases when it provides data to many layers
+        if (iter->precision == Precision::I8
+            && iter->outData.size() == 1) {
+            if ((iter->outData[0]->inputTo.size() == 1
+                                               && iter->outData[0]->inputTo.begin()->second->precision == Precision::FP32)
+                                              || iter->outData[0]->inputTo.size() == 0) {
+                returnTailToFP32(iter);
             }
         }
     }
 }
 
-void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net) {
+void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net, const CNNStatisticHelper& statHelper) {
     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
 
     std::vector<CNNLayer::Ptr> oScaleLayers;
@@ -841,7 +1101,7 @@ void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net) {
             for (int k = 0; k < iter->insData.size(); k++) {
                 auto prevKLayer = iter->insData[k].lock()->creatorLayer.lock();
                 if ((prevKLayer->precision != Precision::I8 && prevKLayer->precision != Precision::U8) ||
-                    prevKLayer->blobs.find("o-scale") == prevKLayer->blobs.end()) {
+                    prevKLayer->blobs.find("i-concat-scale") == prevKLayer->blobs.end()) {
                     all_inputs_are_int8 = false;
                     break;
                 }
@@ -859,14 +1119,13 @@ void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net) {
                 float* oScaleMemory = static_cast<float*>(oScale->buffer());
                 int cc = 0;
                 for (int in = 0; in < iter->insData.size(); in++) {
-                    auto prevOScale = iter->insData[in].lock()->creatorLayer.lock()->blobs["o-scale"];
+                    auto prevOScale = iter->insData[in].lock()->creatorLayer.lock()->blobs["i-concat-scale"];
                     float* prevOScaleMemory = static_cast<float*>(prevOScale->buffer());
 
                     for (int c = 0; c < prevOScale->size(); c++) {
                         oScaleMemory[cc] = prevOScaleMemory[c];
                         cc++;
                     }
-                    iter->insData[in].lock()->creatorLayer.lock()->blobs.erase("o-scale");
                 }
                 if (cc != outputChannels) THROW_IE_EXCEPTION << "Size of o-scale after " << iter->name << " isn't equal to the channels count";
 
@@ -876,7 +1135,8 @@ void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net) {
         }
 
         if (iter->blobs.find("o-scale") != iter->blobs.end()) {
-            bool canPropagate = true;
+            int int8Consumers = 0;
+            int fp32Consumers = 0;
             if (iter->outData.size() > 1) {
                 THROW_IE_EXCEPTION << "normalization algorithm for int8 found layer having o-scale and multiple ports";
             }
@@ -885,32 +1145,51 @@ void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net) {
                     if (l.second->precision == Precision::I8 || l.second->precision == Precision::U8) {
                         if (l.second->type == "Pooling" || l.second->type == "ReLU") {
                             l.second->blobs["o-scale"] = iter->blobs["o-scale"];
+                            // debug scales. Need to compare with actual values in FP32 scoring
+                            l.second->blobs["ext-scale"] = l.second->blobs["o-scale"];
+                            int8Consumers++;
                         } else if (l.second->type == "Convolution") {
                             l.second->blobs.erase("i-scale");
-                        } else if (l.second->type == "Eltwise") {
-                            canPropagate = true;
+                            int8Consumers++;
+                        } else if (CaselessEq<std::string>()(l.second->type, "Eltwise")) {
+                            if (statHelper.getLatestInFuse(iter) != iter) {
+                                l.second->blobs["o-scale"] = iter->blobs["o-scale"];
+                            }
+                            int8Consumers++;
+                        } else if ((l.second->precision == Precision::I8) &&
+                            CaselessEq<std::string>()(l.second->type, "concat")) {
+                            // if concat is i8, we can propagate oscale further to concat.
+                            // The logic around o-scale assumes that if we have it in the layer after iteration
+                            // in this loop it means that it must not be removed and we need to place
+                            // scale. While for concat we return to one layer back and again need to analyze o-scale
+                            // and it is not clear if we need to return o-scale or it was only for concat.
+                            // Having all of this in mind, it's better to rename o-scale to i-concat-scale
+                            iter->blobs["i-concat-scale"] = iter->blobs["o-scale"];
+                            int8Consumers++;
                         } else {
-                            canPropagate = false;
+                            fp32Consumers++;
                         }
+                    } else if (CaselessEq<std::string>()(l.second->type, "priorbox") ||
+                        CaselessEq<std::string>()(l.second->type, "priorboxclustered")) {
                     } else {
                         // we are leaving o-scale still for adding of scale-shift before FP32 layer
-                        canPropagate = false;
+                        fp32Consumers++;
                     }
                 }
 
                 if (iter->outData[0]->inputTo.empty()) {
-                    canPropagate = false;
+                    fp32Consumers++;
                 }
 
-                if (canPropagate) {
-                    if (iter->type == "Convolution") {
+                if (CaselessEq<std::string>()(iter->type, "Convolution")) {
+                    if (int8Consumers) {
                         iter->blobs["oi-scale"] = iter->blobs["o-scale"];
+                    } else {
+                        iter->outData[0]->setPrecision(Precision::FP32);
                     }
+                }
+                if (!fp32Consumers) {
                     iter->blobs.erase("o-scale");
-                } else {
-                    if (iter->type == "Convolution") {
-                        iter->blobs.erase("o-scale");
-                    }
                 }
             }
         }
@@ -922,6 +1201,11 @@ void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net) {
         if (iter->blobs.find("o-scale") != iter->blobs.end()) {
             // go over out data. if all outputs are fp32, continue this optimization
             bool canOptimize = true;
+
+            // current layer must not be convolution
+            if (CaselessEq<std::string>()(iter->type, "convolution")) {
+                canOptimize = false;
+            }
             for (auto o : iter->outData) {
                 for (auto ol : o->inputTo) {
                     if (ol.second->precision == Precision::I8) {
@@ -954,8 +1238,12 @@ void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net) {
             }
             if (eliminateOScale && curLayer) {
                 for (auto o : iter->outData) {
-                    o->precision = Precision::FP32;
+                    o->setPrecision(Precision::FP32);
                 }
+                for (auto o : curLayer->outData) {
+                    o->setPrecision(Precision::FP32);
+                }
+
                 curLayer->blobs.erase("oi-scale");
                 iter->blobs.erase("o-scale");
                 auto iLayer = iter;
@@ -1006,59 +1294,6 @@ void precisionColoring(const CNNLayerPtr layer,
     }
 }
 
-StatsMap ConvertAllStatsToMax(const ICNNNetwork &network, const StatsMap &statsMap) {
-    StatsMap newMap = statsMap;
-
-    float dummy;
-
-    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
-    for (auto l : sortedLayers) {
-        auto it = newMap.find(l->name);
-        if (l->type == "Pooling") {
-            // get predecessor statistic and update it for current layer
-            auto parent = l->insData[0].lock()->creatorLayer.lock();
-            auto itPStat = newMap.find(parent->name);
-            if (itPStat != newMap.end()) {
-                newMap[l->name] = itPStat->second;
-            } else if (it != newMap.end()) {
-                THROW_IE_EXCEPTION << "pool has statistic but parent does not have it. Not implemented case.";
-            }
-        } else if (it != newMap.end()) {
-            float min = FLT_MAX;
-            float max = FLT_MIN;
-
-            if (l->type == "Concat"
-                && l->outData.size() == 1 && l->outData[0]->getTensorDesc().getDims().size() == 4) {
-                size_t concatLayerIdx = 0;
-                for (int k = 0; k < l->insData.size(); k++) {
-                    auto prevKLayer = l->insData[k].lock()->creatorLayer.lock();
-                    // looking for the statistic for prevKLayer
-                    auto kLayerStat = newMap.find(prevKLayer->name);
-                    if (kLayerStat != newMap.end()) {
-                        for (size_t ikStat = 0; ikStat < kLayerStat->second->_maxOutputs.size(); ikStat++, concatLayerIdx++) {
-                            it->second->_maxOutputs[concatLayerIdx] = kLayerStat->second->_maxOutputs[ikStat];
-                            it->second->_minOutputs[concatLayerIdx] = kLayerStat->second->_minOutputs[ikStat];
-                        }
-                    } else {
-                        THROW_IE_EXCEPTION << "We have incomplete statistic for predecessors of concat layer " << l->name;
-                    }
-                }
-            } else {
-                if (!it->second->_maxOutputs.empty()) {
-                    DataStats::GetDataAbsMax(&it->second->_maxOutputs[0], it->second->_maxOutputs.size(), max);
-                    std::fill(it->second->_maxOutputs.begin(), it->second->_maxOutputs.end(), max);
-                }
-                if (!it->second->_minOutputs.empty()) {
-                    DataStats::GetDataMinMax(&it->second->_minOutputs[0], it->second->_minOutputs.size(), min, dummy);
-                    std::fill(it->second->_minOutputs.begin(), it->second->_minOutputs.end(), min);
-                }
-            }
-        }
-    }
-
-    return newMap;
-}
-
 void CNNNetworkInt8Normalizer::NormalizeNetwork(ICNNNetwork& network, ICNNNetworkStats& netStats) {
     CNNNetwork cnnn(&network);
 
@@ -1067,11 +1302,14 @@ void CNNNetworkInt8Normalizer::NormalizeNetwork(ICNNNetwork& network, ICNNNetwor
 
     // Applying int8-conversion
     StatsMap statsMap = netStats.getNodesStats();
-    statsMap = ConvertAllStatsToMax(network, statsMap);
 
-    ConvertToInt8(maxSign, maxUnsign, cnnn, statsMap);
-    PropagateScaleFactors(cnnn);
-    AddScaleShifts(cnnn);
+    CNNStatisticHelper statHelper(cnnn, statsMap, maxSign, maxUnsign);
+
+    replaceScaleShiftByDWConvolution(cnnn);
+
+    DefinesExecutionPrecision(cnnn, statHelper);
+    PropagateScaleFactors(cnnn, statHelper);
+    AddScaleShifts(cnnn, statHelper);
 #ifndef NDEBUG
     std::ofstream file("i8_normalized.dot");
     saveGraphToDot(cnnn, file, precisionColoring);
index 664959c..69e94b1 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 namespace InferenceEngine {
 namespace details {
 
+/**
+* We have raw statistic from stat collection tool and this statistic should be processed to get best
+* accuracy. This transformation depends on the topology, depends on the parameters of layers.
+* i.e. data going to regular and depth-wise convolution would be scaled differently. In case of
+* regular convolution it should be scaled for tensor wide approach, for depth-wise convolution it
+* should be scaled by channel approach.
+* This class contains logic of getting scales
+*/
+class CNNStatisticHelper {
+public:
+    /**
+    * We need to have topology to make a decision about scales
+    * @param network initial network to be quantized, the topology can be changed during quantization
+    * @param internalNodesStats initial statistic
+    * @param maxSign - maximal signed value to be used for calculation of scales
+    * @param maxUnsign - maximal unsigned value to be used for calculation of scales
+    *
+    */
+    CNNStatisticHelper(CNNNetwork& network,
+                       const std::map<std::string, NetworkNodeStatsPtr>& internalNodesStats,
+                       int maxSign,
+                       int maxUnsign);
+
+    /**
+    * Returns if we can quantize layer basing on information of existing statistic before and after
+    * layers
+    */
+    bool canLayerBeQuantized(const std::string &layerName) const;
+
+    /**
+     * The topology is allowed to be changed, we need to modify statistic accordingly
+     *
+     * Currently there is a need in copy of statistic only
+
+     * @param srcName name of layer from statistic needs to be taken
+     * @param dstName name of layer which statistic will be applied
+     */
+    void copyStatistics(const std::string& srcName, const std::string& dstName);
+
+    /**
+    * Returns boolean values if layer produce negative data according collected statistic
+    * true means that layer produices negative values
+    * false means that layer produces only positive numbers
+    * @param layer - layer of interest
+    * @param outputPort - number of port to verify. -1 stands forverification of all outputs from
+    * layer
+    */
+    bool hasNegativeOutput(const std::string &layerName, int outputPort = -1) const;
+
+    /**
+     * Returns input scale for layer based on statistic
+     * @return blob with scales per channel
+     */
+    InferenceEngine::Blob::Ptr getInputScale(CNNLayer::Ptr layer) const;
+
+    /**
+     * Returns output scale for layer based on statistic
+     * @return blob with scales per channel
+     */
+    InferenceEngine::Blob::Ptr getOutputScale(CNNLayer::Ptr layer) const;
+
+    /**
+     * provides max signed value as the only place for synchronization with other algorithms in
+     * normalizer which require this
+     */
+    int getMaxSignValue() const;
+
+    /**
+     * Returns a latest layer in fusion, the data from returned layer will go to anopther, this mean
+     * that for all layers which will be fused we will have to use only statistic from that latest layer
+     * @param layer - layer of interest
+     *
+     * @return returns layer which statistic should be used for calculatio of all scales for layer
+     *         passed as a parameter for this method
+     */
+    CNNLayer::Ptr getLatestInFuse(CNNLayer::Ptr layer) const;
+
+private:
+    /**
+     * Calculates scale factor according statistic for layer passed to this function. No other logic for
+     * selection another layer is implemented here.
+     *
+     * @param channels redundant parameter, should be removed
+     * @param stats redundant parameter, should be removed
+     * @param maxInt - we can quantize to I8 even if data is unsigned, need to provide such max number
+     *               explicitly
+     *
+     * @return InferenceEngine::Blob::Ptr
+     */
+    InferenceEngine::Blob::Ptr calculateScaleFactor(size_t channels,
+                                                    NetworkNodeStatsPtr stats,
+                                                    int maxInt) const;
+
+    /**
+     * Select the latet layer in the fusion and returns its statistic
+    */
+    NetworkNodeStatsPtr  getStatistic(CNNLayer::Ptr layer) const;
+
+    /**
+     * Pass over alls statistic and normalize it to the only scale per tenso, individual per channel or
+     * mix depenging on the pattern in the network
+     */
+    void NormalizeStatistic();
+
+    CNNNetwork network_;
+    std::map<std::string, NetworkNodeStatsPtr> internalNodesStats_;
+    int maxSign_;
+    int maxUnsign_;
+};
+
+/**
+ * This class normalizes and quantizes network to "Int8" state
+ * The converted network will have
+ *  1) scaleshifts which will normalize activation values to int8 (S8/U8) range
+ *  2) quantize weigths and biases of convolution
+ *  3) adds special attributes to layers because semantic of int8 layer are different vs floating
+ *  point ones. For example, after convolution we need to return back to denormalized values and
+ *  there should be special scale here
+ *  4) Transforms some layers to another ones. For example if i8 to i8 Scaleshift is not supported
+ *  by backend, this scaleshift will be converted to grouped/(depth-wise in ideal case) convolution
+ *
+ *  This class very depends on backend and its fusion. It assumes that fusion must be executed all
+ *  the time, we cannot for split it to independent execution of two layers in int8 mode. This is
+ *  done to calculate normalization factors the most optimal way to save accuracy.
+ *  Currently supported fusion
+ *  1. Conv-ReLU
+ *  2. Conv-Sum-ReLU which is appeared from the pattern
+ *  Conv        Something
+ *    \            /
+ *        Eltwise
+ *         ReLU
+ *  Here, the output form "Something" will be used as in-place storge for accumulation of the
+ *  results for convolution. That lead to tricky case in int8 when we have signed int8 input and
+ *  unsigned u8 output
+ *  */
 class INFERENCE_ENGINE_API_CLASS(CNNNetworkInt8Normalizer) {
 public:
     CNNNetworkInt8Normalizer() {
     }
 private:
+    /** Helper function for filling of scaleshift weights for normalization of activation */
     static void fillInScaleShift(ScaleShiftLayer* scshLayer, size_t c, float* weightsN, float* weightsD);
 
 public:
+    /** main function for calling of quantization */
     void NormalizeNetwork(ICNNNetwork& network, ICNNNetworkStats& netStats);
 
 protected:
-    void AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor);
+    /** Helper function to add scaleshifts and other layers for transformatin of topology */
+    void AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor, size_t port);
+    /** Helper function to add scaleshifts and other layers for transformatin of topology */
     void AddLayerToCNNNetworkAfterData(DataPtr pData, CNNLayer::Ptr layer, const std::string& nextLayerName);
+    /**  Adds ScaleShift between two specified layers  */
+    void AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2, CNNStatisticHelper& statHelper);
 
 
     /**
-     * Adds ScaleShift between two specified layers
+     * Function which recalculate weights according to input scales, and quantize weights, biases and
+     * adds o-scale and w-scale
+     * w-scale - multiplication on this scale of i8 convolution result will produce denormalized fp32
+     * data
+     * o-scale - multiplication on this scale will convert above denormalized fp32 to i8 for next layer
      */
-    void AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2);
+    void QuantizeConvolution(CNNLayer::Ptr convolution, CNNStatisticHelper& statHelper);
+
+    /**  Adds ScaleShifts everywhere */
+    void AddScaleShifts(CNNNetwork& net, CNNStatisticHelper& statHelper);
+
     /**
-     * Adds ScaleShifts everywhere
+     * Goes over all layers and mark which layers will be executed in FP32/I8 and marks data between
+     * layers to I8/U8/FP32
      */
-    void AddScaleShifts(CNNNetwork& net);
+    void DefinesExecutionPrecision(CNNNetwork& net, CNNStatisticHelper& statHelper);
 
     /**
-     * Converts the CNNNetwork from FP32 to Int8
+     * Since o-scales exist only for convolutins, we need to propagate them down oever concats and
+     * linear layers
      */
-    void ConvertToInt8(int maxSign, int maxUnsign, CNNNetwork& net, const std::map<std::string, NetworkNodeStatsPtr>& netNodesStats);
+    void PropagateScaleFactors(CNNNetwork& net, const CNNStatisticHelper& statHelper);
 
     /**
-     * Merging statistics from multiple sources.
-     * The target statistics has max[i] = max(max1[i], max2[i]) and min[i] = min(min1[i], min2[i])
+     * Normalizes and quantizes srcData using scales for normalization and int8blob precision for
+     * quantization
      */
-    NetworkNodeStatsPtr mergeNetworkNodesStats(std::vector<NetworkNodeStatsPtr> stats);
+    void ScaleDataToInt(const float* srcData, size_t srcSize, Blob::Ptr int8blob, const std::vector<float>& scales);
 
     /**
-     * Calculates a scale factor from statistics
+     * Replaces all ScaleShifts layers met in the model to the depth-wise convolution with the same
+     * weights and biases.
+     *
+     * Exceptions:
+     * 1. ScaleShift following after Input layer, it is not converted to depth-wise convolution
+     * 2. Scaleshift producing output of network
+     * 3. Scaleshift passing data to Priorbox
+     *
+     * This conversion allows to avoid introductin one more i8 primitive - ScaleShift accepting i8 input
+     * and producing i8 output
      */
-    InferenceEngine::Blob::Ptr calculateScaleFactor(const std::string& name, size_t channels, std::vector<NetworkNodeStatsPtr> stats, int maxInt);
+    void replaceScaleShiftByDWConvolution(CNNNetwork& net);
 
+    /** Helper function which creates DW/Grouped/regular convolution by passed weights and biases */
+    CNNLayer::Ptr createDWConvolutionForScale(const std::string& layerName, size_t channels, float *weights, float *biases);
 
-    void PropagateScaleFactors(CNNNetwork& net);
-    void ScaleDataToInt(const float* srcData, size_t srcSize, Blob::Ptr int8blob, const std::vector<float>& scales);
+    /**
+    * Returns tails from I8 to FP32 until convolution - it is the most performed approach because
+    * convolution can convert to FP32 for free, while adding one more scale will decrease performance
+    */
+    void returnTailToFP32(CNNLayer::Ptr layer);
+
+    /**
+     * Verifies if next layer has type which potentially can be fused with convolution
+     * and if activation is supported for int8
+     * @return true if layer does not have improper activation for fusion
+     */
+    bool isNextFusionAllowed(CNNLayer::Ptr layer) const;
 };
 
 typedef std::shared_ptr<CNNNetworkInt8Normalizer> CNNNetworkNormalizerPtr;
index 12af66e..8e4c693 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -19,6 +18,7 @@ namespace InferenceEngine {
 
 TaskExecutor::TaskExecutor(std::string name) : _isStopped(false), _name(name) {
     _thread = std::make_shared<std::thread>([&] {
+        anotateSetThreadName(("TaskExecutor thread for " + _name).c_str());
         while (!_isStopped) {
             bool isQueueEmpty;
             Task::Ptr currentTask;
index f6aa1aa..c9afe39 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -16,6 +15,7 @@
 #include "debug.h"
 #include "cpp_interfaces/exception2status.hpp"
 #include "ie_preprocess_data.hpp"
+#include "ie_memcpy.h"
 
 namespace InferenceEngine {
 
@@ -44,7 +44,8 @@ public:
                         newPtr->getPreProcess()[i]->meanData =
                                 make_blob_with_precision(newPtr->getPreProcess()[i]->meanData->getTensorDesc());
                         newPtr->getPreProcess()[i]->meanData->allocate();
-                        memcpy(newPtr->getPreProcess()[i]->meanData->buffer(), blob->cbuffer(), blob->byteSize());
+                        ie_memcpy(newPtr->getPreProcess()[i]->meanData->buffer(), newPtr->getPreProcess()[i]->meanData->byteSize(),
+                                  blob->cbuffer(), blob->byteSize());
                     }
                 }
                 newData->inputTo.clear();
@@ -168,14 +169,15 @@ public:
     /**
      * @brief Checks and executes input data pre-processing if needed.
      */
-    void execDataPreprocessing(InferenceEngine::BlobMap& inputs) {
+    void execDataPreprocessing(InferenceEngine::BlobMap& inputs, bool serial = false) {
         for (auto &input : inputs) {
             // If there is a pre-process entry for an input then it must be pre-processed
             // using preconfigured resize algorithm.
             auto it = _preProcData.find(input.first);
             if (it != _preProcData.end()) {
                 _preProcData[input.first].execute(input.second,
-                                                  _networkInputs[input.first]->getPreProcess().getResizeAlgorithm());
+                                                  _networkInputs[input.first]->getPreProcess().getResizeAlgorithm(),
+                                                  serial);
             }
         }
     }
index 161909f..d9bee35 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -17,6 +16,7 @@
 #include "cpp_interfaces/interface/ie_iplugin_internal.hpp"
 #include "cpp_interfaces/base/ie_executable_network_base.hpp"
 #include "cpp_interfaces/impl/ie_executable_network_internal.hpp"
+#include "ie_memcpy.h"
 
 namespace InferenceEngine {
 
@@ -83,7 +83,8 @@ public:
                         newPtr->getPreProcess()[i]->meanData =
                                 make_blob_with_precision(newPtr->getPreProcess()[i]->meanData->getTensorDesc());
                         newPtr->getPreProcess()[i]->meanData->allocate();
-                        memcpy(newPtr->getPreProcess()[i]->meanData->buffer(), blob->cbuffer(), blob->byteSize());
+                        ie_memcpy(newPtr->getPreProcess()[i]->meanData->buffer(), newPtr->getPreProcess()[i]->meanData->byteSize(),
+                                  blob->cbuffer(), blob->byteSize());
                     }
                 }
                 newData->inputTo.clear();
diff --git a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
new file mode 100644 (file)
index 0000000..ea37235
--- /dev/null
@@ -0,0 +1,1817 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ie_preprocess_gapi_kernels.hpp"
+#include "ie_preprocess_gapi_kernels_impl.hpp"
+#include "ie_preprocess_gapi_kernels_sse42.hpp"
+
+// NB: include this before opencv_hal_sse.hpp
+#include "nmmintrin.h"
+
+// NB: define these before opencv_hal_sse.hpp
+namespace cv {
+namespace hal {
+
+enum StoreMode {
+    STORE_UNALIGNED = 0,
+    STORE_ALIGNED = 1,
+    STORE_ALIGNED_NOCACHE = 2
+};
+
+}  // namespace hal
+}  // namespace cv
+
+// NB: define these before opencv_hal_sse.hpp
+#define OPENCV_HAL_ADD(a, b) ((a) + (b))
+#define OPENCV_HAL_AND(a, b) ((a) & (b))
+#define OPENCV_HAL_NOP(a) (a)
+#define OPENCV_HAL_1ST(a, b) (a)
+
+// NB: define these before opencv_hal_sse.hpp
+#ifdef CV_SSE4_2
+  #undef CV_SSE4_2
+  #undef CV_SSE4_1
+  #undef CV_SSSE3
+  #undef CV_SSE3
+  #undef CV_SSE2
+  #undef CV_SSE
+#endif
+#define CV_SSE4_2 1
+#define CV_SSE4_1 1
+#define CV_SSSE3  1
+#define CV_SSE3   1
+#define CV_SSE2   1
+#define CV_SSE    1
+#define CV_CPU_HAS_SUPPORT_SSE2 1
+#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN  // empty
+#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+// OpenCV universal intrinsic
+#include "opencv_hal_sse.hpp"
+
+// AFTER "opencv_hal_sse.hpp"
+// (CV_SIMD128 defined there)
+#if   !CV_SIMD128
+#error CV_SIMD128 is required!
+#endif
+
+#include <cstring>
+
+using namespace cv;
+
+namespace InferenceEngine {
+namespace gapi {
+namespace kernels {
+
+//----------------------------------------------------------------------
+
+#if CV_SSE
+static inline void v_deinterleave(const v_float32x4& low, const v_float32x4& high,
+                                        v_float32x4& even,      v_float32x4& odd) {
+    __m128 tmp0 = _mm_unpacklo_ps(low.val, high.val);
+    __m128 tmp1 = _mm_unpackhi_ps(low.val, high.val);
+    even.val = _mm_unpacklo_ps(tmp0, tmp1);
+    odd .val = _mm_unpackhi_ps(tmp0, tmp1);
+}
+#endif
+
+#if CV_SSE2
+static inline void v_deinterleave(const v_uint8x16& i0, const v_uint8x16& i1,
+                                  const v_uint8x16& i2, const v_uint8x16& i3,
+                                        v_uint8x16& o0,       v_uint8x16& o1,
+                                        v_uint8x16& o2,       v_uint8x16& o3) {
+    __m128i u0 = i0.val;                     // a0 b0 c0 d0 a1 b1 c1 d1 ...
+    __m128i u1 = i1.val;                     // a4 b4 c4 d4 ...
+    __m128i u2 = i2.val;                     // a8 b8 c8 d8 ...
+    __m128i u3 = i3.val;                     // a12 b12 c12 d12 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2);  // a0 a8 b0 b8 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2);  // a2 a10 b2 b10 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3);  // a4 a12 b4 b12 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3);  // a6 a14 b6 b14 ...
+
+    u0 = _mm_unpacklo_epi8(v0, v2);          // a0 a4 a8 a12 ...
+    u1 = _mm_unpacklo_epi8(v1, v3);          // a2 a6 a10 a14 ...
+    u2 = _mm_unpackhi_epi8(v0, v2);          // a1 a5 a9 a13 ...
+    u3 = _mm_unpackhi_epi8(v1, v3);          // a3 a7 a11 a15 ...
+
+    v0 = _mm_unpacklo_epi8(u0, u1);          // a0 a2 a4 a6 ...
+    v1 = _mm_unpacklo_epi8(u2, u3);          // a1 a3 a5 a7 ...
+    v2 = _mm_unpackhi_epi8(u0, u1);          // c0 c2 c4 c6 ...
+    v3 = _mm_unpackhi_epi8(u2, u3);          // c1 c3 c5 c7 ...
+
+    o0.val = _mm_unpacklo_epi8(v0, v1);      // a0 a1 a2 a3 ...
+    o1.val = _mm_unpackhi_epi8(v0, v1);      // b0 b1 b2 b3 ...
+    o2.val = _mm_unpacklo_epi8(v2, v3);      // c0 c1 c2 c3 ...
+    o3.val = _mm_unpackhi_epi8(v2, v3);      // d0 d1 d2 d3 ...
+}
+
+static inline v_uint8x16 v_interleave_low(const v_uint8x16& a, const v_uint8x16& b) {
+    return v_uint8x16(_mm_unpacklo_epi8(a.val, b.val));
+}
+
+static inline v_uint8x16 v_interleave_high(const v_uint8x16& a, const v_uint8x16& b) {
+    return v_uint8x16(_mm_unpackhi_epi8(a.val, b.val));
+}
+
+static inline v_int16x8 v_interleave_low(const v_int16x8& a, const v_int16x8& b) {
+    return v_int16x8(_mm_unpacklo_epi16(a.val, b.val));
+}
+
+static inline v_int16x8 v_interleave_high(const v_int16x8& a, const v_int16x8& b) {
+    return v_int16x8(_mm_unpackhi_epi16(a.val, b.val));
+}
+
+static inline v_uint16x8 v_expand_low(const v_uint8x16& a) {
+    return v_uint16x8(_mm_unpacklo_epi8(a.val, _mm_setzero_si128()));
+}
+
+static inline v_uint16x8 v_expand_high(const v_uint8x16& a) {
+    return v_uint16x8(_mm_unpackhi_epi8(a.val, _mm_setzero_si128()));
+}
+
+static inline v_uint8x16 v_saturate_u8(const v_int16x8& a) {
+    v_uint8x16 r;
+    r.val = _mm_packus_epi16(a.val, _mm_setzero_si128());
+    return r;
+}
+
+static inline v_int16x8 v_saturate_s16(const v_int32x4& a) {
+    v_int16x8 r;
+    r.val = _mm_packs_epi32(a.val, _mm_setzero_si128());
+    return r;
+}
+
+// for each j=index[k], load two chars src[j] and src[j+1]
+static inline v_uint8x16 v_gather_pairs(const uchar src[], const v_int16x8& index) {
+    v_uint8x16 r;
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 0)]), 0);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 1)]), 1);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 2)]), 2);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 3)]), 3);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 4)]), 4);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 5)]), 5);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 6)]), 6);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const ushort*>(&src[_mm_extract_epi16(index.val, 7)]), 7);
+    return r;
+}
+
+static inline v_int16x8 v_gather_chan(const uchar src[], const v_int16x8& index, int channel, int pos) {
+    constexpr const int chanNum = 3;
+    v_int16x8 r;
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 0) + pos) + channel]), 0);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 1) + pos) + channel]), 1);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 2) + pos) + channel]), 2);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 3) + pos) + channel]), 3);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 4) + pos) + channel]), 4);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 5) + pos) + channel]), 5);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 6) + pos) + channel]), 6);
+    r.val = _mm_insert_epi16(r.val, *reinterpret_cast<const uchar*>(&src[chanNum*(_mm_extract_epi16(index.val, 7) + pos) + channel]), 7);
+    return r;
+}
+
+static inline void v_gather_pairs(const float src[], const v_int32x4& index,
+                                  v_float32x4& low, v_float32x4& high) {
+    int i[4];
+    v_store(i, index);
+
+    __m128 l = _mm_setzero_ps();
+    l = _mm_loadl_pi(l, (const __m64*)&src[i[0]]);  // pair of floats
+    l = _mm_loadh_pi(l, (const __m64*)&src[i[1]]);
+    low.val = l;
+
+    __m128 h = _mm_setzero_ps();
+    h = _mm_loadl_pi(h, (const __m64*)&src[i[2]]);
+    h = _mm_loadh_pi(h, (const __m64*)&src[i[3]]);
+    high.val = h;
+}
+
+static inline v_int32x4 v_madd(const v_int16x8& a, const v_int16x8& b) {
+    v_int32x4 r;
+    r.val = _mm_madd_epi16(a.val, b.val);
+    return r;
+}
+
+static inline v_int16x8 v_mulhi(const v_int16x8& a, short b) {
+    v_int16x8 r;
+    r.val = _mm_mulhi_epi16(a.val, _mm_set1_epi16(b));
+    return r;
+}
+
+static inline v_uint16x8 v_mulhi(const v_uint16x8& a, v_uint16x8 b) {
+    v_uint16x8 r;
+    r.val = _mm_mulhi_epu16(a.val, b.val);
+    return r;
+}
+
+static inline v_uint16x8 v_mulhi(const v_uint16x8& a, uint16_t b) {
+    v_uint16x8 r;
+    r.val = _mm_mulhi_epu16(a.val, _mm_set1_epi16(b));
+    return r;
+}
+
+static inline v_int16x8 v_mulhrs(const v_int16x8& a, const v_int16x8& b) {
+    v_int16x8 r;
+    r.val = _mm_mulhrs_epi16(a.val, b.val);
+    return r;
+}
+
+static inline v_int16x8 v_mulhrs(const v_int16x8& a, short b) {
+    return v_mulhrs(a, v_setall_s16(b));
+}
+#endif  // SSE2
+
+#ifdef CV_SSE3
+static inline void v_deinterleave_expand(const v_uint8x16& src, v_int16x8& even, v_int16x8& odd) {
+    static const __m128i mask_even = _mm_setr_epi8(0, -1, 2, -1, 4, -1, 6, -1, 8, -1, 10, -1, 12, -1, 14, -1);
+    static const __m128i mask_odd  = _mm_setr_epi8(1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1);
+    even.val = _mm_shuffle_epi8(src.val, mask_even);
+    odd .val = _mm_shuffle_epi8(src.val, mask_odd);
+}
+#endif
+
+static inline v_float32x4 v_fma(const v_float32x4& a, float b, const v_float32x4& c) {
+    return v_fma(a, v_setall_f32(b), c);
+}
+
+static inline v_int16x8 operator+ (const v_int16x8& a, short b) {
+    return a + v_setall_s16(b);
+}
+
+static inline v_int16x8 operator- (short a, const v_int16x8& b) {
+    return v_setall_s16(a) - b;
+}
+
+static inline v_float32x4 operator- (float a, const v_float32x4& b) {
+    return v_setall_f32(a) - b;
+}
+
+static inline v_float32x4 operator* (const v_float32x4& a, float b) {
+    return a * v_setall_f32(b);
+}
+
+//------------------------------------------------------------------------------
+
+// Resize (bi-linear, 8U)
+void calcRowLinear_8U(uint8_t *dst[],
+                const uint8_t *src0[],
+                const uint8_t *src1[],
+                const short    alpha[],
+                const short    clone[],  // 4 clones of alpha
+                const short    mapsx[],
+                const short    beta[],
+                      uint8_t  tmp[],
+                const Size   & inSz,
+                const Size   & outSz,
+                      int      lpi) {
+    bool xRatioEq1 = inSz.width  == outSz.width;
+    bool yRatioEq1 = inSz.height == outSz.height;
+
+    if (!xRatioEq1 && !yRatioEq1) {
+        if (4 == lpi) {
+            // vertical pass
+            GAPI_DbgAssert(inSz.width >= 8);
+
+            __m128i b0 = _mm_set1_epi16(beta[0]);
+            __m128i b1 = _mm_set1_epi16(beta[1]);
+            __m128i b2 = _mm_set1_epi16(beta[2]);
+            __m128i b3 = _mm_set1_epi16(beta[3]);
+
+            for (int w = 0; w < inSz.width; ) {
+                for (; w <= inSz.width - 8; w += 8) {
+                #if USE_CVKL
+                    //--------------------------------------------
+                    // reworked from: ie_preprocess_data_sse42.cpp
+                    //      function: resize_bilinear_u8
+                    //         label: vertical_pass
+                    //--------------------------------------------
+
+                    __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[0][w])),
+                                                                     *reinterpret_cast<const int64_t*>(&src0[1][w]), 1);
+                    __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[2][w])),
+                                                                     *reinterpret_cast<const int64_t*>(&src0[3][w]), 1);
+                    __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[0][w])),
+                                                                     *reinterpret_cast<const int64_t*>(&src1[1][w]), 1);
+                    __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[2][w])),
+                                                                     *reinterpret_cast<const int64_t*>(&src1[3][w]), 1);
+
+                    __m128i val0_0 = _mm_cvtepu8_epi16(val0lo);
+                    __m128i val0_2 = _mm_cvtepu8_epi16(val0hi);
+                    __m128i val1_0 = _mm_cvtepu8_epi16(val1lo);
+                    __m128i val1_2 = _mm_cvtepu8_epi16(val1hi);
+
+                    __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
+                    __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
+                    __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
+                    __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
+
+                    __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), b0);
+                    __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), b1);
+                    __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), b2);
+                    __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), b3);
+
+                    __m128i r0 = _mm_add_epi16(val1_0, t0);
+                    __m128i r1 = _mm_add_epi16(val1_1, t1);
+                    __m128i r2 = _mm_add_epi16(val1_2, t2);
+                    __m128i r3 = _mm_add_epi16(val1_3, t3);
+
+                    __m128i q0 = _mm_packus_epi16(r0, r1);
+                    __m128i q1 = _mm_packus_epi16(r2, r3);
+
+                    __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
+                    __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
+
+                    __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
+                    __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
+
+                    _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w +  0]), q4);
+                    _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w + 16]), q5);
+
+                #else
+                    // let: t[i] = src0[i][w]*beta0[i] + src1[i][w]*beta1
+                    // here: beta0[i] = beta[i], beta1 = 1 - beta0[i]
+                    v_int16x8 t0, t1, t2, t3;
+                    {
+                        v_int16x8 s0, s1;
+
+                        s0 = v_reinterpret_as_s16(v_load_expand(&src0[0][w]));
+                        s1 = v_reinterpret_as_s16(v_load_expand(&src1[0][w]));
+                        t0 = v_mulhrs(s0 - s1, beta[0]) + s1;
+
+                        s0 = v_reinterpret_as_s16(v_load_expand(&src0[1][w]));
+                        s1 = v_reinterpret_as_s16(v_load_expand(&src1[1][w]));
+                        t1 = v_mulhrs(s0 - s1, beta[1]) + s1;
+
+                        s0 = v_reinterpret_as_s16(v_load_expand(&src0[2][w]));
+                        s1 = v_reinterpret_as_s16(v_load_expand(&src1[2][w]));
+                        t2 = v_mulhrs(s0 - s1, beta[2]) + s1;
+
+                        s0 = v_reinterpret_as_s16(v_load_expand(&src0[3][w]));
+                        s1 = v_reinterpret_as_s16(v_load_expand(&src1[3][w]));
+                        t3 = v_mulhrs(s0 - s1, beta[3]) + s1;
+                    }
+                    // store as groups of 4 pixels: each group to have a pixel per row
+                    {
+                        v_uint8x16 a0, a1, a2, a3;
+                        a0 = v_pack_u(t0, v_setall_s16(0));
+                        a1 = v_pack_u(t1, v_setall_s16(0));
+                        a2 = v_pack_u(t2, v_setall_s16(0));
+                        a3 = v_pack_u(t3, v_setall_s16(0));
+
+                        v_int16x8 b0, b1;
+                        b0 = v_reinterpret_as_s16(v_interleave_low(a0, a1));  // 0th, 1st
+                        b1 = v_reinterpret_as_s16(v_interleave_low(a2, a3));  // 2nd, 3rd
+
+                        v_uint8x16 d0, d1;
+                        d0 = v_reinterpret_as_u8(v_interleave_low(b0,  b1));
+                        d1 = v_reinterpret_as_u8(v_interleave_high(b0, b1));
+
+                        v_store(&tmp[4*w +  0], d0);
+                        v_store(&tmp[4*w + 16], d1);
+                    }
+                #endif
+                }
+
+                if (w < inSz.width) {
+                    w = inSz.width - 8;
+                }
+            }
+
+            // horizontal pass
+            GAPI_DbgAssert(outSz.width >= 8);
+            for (int x = 0; x < outSz.width; ) {
+                for (; x <= outSz.width - 8; x += 8) {
+                #if USE_CVKL
+                    //--------------------------------------------
+                    // reworked from: ie_preprocess_data_sse42.cpp
+                    //      function: resize_bilinear_u8
+                    //         label: horizontal_pass
+                    //--------------------------------------------
+
+                #if 1
+                    __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 *  x]));
+                    __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 2)]));
+                    __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 4)]));
+                    __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 6)]));
+                #else
+                    // provided alpha[x..x+7] = { a0, a1, a2, a3, a4, a5, a6, a7},
+                    // clone each a[i] 4 times - one item per each of LPI rows,
+                    // so that a10 = {a0, a0, a0, a0, a1, a1, a1, a1}, etc.
+                    __m128i a10, a32, a54, a76;
+                    __m128i alpha0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&alpha[x]));
+                    a10 = _mm_unpacklo_epi16(alpha0, alpha0);  // {a0, a0, a1, a1, a2, a2, a3, a3}
+                    a32 = _mm_unpackhi_epi16(a10, a10);        // {a2, a2, a2, a2, a3, a3, a3, a3}
+                    a10 = _mm_unpacklo_epi16(a10, a10);        // {a0, a0, a0, a0, a1, a1, a1, a1}
+                    a54 = _mm_unpackhi_epi16(alpha0, alpha0);  // {a4, a4, a5, a5, a6, a6, a7, a7}
+                    a76 = _mm_unpackhi_epi16(a54, a54);        // {a6, a6, a6, a6, a7, a7, a7, a7}
+                    a54 = _mm_unpacklo_epi16(a54, a54);        // {a4, a4, a4, a4, a5, a5, a5, a5}
+                #endif
+
+                    __m128d val0d, val1d, val2d, val3d;
+                    val0d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 0]]));
+                    val0d = _mm_loadh_pd(val0d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 1]]));
+                    val1d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 2]]));
+                    val1d = _mm_loadh_pd(val1d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 3]]));
+                    val2d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 4]]));
+                    val2d = _mm_loadh_pd(val2d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 5]]));
+                    val3d = _mm_load_sd(/****/  reinterpret_cast<double*>(&tmp[4 * mapsx[x + 6]]));
+                    val3d = _mm_loadh_pd(val3d, reinterpret_cast<double*>(&tmp[4 * mapsx[x + 7]]));
+
+                    __m128i val_0 = _mm_castpd_si128(val0d);
+                    __m128i val_1 = _mm_castpd_si128(val1d);
+                    __m128i val_2 = _mm_castpd_si128(val2d);
+                    __m128i val_3 = _mm_castpd_si128(val3d);
+
+                    val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
+                    val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
+                    val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
+                    val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
+
+                    __m128i val0_0 = _mm_cvtepu8_epi16(val_0);
+                    __m128i val0_1 = _mm_cvtepu8_epi16(val_1);
+                    __m128i val0_2 = _mm_cvtepu8_epi16(val_2);
+                    __m128i val0_3 = _mm_cvtepu8_epi16(val_3);
+
+                    __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
+                    __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
+                    __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
+                    __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
+
+                    __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), a10);
+                    __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), a32);
+                    __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), a54);
+                    __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), a76);
+
+                    __m128i r0 = _mm_add_epi16(val1_0, t0);
+                    __m128i r1 = _mm_add_epi16(val1_1, t1);
+                    __m128i r2 = _mm_add_epi16(val1_2, t2);
+                    __m128i r3 = _mm_add_epi16(val1_3, t3);
+
+                    __m128i q0 = _mm_packus_epi16(r0, r1);
+                    __m128i q1 = _mm_packus_epi16(r2, r3);
+
+                    __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
+                    __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
+
+                    __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
+                    __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
+
+                    _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[0][x]),                q4);
+                    _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[1][x]), _mm_srli_si128(q4, 8));
+                    _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[2][x]),                q5);
+                    _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[3][x]), _mm_srli_si128(q5, 8));
+
+                #else
+                    // let: t be 2 pairs of groups of 4 pixels (each group is for 4 dst rows)
+                    // each pair of gorups corresponds to pixels indexed as sx0 and sx1=sx0+1
+                    // so: low part of t0 is 2x4 pixels corresponding to sx0=mapsx[x+0], etc.
+                    v_uint8x16 t0, t1, t2, t3;
+                    {
+                        t0.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 0]])),
+                                                                 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 1]]), 1);
+                        t1.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 2]])),
+                                                                 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3]]), 1);
+                        t2.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 4]])),
+                                                                 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 5]]), 1);
+                        t3.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 6]])),
+                                                                 *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 7]]), 1);
+                    }
+
+                    // let: r0 be pixels for 0th row, etc
+                    v_uint8x16 r0, r1, r2, r3;
+                    v_deinterleave(t0, t1, t2, t3, r0, r1, r2, r3);
+
+                    // let: dl be resulting 8 pixels for l'th row
+                    //      dl = alpha0*s0l + alpha1*s1l
+                    // note that alpha0 + alpha1 = 1
+                    {
+                        v_int16x8 s0, s1, d, alpha0;
+
+                        alpha0 = v_load(&alpha[x]);  // 8 coefficients
+
+                        v_deinterleave_expand(r0, s0, s1);
+                        d = v_mulhrs(s0 - s1, alpha0) + s1;
+                        v_pack_u_store(&dst[0][x], d);
+
+                        v_deinterleave_expand(r1, s0, s1);
+                        d = v_mulhrs(s0 - s1, alpha0) + s1;
+                        v_pack_u_store(&dst[1][x], d);
+
+                        v_deinterleave_expand(r2, s0, s1);
+                        d = v_mulhrs(s0 - s1, alpha0) + s1;
+                        v_pack_u_store(&dst[2][x], d);
+
+                        v_deinterleave_expand(r3, s0, s1);
+                        d = v_mulhrs(s0 - s1, alpha0) + s1;
+                        v_pack_u_store(&dst[3][x], d);
+                    }
+                #endif
+                }
+
+                if (x < outSz.width) {
+                    x = outSz.width - 8;
+                }
+            }
+
+        } else {  // if any lpi
+            for (int l = 0; l < lpi; l++) {
+                short beta0 =                            beta[l];
+            //  short beta1 = saturate_cast<short>(ONE - beta[l]);
+
+                // vertical pass
+                GAPI_DbgAssert(inSz.width >= 8);
+                for (int w = 0; w < inSz.width; ) {
+                    for (; w <= inSz.width - 8; w += 8) {
+                        v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
+                        v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
+                        v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
+                        v_pack_u_store(tmp + w, t);
+                    }
+
+                    if (w < inSz.width) {
+                        w = inSz.width - 8;
+                    }
+                }
+
+                // horizontal pass
+                GAPI_DbgAssert(outSz.width >= 8);
+                for (int x = 0; x < outSz.width; ) {
+                    for (; x <= outSz.width - 8; x += 8) {
+                        v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
+                        v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
+                        v_uint8x16 t = v_gather_pairs(tmp, sx);  // 8 pairs of src0 pixels
+                        v_int16x8 t0, t1;
+                        v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
+                        v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
+                        v_pack_u_store(&dst[l][x], d);
+                    }
+
+                    if (x < outSz.width) {
+                        x = outSz.width - 8;
+                    }
+                }
+            }
+        }  // if lpi == 4
+
+    } else if (!xRatioEq1) {
+        GAPI_DbgAssert(yRatioEq1);
+
+        if (4 == lpi) {
+            // vertical pass
+            GAPI_DbgAssert(inSz.width >= 16);
+            for (int w = 0; w < inSz.width; ) {
+                for (; w <= inSz.width - 16; w += 16) {
+                    v_uint8x16 s0, s1, s2, s3;
+                    s0 = v_load(&src0[0][w]);
+                    s1 = v_load(&src0[1][w]);
+                    s2 = v_load(&src0[2][w]);
+                    s3 = v_load(&src0[3][w]);
+                    v_store_interleave(&tmp[4*w], s0, s1, s2, s3);
+                }
+
+                if (w < inSz.width) {
+                    w = inSz.width - 16;
+                }
+            }
+
+            // horizontal pass
+            GAPI_DbgAssert(outSz.width >= 8);
+            for (int x = 0; x < outSz.width; ) {
+                for (; x <= outSz.width - 8; x += 8) {
+                    v_uint8x16 t0, t1, t2, t3;
+                    t0.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 0]])),
+                                                             *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 1]]), 1);
+                    t1.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 2]])),
+                                                             *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 3]]), 1);
+                    t2.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 4]])),
+                                                             *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 5]]), 1);
+                    t3.val = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<__m128i*>(&tmp[4 * mapsx[x + 6]])),
+                                                             *reinterpret_cast<int64_t*>(&tmp[4 * mapsx[x + 7]]), 1);
+
+                    v_uint8x16 r0, r1, r2, r3;
+                    v_deinterleave(t0, t1, t2, t3, r0, r1, r2, r3);
+
+                    v_int16x8 s0, s1, d, alpha0;
+
+                    alpha0 = v_load(&alpha[x]);  // 8 coefficients
+
+                    v_deinterleave_expand(r0, s0, s1);
+                    d = v_mulhrs(s0 - s1, alpha0) + s1;
+                    v_pack_u_store(&dst[0][x], d);
+
+                    v_deinterleave_expand(r1, s0, s1);
+                    d = v_mulhrs(s0 - s1, alpha0) + s1;
+                    v_pack_u_store(&dst[1][x], d);
+
+                    v_deinterleave_expand(r2, s0, s1);
+                    d = v_mulhrs(s0 - s1, alpha0) + s1;
+                    v_pack_u_store(&dst[2][x], d);
+
+                    v_deinterleave_expand(r3, s0, s1);
+                    d = v_mulhrs(s0 - s1, alpha0) + s1;
+                    v_pack_u_store(&dst[3][x], d);
+                }
+
+                if (x < outSz.width) {
+                    x = outSz.width - 8;
+                }
+            }
+
+        } else {  // any LPI
+            for (int l = 0; l < lpi; l++) {
+                const uchar *src = src0[l];
+
+                // horizontal pass
+                GAPI_DbgAssert(outSz.width >= 8);
+                for (int x = 0; x < outSz.width; ) {
+                    for (; x <= outSz.width - 8; x += 8) {
+                        v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
+                        v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
+                        v_uint8x16 t = v_gather_pairs(src, sx);  // 8 pairs of src0 pixels
+                        v_int16x8 t0, t1;
+                        v_deinterleave_expand(t, t0, t1);        // tmp pixels as int16
+                        v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
+                        v_pack_u_store(&dst[l][x], d);
+                    }
+
+                    if (x < outSz.width) {
+                        x = outSz.width - 8;
+                    }
+                }
+            }
+        }
+
+    } else if (!yRatioEq1) {
+        GAPI_DbgAssert(xRatioEq1);
+        int length = inSz.width;  // == outSz.width
+
+        for (int l = 0; l < lpi; l++) {
+            short beta0 =                            beta[l];
+        //  short beta1 = saturate_cast<short>(ONE - beta[l]);
+
+            // vertical pass
+            GAPI_DbgAssert(inSz.width >= 8);
+            for (int w = 0; w < outSz.width; ) {
+                for (; w <= length - 8; w += 8) {
+                    v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(src0[l] + w));
+                    v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(src1[l] + w));
+                    v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
+                    v_pack_u_store(dst[l] + w, t);
+                }
+
+                if (w < inSz.width) {
+                    w = inSz.width - 8;
+                }
+            }
+        }
+
+    } else {
+        GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
+        int length = inSz.width;  // == outSz.width
+
+        for (int l = 0; l < lpi; l++) {
+            memcpy(dst[l], src0[l], length);
+        }
+    }
+}
+
+// Resize (bi-linear, 8UC3)
+void calcRowLinear_8UC3(std::array<std::array<uint8_t*, 4>, 3> &dst,
+                  const uint8_t *src0[],
+                  const uint8_t *src1[],
+                  const short    alpha[],
+                  const short    clone[],  // 4 clones of alpha
+                  const short    mapsx[],
+                  const short    beta[],
+                        uint8_t  tmp[],
+                  const Size    &inSz,
+                  const Size    &outSz,
+                        int      lpi) {
+    constexpr const int chanNum = 3;
+
+    if (4 == lpi) {
+        // vertical pass
+        GAPI_DbgAssert(inSz.width >= 8);
+
+        __m128i b0 = _mm_set1_epi16(beta[0]);
+        __m128i b1 = _mm_set1_epi16(beta[1]);
+        __m128i b2 = _mm_set1_epi16(beta[2]);
+        __m128i b3 = _mm_set1_epi16(beta[3]);
+
+        for (int w = 0; w < inSz.width*chanNum; ) {
+            for (; w <= inSz.width*chanNum - 8; w += 8) {
+                //--------------------------------------------
+                // reworked from: ie_preprocess_data_sse42.cpp
+                //      function: resize_bilinear_u8
+                //         label: vertical_pass
+                //--------------------------------------------
+
+                __m128i val0lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[0][w])),
+                        *reinterpret_cast<const int64_t*>(&src0[1][w]), 1);
+                __m128i val0hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src0[2][w])),
+                        *reinterpret_cast<const int64_t*>(&src0[3][w]), 1);
+                __m128i val1lo = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[0][w])),
+                        *reinterpret_cast<const int64_t*>(&src1[1][w]), 1);
+                __m128i val1hi = _mm_insert_epi64(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src1[2][w])),
+                        *reinterpret_cast<const int64_t*>(&src1[3][w]), 1);
+
+                __m128i val0_0 = _mm_cvtepu8_epi16(val0lo);
+                __m128i val0_2 = _mm_cvtepu8_epi16(val0hi);
+                __m128i val1_0 = _mm_cvtepu8_epi16(val1lo);
+                __m128i val1_2 = _mm_cvtepu8_epi16(val1hi);
+
+                __m128i val0_1 = _mm_unpackhi_epi8(val0lo, _mm_setzero_si128());
+                __m128i val0_3 = _mm_unpackhi_epi8(val0hi, _mm_setzero_si128());
+                __m128i val1_1 = _mm_unpackhi_epi8(val1lo, _mm_setzero_si128());
+                __m128i val1_3 = _mm_unpackhi_epi8(val1hi, _mm_setzero_si128());
+
+                __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), b0);
+                __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), b1);
+                __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), b2);
+                __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), b3);
+
+                __m128i r0 = _mm_add_epi16(val1_0, t0);
+                __m128i r1 = _mm_add_epi16(val1_1, t1);
+                __m128i r2 = _mm_add_epi16(val1_2, t2);
+                __m128i r3 = _mm_add_epi16(val1_3, t3);
+
+                __m128i q0 = _mm_packus_epi16(r0, r1);
+                __m128i q1 = _mm_packus_epi16(r2, r3);
+
+                __m128i q2 = _mm_blend_epi16(q0, _mm_slli_si128(q1, 4), 0xCC /*0b11001100*/);
+                __m128i q3 = _mm_blend_epi16(_mm_srli_si128(q0, 4), q1, 0xCC /*0b11001100*/);
+
+                __m128i q4 = _mm_shuffle_epi8(q2, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
+                __m128i q5 = _mm_shuffle_epi8(q3, _mm_setr_epi8(0, 8, 4, 12, 1, 9, 5, 13, 2, 10, 6, 14, 3, 11, 7, 15));
+
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w +  0]), q4);
+                _mm_storeu_si128(reinterpret_cast<__m128i *>(&tmp[4*w + 16]), q5);
+            }
+
+            if (w < inSz.width*chanNum) {
+                w = inSz.width*chanNum - 8;
+            }
+        }
+
+        // horizontal pass
+        GAPI_DbgAssert(outSz.width >= 8);
+        for (int x = 0; x < outSz.width; ) {
+            for (; x <= outSz.width - 8; x += 8) {
+                //--------------------------------------------
+                // reworked from: ie_preprocess_data_sse42.cpp
+                //      function: resize_bilinear_u8
+                //         label: horizontal_pass
+                //--------------------------------------------
+
+                __m128i a10 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 *  x]));
+                __m128i a32 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 2)]));
+                __m128i a54 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 4)]));
+                __m128i a76 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&clone[4 * (x + 6)]));
+
+                __m128i val_0 = _mm_setzero_si128();
+                __m128i val_1 = _mm_setzero_si128();
+                __m128i val_2 = _mm_setzero_si128();
+                __m128i val_3 = _mm_setzero_si128();
+
+                for (int c = 0; c < chanNum; c++) {
+                    val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 0]      + c)]), 0);
+                    val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 0] + 1) + c)]), 1);
+                    val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 1]      + c)]), 2);
+                    val_0 = _mm_insert_epi32(val_0, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 1] + 1) + c)]), 3);
+
+                    val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 2]      + c)]), 0);
+                    val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 2] + 1) + c)]), 1);
+                    val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 3]      + c)]), 2);
+                    val_1 = _mm_insert_epi32(val_1, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 3] + 1) + c)]), 3);
+
+                    val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 4]      + c)]), 0);
+                    val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 4] + 1) + c)]), 1);
+                    val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 5]      + c)]), 2);
+                    val_2 = _mm_insert_epi32(val_2, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 5] + 1) + c)]), 3);
+
+                    val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 6]      + c)]), 0);
+                    val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 6] + 1) + c)]), 1);
+                    val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum *  mapsx[x + 7]      + c)]), 2);
+                    val_3 = _mm_insert_epi32(val_3, *reinterpret_cast<const int*>(&tmp[4 * (chanNum * (mapsx[x + 7] + 1) + c)]), 3);
+
+                    val_0 = _mm_shuffle_epi32(val_0, _MM_SHUFFLE(3, 1, 2, 0));
+                    val_1 = _mm_shuffle_epi32(val_1, _MM_SHUFFLE(3, 1, 2, 0));
+                    val_2 = _mm_shuffle_epi32(val_2, _MM_SHUFFLE(3, 1, 2, 0));
+                    val_3 = _mm_shuffle_epi32(val_3, _MM_SHUFFLE(3, 1, 2, 0));
+
+                    __m128i val0_0 = _mm_cvtepu8_epi16(val_0);
+                    __m128i val0_1 = _mm_cvtepu8_epi16(val_1);
+                    __m128i val0_2 = _mm_cvtepu8_epi16(val_2);
+                    __m128i val0_3 = _mm_cvtepu8_epi16(val_3);
+
+                    __m128i val1_0 = _mm_unpackhi_epi8(val_0, _mm_setzero_si128());
+                    __m128i val1_1 = _mm_unpackhi_epi8(val_1, _mm_setzero_si128());
+                    __m128i val1_2 = _mm_unpackhi_epi8(val_2, _mm_setzero_si128());
+                    __m128i val1_3 = _mm_unpackhi_epi8(val_3, _mm_setzero_si128());
+
+                    __m128i t0 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_0, val1_0), a10);
+                    __m128i t1 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_1, val1_1), a32);
+                    __m128i t2 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_2, val1_2), a54);
+                    __m128i t3 = _mm_mulhrs_epi16(_mm_sub_epi16(val0_3, val1_3), a76);
+
+                    __m128i r0 = _mm_add_epi16(val1_0, t0);
+                    __m128i r1 = _mm_add_epi16(val1_1, t1);
+                    __m128i r2 = _mm_add_epi16(val1_2, t2);
+                    __m128i r3 = _mm_add_epi16(val1_3, t3);
+
+                    __m128i q0 = _mm_packus_epi16(r0, r1);
+                    __m128i q1 = _mm_packus_epi16(r2, r3);
+
+                    __m128i q2 = _mm_shuffle_epi8(q0, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
+                    __m128i q3 = _mm_shuffle_epi8(q1, _mm_setr_epi8(0, 4, 8, 12, 2, 6, 10, 14, 1, 5, 9, 13, 3, 7, 11, 15));
+
+                    __m128i q4 = _mm_blend_epi16(q2, _mm_slli_si128(q3, 4), 0xCC /*0b11001100*/);
+                    __m128i q5 = _mm_blend_epi16(_mm_srli_si128(q2, 4), q3, 0xCC /*0b11001100*/);
+
+                    _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][0][x]),                q4);
+                    _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][1][x]), _mm_srli_si128(q4, 8));
+                    _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][2][x]),                q5);
+                    _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst[c][3][x]), _mm_srli_si128(q5, 8));
+                }
+            }
+
+            if (x < outSz.width) {
+                x = outSz.width - 8;
+            }
+        }
+    } else {  // if any lpi
+        for (int l = 0; l < lpi; l++) {
+            short beta0 = beta[l];
+
+            // vertical pass
+            GAPI_DbgAssert(inSz.width*chanNum >= 8);
+            for (int w = 0; w < inSz.width*chanNum; ) {
+                for (; w <= inSz.width*chanNum - 8; w += 8) {
+                    v_int16x8 s0 = v_reinterpret_as_s16(v_load_expand(&src0[l][w]));
+                    v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(&src1[l][w]));
+                    v_int16x8 t = v_mulhrs(s0 - s1, beta0) + s1;
+                    v_pack_u_store(tmp + w, t);
+                }
+
+                if (w < inSz.width*chanNum) {
+                    w = inSz.width*chanNum - 8;
+                }
+            }
+
+            // horizontal pass
+            GAPI_DbgAssert(outSz.width >= 8);
+            for (int x = 0; x < outSz.width; ) {
+                for (; x <= outSz.width - 8; x += 8) {
+                    for (int c = 0; c < chanNum; c++) {
+                        v_int16x8 a0 = v_load(&alpha[x]);        // as signed Q1.1.14
+                        v_int16x8 sx = v_load(&mapsx[x]);        // as integer (int16)
+                        v_int16x8 t0 = v_gather_chan(tmp, sx, c, 0);
+                        v_int16x8 t1 = v_gather_chan(tmp, sx, c, 1);
+                        v_int16x8 d = v_mulhrs(t0 - t1, a0) + t1;
+                        v_pack_u_store(&dst[c][l][x], d);
+                    }
+                }
+
+                if (x < outSz.width) {
+                    x = outSz.width - 8;
+                }
+            }
+        }
+    }
+}
+
+// Resize (bi-linear, 32F)
+void calcRowLinear_32F(float *dst[],
+                 const float *src0[],
+                 const float *src1[],
+                 const float  alpha[],
+                 const int    mapsx[],
+                 const float  beta[],
+                       float  tmp[],
+                 const Size & inSz,
+                 const Size & outSz,
+                       int    lpi) {
+    UNUSED(tmp);
+
+    bool xRatioEq1 = inSz.width  == outSz.width;
+    bool yRatioEq1 = inSz.height == outSz.height;
+
+    if (!xRatioEq1 && !yRatioEq1) {
+        for (int l = 0; l < lpi; l++) {
+            float beta0 = beta[l];
+            float beta1 = 1 - beta0;
+
+            int x = 0;
+
+        #if CV_SIMD128
+            for (; x <= outSz.width - 4; x += 4) {
+                v_float32x4 alpha0 = v_load(&alpha[x]);
+            //  v_float32x4 alpha1 = 1.f - alpha0;
+
+                v_int32x4 sx = v_load(&mapsx[x]);
+
+                v_float32x4 s0l, s0h, s00, s01;
+                v_gather_pairs(src0[l], sx, s0l, s0h);
+                v_deinterleave(s0l, s0h, s00, s01);
+
+            //  v_float32x4 res0 = s00*alpha0 + s01*alpha1;
+                v_float32x4 res0 = v_fma(s00 - s01, alpha0, s01);
+
+                v_float32x4 s1l, s1h, s10, s11;
+                v_gather_pairs(src1[l], sx, s1l, s1h);
+                v_deinterleave(s1l, s1h, s10, s11);
+
+            //  v_float32x4 res1 = s10*alpha0 + s11*alpha1;
+                v_float32x4 res1 = v_fma(s10 - s11, alpha0, s11);
+
+            //  v_float32x4 d = res0*beta0 + res1*beta1;
+                v_float32x4 d = v_fma(res0 - res1, beta0, res1);
+
+                v_store(&dst[l][x], d);
+            }
+        #endif
+
+            for (; x < outSz.width; x++) {
+                float alpha0 = alpha[x];
+                float alpha1 = 1 - alpha0;
+                int   sx0 = mapsx[x];
+                int   sx1 = sx0 + 1;
+                float res0 = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
+                float res1 = src1[l][sx0]*alpha0 + src1[l][sx1]*alpha1;
+                dst[l][x] = beta0*res0 + beta1*res1;
+            }
+        }
+
+    } else if (!xRatioEq1) {
+        GAPI_DbgAssert(yRatioEq1);
+
+        for (int l = 0; l < lpi; l++) {
+            int x = 0;
+
+        #if CV_SIMD128
+            for (; x <= outSz.width - 4; x += 4) {
+                v_float32x4 alpha0 = v_load(&alpha[x]);
+            //  v_float32x4 alpha1 = 1.f - alpha0;
+
+                v_int32x4 sx = v_load(&mapsx[x]);
+
+                v_float32x4 s0l, s0h, s00, s01;
+                v_gather_pairs(src0[l], sx, s0l, s0h);
+                v_deinterleave(s0l, s0h, s00, s01);
+
+            //  v_float32x4 d = s00*alpha0 + s01*alpha1;
+                v_float32x4 d = v_fma(s00 - s01, alpha0, s01);
+
+                v_store(&dst[l][x], d);
+            }
+        #endif
+
+            for (; x < outSz.width; x++) {
+                float alpha0 = alpha[x];
+                float alpha1 = 1 - alpha0;
+                int   sx0 = mapsx[x];
+                int   sx1 = sx0 + 1;
+                dst[l][x] = src0[l][sx0]*alpha0 + src0[l][sx1]*alpha1;
+            }
+        }
+
+    } else if (!yRatioEq1) {
+        GAPI_DbgAssert(xRatioEq1);
+        int length = inSz.width;  // == outSz.width
+
+        for (int l = 0; l < lpi; l++) {
+            float beta0 = beta[l];
+            float beta1 = 1 - beta0;
+
+            int x = 0;
+
+        #if CV_SIMD128
+            for (; x <= length - 4; x += 4) {
+                v_float32x4 s0 = v_load(&src0[l][x]);
+                v_float32x4 s1 = v_load(&src1[l][x]);
+
+            //  v_float32x4 d = s0*beta0 + s1*beta1;
+                v_float32x4 d = v_fma(s0 - s1, beta0, s1);
+
+                v_store(&dst[l][x], d);
+            }
+        #endif
+
+            for (; x < length; x++) {
+                dst[l][x] = beta0*src0[l][x] + beta1*src1[l][x];
+            }
+        }
+
+    } else {
+        GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
+        int length = inSz.width;  // == outSz.width
+        for (int l = 0; l < lpi; l++) {
+            memcpy(dst[l], src0[l], length * sizeof(float));
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+
+// vertical pass
+template<typename T, typename A, typename I, typename W>
+static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap, A yalpha,
+                         W vbuf[]) {
+    int y_1st = ymap.index0;
+    int ylast = ymap.index1 - 1;
+
+    // yratio > 1, so at least 2 rows
+    GAPI_DbgAssert(y_1st < ylast);
+
+    // 1st and last rows
+    {
+        int w = 0;
+
+    #if CV_SIMD128
+        if (std::is_same<T, uint8_t>::value) {
+            for (; w <= inWidth - 8; w += 8) {
+                v_uint16x8 vsrc0 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[0][w]));
+                v_uint16x8 vsrc1 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[ylast - y_1st][w]));
+                v_uint16x8 vres = v_mulhi(vsrc0 << 8, static_cast<Q0_16>(ymap.alpha0)) +
+                                  v_mulhi(vsrc1 << 8, static_cast<Q0_16>(ymap.alpha1));
+                v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
+            }
+        }
+    #endif
+
+        for (; w < inWidth; w++) {
+            vbuf[w] = mulas(ymap.alpha0, src[0][w])
+                    + mulas(ymap.alpha1, src[ylast - y_1st][w]);
+        }
+    }
+
+    // inner rows (if any)
+    for (int i = 1; i < ylast - y_1st; i++) {
+        int w = 0;
+
+    #if CV_SIMD128
+        if (std::is_same<T, uint8_t>::value) {
+            for (; w <= inWidth - 8; w += 8) {
+                v_uint16x8 vsrc = v_load_expand(reinterpret_cast<const uint8_t*>(& src[i][w]));
+                v_uint16x8 vres = v_load(reinterpret_cast<Q8_8*>(& vbuf[w]));
+                vres = vres + v_mulhi(vsrc << 8, static_cast<Q0_16>(yalpha));
+                v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
+            }
+        }
+    #endif
+
+        for (; w < inWidth; w++) {
+            vbuf[w] += mulas(yalpha, src[i][w]);
+        }
+    }
+}
+
+// horizontal pass
+template<typename T, typename A, typename I, typename W>
+static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[], const A xalpha[],
+                         const W vbuf[]) {
+#define HSUM(xmaxdf) \
+    for (int x = 0; x < outWidth; x++) { \
+        int      index =  xindex[x]; \
+        const A *alpha = &xalpha[x * xmaxdf]; \
+\
+        W sum = 0; \
+        for (int i = 0; i < xmaxdf; i++) { \
+            sum += mulaw(alpha[i], vbuf[index + i]); \
+        } \
+\
+        dst[x] = convert_cast<T>(sum); \
+    }
+
+    if (2 == xmaxdf) {
+        HSUM(2);
+    } else if (3 == xmaxdf) {
+        HSUM(3);
+    } else if (4 == xmaxdf) {
+        HSUM(4);
+    } else if (5 == xmaxdf) {
+        HSUM(5);
+    } else if (6 == xmaxdf) {
+        HSUM(6);
+    } else if (7 == xmaxdf) {
+        HSUM(7);
+    } else if (8 == xmaxdf) {
+        HSUM(8);
+    } else {
+        HSUM(xmaxdf);
+    }
+#undef HSUM
+}
+
+template<typename T, typename A, typename I, typename W>
+static void calcRowArea_impl(T dst[], const T *src[], const Size& inSz, const Size& outSz,
+    A yalpha, const MapperUnit<A, I>& ymap, int xmaxdf, const I xindex[], const A xalpha[],
+    W vbuf[]) {
+    bool xRatioEq1 = inSz.width  == outSz.width;
+    bool yRatioEq1 = inSz.height == outSz.height;
+
+    if (!yRatioEq1 && !xRatioEq1) {
+        downy(src, inSz.width, ymap, yalpha, vbuf);
+        downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
+
+    } else if (!yRatioEq1) {
+        GAPI_DbgAssert(xRatioEq1);
+        downy(src, inSz.width, ymap, yalpha, vbuf);
+        for (int x = 0; x < outSz.width; x++) {
+            dst[x] = convert_cast<T>(vbuf[x]);
+        }
+
+    } else if (!xRatioEq1) {
+        GAPI_DbgAssert(yRatioEq1);
+        for (int w = 0; w < inSz.width; w++) {
+            vbuf[w] = convert_cast<W>(src[0][w]);
+        }
+        downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
+
+    } else {
+        GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
+        memcpy(dst, src[0], outSz.width * sizeof(T));
+    }
+}
+
+void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz,
+    Q0_16 yalpha, const MapperUnit8U &ymap, int xmaxdf, const short xindex[], const Q0_16 xalpha[],
+    Q8_8 vbuf[]) {
+    calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
+}
+
+void calcRowArea_32F(float dst[], const float *src[], const Size& inSz, const Size& outSz,
+    float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[], const float xalpha[],
+    float vbuf[]) {
+    calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
+}
+
+//------------------------------------------------------------------------------
+#if USE_CVKL
+
+// from: ie_preprocess_data.hpp
+static inline uint8_t saturateU32toU8(uint32_t v) {
+    return static_cast<uint8_t>(v > UINT8_MAX ? UINT8_MAX : v);
+}
+
+// from: ie_preprocess_data_sse42.cpp
+static inline uint16_t mulq16(uint16_t a, uint16_t b) {
+    return static_cast<uint16_t>(((uint32_t)a * (uint32_t)b) >> 16);
+}
+
+// extracted from: ie_preprocess_data_sse42.cpp
+// (and reworked for 1-channel and fluid's src)
+void calcRowArea_CVKL_U8_SSE42(const uchar  * src[],
+                                     uchar    dst[],
+                               const Size   & inSz,
+                               const Size   & outSz,
+                                     int      y,
+                               const uint16_t xsi[],
+                               const uint16_t ysi[],
+                               const uint16_t xalpha[],
+                               const uint16_t yalpha[],
+                                     int      x_max_count,
+                                     int      y_max_count,
+                                     uint16_t vert_sum[]) {
+    int dwidth  = outSz.width;
+//  int dheight = outSz.height;
+    int swidth  =  inSz.width;
+    int sheight =  inSz.height;
+
+    int vest_sum_size = 2*swidth;
+//  uint16_t* vert_sum = yalpha + dheight*y_max_count;
+    uint16_t* alpha0 = vert_sum + vest_sum_size;
+    uint16_t* alpha1 = alpha0 + dwidth;
+    uint16_t* alpha2 = alpha1 + dwidth;
+    uint16_t* alpha3 = alpha2 + dwidth;
+    uint16_t* sxid0 = alpha3 + dwidth;
+    uint16_t* sxid1 = sxid0 + 4*dwidth;
+    uint16_t* sxid2 = sxid1 + 4*dwidth;
+    uint16_t* sxid3 = sxid2 + 4*dwidth;
+
+    uint8_t * pdst_row  = dst;
+    uint16_t* vert_sum_ = vert_sum;
+
+    int ysi_row = ysi[y];
+
+    memset(vert_sum_, 0, swidth * sizeof(uint16_t));
+
+    for (int dy = 0; dy < y_max_count; dy++) {
+        if (ysi_row + dy >= sheight)
+            break;
+
+        uint16_t yalpha_dy = yalpha[y * y_max_count + dy];
+        const uint8_t *sptr_dy = src[dy];
+
+        int x = 0;
+
+        __m128i yalpha_dy_sse = _mm_set1_epi16(yalpha_dy);
+        for (; x <= swidth - 16; x += 16) {
+            __m128i sval = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sptr_dy + x));
+
+            // sptr_dy[x] << 8
+            __m128i sval_Q16_lo = _mm_unpacklo_epi8(_mm_setzero_si128(), sval);
+            __m128i sval_Q16_hi = _mm_unpackhi_epi8(_mm_setzero_si128(), sval);
+
+            __m128i vert_sum_lo = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 0));
+            __m128i vert_sum_hi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + x + 8));
+
+            vert_sum_lo = _mm_add_epi16(vert_sum_lo, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_lo));
+            vert_sum_hi = _mm_add_epi16(vert_sum_hi, _mm_mulhi_epu16(yalpha_dy_sse, sval_Q16_hi));
+
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 0), vert_sum_lo);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(vert_sum_ + x + 8), vert_sum_hi);
+        }
+
+        for (; x < swidth; x++) {
+            vert_sum_[x] += mulq16(yalpha_dy, static_cast<uint16_t>(sptr_dy[x] << 8));
+        }
+    }
+
+    if (x_max_count == 2) {
+        int x = 0;
+        for (; x <= dwidth - 8; x += 8) {
+            __m128i res = _mm_set1_epi16(1 << (8 - 1));
+
+            int id0 = xsi[x];
+
+            __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
+            __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
+
+            __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2));
+            __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 2 + 8));
+
+            __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2));
+            __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 2 + 8));
+
+            __m128i vert_sum0 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
+                                             _mm_shuffle_epi8(chunk1, sx0_id1));
+            __m128i vert_sum1 = _mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
+                                             _mm_shuffle_epi8(chunk1, sx1_id1));
+
+            res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
+            res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
+
+            res = _mm_srli_epi16(res, 8);
+            res = _mm_packus_epi16(res, res);
+            _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
+        }
+
+        for (; x < dwidth; x++) {
+            uint16_t res = 1 << (8 - 1);
+            int id = xsi[x];
+            res += mulq16(alpha0[x], vert_sum_[id + 0]);
+            res += mulq16(alpha1[x], vert_sum_[id + 1]);
+            pdst_row[x] = saturateU32toU8(res >> 8);
+        }
+    } else if (x_max_count == 3) {
+        int x = 0;
+        for (; x <= dwidth - 8; x += 8) {
+            __m128i res = _mm_set1_epi16(1 << (8 - 1));
+
+            int id0 = xsi[x];
+
+            __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
+            __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
+            __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
+
+            __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3));
+            __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 8));
+            __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 3 + 16));
+
+            __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3));
+            __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 8));
+            __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 3 + 16));
+
+            __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3));
+            __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 8));
+            __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 3 + 16));
+
+            __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
+                                                          _mm_shuffle_epi8(chunk1, sx0_id1)),
+                                             _mm_shuffle_epi8(chunk2, sx0_id2));
+            __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
+                                                          _mm_shuffle_epi8(chunk1, sx1_id1)),
+                                             _mm_shuffle_epi8(chunk2, sx1_id2));
+            __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
+                                                          _mm_shuffle_epi8(chunk1, sx2_id1)),
+                                             _mm_shuffle_epi8(chunk2, sx2_id2));
+
+            res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
+            res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
+            res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
+
+            res = _mm_srli_epi16(res, 8);
+            res = _mm_packus_epi16(res, res);
+            _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
+        }
+
+        for (; x < dwidth; x++) {
+            uint16_t res = 1 << (8 - 1);
+            int id = xsi[x];
+            res += mulq16(alpha0[x], vert_sum_[id + 0]);
+            res += mulq16(alpha1[x], vert_sum_[id + 1]);
+            res += mulq16(alpha2[x], vert_sum_[id + 2]);
+            pdst_row[x] = saturateU32toU8(res >> 8);
+        }
+    } else if (x_max_count == 4) {
+        int x = 0;
+        for (; x <= dwidth - 8; x += 8) {
+            __m128i res = _mm_set1_epi16(1 << (8 - 1));
+
+            int id0 = xsi[x];
+
+            __m128i chunk0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0));
+            __m128i chunk1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 8));
+            __m128i chunk2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 16));
+            __m128i chunk3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id0 + 24));
+
+            __m128i sx0_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4));
+            __m128i sx0_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 8));
+            __m128i sx0_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 16));
+            __m128i sx0_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid0 + x * 4 + 24));
+
+            __m128i sx1_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4));
+            __m128i sx1_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 8));
+            __m128i sx1_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 16));
+            __m128i sx1_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid1 + x * 4 + 24));
+
+            __m128i sx2_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4));
+            __m128i sx2_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 8));
+            __m128i sx2_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 16));
+            __m128i sx2_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid2 + x * 4 + 24));
+
+            __m128i sx3_id0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4));
+            __m128i sx3_id1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 8));
+            __m128i sx3_id2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 16));
+            __m128i sx3_id3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(sxid3 + x * 4 + 24));
+
+            __m128i vert_sum0 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx0_id0),
+                                                          _mm_shuffle_epi8(chunk1, sx0_id1)),
+                                             _mm_or_si128(_mm_shuffle_epi8(chunk2, sx0_id2),
+                                                          _mm_shuffle_epi8(chunk3, sx0_id3)));
+            __m128i vert_sum1 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx1_id0),
+                                                          _mm_shuffle_epi8(chunk1, sx1_id1)),
+                                             _mm_or_si128(_mm_shuffle_epi8(chunk2, sx1_id2),
+                                                          _mm_shuffle_epi8(chunk3, sx1_id3)));
+            __m128i vert_sum2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx2_id0),
+                                                          _mm_shuffle_epi8(chunk1, sx2_id1)),
+                                             _mm_or_si128(_mm_shuffle_epi8(chunk2, sx2_id2),
+                                                          _mm_shuffle_epi8(chunk3, sx2_id3)));
+            __m128i vert_sum3 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(chunk0, sx3_id0),
+                                                          _mm_shuffle_epi8(chunk1, sx3_id1)),
+                                             _mm_or_si128(_mm_shuffle_epi8(chunk2, sx3_id2),
+                                                          _mm_shuffle_epi8(chunk3, sx3_id3)));
+
+            res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha0 + x)), vert_sum0));
+            res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha1 + x)), vert_sum1));
+            res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha2 + x)), vert_sum2));
+            res = _mm_add_epi16(res, _mm_mulhi_epu16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(alpha3 + x)), vert_sum3));
+
+            res = _mm_srli_epi16(res, 8);
+            res = _mm_packus_epi16(res, res);
+            _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
+        }
+
+        for (; x < dwidth; x++) {
+            uint16_t res = 1 << (8 - 1);
+            int id = xsi[x];
+            res += mulq16(alpha0[x], vert_sum_[id + 0]);
+            res += mulq16(alpha1[x], vert_sum_[id + 1]);
+            res += mulq16(alpha2[x], vert_sum_[id + 2]);
+            res += mulq16(alpha3[x], vert_sum_[id + 3]);
+            pdst_row[x] = saturateU32toU8(res >> 8);
+        }
+    } else if (x_max_count <= 7) {
+        int x = 0;
+        for (; x <= dwidth - 8; x += 8) {
+            __m128i res = _mm_set1_epi16(1 << (16 - 8 - 1));
+            for (int i = 0; i < x_max_count; i++) {
+                __m128i valpha = _mm_setr_epi16(xalpha[x * x_max_count + x_max_count * 0 + i],
+                                                xalpha[x * x_max_count + x_max_count * 1 + i],
+                                                xalpha[x * x_max_count + x_max_count * 2 + i],
+                                                xalpha[x * x_max_count + x_max_count * 3 + i],
+                                                xalpha[x * x_max_count + x_max_count * 4 + i],
+                                                xalpha[x * x_max_count + x_max_count * 5 + i],
+                                                xalpha[x * x_max_count + x_max_count * 6 + i],
+                                                xalpha[x * x_max_count + x_max_count * 7 + i]);
+                __m128i vvert_sum = _mm_setr_epi16(vert_sum_[xsi[x + 0] + i],
+                                                   vert_sum_[xsi[x + 1] + i],
+                                                   vert_sum_[xsi[x + 2] + i],
+                                                   vert_sum_[xsi[x + 3] + i],
+                                                   vert_sum_[xsi[x + 4] + i],
+                                                   vert_sum_[xsi[x + 5] + i],
+                                                   vert_sum_[xsi[x + 6] + i],
+                                                   vert_sum_[xsi[x + 7] + i]);
+
+                res = _mm_add_epi16(res, _mm_mulhi_epu16(valpha, vvert_sum));
+            }
+            res = _mm_srli_epi16(res, 8);
+            res = _mm_packus_epi16(res, res);
+            _mm_storel_epi64(reinterpret_cast<__m128i*>(pdst_row + x), res);
+        }
+
+        for (; x < dwidth; x++) {
+            uint16_t res = 1 << (8 - 1);
+            for (int i = 0; i < x_max_count; i++) {
+                uint16_t a = xalpha[x * x_max_count + i];
+                int sx = xsi[x] + i;
+
+                res += mulq16(a, vert_sum_[sx]);
+            }
+            pdst_row[x] = saturateU32toU8(res >> 8);
+        }
+    } else {
+        for (int x = 0; x < dwidth; x++) {
+            uint16_t res = 1 << (8 - 1);
+            __m128i vres = _mm_setzero_si128();
+            int id = xsi[x];
+
+            int i = 0;
+            for (; i <= x_max_count - 8; i += 8) {
+                __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(xalpha + x * x_max_count + i));
+                __m128i s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(vert_sum_ + id + i));
+
+                vres = _mm_add_epi16(vres, _mm_mulhi_epu16(a, s));
+            }
+            vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 2));
+            vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 4));
+            vres = _mm_add_epi16(vres, _mm_slli_si128(vres, 8));
+            res += static_cast<uint16_t>(_mm_extract_epi16(vres, 7));
+
+            for (; i < x_max_count; i++) {
+                uint16_t a = xalpha[x * x_max_count + i];
+                uint16_t s = vert_sum_[id + i];
+
+                res += mulq16(a, s);
+            }
+
+            pdst_row[x] = saturateU32toU8(res >> 8);
+        }
+    }
+}
+
+#endif  // CVKL
+//------------------------------------------------------------------------------
+
+void mergeRow_8UC2(const uint8_t in0[],
+                   const uint8_t in1[],
+                         uint8_t out[],
+                             int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 16; l += 16) {
+        v_uint8x16 r0, r1;
+        r0 = v_load(&in0[l]);
+        r1 = v_load(&in1[l]);
+        v_store_interleave(&out[2*l], r0, r1);
+    }
+
+    if (l < length && length >= 16) {
+        l = length - 16;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out[2*l + 0] = in0[l];
+        out[2*l + 1] = in1[l];
+    }
+}
+
+void mergeRow_8UC3(const uint8_t in0[],
+                   const uint8_t in1[],
+                   const uint8_t in2[],
+                         uint8_t out[],
+                             int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 16; l += 16) {
+        v_uint8x16 r0, r1, r2;
+        r0 = v_load(&in0[l]);
+        r1 = v_load(&in1[l]);
+        r2 = v_load(&in2[l]);
+        v_store_interleave(&out[3*l], r0, r1, r2);
+    }
+
+    if (l < length && length >= 16) {
+        l = length - 16;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out[3*l + 0] = in0[l];
+        out[3*l + 1] = in1[l];
+        out[3*l + 2] = in2[l];
+    }
+}
+
+void mergeRow_8UC4(const uint8_t in0[],
+                   const uint8_t in1[],
+                   const uint8_t in2[],
+                   const uint8_t in3[],
+                         uint8_t out[],
+                             int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 16; l += 16) {
+        v_uint8x16 r0, r1, r2, r3;
+        r0 = v_load(&in0[l]);
+        r1 = v_load(&in1[l]);
+        r2 = v_load(&in2[l]);
+        r3 = v_load(&in3[l]);
+        v_store_interleave(&out[4*l], r0, r1, r2, r3);
+    }
+
+    if (l < length && length >= 16) {
+        l = length - 16;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out[4*l + 0] = in0[l];
+        out[4*l + 1] = in1[l];
+        out[4*l + 2] = in2[l];
+        out[4*l + 3] = in3[l];
+    }
+}
+
+void mergeRow_32FC2(const float in0[],
+                    const float in1[],
+                          float out[],
+                            int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 4; l += 4) {
+        v_float32x4 r0, r1;
+        r0 = v_load(&in0[l]);
+        r1 = v_load(&in1[l]);
+        v_store_interleave(&out[2*l], r0, r1);
+    }
+
+    if (l < length && length >= 4) {
+        l = length - 4;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out[2*l + 0] = in0[l];
+        out[2*l + 1] = in1[l];
+    }
+}
+
+void mergeRow_32FC3(const float in0[],
+                    const float in1[],
+                    const float in2[],
+                          float out[],
+                            int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 4; l += 4) {
+        v_float32x4 r0, r1, r2;
+        r0 = v_load(&in0[l]);
+        r1 = v_load(&in1[l]);
+        r2 = v_load(&in2[l]);
+        v_store_interleave(&out[3*l], r0, r1, r2);
+    }
+
+    if (l < length && length >= 4) {
+        l = length - 4;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out[3*l + 0] = in0[l];
+        out[3*l + 1] = in1[l];
+        out[3*l + 2] = in2[l];
+    }
+}
+
+void mergeRow_32FC4(const float in0[],
+                    const float in1[],
+                    const float in2[],
+                    const float in3[],
+                          float out[],
+                            int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 4; l += 4) {
+        v_float32x4 r0, r1, r2, r3;
+        r0 = v_load(&in0[l]);
+        r1 = v_load(&in1[l]);
+        r2 = v_load(&in2[l]);
+        r3 = v_load(&in3[l]);
+        v_store_interleave(&out[4*l], r0, r1, r2, r3);
+    }
+
+    if (l < length && length >= 4) {
+        l = length - 4;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out[4*l + 0] = in0[l];
+        out[4*l + 1] = in1[l];
+        out[4*l + 2] = in2[l];
+        out[4*l + 3] = in3[l];
+    }
+}
+
+void splitRow_8UC2(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                             int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 16; l += 16) {
+        v_uint8x16 r0, r1;
+        v_load_deinterleave(&in[2*l], r0, r1);
+        v_store(&out0[l], r0);
+        v_store(&out1[l], r1);
+    }
+    if (l < length && length >= 16) {
+        l = length - 16;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out0[l] = in[2*l + 0];
+        out1[l] = in[2*l + 1];
+    }
+}
+
+void splitRow_8UC3(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                         uint8_t out2[],
+                             int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 16; l += 16) {
+        v_uint8x16 r0, r1, r2;
+        v_load_deinterleave(&in[3*l], r0, r1, r2);
+        v_store(&out0[l], r0);
+        v_store(&out1[l], r1);
+        v_store(&out2[l], r2);
+    }
+    if (l < length && length >= 16) {
+        l = length - 16;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out0[l] = in[3*l + 0];
+        out1[l] = in[3*l + 1];
+        out2[l] = in[3*l + 2];
+    }
+}
+
+void splitRow_8UC4(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                         uint8_t out2[],
+                         uint8_t out3[],
+                             int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 16; l += 16) {
+        v_uint8x16 r0, r1, r2, r3;
+        v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
+        v_store(&out0[l], r0);
+        v_store(&out1[l], r1);
+        v_store(&out2[l], r2);
+        v_store(&out3[l], r3);
+    }
+    if (l < length && length >= 16) {
+        l = length - 16;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out0[l] = in[4*l + 0];
+        out1[l] = in[4*l + 1];
+        out2[l] = in[4*l + 2];
+        out3[l] = in[4*l + 3];
+    }
+}
+
+void splitRow_32FC2(const float in[],
+                          float out0[],
+                          float out1[],
+                            int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 4; l += 4) {
+        v_float32x4 r0, r1;
+        v_load_deinterleave(&in[2*l], r0, r1);
+        v_store(&out0[l], r0);
+        v_store(&out1[l], r1);
+    }
+
+    if (l < length && length >= 4) {
+        l = length - 4;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out0[l] = in[2*l + 0];
+        out1[l] = in[2*l + 1];
+    }
+}
+
+void splitRow_32FC3(const float in[],
+                          float out0[],
+                          float out1[],
+                          float out2[],
+                            int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 4; l += 4) {
+        v_float32x4 r0, r1, r2;
+        v_load_deinterleave(&in[3*l], r0, r1, r2);
+        v_store(&out0[l], r0);
+        v_store(&out1[l], r1);
+        v_store(&out2[l], r2);
+    }
+
+    if (l < length && length >= 4) {
+        l = length - 4;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out0[l] = in[3*l + 0];
+        out1[l] = in[3*l + 1];
+        out2[l] = in[3*l + 2];
+    }
+}
+
+void splitRow_32FC4(const float in[],
+                          float out0[],
+                          float out1[],
+                          float out2[],
+                          float out3[],
+                            int length) {
+    int l = 0;
+
+#if CV_SIMD128
+    cycle:
+    for (; l <= length - 4; l += 4) {
+        v_float32x4 r0, r1, r2, r3;
+        v_load_deinterleave(&in[4*l], r0, r1, r2, r3);
+        v_store(&out0[l], r0);
+        v_store(&out1[l], r1);
+        v_store(&out2[l], r2);
+        v_store(&out3[l], r3);
+    }
+
+    if (l < length && length >= 4) {
+        l = length - 4;
+        goto cycle;
+    }
+#endif
+
+    for (; l < length; l++) {
+        out0[l] = in[4*l + 0];
+        out1[l] = in[4*l + 1];
+        out2[l] = in[4*l + 2];
+        out3[l] = in[4*l + 3];
+    }
+}
+
+}  // namespace kernels
+}  // namespace gapi
+}  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp b/inference-engine/src/inference_engine/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.hpp
new file mode 100644 (file)
index 0000000..bbb0d6e
--- /dev/null
@@ -0,0 +1,157 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_preprocess_gapi_kernels.hpp"
+#include "ie_preprocess_gapi_kernels_impl.hpp"
+
+namespace InferenceEngine {
+namespace gapi {
+namespace kernels {
+
+//----------------------------------------------------------------------
+
+typedef MapperUnit<float,   int> MapperUnit32F;
+typedef MapperUnit<Q0_16, short> MapperUnit8U;
+
+void calcRowArea_8U(uchar dst[], const uchar *src[], const Size &inSz, const Size &outSz,
+    Q0_16 yalpha, const MapperUnit8U& ymap, int xmaxdf, const short xindex[], const Q0_16 xalpha[],
+    Q8_8 vbuf[]);
+
+void calcRowArea_32F(float dst[], const float *src[], const Size &inSz, const Size &outSz,
+    float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[], const float xalpha[],
+    float vbuf[]);
+
+#if USE_CVKL
+void calcRowArea_CVKL_U8_SSE42(const uchar  * src[],
+                                     uchar    dst[],
+                               const Size   & inSz,
+                               const Size   & outSz,
+                                     int      y,
+                               const uint16_t xsi[],
+                               const uint16_t ysi[],
+                               const uint16_t xalpha[],
+                               const uint16_t yalpha[],
+                                     int      x_max_count,
+                                     int      y_max_count,
+                                     uint16_t vert_sum[]);
+#endif
+
+//----------------------------------------------------------------------
+
+// Resize (bi-linear, 8U)
+void calcRowLinear_8U(uint8_t *dst[],
+                const uint8_t *src0[],
+                const uint8_t *src1[],
+                const short    alpha[],
+                const short    clone[],
+                const short    mapsx[],
+                const short    beta[],
+                      uint8_t  tmp[],
+                const Size   & inSz,
+                const Size   & outSz,
+                      int      lpi);
+
+void calcRowLinear_8UC3(std::array<std::array<uint8_t*, 4>, 3> &dst,
+                  const uint8_t *src0[],
+                  const uint8_t *src1[],
+                  const short    alpha[],
+                  const short    clone[],
+                  const short    mapsx[],
+                  const short    beta[],
+                        uint8_t  tmp[],
+                  const Size    &inSz,
+                  const Size    &outSz,
+                        int      lpi);
+
+// Resize (bi-linear, 32F)
+void calcRowLinear_32F(float *dst[],
+                 const float *src0[],
+                 const float *src1[],
+                 const float  alpha[],
+                 const int    mapsx[],
+                 const float  beta[],
+                       float  tmp[],
+                 const Size & inSz,
+                 const Size & outSz,
+                       int    lpi);
+
+//----------------------------------------------------------------------
+
+void mergeRow_8UC2(const uint8_t in0[],
+                   const uint8_t in1[],
+                         uint8_t out[],
+                             int length);
+
+void mergeRow_8UC3(const uint8_t in0[],
+                   const uint8_t in1[],
+                   const uint8_t in2[],
+                         uint8_t out[],
+                             int length);
+
+void mergeRow_8UC4(const uint8_t in0[],
+                   const uint8_t in1[],
+                   const uint8_t in2[],
+                   const uint8_t in3[],
+                         uint8_t out[],
+                             int length);
+
+void mergeRow_32FC2(const float in0[],
+                    const float in1[],
+                          float out[],
+                            int length);
+
+void mergeRow_32FC3(const float in0[],
+                    const float in1[],
+                    const float in2[],
+                          float out[],
+                            int length);
+
+void mergeRow_32FC4(const float in0[],
+                    const float in1[],
+                    const float in2[],
+                    const float in3[],
+                          float out[],
+                            int length);
+
+void splitRow_8UC2(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                             int length);
+
+void splitRow_8UC3(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                         uint8_t out2[],
+                             int length);
+
+void splitRow_8UC4(const uint8_t in[],
+                         uint8_t out0[],
+                         uint8_t out1[],
+                         uint8_t out2[],
+                         uint8_t out3[],
+                             int length);
+
+void splitRow_32FC2(const float in[],
+                          float out0[],
+                          float out1[],
+                            int length);
+
+void splitRow_32FC3(const float in[],
+                          float out0[],
+                          float out1[],
+                          float out2[],
+                            int length);
+
+void splitRow_32FC4(const float in[],
+                          float out0[],
+                          float out1[],
+                          float out2[],
+                          float out3[],
+                            int length);
+
+}  // namespace kernels
+}  // namespace gapi
+}  // namespace InferenceEngine
index e68a243..58e43a1 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index bff1480..b25f1d0 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index fedbd3a..8c5df8e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -207,22 +206,6 @@ inline std::string tolower(const std::string &s) {
     std::transform(s.begin(), s.end(), ret.begin(), ::tolower);
     return ret;
 }
-
-/**
- * @brief Wierd function to perform string formatting
- * @param msg - base format
- * @param ... - arguments for formatting
- * @return formatted string
- */
-static inline std::string stringFormat(const char *msg, ...) {
-    va_list va;
-    va_start(va, msg);
-    char buffer[65536];
-
-    vsnprintf_s(buffer, 65535, msg, va);
-    va_end(va);
-    return std::string(buffer);
-}
 }  // namespace details
 }  // namespace InferenceEngine
 
index 6767195..ae2bf3f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -68,8 +67,8 @@ private:
         stream.reset(new std::ostream(this));
 
         if (nullptr != ptr && len > 0) {
-            (*stream.get()) << ptr;
             ptr[len - 1] = 0;
+            (*stream.get()) << ptr;
         }
     }
 };
index c529336..2860d03 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 00ac92b..7b38b9f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index fe36e7d..a3e2276 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 9615f22..e123c75 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 14c9770..bce8a70 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -175,6 +174,25 @@ inline bool CNNNetDFS(const InferenceEngine::CNNLayerPtr &layer, const T &visit,
     std::unordered_map < CNNLayer *, bool> visited;
     return details::DFS(visited, layer, visit, visitBefore);
 }
+/**
+ * DFS algorithm with multiple starting data
+ * @param layer - starting data
+ * @param visit - callback to be called upon visiting
+ * @param visitBefore - indicates when callback is happened before all child nodes or after
+ */
+template<class T>
+inline bool CNNNetForestDFS(const std::vector<DataPtr> &heads, const T &visit, bool bVisitBefore) {
+    std::unordered_map< CNNLayer *, bool> visited;
+    for (const auto &in : heads) {
+        for (const auto &to : in->inputTo) {
+            if (visited.find(to.second.get()) != visited.end()) continue;
+            if (!details::DFS(visited, to.second, visit, bVisitBefore)) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
 
 /**
  * DFS algorithm with multiple starting nodes
@@ -740,7 +758,7 @@ inline void CNNNetworkInsertLayer(CNNLayerPtr after, CNNLayerPtr before, CNNLaye
 
         if (!bLocated) {
             if (before != nullptr) {
-                THROW_IE_EXCEPTION << "Layers are not adjiacend: " << after->name << " vs " << before->name;
+                THROW_IE_EXCEPTION << "Layers are not adjacent: " << after->name << " vs " << before->name;
             }
             // inserting into node that doesnt have childs
             IE_ASSERT(!after->outData.empty());
index 6396059..b770590 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -74,7 +73,7 @@ public:
      * @brief Creates a new empty rvalue LockedMemory instance of type void
      * @return LockedMemory instance of type void
      */
-    LockedMemory<void> buffer() override {
+    LockedMemory<void> buffer() noexcept override {
         return {getAllocator().get(), getHandle(), offset};
     }
 
@@ -82,7 +81,7 @@ public:
      * @brief Creates a new empty rvalue LockedMemory instance of type const void
      * @return LockedMemory instance of type const void
      */
-    LockedMemory<const void> cbuffer() const override {
+    LockedMemory<const void> cbuffer() const noexcept override {
         return {getAllocator().get(), getHandle(), offset};
     }
 
@@ -90,7 +89,7 @@ public:
      * @brief Creates a LockedMemory instance of the given type
      * @return LockedMemory instance of the given type
      */
-    LockedMemory <T> data() override {
+    LockedMemory <T> data() noexcept override {
         return {getAllocator().get(), getHandle(), offset};
     }
 
@@ -98,7 +97,7 @@ public:
     * @brief Creates a readOnly LockedMemory instance of the given type
     * @return Read-only LockedMemory instance of the given type
     */
-    LockedMemory<const T> readOnly() const override {
+    LockedMemory<const T> readOnly() const noexcept override {
         return {getAllocator().get(), getHandle(), offset};
     }
 
@@ -134,18 +133,14 @@ protected:
      * @brief Allocates TBlobProxy data
      * Always throws exception. Not intended to be used
      */
-    void allocate() override {
-        THROW_IE_EXCEPTION
-                << "Proxy blob are not to use allocate, it should rely on actual blob allocation it proxies";
-    }
+    void allocate() noexcept override {}
 
     /**
      * @brief Deallocates TBlobProxy data
      * Always throws exception. Not intended to be used
      */
-    bool deallocate() override {
-        THROW_IE_EXCEPTION
-                << "Proxy blob are not to use deallocate(), it should rely on actual blob allocation it proxies";
+    bool deallocate() noexcept override {
+        return false;
     }
 
 private:
diff --git a/inference-engine/src/inference_engine/ie_cnn_layer_builder.h b/inference-engine/src/inference_engine/ie_cnn_layer_builder.h
new file mode 100644 (file)
index 0000000..8cad3ca
--- /dev/null
@@ -0,0 +1,102 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <details/caseless.hpp>
+#include <ie_inetwork.hpp>
+#include <ie_layers.h>
+#include <ie_blob.h>
+#include <memory>
+#include <string>
+
+namespace InferenceEngine {
+
+namespace Builder {
+
+class BaseConverter {
+public:
+    explicit BaseConverter(const std::string& type): type(type) {}
+
+    virtual CNNLayer::Ptr createLayer(const std::shared_ptr<const ILayer>& layer, Precision precision) = 0;
+    virtual bool canCreate(const std::string& nodeType) const = 0;
+
+protected:
+    std::string type;
+};
+
+template <class CLT>
+class LayerConverter: public BaseConverter {
+public:
+    explicit LayerConverter(const std::string& type): BaseConverter(type) {}
+
+    CNNLayer::Ptr createLayer(const std::shared_ptr<const ILayer>& layer, Precision precision) override {
+        LayerParams params = {layer->getName(), layer->getType(), precision};
+        auto res = std::make_shared<CLT>(params);
+
+        auto * weightLayerPtr = dynamic_cast<WeightableLayer *>(res.get());
+
+        for (auto& it : layer->getParameters()->getConstantData()) {
+            res->blobs[it.first] = std::const_pointer_cast<Blob>(it.second);
+            if (weightLayerPtr == nullptr)
+                continue;
+            if (it.first == "weights") {
+                weightLayerPtr->_weights =  std::const_pointer_cast<Blob>(it.second);
+            } else if (it.first == "biases") {
+                weightLayerPtr->_biases =  std::const_pointer_cast<Blob>(it.second);
+            }
+        }
+
+        for (const auto& it : layer->getParameters()->getParameters()) {
+            res->params[it.first] = it.second;
+        }
+        return res;
+    }
+
+    bool canCreate(const std::string& nodeType) const override {
+        details::CaselessEq<std::string> comparator;
+        return comparator(nodeType, type);
+    }
+};
+
+class ActivationConverter: public BaseConverter {
+public:
+    ActivationConverter(): BaseConverter("Activation") {}
+
+    CNNLayer::Ptr createLayer(const std::shared_ptr<const ILayer>& layer, Precision precision) override {
+        LayerParams params = {layer->getName(), layer->getType(), precision};
+        static details::caseless_map<std::string, std::shared_ptr<BaseConverter>> activationCreators = {
+                {"relu", std::make_shared<LayerConverter<InferenceEngine::ReLULayer>>("ReLU")},
+                {"prelu", std::make_shared<LayerConverter<InferenceEngine::PReLULayer>>("PReLU")},
+                {"clamp", std::make_shared<LayerConverter<InferenceEngine::ClampLayer>>("Clamp")},
+                {"elu", std::make_shared<LayerConverter<InferenceEngine::CNNLayer>>("ELU")},
+                {"sigmoid", std::make_shared<LayerConverter<InferenceEngine::CNNLayer>>("Sigmoid")},
+                {"tanh", std::make_shared<LayerConverter<InferenceEngine::CNNLayer>>("TanH")},
+        };
+
+        auto typeIt = layer->getParameters()->getParameters().find("type");
+        if (typeIt == layer->getParameters()->getParameters().end())
+            THROW_IE_EXCEPTION << "Unsupported Activation layer. Type is unknown.";
+
+        auto activationBuilder = activationCreators.find(typeIt->second);
+        if (activationBuilder == activationCreators.end()) {
+            THROW_IE_EXCEPTION << "Unsupported Activation layer type: " << typeIt->second.asString();
+        }
+
+        auto activation = activationBuilder->second->createLayer(layer, precision);
+
+        activation->type = activationBuilder->first;
+        activation->params.erase("type");
+        activation->validateLayer();
+        return activation;
+    }
+
+    bool canCreate(const std::string& nodeType) const override {
+        details::CaselessEq<std::string> comparator;
+        return comparator(nodeType, type);
+    }
+};
+
+}  // namespace Builder
+}  // namespace InferenceEngine
index ccceab6..2db4c2a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -12,7 +11,7 @@
 #include "debug.h"
 #include "parsers.h"
 #include <ie_cnn_net_reader_impl.h>
-#include "v2_format_parser.h"
+#include "ie_format_parser.h"
 #include <file_utils.h>
 #include <ie_plugin.hpp>
 #include "xml_parse_utils.h"
@@ -73,11 +72,11 @@ StatusCode CNNNetReaderImpl::ReadNetwork(const void* model, size_t size, Respons
 }
 
 StatusCode CNNNetReaderImpl::ReadWeights(const char* filepath, ResponseDesc* resp) noexcept {
-    long long fileSize = FileUtils::fileSize(filepath);
-    if (fileSize == 0)
-        return OK;
+    int64_t fileSize = FileUtils::fileSize(filepath);
+
     if (fileSize < 0)
-        return DescriptionBuffer(resp) << "filesize for: " << filepath << " - " << fileSize << "<0";
+        return DescriptionBuffer(resp) << "filesize for: " << filepath << " - " << fileSize
+                                       << "<0. Please, check weights file existence.";
 
     if (network.get() == nullptr) {
         return DescriptionBuffer(resp) << "network is empty";
@@ -142,7 +141,7 @@ StatusCode CNNNetReaderImpl::ReadNetwork(pugi::xml_document& xmlDoc) {
 
         _version = GetFileVersion(root);
         if (_version < 1) THROW_IE_EXCEPTION << "deprecated IR version: " << _version;
-        if (_version > 3) THROW_IE_EXCEPTION << "cannot parse future versions: " << _version;
+        if (_version > 4) THROW_IE_EXCEPTION << "cannot parse future versions: " << _version;
         _parser = parserCreator->create(_version);
         network = _parser->Parse(root);
         name = network->getName();
@@ -169,42 +168,8 @@ StatusCode CNNNetReaderImpl::ReadNetwork(pugi::xml_document& xmlDoc) {
     return OK;
 }
 
-StatusCode CNNNetReaderImpl::ReadSubNetwork(pugi::xml_node &xmlRoot) {
-    description.clear();
-
-    try {
-        _parser = parserCreator->create(_version);
-        network = _parser->Parse(xmlRoot);
-        name = network->getName();
-        // network->validate(version);
-        parseSuccess = true;
-    }
-    catch (const std::string& err) {
-        description = err;
-        parseSuccess = false;
-        return GENERAL_ERROR;
-    }
-    catch (const InferenceEngineException& e) {
-        description = e.what();
-        parseSuccess = false;
-        return GENERAL_ERROR;
-    }
-    catch (const std::exception& e) {
-        description = e.what();
-        parseSuccess = false;
-        return GENERAL_ERROR;
-    }
-    catch (...) {
-        description = "Unknown exception thrown";
-        parseSuccess = false;
-        return UNEXPECTED;
-    }
-
-    return OK;
-}
-
 std::shared_ptr<IFormatParser> V2FormatParserCreator::create(int version) {
-    return std::make_shared<V2FormatParser>(version);
+    return std::make_shared<FormatParser>(version);
 }
 
 INFERENCE_ENGINE_API(InferenceEngine::ICNNNetReader*) InferenceEngine::CreateCNNNetReader() noexcept {
index b635ee3..fb9bd49 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,6 +6,7 @@
 
 #include "ie_icnn_net_reader.h"
 #include "cnn_network_impl.hpp"
+#include "parsers.h"
 #include <memory>
 #include <string>
 #include <map>
@@ -72,11 +72,6 @@ public:
         delete this;
     }
 
-    StatusCode ReadSubNetwork(pugi::xml_node &xmlRoot);
-    void CopyBlobs(void* layerParsePrms, std::string name) {
-        _parser->CopyBlobsByName(layerParsePrms, name);
-    }
-
 private:
     std::shared_ptr<InferenceEngine::details::IFormatParser> _parser;
 
diff --git a/inference-engine/src/inference_engine/ie_context.cpp b/inference-engine/src/inference_engine/ie_context.cpp
new file mode 100644 (file)
index 0000000..8f8335b
--- /dev/null
@@ -0,0 +1,62 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <ie_context.hpp>
+#include <shape_infer/built-in/ie_built_in_holder.hpp>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace InferenceEngine;
+
+Context::Context() {
+    auto builtIn = std::make_shared<ShapeInfer::BuiltInShapeInferHolder>();
+    addExtension(builtIn);
+}
+
+void Context::addExtension(const IShapeInferExtensionPtr &ext) {
+    // Get all shape infer impls
+    char** types = nullptr;
+    unsigned int size = 0;
+    ResponseDesc resp;
+    StatusCode sts = ext->getShapeInferTypes(types, size, &resp);
+    if (sts != OK)
+        THROW_IE_EXCEPTION << "Failed to get types from extension: " << resp.msg;
+    std::vector<std::string> implTypes;
+
+    std::string badLayerTypes;
+    for (int i = 0; i < size; i++) {
+        std::string type(types[i], strlen(types[i]));
+        delete[] types[i];
+        if (shapeInferImpls.find(type) != shapeInferImpls.end()) {
+            if (!badLayerTypes.empty())
+                badLayerTypes += ", ";
+            badLayerTypes += type;
+        }
+        implTypes.emplace_back(type);
+    }
+    delete[] types;
+
+    if (!badLayerTypes.empty())
+        THROW_IE_EXCEPTION << "Failed to add extension with already registered types: " << badLayerTypes;
+
+    for (const auto& implType : implTypes) {
+        IShapeInferImpl::Ptr impl;
+        sts = ext->getShapeInferImpl(impl, implType.c_str(), &resp);
+        if (sts != OK)
+            THROW_IE_EXCEPTION << "Failed to get implementation for " << implType << "type: " << resp.msg;
+        shapeInferImpls[implType] = impl;
+    }
+}
+
+void Context::addShapeInferImpl(const std::string &type, const IShapeInferImpl::Ptr &impl) {
+    if (shapeInferImpls.find(type) != shapeInferImpls.end())
+        THROW_IE_EXCEPTION << "Failed to add implementation for already registered type: " << type;
+    shapeInferImpls[type] = impl;
+}
+
+IShapeInferImpl::Ptr Context::getShapeInferImpl(const std::string &type) {
+    return shapeInferImpls.find(type) == shapeInferImpls.end() ? nullptr : shapeInferImpls[type];
+}
index 8338c04..7626620 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -51,6 +50,9 @@ const TensorDesc& Data::getTensorDesc() const {
             (!tensorDesc.getPrecision() && precision)) {
         THROW_IE_EXCEPTION << "Tensor descriptor is empty!";
     }
+    if (precision && tensorDesc.getPrecision() != precision) {
+        tensorDesc.setPrecision(precision);
+    }
     return tensorDesc;
 }
 
index 2c68097..3094414 100644 (file)
@@ -1,8 +1,9 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include <vector>
+#include <string>
 #include <ie_device.hpp>
 #include <details/ie_exception.hpp>
 #include "description_buffer.hpp"
 using namespace InferenceEngine;
 
 FindPluginResponse InferenceEngine::findPlugin(const FindPluginRequest& req) {
+    std::vector<std::string> pluginVec;
     switch (req.device) {
-    case TargetDevice::eCPU:
-        return { {
+        case TargetDevice::eCPU:
 #ifdef ENABLE_MKL_DNN
-                "MKLDNNPlugin",
+            pluginVec.push_back("MKLDNNPlugin");
 #endif
 #ifdef ENABLE_OPENVX_CVE
-                "OpenVXPluginCVE",
+            pluginVec.push_back("OpenVXPluginCVE");
 #elif defined ENABLE_OPENVX
-                "OpenVXPlugin",
+            pluginVec.push_back("OpenVXPlugin");
 #endif
-            } };
-    case TargetDevice::eGPU:
-        return { {
+            break;
+        case TargetDevice::eGPU:
 #ifdef ENABLE_CLDNN
-                "clDNNPlugin",
+            pluginVec.push_back("clDNNPlugin");
 #endif
 #ifdef ENABLE_OPENVX
-                "OpenVXPlugin",
+            pluginVec.push_back("OpenVXPlugin");
 #endif
-            } };
-    case TargetDevice::eFPGA:
-        return{ {
+            break;
+        case TargetDevice::eFPGA:
 #ifdef ENABLE_DLIA
-                "dliaPlugin",
+            pluginVec.push_back("dliaPlugin");
 #endif
 #ifdef ENABLE_OPENVX
-                "OpenVXPlugin",
+            pluginVec.push_back("OpenVXPlugin");
 #endif
-            } };
-    case TargetDevice::eMYRIAD:
-        return{ {
+            break;
+        case TargetDevice::eMYRIAD:
 #ifdef ENABLE_MYRIAD
-                "myriadPlugin",
+            pluginVec.push_back("myriadPlugin");
+#endif
+            break;
+        case TargetDevice::eHDDL:
+#ifdef ENABLE_HDDL
+            pluginVec.push_back("HDDLPlugin");
 #endif
-            } };
+            break;
         case TargetDevice::eGNA:
-            return{ {
 #ifdef ENABLE_GNA
-                        "GNAPlugin",
+            pluginVec.push_back("GNAPlugin");
 #endif
-                    } };
-    case TargetDevice::eHETERO:
-        return{ {
-                "HeteroPlugin",
-            } };
+            break;
+        case TargetDevice::eHETERO:
+            pluginVec.push_back("HeteroPlugin");
+            break;
 
-    default:
-        THROW_IE_EXCEPTION << "Cannot find plugin for device: " << getDeviceName(req.device);
+        default:
+            THROW_IE_EXCEPTION << "Cannot find plugin for device: " << getDeviceName(req.device);
     }
+    std::for_each(pluginVec.begin(), pluginVec.end(), [](std::string &name){ name = name + IE_BUILD_POSTFIX;});
+    return {pluginVec};
 }
 
 INFERENCE_ENGINE_API(StatusCode) InferenceEngine::findPlugin(
@@ -1,17 +1,18 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include <set>
 #include <unordered_set>
-#include "v2_format_parser.h"
-#include "v2_layer_parsers.h"
+#include "ie_format_parser.h"
+#include "ie_layer_parsers.h"
 #include "xml_parse_utils.h"
 #include "ie_blob_proxy.hpp"
 #include "range_iterator.hpp"
 #include <fstream>
+#include <sstream>
 #include "ie_icnn_network_stats.hpp"
+#include "ie_layers_prv.h"
 
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
@@ -36,12 +37,10 @@ void LayerParseParameters::addInputPort(const LayerPortData &port) {
 }
 
 inline void ParseSegment(LayerParseParameters& prms, const pugi::xml_node &blob) {
-    int size = GetIntAttr(blob, "size", 0);
-    int start = GetIntAttr(blob, "offset", 0);
+    uint64_t size = GetUInt64Attr(blob, "size", 0);
+    uint64_t start = GetUInt64Attr(blob, "offset", 0);
     if (!size)
         return;
-    if (size < 0 || start < 0)
-        THROW_IE_EXCEPTION << "Layer " << prms.prms.name << " has incorrect blob: " << blob.name();
 
     WeightSegment& segment = prms.blobs[blob.name()];
     segment.start = static_cast<size_t>(start);
@@ -53,17 +52,18 @@ inline void ParseSegment(LayerParseParameters& prms, const pugi::xml_node &blob)
         segment.precision = prms.prms.precision;
 }
 
-int BaseCreator::version_ = 3;
+int BaseCreator::version_ = 4;
 
-void V2FormatParser::ParsePort(LayerParseParameters::LayerPortData& port, pugi::xml_node &node) const {
+void FormatParser::ParsePort(LayerParseParameters::LayerPortData& port, pugi::xml_node &node) const {
     port.portId = GetIntAttr(node, "id");
     ParseDims(port.dims, node);
     const std::string &preStr = GetStrAttr(node, "precision", "");
     if (!preStr.empty()) port.precision = Precision::FromStr(preStr);
 }
 
-void V2FormatParser::ParseGenericParams(pugi::xml_node& node, LayerParseParameters& layerParsePrms) const {
+void FormatParser::ParseGenericParams(pugi::xml_node& node, LayerParseParameters& layerParsePrms) const {
     layerParsePrms.layerId = GetIntAttr(node, "id");
+    layerParsePrms.underIRVersion = _version;
 
     InferenceEngine::LayerParams& prms = layerParsePrms.prms;
     prms.type = XMLParseUtils::GetStrAttr(node, "type");
@@ -113,18 +113,22 @@ void V2FormatParser::ParseGenericParams(pugi::xml_node& node, LayerParseParamete
     }
 }
 
-InferenceEngine::CNNLayer::Ptr V2FormatParser::CreateLayer(pugi::xml_node& node,
+static inline std::string gen_id(int layer_id, int port_id) {
+    return (std::to_string(layer_id) + '.' + std::to_string(port_id));
+}
+
+InferenceEngine::CNNLayer::Ptr FormatParser::CreateLayer(pugi::xml_node& node,
                                                        LayerParseParameters& layerParsePrms) const {
     for (auto &creator : getCreators()) {
         if (!creator->shouldCreate(layerParsePrms.prms.type))
             continue;
         return creator->CreateLayer(node, layerParsePrms);
     }
-    static V2LayerCreator<GenericLayer> genericCreator("");
+    static LayerCreator<GenericLayer> genericCreator("");
     return genericCreator.CreateLayer(node, layerParsePrms);
 }
 
-void V2FormatParser::SetLayerInput(CNNNetworkImpl& network, const std::string& dataId,
+void FormatParser::SetLayerInput(CNNNetworkImpl& network, const std::string& dataId,
                                    CNNLayerPtr& targetLayer, int inputPort) {
     DataPtr& dataPtr = _portsToData[dataId];
     if (!dataPtr) THROW_IE_EXCEPTION << "in Layer " << targetLayer->name
@@ -155,16 +159,18 @@ void V2FormatParser::SetLayerInput(CNNNetworkImpl& network, const std::string& d
                                << " dims input: " << dumpVec(parseInfo.inputPorts[i].dims)
                                << " dims output: " << dumpVec(dataPtr->getDims());
         targetLayer->insData[i] = dataPtr;
+        const auto insId = gen_id(parseInfo.layerId, parseInfo.inputPorts[i].portId);
+        _portsToData[insId] = dataPtr;
         return;
     }
     THROW_IE_EXCEPTION << "input port " << inputPort << " does not exist in layer " << targetLayer->name;
 }
 
-V2FormatParser::V2FormatParser(int version): _version(version) {
+FormatParser::FormatParser(int version): _version(version) {
     BaseCreator::version_ = version;
 }
 
-CNNNetworkImplPtr V2FormatParser::Parse(pugi::xml_node& root) {
+CNNNetworkImplPtr FormatParser::Parse(pugi::xml_node& root) {
     _network.reset(new CNNNetworkImpl());
     _network->setName(GetStrAttr(root, "name", ""));
     _defPrecision = Precision::FromStr(GetStrAttr(root, "precision", "UNSPECIFIED"));
@@ -210,10 +216,12 @@ CNNNetworkImplPtr V2FormatParser::Parse(pugi::xml_node& root) {
             }
         }
 
-        for (const auto& outPort : lprms.outputPorts) {
-            const std::string outId = details::stringFormat("%d.%d", lprms.layerId, outPort.portId);
-            const std::string outName = lprms.outputPorts.size() == 1 ? lprms.prms.name
-                : details::stringFormat("%s.%d", lprms.prms.name.c_str(), outPort.portId);
+        for (int i = 0; i < lprms.outputPorts.size(); i++) {
+            const auto &outPort = lprms.outputPorts[i];
+            const auto outId = gen_id(lprms.layerId, outPort.portId);
+            const std::string outName = lprms.outputPorts.size() == 1
+                    ? lprms.prms.name
+                    : lprms.prms.name + "." + std::to_string(i);
             DataPtr& ptr = _network->getData(outName.c_str());
             if (!ptr) {
                 ptr.reset(new Data(outName, outPort.dims, outPort.precision, TensorDesc::getLayoutByDims(outPort.dims)));
@@ -240,7 +248,7 @@ CNNNetworkImplPtr V2FormatParser::Parse(pugi::xml_node& root) {
         int toLayer = GetIntAttr(_ec, "to-layer");
         int toPort = GetIntAttr(_ec, "to-port");
 
-        auto dataId = details::stringFormat("%d.%d", fromLayer, fromPort);
+        const auto dataId = gen_id(fromLayer, fromPort);
         auto targetLayer = layerById[toLayer];
         if (!targetLayer)
             THROW_IE_EXCEPTION << "Layer ID " << toLayer << " was not found while connecting edge at offset "
@@ -287,30 +295,57 @@ CNNNetworkImplPtr V2FormatParser::Parse(pugi::xml_node& root) {
         }
         if (!inputWasSet) THROW_IE_EXCEPTION << "network does not have any input layer";
     } else {  // version 2: inputs are marked as input layers
-        auto prepareInputLayer = [](vector<CNNLayer::Ptr> layers,
-                                    function<void(const InputInfo::Ptr&)> register_input) {
-            for (auto inLayer : layers) {
-                if (inLayer->outData.size() != 1) {
-                    THROW_IE_EXCEPTION << "Input layer must have 1 output.\n"
-                        "See documentation for details, "
-                        "'Notice On Using Model Optimizer tool' in UseOfTheInferenceEngine.html.\n"
-                        "You need to modify prototxt and generate new IR.";
-                }
-                InputInfo::Ptr info(new InputInfo());
-                info->setInputData(*inLayer->outData.begin());
-                Precision inputPrecision = info->getInputPrecision();
-                if (inputPrecision == Precision::Q78)
-                    info->setInputPrecision(Precision::I16);
-                if (inputPrecision == Precision::FP16)
-                    info->setInputPrecision(Precision::FP32);
-
-                register_input(info);
-            }
-        };
+        auto keep_input_info = [&] (DataPtr &in_data) {
+            InputInfo::Ptr info(new InputInfo());
+            info->setInputData(in_data);
+            Precision prc = info->getInputPrecision();
+
+            // Convert precision into native format (keep element size)
+            prc = prc == Precision::Q78 ? Precision::I16 :
+                  prc == Precision::FP16 ? Precision::FP32 :
+                  static_cast<Precision::ePrecision>(prc);
 
-        prepareInputLayer(inputLayers, [&](const InputInfo::Ptr& info) {
+            info->setInputPrecision(prc);
             _network->setInputInfo(info);
-        });
+        };
+
+        // Keep all data from InputLayers
+        for (auto inLayer : inputLayers) {
+            if (inLayer->outData.size() != 1)
+                THROW_IE_EXCEPTION << "Input layer must have 1 output. "
+                                      "See documentation for details.";
+            keep_input_info(inLayer->outData[0]);
+        }
+
+        // Keep all data which has no creator layer
+        for (auto &kvp : _network->allLayers()) {
+            const CNNLayer::Ptr& layer = kvp.second;
+            auto pars_info = layersParseInfo[layer->name];
+
+            if (layer->insData.empty())
+                layer->insData.resize(pars_info.inputPorts.size());
+
+            for (int i = 0; i < layer->insData.size(); i++) {
+                if (!layer->insData[i].lock()) {
+                    std::string data_name = (layer->insData.size() == 1)
+                            ? layer->name
+                            : layer->name + "." + std::to_string(i);
+
+                    DataPtr data(new Data(data_name,
+                            pars_info.inputPorts[i].dims,
+                            pars_info.inputPorts[i].precision,
+                            TensorDesc::getLayoutByDims(pars_info.inputPorts[i].dims)));
+
+                    layer->insData[i] = data;
+                    data->inputTo[layer->name] = layer;
+
+                    const auto insId = gen_id(pars_info.layerId, pars_info.inputPorts[i].portId);
+                    _portsToData[insId] = data;
+
+                    keep_input_info(data);
+                }
+            }
+        }
     }
 
     auto statNode = root.child("statistics");
@@ -322,38 +357,33 @@ CNNNetworkImplPtr V2FormatParser::Parse(pugi::xml_node& root) {
     // check all input ports are occupied
     for (const auto& kvp : _network->allLayers()) {
         const CNNLayer::Ptr& layer = kvp.second;
-        if (_version) {
-            const LayerParseParameters& parseInfo = layersParseInfo[layer->name];
-            size_t inSize = layer->insData.size();
-            if (inSize != parseInfo.inputPorts.size())
-                THROW_IE_EXCEPTION << "Layer " << layer->name << " does not have any edge connected to it";
-
-            for (unsigned i = 0; i < inSize; i++) {
-                if (!layer->insData[i].lock()) {
-                    THROW_IE_EXCEPTION << "Layer " << layer->name.c_str() << " input port "
-                        << parseInfo.inputPorts[i].portId << " is not connected to any data";
-                }
+        const LayerParseParameters& parseInfo = layersParseInfo[layer->name];
+        size_t inSize = layer->insData.size();
+        if (inSize != parseInfo.inputPorts.size())
+            THROW_IE_EXCEPTION << "Layer " << layer->name << " does not have any edge connected to it";
+
+        for (unsigned i = 0; i < inSize; i++) {
+            if (!layer->insData[i].lock()) {
+                THROW_IE_EXCEPTION << "Layer " << layer->name.c_str() << " input port "
+                                   << parseInfo.inputPorts[i].portId << " is not connected to any data";
             }
         }
         layer->validateLayer();
     }
+    // parse mean image
+    ParsePreProcess(root);
+    _network->resolveOutput();
+
+    // Set default output precision to FP32 (for back-compatibility)
+    OutputsDataMap outputsInfo;
+    _network->getOutputsInfo(outputsInfo);
+    for (auto outputInfo : outputsInfo) {
+        outputInfo.second->setPrecision(Precision::FP32);
+    }
 
-    if (_version) {
-        // parse mean image
-        ParsePreProcess(root);
-        _network->resolveOutput();
-
-        // Set default output precision to FP32 (for back-compatibility)
-        OutputsDataMap outputsInfo;
-        _network->getOutputsInfo(outputsInfo);
-        for (auto outputInfo : outputsInfo) {
-            outputInfo.second->setPrecision(Precision::FP32);
-        }
-
-        if (_version == 1) {
-            int batchSize = GetIntAttr(root, "batch", 1);
-            _network->setBatchSize(batchSize);
-        }
+    if (_version == 1) {
+        int batchSize = GetIntAttr(root, "batch", 1);
+        _network->setBatchSize(batchSize);
     }
 
     return _network;
@@ -362,7 +392,7 @@ CNNNetworkImplPtr V2FormatParser::Parse(pugi::xml_node& root) {
 template<typename BlobType>
 inline Blob::Ptr GetTypedBlobFromSegment(const TBlob<uint8_t>::Ptr& weights, const WeightSegment& segment) {
     if (segment.getEnd() > weights->size())
-        THROW_IE_EXCEPTION << "metadata is incorrect - segment exceeds given buffer limits. Please validate input data";
+        THROW_IE_EXCEPTION << "segment exceeds given buffer limits. Please, validate weights file";
 
     size_t noOfElement = segment.size / sizeof(BlobType);
     // RanC: TODO: IR does not provide me with weight slayout.
@@ -381,7 +411,7 @@ inline Blob::Ptr GetTypedBlobFromSegment(const TBlob<uint8_t>::Ptr& weights, con
     return binBlob;
 }
 
-Blob::Ptr V2FormatParser::GetBlobFromSegment(const TBlob<uint8_t>::Ptr& weights, const WeightSegment& segment) const {
+Blob::Ptr FormatParser::GetBlobFromSegment(const TBlob<uint8_t>::Ptr& weights, const WeightSegment& segment) const {
     if (segment.precision == Precision::FP32) {
         return GetTypedBlobFromSegment<float>(weights, segment);
     } else if (segment.precision == Precision::I16 || segment.precision == Precision::Q78 || segment.precision == Precision::FP16) {
@@ -395,14 +425,7 @@ Blob::Ptr V2FormatParser::GetBlobFromSegment(const TBlob<uint8_t>::Ptr& weights,
     }
 }
 
-void V2FormatParser::CopyBlobsByName(void* layerParsePrms, std::string name) {
-    auto internalParams = layersParseInfo.find(name);
-
-    LayerParseParameters* params = static_cast<LayerParseParameters *>(layerParsePrms);
-    params->blobs = internalParams->second.blobs;
-}
-
-void V2FormatParser::SetWeights(const TBlob<uint8_t>::Ptr& weights) {
+void FormatParser::SetWeights(const TBlob<uint8_t>::Ptr& weights) {
     for (auto& kvp : _network->allLayers()) {
         auto fit = layersParseInfo.find(kvp.second->name);
         // todo: may check that earlier - while parsing...
@@ -420,13 +443,16 @@ void V2FormatParser::SetWeights(const TBlob<uint8_t>::Ptr& weights) {
                 pWL->_biases  = GetBlobFromSegment(weights, lprms.blobs["biases"]);
                 pWL->blobs["biases"] = pWL->_biases;
             }
-        } else {
-            auto pGL = dynamic_cast<GenericLayer *>(kvp.second.get());
-            if (pGL == nullptr) continue;
-            for (auto s : lprms.blobs) {
-                pGL->blobs[s.first] = GetBlobFromSegment(weights, s.second);
-            }
         }
+        auto pGL = kvp.second.get();
+        if (pGL == nullptr) continue;
+        for (auto s : lprms.blobs) {
+            pGL->blobs[s.first] = GetBlobFromSegment(weights, s.second);
+        }
+
+        // Some layer can specify additional action to prepare weights
+        if (fit->second.internalWeightSet)
+            fit->second.internalWeightSet(weights);
     }
     for (auto &kvp : _preProcessSegments) {
         const std::string &inputName = kvp.first;
@@ -450,12 +476,13 @@ void V2FormatParser::SetWeights(const TBlob<uint8_t>::Ptr& weights) {
     }
 }
 
-void V2FormatParser::ParseDims(SizeVector& dims, const pugi::xml_node &parentNode) const {
+void FormatParser::ParseDims(SizeVector& dims, const pugi::xml_node &parentNode) const {
     for (auto node = parentNode.child("dim"); !node.empty(); node = node.next_sibling("dim")) {
         unsigned int dim = 0;
         const pugi::char_t* dimVal = node.child_value();
-        if (!sscanf(dimVal, "%u", &dim) || dim == 0) {
-            THROW_IE_EXCEPTION << "dimension (" << dimVal << ") in node must be a positive integer: at offset "
+        stringstream ss(dimVal);
+        if (!(ss >> dim) || dim == 0) {
+            THROW_IE_EXCEPTION << "dimension (" << dimVal << ") in node " << node.name() << " must be a positive integer: at offset "
                 << node.offset_debug();
         }
         dims.push_back(dim);
@@ -469,7 +496,15 @@ void V2FormatParser::ParseDims(SizeVector& dims, const pugi::xml_node &parentNod
         dims.insert(dims.begin(), 1);  // for batch, in version 1, in version 2 it is already there.
 }
 
-DataPtr V2FormatParser::ParseInputData(pugi::xml_node& root) const {
+const DataPtr& FormatParser::GetDataBy(int layer_id, int port_id) const {
+    const auto id = gen_id(layer_id, port_id);
+    const auto &found = _portsToData.find(id);
+    if (found == _portsToData.end())
+        THROW_IE_EXCEPTION << "No data found for layer_id=" << layer_id << " port_id=" << port_id;
+    return found->second;
+}
+
+DataPtr FormatParser::ParseInputData(pugi::xml_node& root) const {
     auto inputNode = root.child("input");
     if (inputNode.empty()) {
         THROW_IE_EXCEPTION << "No input node in network, missing <input>";
@@ -486,7 +521,7 @@ DataPtr V2FormatParser::ParseInputData(pugi::xml_node& root) const {
     return inputData;
 }
 
-void V2FormatParser::ParsePreProcess(pugi::xml_node& root) {
+void FormatParser::ParsePreProcess(pugi::xml_node& root) {
     /*
     <pre-process mean-precision="FP32">
         <channel id = ”0”>
@@ -612,42 +647,45 @@ void V2FormatParser::ParsePreProcess(pugi::xml_node& root) {
     }
 }
 
-const std::vector<std::shared_ptr<BaseCreator> >& V2FormatParser::getCreators() const {
+const std::vector<std::shared_ptr<BaseCreator> >& FormatParser::getCreators() const {
     // there should be unique_ptr but it cant be used with initializer lists
     static std::vector<std::shared_ptr<BaseCreator> > creators = {
-        std::make_shared<V2LayerCreator<PowerLayer>>("Power"),
-        std::make_shared<V2LayerCreator<ConvolutionLayer>>("Convolution"),
-        std::make_shared<V2LayerCreator<DeconvolutionLayer>>("Deconvolution"),
-        std::make_shared<V2LayerCreator<PoolingLayer>>("Pooling"),
-        std::make_shared<V2LayerCreator<FullyConnectedLayer>>("InnerProduct"),
-        std::make_shared<V2LayerCreator<FullyConnectedLayer>>("FullyConnected"),
-        std::make_shared<V2LayerCreator<NormLayer>>("LRN"),
-        std::make_shared<V2LayerCreator<NormLayer>>("Norm"),
-        std::make_shared<V2LayerCreator<SoftMaxLayer>>("Softmax"),
-        std::make_shared<V2LayerCreator<GRNLayer>>("GRN"),
-        std::make_shared<V2LayerCreator<MVNLayer>>("MVN"),
-        std::make_shared<V2LayerCreator<RNNLayer>>("RNN"),
-        std::make_shared<V2LayerCreator<LSTMCell>>("LSTMCell"),
-        std::make_shared<V2LayerCreator<ReLULayer>>("ReLU"),
-        std::make_shared<V2LayerCreator<ClampLayer>>("Clamp"),
-        std::make_shared<V2LayerCreator<SplitLayer>>("Split"),
-        std::make_shared<V2LayerCreator<SplitLayer>>("Slice"),
-        std::make_shared<V2LayerCreator<ConcatLayer>>("Concat"),
-        std::make_shared<V2LayerCreator<EltwiseLayer>>("Eltwise"),
-        std::make_shared<V2LayerCreator<ScaleShiftLayer>>("ScaleShift"),
-        std::make_shared<V2LayerCreator<PReLULayer>>("PReLU"),
-        std::make_shared<V2LayerCreator<CropLayer>>("Crop"),
-        std::make_shared<V2LayerCreator<ReshapeLayer>>("Reshape"),
-        std::make_shared<V2LayerCreator<ReshapeLayer>>("Flatten"),
-        std::make_shared<V2LayerCreator<TileLayer>>("Tile"),
+        std::make_shared<LayerCreator<PowerLayer>>("Power"),
+        std::make_shared<LayerCreator<ConvolutionLayer>>("Convolution"),
+        std::make_shared<LayerCreator<DeconvolutionLayer>>("Deconvolution"),
+        std::make_shared<LayerCreator<PoolingLayer>>("Pooling"),
+        std::make_shared<LayerCreator<FullyConnectedLayer>>("InnerProduct"),
+        std::make_shared<LayerCreator<FullyConnectedLayer>>("FullyConnected"),
+        std::make_shared<LayerCreator<NormLayer>>("LRN"),
+        std::make_shared<LayerCreator<NormLayer>>("Norm"),
+        std::make_shared<LayerCreator<SoftMaxLayer>>("Softmax"),
+        std::make_shared<LayerCreator<GRNLayer>>("GRN"),
+        std::make_shared<LayerCreator<MVNLayer>>("MVN"),
+        std::make_shared<LayerCreator<ReLULayer>>("ReLU"),
+        std::make_shared<LayerCreator<ClampLayer>>("Clamp"),
+        std::make_shared<LayerCreator<SplitLayer>>("Split"),
+        std::make_shared<LayerCreator<SplitLayer>>("Slice"),
+        std::make_shared<LayerCreator<ConcatLayer>>("Concat"),
+        std::make_shared<LayerCreator<EltwiseLayer>>("Eltwise"),
+        std::make_shared<LayerCreator<GemmLayer>>("Gemm"),
+        std::make_shared<LayerCreator<PadLayer>>("Pad"),
+        std::make_shared<LayerCreator<GatherLayer>>("Gather"),
+        std::make_shared<LayerCreator<ScaleShiftLayer>>("ScaleShift"),
+        std::make_shared<LayerCreator<PReLULayer>>("PReLU"),
+        std::make_shared<LayerCreator<CropLayer>>("Crop"),
+        std::make_shared<LayerCreator<ReshapeLayer>>("Reshape"),
+        std::make_shared<LayerCreator<ReshapeLayer>>("Flatten"),
+        std::make_shared<LayerCreator<TileLayer>>("Tile"),
         std::make_shared<ActivationLayerCreator>("Activation"),
-        std::make_shared<V2LayerCreator<BatchNormalizationLayer>>("BatchNormalization"),
+        std::make_shared<LayerCreator<BatchNormalizationLayer>>("BatchNormalization"),
         std::make_shared<TILayerCreator>("TensorIterator"),
+        std::make_shared<LayerCreator<LSTMCell>>("LSTMCell"),
+        std::make_shared<LayerCreator<RNNLayer>>("RNN"),
     };
     return creators;
 }
 
-void V2FormatParser::ParseStatisticSection(const pugi::xml_node& statNode) {
+void FormatParser::ParseStatisticSection(const pugi::xml_node& statNode) {
     auto splitParseCommas = [&](const string& s) ->vector<float> {
         vector<float> res;
         stringstream ss(s);
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -43,6 +42,10 @@ struct LayerParseParameters {
     std::vector<LayerPortData> outputPorts;
     std::map<std::string, WeightSegment> blobs;
 
+    std::function<void(const TBlob<uint8_t>::Ptr &weights)> internalWeightSet;
+
+    int underIRVersion = 0;
+
     void addOutputPort(const LayerPortData &port);
     void addInputPort(const LayerPortData &port);
 };
@@ -64,21 +67,23 @@ public:
     }
 };
 
-class V2FormatParser : public IFormatParser {
+class INFERENCE_ENGINE_API_CLASS(FormatParser) : public IFormatParser {
 public:
-    explicit V2FormatParser(int version);
+    explicit FormatParser(int version);
 
     CNNNetworkImplPtr Parse(pugi::xml_node& root) override;
 
     Blob::Ptr GetBlobFromSegment(const TBlob<uint8_t>::Ptr& weights, const WeightSegment & weight_segment) const;
     void SetWeights(const TBlob<uint8_t>::Ptr& weights) override;
-    void CopyBlobsByName(void* layerParsePrms, std::string name) override;
     void ParseDims(SizeVector& dims, const pugi::xml_node &node) const;
+    const DataPtr& GetDataBy(int layer_id, int port_id) const;
+
+protected:
+    std::map<std::string, LayerParseParameters> layersParseInfo;
 
 private:
     int _version;
     Precision _defPrecision;
-    std::map<std::string, LayerParseParameters> layersParseInfo;
     std::map<std::string, DataPtr> _portsToData;
 
     CNNNetworkImplPtr _network;
diff --git a/inference-engine/src/inference_engine/ie_layer_parsers.cpp b/inference-engine/src/inference_engine/ie_layer_parsers.cpp
new file mode 100644 (file)
index 0000000..886c759
--- /dev/null
@@ -0,0 +1,252 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ie_layer_parsers.h"
+#include "ie_cnn_net_reader_impl.h"
+
+#include <string>
+#include <utility>
+#include <memory>
+#include <set>
+
+namespace InferenceEngine {
+namespace details {
+
+CNNLayer::Ptr ActivationLayerCreator::CreateLayer(pugi::xml_node& node, LayerParseParameters& layerParsePrms)  {
+    pugi::xml_node dn = GetChild(node, { "data", "activation_data" }, false);
+    if (dn.empty()) {
+        THROW_IE_EXCEPTION << "Activation layer has no data node";
+    }
+
+    std::string type;
+    for (auto ait = dn.attributes_begin(); ait != dn.attributes_end(); ++ait) {
+        pugi::xml_attribute attr = *ait;
+        if (CaselessEq<std::string>()("type", attr.name())) {
+            if (!type.empty()) {
+                THROW_IE_EXCEPTION << "Activation layer has multiple types";
+            }
+            type = attr.value();
+        }
+    }
+
+    static caseless_map<std::string, std::shared_ptr<BaseCreator>> activationCreators = {
+        {"relu", std::make_shared<LayerCreator<ReLULayer>>("ReLU")},
+        {"prelu", std::make_shared<LayerCreator<PReLULayer>>("PReLU")},
+        {"clamp", std::make_shared<LayerCreator<ClampLayer>>("Clamp")},
+        {"elu", std::make_shared<LayerCreator<CNNLayer>>("ELU")},
+        {"sigmoid", std::make_shared<LayerCreator<CNNLayer>>("Sigmoid")},
+        {"tanh", std::make_shared<LayerCreator<CNNLayer>>("TanH")},
+    };
+
+    auto activationBuilder = activationCreators.find(type);
+    if (activationBuilder == activationCreators.end()) {
+        THROW_IE_EXCEPTION << "Unsupported Activation layer type: " << type;
+    }
+
+    auto activation = activationBuilder->second->CreateLayer(node, layerParsePrms);
+
+    activation->type = activationBuilder->first;
+    activation->params.erase("type");
+
+    return activation;
+}
+
+/***********************************************************************************/
+/*******  Tensor Iterator parser  **************************************************/
+/***********************************************************************************/
+
+using PortInf = std::pair<int, int>;
+using PortSet = std::set<PortInf>;
+using PortMap = std::map<PortInf, DataPtr>;
+
+static PortSet allRequiredInputs(pugi::xml_node &ti) {
+    PortSet res;  // duplicates are possible
+
+    FOREACH_CHILD(p, ti.child("port_map"), "input") {
+        int internal_layer_id = GetIntAttr(p, "internal_layer_id");
+        int internal_port_id = GetIntAttr(p, "internal_port_id");
+        res.emplace(internal_layer_id, internal_port_id);
+    }
+    FOREACH_CHILD(ec, ti.child("back_edges"), "edge") {
+        int to_layer_id = GetIntAttr(ec, "to-layer");
+        int to_port_id = GetIntAttr(ec, "to-port");
+        res.emplace(to_layer_id, to_port_id);
+    }
+    return res;
+}
+
+static PortSet allRequiredOutputs(pugi::xml_node &ti) {
+    PortSet res;  // duplicates are possible
+
+    FOREACH_CHILD(p, ti.child("port_map"), "output") {
+        int internal_layer_id = GetIntAttr(p, "internal_layer_id");
+        int internal_port_id = GetIntAttr(p, "internal_port_id");
+        res.emplace(internal_layer_id, internal_port_id);
+    }
+    FOREACH_CHILD(edge, ti.child("back_edges"), "edge") {
+        int to_layer_id = GetIntAttr(edge, "from-layer");
+        int to_port_id = GetIntAttr(edge, "from-port");
+        res.emplace(to_layer_id, to_port_id);
+    }
+    return res;
+}
+
+/***********************************************************************************/
+/*******  Body Parser Helper  ******************************************************/
+/***********************************************************************************/
+using WBlob = TBlob<uint8_t>::Ptr;
+
+class BodyParser {
+public:
+    BodyParser(pugi::xml_node &net_node, int ir_version) :
+        body(net_node), parser(FormatParser(ir_version)) {}
+
+    void parse(PortSet in_request, PortSet out_request) {
+        auto net = parser.Parse(body);
+
+        for (const auto &pi : in_request)
+            inputs[pi] = parser.GetDataBy(pi.first, pi.second);
+        for (const auto &pi : out_request)
+            outputs[pi] = parser.GetDataBy(pi.first, pi.second);
+
+        // Mark data as network output. Just for check
+        for (const auto &kvp : outputs) {
+            auto &data = kvp.second;
+            auto layer = data->creatorLayer.lock();
+            auto &outs = layer->outData;
+            auto o_idx = std::find(outs.begin(), outs.end(), data) - outs.begin();
+            auto sts = net->addOutput(layer->name, o_idx, nullptr);
+            IE_ASSERT(sts == OK) << "TI body. Cannot add output port for layer "
+                                 << layer->name << " port index " << o_idx;
+        }
+
+        // Verify that all input/output are in use
+        InputsDataMap in_info_map;
+        std::map<std::string, DataPtr> out_info_map;
+        net->getInputsInfo(in_info_map);
+        net->getOutputsInfo(out_info_map);
+
+        IE_ASSERT(in_info_map.size() == inputs.size())   << "TI body. There are unlinked inputs";
+        IE_ASSERT(out_info_map.size() == outputs.size()) << "TI body. There are unlinked outputs";
+    }
+
+    void setWeights(const WBlob &weights) {
+        parser.SetWeights(weights);
+    }
+
+    const PortMap& getInsMap()  const { return inputs;  }
+    const PortMap& getOutsMap() const { return outputs; }
+
+private:
+    pugi::xml_node &body;
+    FormatParser parser;
+
+    PortMap inputs;
+    PortMap outputs;
+};
+
+CNNLayer::Ptr TILayerCreator::CreateLayer(pugi::xml_node& node, LayerParseParameters& layerParsePrms) {
+    std::string ti_name = node.attribute("name").as_string();
+
+    auto body = node.child("body");
+    if (body.empty())
+        THROW_IE_EXCEPTION << "TensorIterator " << ti_name << " has no body";
+
+    auto all_inputs = allRequiredInputs(node);
+    auto all_outputs = allRequiredOutputs(node);
+
+    auto parser = std::make_shared<BodyParser>(body, layerParsePrms.underIRVersion);
+    parser->parse(all_inputs, all_outputs);
+
+    auto ins = parser->getInsMap();
+    auto outs = parser->getOutsMap();
+
+    // fill in/outputs and map internal port to index
+    std::map<PortInf, int> p2i;
+    std::vector<DataPtr> inputs, outputs;
+    for (const auto &p : all_inputs) {
+        IE_ASSERT(ins.find(p) != ins.end());
+        p2i[p] = inputs.size();
+        inputs.push_back(ins[p]);
+    }
+    for (const auto &p : all_outputs) {
+        IE_ASSERT(outs.find(p) != outs.end());
+        p2i[p] = outputs.size();
+        outputs.push_back(outs[p]);
+    }
+
+    // fill map external port to index
+    std::map<int, int> e2i;
+    {
+        int in_indx = 0;
+        FOREACH_CHILD(in, node.child("input"), "port") {
+            int id = GetIntAttr(in, "id");
+            e2i[id] = in_indx++;
+        }
+        int out_indx = 0;
+        FOREACH_CHILD(in, node.child("output"), "port") {
+            int id = GetIntAttr(in, "id");
+            e2i[id] = out_indx++;
+        }
+    }
+
+    std::vector<TensorIterator::PortMap> in_ports_maping, out_ports_maping, back_edges;
+
+    auto parse_rule = [&] (pugi::xml_node &pm) {
+        int external_port_id  = GetIntAttr(pm, "external_port_id");
+        int internal_layer_id = GetIntAttr(pm, "internal_layer_id");
+        int internal_port_id  = GetIntAttr(pm, "internal_port_id");
+
+        int axis = GetIntAttr(pm, "axis", -1);
+        int stride = GetIntAttr(pm, "stride", 1);
+        int part_size = GetIntAttr(pm, "part_size", 1);
+        int start = GetIntAttr(pm, "start", 0);
+        int end = GetIntAttr(pm, "end", -1);
+
+        TensorIterator::PortMap res;
+        res.from = e2i[external_port_id];
+        res.to   = p2i[{internal_layer_id, internal_port_id}];
+        res.axis = axis;
+        res.stride    = stride;
+        res.part_size = part_size;
+        res.start     = start;
+        res.end       = end;
+        return res;
+    };
+
+    FOREACH_CHILD(pm, node.child("port_map"), "input") {
+        in_ports_maping.push_back(parse_rule(pm));
+    }
+    FOREACH_CHILD(pm, node.child("port_map"), "output") {
+        out_ports_maping.push_back(parse_rule(pm));
+    }
+
+    FOREACH_CHILD(ec, node.child("back_edges"), "edge") {
+        int from_l = GetIntAttr(ec, "from-layer");
+        int from_p = GetIntAttr(ec, "from-port");
+        int to_l = GetIntAttr(ec, "to-layer");
+        int to_p = GetIntAttr(ec, "to-port");
+
+        back_edges.push_back({ p2i[{from_l, from_p}], p2i[{to_l, to_p}],
+                               -1, 1, 0, -1, 1 });
+    }
+
+    // Hold parser as a shared_ptr into callback
+    // Will be called outside to set weight blobs
+    // for internal TI body layers
+    layerParsePrms.internalWeightSet = [=] (const WBlob &w) {
+        parser->setWeights(w);
+    };
+
+    auto res = std::make_shared<TensorIterator>(layerParsePrms.prms);
+    res->body.inputs = inputs;
+    res->body.outputs = outputs;
+    res->input_port_map = in_ports_maping;
+    res->output_port_map = out_ports_maping;
+    res->back_edges = back_edges;
+    return res;
+}
+
+}  // namespace details
+}  // namespace InferenceEngine
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,7 +6,7 @@
 
 #include <debug.h>
 #include <memory>
-#include "v2_format_parser.h"
+#include "ie_format_parser.h"
 #include "xml_parse_utils.h"
 #include "range_iterator.hpp"
 #include "details/caseless.hpp"
@@ -31,9 +30,9 @@ using namespace XMLParseUtils;
 namespace InferenceEngine {
 namespace details {
 template<class LT>
-class V2LayerCreator : public BaseCreator {
+class LayerCreator : public BaseCreator {
 public:
-    explicit V2LayerCreator(const std::string& type) : BaseCreator(type) {}
+    explicit LayerCreator(const std::string& type) : BaseCreator(type) {}
 
     CNNLayer::Ptr CreateLayer(pugi::xml_node& node, LayerParseParameters& layerParsePrms) override {
         auto res = std::make_shared<LT>(layerParsePrms.prms);
@@ -58,19 +57,7 @@ public:
             if (dn.child("crop").empty()) {
                 for (auto ait = dn.attributes_begin(); ait != dn.attributes_end(); ++ait) {
                     pugi::xml_attribute attr = *ait;
-                    if (attr.name() == dn.attribute("region").name()) {
-                        bool isSame = std::equal(null_terminated_string(attr.value()),
-                                                 null_terminated_string_end(),
-                                                 null_terminated_string("same"),
-                                                 [](char c1, char c2) {
-                                                     return std::tolower(c1) == c2;
-                                                 });
-                        bool var = attr.empty() || !isSame;
-
-                        res->params[attr.name()] = var == 0 ? "false" : "true";
-                    } else {
-                        res->params[attr.name()] = attr.value();
-                    }
+                    res->params.emplace(attr.name(), attr.value());
                 }
             } else {
                 if (std::is_same<LT, CropLayer>::value) {
index dde5f6d..b39a054 100644 (file)
@@ -1,9 +1,9 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "ie_layers.h"
+#include "ie_layers_prv.h"
 #include "ie_layer_validators.hpp"
 #include "debug.h"
 #include "xml_parse_utils.h"
 #include <map>
 #include <vector>
 #include <ie_iextension.h>
-#include <v2_format_parser.h>
+#include <ie_format_parser.h>
+
+#include <details/ie_exception.hpp>
+
+namespace InferenceEngine {
 
-using namespace InferenceEngine;
 using namespace details;
 using std::vector;
 
+template <typename T, typename P>
+inline bool one_of(T val, P item) { return val == item; }
+template <typename T, typename P, typename... Args>
+inline bool one_of(T val, P item, Args... item_others) {
+    return val == item || one_of(val, item_others...);
+}
+
 void CNNLayer::validateLayer() {
-    LayerValidator::Ptr validator = LayerValidators::getInstance()->getValidator(type);
-    validator->parseParams(this);
-    validator->checkParams(this);
-    InOutDims shapes;
-    getInOutShapes(this, shapes);
-    validator->checkShapes(this, shapes.inDims);
+    try {
+        LayerValidator::Ptr validator = LayerValidators::getInstance()->getValidator(type);
+        validator->parseParams(this);
+        validator->checkParams(this);
+        InOutDims shapes;
+        getInOutShapes(this, shapes);
+        validator->checkShapes(this, shapes.inDims);
+    } catch(InferenceEngineException ie_e) {
+        THROW_IE_EXCEPTION << "Error of validate layer: " << this->name
+                           << " with type: " << this->type << ". "
+                           << ie_e.what();
+    }
 }
 
 struct WeightableParams {
@@ -73,8 +89,7 @@ void checkWeightable(const std::map<std::string, Blob::Ptr>& blobs,
     OC = params.outputs;
 
     auto it = blobs.find("weights");
-    if (it !=
-        blobs.end()) {  // TODO: return with fixing shape infer tests: THROW_IE_EXCEPTION << "Invalid blobs: no weights";
+    if (it != blobs.end()) {  // TODO: return with fixing shape infer tests: THROW_IE_EXCEPTION << "Invalid blobs: no weights";
         auto weights = it->second;
         if (weights == nullptr || weights->dims().empty()) THROW_IE_EXCEPTION << "Weights can't be empty";
 
@@ -235,15 +250,20 @@ void ConvolutionValidator::parseParams(CNNLayer* layer) {
     if (!convLayer) {
         THROW_IE_EXCEPTION << "Layer is not instance of ConvolutionLayer class";
     }
-    auto version = BaseCreator::version_;
     convLayer->_out_depth = convLayer->GetParamAsUInt("output");
 
-    if (version < 3) {
-        convLayer->_kernel.clear();
+    convLayer->_kernel.clear();
+    convLayer->_stride.clear();
+    convLayer->_padding.clear();
+    convLayer->_pads_end.clear();
+    convLayer->_dilation.clear();
+
+    vector<unsigned int> kernels = convLayer->GetParamAsUInts("kernel", {});
+    if (kernels.empty()) {
+        // IR_v == 2
         convLayer->_kernel.insert(X_AXIS, convLayer->GetParamAsUInt("kernel-x"));
         convLayer->_kernel.insert(Y_AXIS, convLayer->GetParamAsUInt("kernel-y"));
 
-        convLayer->_stride.clear();
         convLayer->_stride.insert(X_AXIS, convLayer->GetParamAsUInt("stride-x", 1u));
         convLayer->_stride.insert(Y_AXIS, convLayer->GetParamAsUInt("stride-y", 1u));
         // TODO: maybe just throw exception, why do we change IR?
@@ -256,25 +276,16 @@ void ConvolutionValidator::parseParams(CNNLayer* layer) {
             LogError("Warning! in layer %s: Stride y is 0, setting to 1", convLayer->name.c_str());
         }
 
-        convLayer->_padding.clear();
         convLayer->_padding.insert(X_AXIS, convLayer->GetParamAsUInt("pad-x", 0u));
         convLayer->_padding.insert(Y_AXIS, convLayer->GetParamAsUInt("pad-y", 0u));
 
-        convLayer->_pads_end.clear();
         convLayer->_pads_end.insert(X_AXIS, convLayer->GetParamAsUInt("pad-r", convLayer->_padding[X_AXIS]));
         convLayer->_pads_end.insert(Y_AXIS, convLayer->GetParamAsUInt("pad-b", convLayer->_padding[Y_AXIS]));
 
-        convLayer->_dilation.clear();
         convLayer->_dilation.insert(X_AXIS, convLayer->GetParamAsUInt("dilation-x", 1u));
         convLayer->_dilation.insert(Y_AXIS, convLayer->GetParamAsUInt("dilation-y", 1u));
-
-        // TODO: checks for presence of all required attributes, and that there's no extraneous parameters only.
-    } else if (version == 3) {
-        vector<unsigned int> kernels = convLayer->GetParamAsUInts("kernel");
-        if (kernels.empty()) {
-            THROW_IE_EXCEPTION << "Invalid kernel field in layer " << convLayer->name;
-        }
-        convLayer->_kernel.clear();
+    } else {
+        // IR_v > 2
         for (int i = 1; i <= kernels.size(); i++) {
             convLayer->_kernel.insert(i - 1, kernels[kernels.size() - i]);
         }
@@ -283,7 +294,6 @@ void ConvolutionValidator::parseParams(CNNLayer* layer) {
         vector<unsigned int> default_1 = vector<unsigned int> (convLayer->_kernel.size(), 1u);
 
         vector<unsigned int> strides = convLayer->GetParamAsUInts("strides", default_1);
-        convLayer->_stride.clear();
         for (int i = 1; i <= strides.size(); i++) {
             if (strides[strides.size() - i] == 0) {
                 THROW_IE_EXCEPTION << "Stride could not be 0.\nIn layer " << convLayer->name;
@@ -292,24 +302,22 @@ void ConvolutionValidator::parseParams(CNNLayer* layer) {
         }
 
         vector<unsigned int> pads_begin = convLayer->GetParamAsUInts("pads_begin", default_0);
-        convLayer->_padding.clear();
         for (int i = 1; i <= pads_begin.size(); i++) {
             convLayer->_padding.insert(i - 1, pads_begin[pads_begin.size() - i]);
         }
 
-        vector<unsigned int> pads_end = convLayer->GetParamAsUInts("pads_end", default_0);
-        convLayer->_pads_end.clear();
+        vector<unsigned int> pads_end = convLayer->GetParamAsUInts("pads_end", pads_begin);
         for (int i = 1; i <= pads_end.size(); i++) {
             convLayer->_pads_end.insert(i - 1, pads_end[pads_end.size() - i]);
         }
 
         vector<unsigned int> dilations = convLayer->GetParamAsUInts("dilations", default_1);
-        convLayer->_dilation.clear();
         for (int i = 1; i <= dilations.size(); i++) {
             convLayer->_dilation.insert(i - 1, dilations[dilations.size() - i]);
         }
     }
 
+    convLayer->_auto_pad = convLayer->GetParamAsString("auto_pad", "");
     convLayer->_group = convLayer->GetParamAsUInt("group", 1u);
 }
 
@@ -340,90 +348,14 @@ void DeconvolutionValidator::parseParams(CNNLayer* layer) {
     if (!deconvLayer) {
         THROW_IE_EXCEPTION << "Layer is not instance of DeconvolutionLayer class";
     }
-
-    auto version = BaseCreator::version_;
-
-    deconvLayer->_out_depth = deconvLayer->GetParamAsUInt("output");
-
-    if (version < 3) {
-        deconvLayer->_kernel.clear();
-        deconvLayer->_kernel.insert(X_AXIS, deconvLayer->GetParamAsUInt("kernel-x"));
-        deconvLayer->_kernel.insert(Y_AXIS, deconvLayer->GetParamAsUInt("kernel-y"));
-
-        deconvLayer->_stride.clear();
-        deconvLayer->_stride.insert(X_AXIS, deconvLayer->GetParamAsUInt("stride-x", 1u));
-        deconvLayer->_stride.insert(Y_AXIS, deconvLayer->GetParamAsUInt("stride-y", 1u));
-        // TODO: maybe just throw exception, why do we change IR?
-        if (0 == deconvLayer->_stride[X_AXIS]) {
-            deconvLayer->_stride[X_AXIS] = 1u;
-            LogError("Warning! in layer %s: Stride x is 0, setting to 1 ", deconvLayer->name.c_str());
-        }
-        if (0 == deconvLayer->_stride[Y_AXIS]) {
-            deconvLayer->_stride[Y_AXIS] = 1u;
-            LogError("Warning! in layer %s: Stride y is 0, setting to 1", deconvLayer->name.c_str());
-        }
-
-        deconvLayer->_padding.clear();
-        deconvLayer->_padding.insert(X_AXIS, deconvLayer->GetParamAsUInt("pad-x", 0u));
-        deconvLayer->_padding.insert(Y_AXIS, deconvLayer->GetParamAsUInt("pad-y", 0u));
-
-        deconvLayer->_pads_end.clear();
-        deconvLayer->_pads_end.insert(X_AXIS, deconvLayer->GetParamAsUInt("pad-r", deconvLayer->_padding[X_AXIS]));
-        deconvLayer->_pads_end.insert(Y_AXIS, deconvLayer->GetParamAsUInt("pad-b", deconvLayer->_padding[Y_AXIS]));
-
-        deconvLayer->_dilation.clear();
-        deconvLayer->_dilation.insert(X_AXIS, deconvLayer->GetParamAsUInt("dilation-x", 1u));
-        deconvLayer->_dilation.insert(Y_AXIS, deconvLayer->GetParamAsUInt("dilation-y", 1u));
-    } else if (version == 3) {
-        vector<unsigned int> kernels = deconvLayer->GetParamAsUInts("kernel");
-        if (kernels.empty()) {
-            THROW_IE_EXCEPTION << "Invalid kernel field in layer " << deconvLayer->name;
-        }
-        deconvLayer->_kernel.clear();
-        for (int i = 1; i <= kernels.size(); i++) {
-            deconvLayer->_kernel.insert(i - 1, kernels[kernels.size() - i]);
-        }
-
-        vector<unsigned int> default_0 = vector<unsigned int> (deconvLayer->_kernel.size(), 0u);
-        vector<unsigned int> default_1 = vector<unsigned int> (deconvLayer->_kernel.size(), 1u);
-
-        vector<unsigned int> strides = deconvLayer->GetParamAsUInts("strides", default_1);
-        deconvLayer->_stride.clear();
-        for (int i = 1; i <= strides.size(); i++) {
-            if (strides[strides.size() - i] == 0) {
-                THROW_IE_EXCEPTION << "Stride could not be 0.\nIn layer " << deconvLayer->name;
-            }
-            deconvLayer->_stride.insert(i - 1, strides[strides.size() - i]);
-        }
-
-        vector<unsigned int> pads_begin = deconvLayer->GetParamAsUInts("pads_begin", default_0);
-        deconvLayer->_padding.clear();
-        for (int i = 1; i <= pads_begin.size(); i++) {
-            deconvLayer->_padding.insert(i - 1, pads_begin[pads_begin.size() - i]);
-        }
-
-        vector<unsigned int> pads_end = deconvLayer->GetParamAsUInts("pads_end", default_0);
-        deconvLayer->_pads_end.clear();
-        for (int i = 1; i <= pads_end.size(); i++) {
-            deconvLayer->_pads_end.insert(i - 1, pads_end[pads_end.size() - i]);
-        }
-
-        vector<unsigned int> dilations = deconvLayer->GetParamAsUInts("dilations", default_1);
-        deconvLayer->_dilation.clear();
-        for (int i = 1; i <= dilations.size(); i++) {
-            deconvLayer->_dilation.insert(i - 1, dilations[dilations.size() - i]);
-        }
-    }
-
-    deconvLayer->_group = deconvLayer->GetParamAsUInt("group", 1u);
+    ConvolutionValidator::parseParams(layer);
 }
 
 void DeconvolutionValidator::checkParams(const CNNLayer* layer) {
     LayerValidator::checkParams(layer);
 }
 
-DeconvolutionValidator::DeconvolutionValidator(const std::string& _type) : LayerValidator(_type) {}
-
+DeconvolutionValidator::DeconvolutionValidator(const std::string& _type) : ConvolutionValidator(_type) {}
 
 void DeconvolutionValidator::checkCorrespondence(const CNNLayer* layer,
                                                  const std::map<std::string, Blob::Ptr>& blobs,
@@ -442,8 +374,15 @@ void PoolingValidator::parseParams(CNNLayer* layer) {
         THROW_IE_EXCEPTION << "Layer is not instance of PoolingLayer class";
     }
 
-    auto version = BaseCreator::version_;
-    if (version < 3) {
+    poolLayer->_kernel.clear();
+    poolLayer->_stride.clear();
+    poolLayer->_padding.clear();
+    poolLayer->_pads_end.clear();
+
+    poolLayer->_auto_pad = poolLayer->GetParamAsString("auto_pad", "");
+
+    vector<unsigned int> kernels = poolLayer->GetParamAsUInts("kernel", {});
+    if (kernels.empty()) {
         int kernel_x = poolLayer->GetParamAsInt("kernel-x", -1);
         /** Pooling as custom layer */
         if (kernel_x == -1) {
@@ -451,14 +390,12 @@ void PoolingValidator::parseParams(CNNLayer* layer) {
                 unsigned int kernel_size = poolLayer->GetParamAsUInt("kernel_size");
                 unsigned int kernel_w = poolLayer->GetParamAsUInt("kernel_w", 0u);
                 unsigned int kernel_h = poolLayer->GetParamAsUInt("kernel_h", 0u);
-                poolLayer->_kernel.clear();
                 poolLayer->_kernel.insert(X_AXIS, kernel_w == 0u ? kernel_size : kernel_w);
                 poolLayer->_kernel.insert(Y_AXIS, kernel_h == 0u ? kernel_size : kernel_h);
 
                 unsigned int stride = poolLayer->GetParamAsUInt("stride", 1u);
                 unsigned int stride_w = poolLayer->GetParamAsUInt("stride_w", 0u);
                 unsigned int stride_h = poolLayer->GetParamAsUInt("stride_h", 0u);
-                poolLayer->_stride.clear();
                 poolLayer->_stride.insert(X_AXIS, stride_w == 0u ? stride : stride_w);
                 poolLayer->_stride.insert(Y_AXIS, stride_h == 0u ? stride : stride_h);
 
@@ -466,11 +403,9 @@ void PoolingValidator::parseParams(CNNLayer* layer) {
                 unsigned int pad_w = poolLayer->GetParamAsUInt("pad_w", 0u);
                 unsigned int pad_h = poolLayer->GetParamAsUInt("pad_h", 0u);
 
-                poolLayer->_padding.clear();
                 poolLayer->_padding.insert(X_AXIS, pad_w == 0u ? pad : pad_w);
                 poolLayer->_padding.insert(Y_AXIS, pad_h == 0u ? pad : pad_h);
 
-                poolLayer->_pads_end.clear();
                 poolLayer->_pads_end.insert(X_AXIS, 0u);
                 poolLayer->_pads_end.insert(Y_AXIS, 0u);
             } catch (...) {
@@ -479,11 +414,9 @@ void PoolingValidator::parseParams(CNNLayer* layer) {
             std::string alg = poolLayer->GetParamAsString("pool", "caffe.PoolingParameter.MAX");
             poolLayer->_type = alg == "caffe.PoolingParameter.MAX" ? PoolingLayer::MAX : PoolingLayer::AVG;
         } else  /** Default behavior */ {
-            poolLayer->_kernel.clear();
             poolLayer->_kernel.insert(X_AXIS, poolLayer->GetParamAsUInt("kernel-x"));
             poolLayer->_kernel.insert(Y_AXIS, poolLayer->GetParamAsUInt("kernel-y"));
 
-            poolLayer->_stride.clear();
             poolLayer->_stride.insert(X_AXIS, poolLayer->GetParamAsUInt("stride-x", 1u));
             poolLayer->_stride.insert(Y_AXIS, poolLayer->GetParamAsUInt("stride-y", 1u));
             // TODO: maybe just throw exception, why do we change IR?
@@ -496,11 +429,9 @@ void PoolingValidator::parseParams(CNNLayer* layer) {
                 LogError("Warning! in layer %s: Stride y is 0, setting to 1", poolLayer->name.c_str());
             }
 
-            poolLayer->_padding.clear();
             poolLayer->_padding.insert(X_AXIS, poolLayer->GetParamAsUInt("pad-x", 0u));
             poolLayer->_padding.insert(Y_AXIS, poolLayer->GetParamAsUInt("pad-y", 0u));
 
-            poolLayer->_pads_end.clear();
             poolLayer->_pads_end.insert(X_AXIS, poolLayer->GetParamAsUInt("pad-r", poolLayer->_padding[X_AXIS]));
             poolLayer->_pads_end.insert(Y_AXIS, poolLayer->GetParamAsUInt("pad-b", poolLayer->_padding[Y_AXIS]));
 
@@ -509,15 +440,10 @@ void PoolingValidator::parseParams(CNNLayer* layer) {
             std::string alg = poolLayer->GetParamAsString("pool-method", "max");
             poolLayer->_type = alg == "avg" ? PoolingLayer::AVG : PoolingLayer::MAX;
             if (alg != "max" && alg != "avg") {
-                THROW_IE_EXCEPTION << "Layer with type `" << _type << "` has incorrect pad-type!";
+                THROW_IE_EXCEPTION << "Layer with type `" << _type << "` has incorrect pool-type!";
             }
         }
-    } else if (version == 3) {
-        vector<unsigned int> kernels = poolLayer->GetParamAsUInts("kernel");
-        if (kernels.empty()) {
-            THROW_IE_EXCEPTION << "Invalid kernel field in layer " << poolLayer->name;
-        }
-        poolLayer->_kernel.clear();
+    } else {
         for (int i = 1; i <= kernels.size(); i++) {
             poolLayer->_kernel.insert(i - 1, kernels[kernels.size() - i]);
         }
@@ -526,7 +452,6 @@ void PoolingValidator::parseParams(CNNLayer* layer) {
         vector<unsigned int> default_1 = vector<unsigned int> (poolLayer->_kernel.size(), 1u);
 
         vector<unsigned int> strides = poolLayer->GetParamAsUInts("strides", default_1);
-        poolLayer->_stride.clear();
         for (int i = 1; i <= strides.size(); i++) {
             if (strides[strides.size() - i] == 0) {
                 THROW_IE_EXCEPTION << "Stride could not be 0.\nIn layer " << poolLayer->name;
@@ -535,13 +460,11 @@ void PoolingValidator::parseParams(CNNLayer* layer) {
         }
 
         vector<unsigned int> pads_begin = poolLayer->GetParamAsUInts("pads_begin", default_0);
-        poolLayer->_padding.clear();
         for (int i = 1; i <= pads_begin.size(); i++) {
             poolLayer->_padding.insert(i - 1, pads_begin[pads_begin.size() - i]);
         }
 
-        vector<unsigned int> pads_end = poolLayer->GetParamAsUInts("pads_end", default_0);
-        poolLayer->_pads_end.clear();
+        vector<unsigned int> pads_end = poolLayer->GetParamAsUInts("pads_end", pads_begin);
         for (int i = 1; i <= pads_end.size(); i++) {
             poolLayer->_pads_end.insert(i - 1, pads_end[pads_end.size() - i]);
         }
@@ -637,72 +560,34 @@ TileValidator::TileValidator(const std::string& _type) : LayerValidator(_type) {
 
 ReshapeValidator::ReshapeValidator(const std::string& _type) : LayerValidator(_type) {}
 
-void ReshapeValidator::parseParams(CNNLayerlayer) {
-    auto casted = dynamic_cast<ReshapeLayer*>(layer);
+void ReshapeValidator::parseParams(CNNLayer *layer) {
+    auto casted = dynamic_cast<ReshapeLayer *>(layer);
     if (!casted) {
         THROW_IE_EXCEPTION << "Layer is not instance of ReshapeLayer class";
     }
-    try {
-        if (!casted->params.empty()) {
-            casted->num_axes = casted->GetParamAsInt(casted->type == "Flatten" ? "end_axis" : "num_axes", -1);
-            casted->axis = casted->GetParamAsInt("axis", 1);
+    casted->shape.clear();
+    if (!casted->params.empty()) {
+        if (casted->type == "Flatten") {
+            casted->num_axes = casted->GetParamAsInt("end_axis", -1);
+            casted->axis = casted->axis = casted->GetParamAsInt("axis", 0);
+        } else {
             casted->shape = casted->GetParamAsInts("dim", {});
-            calculateIn2Out(casted);
         }
-    } catch (...) {}
-}
-
-void ReshapeValidator::checkParams(const CNNLayer* layer) {
-    LayerValidator::checkParams(layer);
+    }
 }
 
-void ReshapeValidator::calculateIn2Out(ReshapeLayer* layer) {
-    if (layer->outData.empty() || layer->insData.empty())
-        return;
-
-    if (!layer->shape.empty() && std::find(layer->shape.begin(), layer->shape.end(), 0) != layer->shape.end())
-        return;
-
-    SizeVector inDims = layer->input()->getTensorDesc().getDims();
-    SizeVector outDims = layer->outData[0]->getTensorDesc().getDims();
-
-    vector<size_t> inMapped;
-    vector<size_t> outMapped;
-    for (size_t i = 0; i < inDims.size(); i++) {
-        bool mapped = false;
-        inMapped.push_back(i);
-        for (size_t j = 0; !mapped && j < outDims.size(); j++) {
-            if (outDims[j] == inDims[i] && std::find(outMapped.begin(), outMapped.end(), j) == outMapped.end()) {
-                outMapped.push_back(j);
-                mapped = true;
-            }
-        }
-
-        for (size_t j = 1; !mapped && j <= outDims.size(); j++) {
-            if (outDims[outDims.size() - j] != inDims[i] && (outDims[outDims.size() - j] % inDims[i] == 0)) {
-                outMapped.push_back(outDims.size() - j);
-                mapped = true;
-            }
-        }
-        if (!mapped) {
-            size_t outIndex = outDims.size() - 1;
-            for (size_t k = 0; k < layer->shape.size(); k++) {
-                if (layer->shape[k] < 0) {
-                    outIndex = k;
-                    break;
-                }
-            }
-            outMapped.push_back(outIndex);
-        }
-    }
-    std::string mapped_params;
-    for (size_t i = 0; i < inMapped.size(); i++) {
-        if (!mapped_params.empty())
-            mapped_params += ",";
-        mapped_params += std::to_string(inMapped[i]) + "-" + std::to_string(outMapped[i]);
+void ReshapeValidator::checkParams(const CNNLayer *layer) {
+    auto casted = dynamic_cast<const ReshapeLayer *>(layer);
+    if (!casted)
+        THROW_IE_EXCEPTION << "Layer is not instance of ReshapeLayer class";
+    size_t num = 0;
+    for (int dim : casted->shape) {
+        if (dim < -1)
+            THROW_IE_EXCEPTION << "Invalid value of Reshape mask (dim attribute):" << dim
+                               << ". Supported values: 0, -1, >0";
+        if (dim == -1) num++;
     }
-
-    layer->params["in2out"] = mapped_params;
+    if (num > 1) THROW_IE_EXCEPTION << "Invalid Reshape mask (dim attribute): at most one dimension can be `-1`";
 }
 
 void EltwiseValidator::parseParams(CNNLayer* layer) {
@@ -825,7 +710,7 @@ void NormValidator::parseParams(CNNLayer* layer) {
     casted->_k = casted->GetParamAsUInt("k", 1);
     casted->_alpha = casted->GetParamAsFloat("alpha");
     casted->_beta = casted->GetParamAsFloat("beta");
-    casted->_isAcrossMaps = casted->GetParamsAsBool("region", false);
+    casted->_isAcrossMaps = CaselessEq<std::string>()(casted->GetParamAsString("region"), "across");
 }
 
 void NormValidator::checkParams(const CNNLayer* layer) {
@@ -847,14 +732,42 @@ void SplitValidator::parseParams(CNNLayer* layer) {
     for (auto& i : layer->outData) {
         if (!out_sizes.empty())
             out_sizes += ",";
+        if (static_cast<int>(i->getTensorDesc().getDims().size()) <= casted->_axis) {
+            THROW_IE_EXCEPTION << "Internal error - dimensions are emtpy";
+        }
         out_sizes += std::to_string(i->getTensorDesc().getDims()[casted->_axis]);
     }
     if (!out_sizes.empty())
         casted->params["out_sizes"] = out_sizes;
 }
 
+void checkNumOfInput(const std::vector<SizeVector>& inShapes, const vector<int>& expected_num_of_shapes) {
+    bool shape_was_found = false;
+    for (const auto& i : expected_num_of_shapes) {
+        if (inShapes.size() == i) {
+            shape_was_found = true;
+        }
+    }
+    if (!shape_was_found) {
+        THROW_IE_EXCEPTION << "Number of inputs (" << inShapes.size() << ") is not equal to expected ones";
+    }
+}
+
+
 void SplitValidator::checkParams(const CNNLayer* layer) {
     LayerValidator::checkParams(layer);
+    std::vector<int> out_sizes = layer->GetParamAsInts("out_sizes", {});
+    if (out_sizes.empty()) {
+        THROW_IE_EXCEPTION << "Value of out_sizes attribute is empty";
+    }
+}
+
+void SplitValidator::checkShapes(const CNNLayer* layer, const std::vector<SizeVector>& inShapes) const {
+    auto casted = dynamic_cast<const SplitLayer*>(layer);
+    if (!casted) {
+        THROW_IE_EXCEPTION << "Layer is not instance of SplitLayer class";
+    }
+    checkNumOfInput(inShapes, {1});
 }
 
 ConcatValidator::ConcatValidator(const std::string& _type) : LayerValidator(_type) {}
@@ -870,3 +783,241 @@ void ConcatValidator::parseParams(CNNLayer* layer) {
 void ConcatValidator::checkParams(const CNNLayer* layer) {
     LayerValidator::checkParams(layer);
 }
+
+void ConcatValidator::checkShapes(const CNNLayer* layer,
+                                  const std::vector<SizeVector>& inShapes) const {
+    if (inShapes.empty())
+        THROW_IE_EXCEPTION << "Inputs are empty";
+
+    auto casted = dynamic_cast<const ConcatLayer*>(layer);
+    if (!casted) {
+        THROW_IE_EXCEPTION << "Invalid Concat layer.";
+    }
+
+    auto firstShape = inShapes[0];
+    size_t firstShapeSize = firstShape.size();
+    size_t axis = casted->_axis;
+    if (axis >= firstShapeSize)
+        THROW_IE_EXCEPTION << "Concat axis(" << axis
+                           << ") should be less the number of current input dimensions ("
+                           << firstShapeSize << ")";
+
+    for (size_t i = 1; i < inShapes.size(); i++) {
+        auto shape = inShapes[i];
+        if (shape.size() != firstShapeSize)
+            THROW_IE_EXCEPTION << "Invalid inputs for Concat layer: number of dimensions must match: "
+                               << firstShapeSize << " vs " << shape.size();
+        bool eq_part1 = std::equal(firstShape.begin(), firstShape.begin() + axis,
+                                   shape.begin());
+        bool eq_part2 = std::equal(firstShape.begin() + axis + 1, firstShape.end(),
+                                   shape.begin() + axis + 1);
+        if (!(eq_part1 && eq_part2))
+            THROW_IE_EXCEPTION << "Invalid inputs for Concat layer: dimensions should match in all"
+                               << "positions except axis (" << axis << ") one"
+                               << ") should match : [" << dumpVec(firstShape) << "] vs ["
+                               << dumpVec(shape) <<"]";
+    }
+}
+
+GemmValidator::GemmValidator(const std::string& _type) : LayerValidator(_type) {}
+
+void GemmValidator::parseParams(CNNLayer* layer) {
+    auto casted = dynamic_cast<GemmLayer*>(layer);
+    if (!casted) {
+        THROW_IE_EXCEPTION << "Layer is not instance of GemmLayer class";
+    }
+    casted->alpha = casted->GetParamAsFloat("alpha", 1);
+    casted->beta = casted->GetParamAsFloat("beta", 1);
+    casted->transpose_a = casted->GetParamsAsBool("transpose_a", false);
+    casted->transpose_b = casted->GetParamsAsBool("transpose_b", false);
+}
+
+void GemmValidator::checkParams(const CNNLayer* layer) {
+    LayerValidator::checkParams(layer);
+}
+
+void GemmValidator::checkShapes(const CNNLayer* layer, const vector<SizeVector>& inShapes) const {
+    auto casted = dynamic_cast<const GemmLayer*>(layer);
+    if (!casted) {
+        THROW_IE_EXCEPTION << "Layer is not instance of GemmLayer class";
+    }
+
+    size_t numInputs = inShapes.size();
+    if (numInputs != 2 && numInputs != 3)
+        THROW_IE_EXCEPTION << "Gemm can take only 2 or 3 inputs, but actually it has: " << numInputs;
+
+    auto dims0 = inShapes[0];
+    auto dims1 = inShapes[1];
+    if (dims0.size() < 2 || dims1.size() < 2) {
+        THROW_IE_EXCEPTION << "Gemm input shapes must have at least 2 dimensions";
+    }
+
+    unsigned long xAxis = dims0.size() - 1;
+    unsigned long yAxis = dims0.size() - 2;
+    if (dims0[xAxis] != dims1[yAxis])
+        THROW_IE_EXCEPTION << "Gemm input0 x dimension must be equal to input1 y dimension ("
+                           << dims0[xAxis] << " vs " << dims1[yAxis] << ")";
+
+    if (inShapes.size() == 3) {
+        auto dims2 = inShapes[2];
+        if (dims2.size() < 2) {
+            THROW_IE_EXCEPTION << "Gemm input shapes must have at least 2 dimensions";
+        }
+
+        if (dims2[xAxis] != dims1[xAxis])
+            THROW_IE_EXCEPTION << "Gemm input2 x dimension must be equal to input1 x dimension ("
+                               << dims2[xAxis] << " vs " << dims1[xAxis] << ")";
+
+        if (dims2[yAxis] != dims0[yAxis])
+            THROW_IE_EXCEPTION << "Gemm input2 y dimension must be equal to input0 y dimension ("
+                               << dims2[yAxis] << " vs " << dims0[yAxis] << ")";
+    }
+}
+
+PadValidator::PadValidator(const std::string& _type) : LayerValidator(_type) {}
+
+void PadValidator::parseParams(CNNLayer* layer) {
+    auto casted = dynamic_cast<PadLayer*>(layer);
+    if (!casted) {
+        THROW_IE_EXCEPTION << "Layer is not instance of PadLayer class";
+    }
+    std::vector<uint32_t> pads_begin = casted->GetParamAsUInts("pads_begin");
+    std::vector<uint32_t> pads_end = casted->GetParamAsUInts("pads_end");
+
+    casted->pads_begin.clear();
+    for (size_t i = 0; i < pads_begin.size(); i++) {
+        casted->pads_begin.insert(i, pads_begin[i]);
+    }
+
+    casted->pads_end.clear();
+    for (size_t i = 0; i < pads_end.size(); i++) {
+        casted->pads_end.insert(i, pads_end[i]);
+    }
+
+    casted->pad_value = casted->GetParamAsFloat("pad_value", 0.0f);
+
+    std::string mode = casted->GetParamAsString("pad_mode", "constant");
+    if (mode == "constant") {
+        casted->pad_mode = PadLayer::Constant;
+    } else if (mode == "edge") {
+        casted->pad_mode = PadLayer::Edge;
+    } else if (mode == "reflect") {
+        casted->pad_mode = PadLayer::Reflect;
+    } else if (mode == "symmetric") {
+        casted->pad_mode = PadLayer::Symmetric;
+    } else {
+        THROW_IE_EXCEPTION << "Unsupported pad mode operation: " << mode;
+    }
+}
+
+void PadValidator::checkParams(const CNNLayer* layer) {
+    LayerValidator::checkParams(layer);
+}
+
+void PadValidator::checkShapes(const CNNLayer* layer, const vector<SizeVector>& inShapes) const {
+    auto casted = dynamic_cast<const PadLayer*>(layer);
+    if (!casted) {
+        THROW_IE_EXCEPTION << "Layer is not instance of PadLayer class";
+    }
+
+    size_t numInputs = inShapes.size();
+    if (numInputs != 1)
+        THROW_IE_EXCEPTION << "Pad can take only 1 input, but actually it has: " << numInputs;
+
+    if (inShapes[0].size() != casted->pads_begin.size())
+        THROW_IE_EXCEPTION << "Dimensions count mismatch in layer " << layer->name
+                           << ". Expected: " << casted->pads_begin.size() << " Got: " << inShapes[0].size();
+
+    if (inShapes[0].size() != casted->pads_end.size())
+        THROW_IE_EXCEPTION << "Dimensions count mismatch in layer " << layer->name
+                           << ". Expected: " << casted->pads_end.size() << " Got: " << inShapes[0].size();
+
+    if (casted->pad_mode == PadLayer::Symmetric || casted->pad_mode == PadLayer::Reflect) {
+        for (size_t i = 0; i < inShapes[0].size(); i++) {
+            if (inShapes[0][i] < casted->pads_begin[i]) {
+                THROW_IE_EXCEPTION << "Pad can't be grater than input shape in symmetric and reflect modes."
+                                   << " For dimension " << i << " pad_begin=" << casted->pads_begin[i]
+                                   << " in_shape="<< inShapes[0][i];
+            }
+            if (inShapes[0][i] < casted->pads_end[i]) {
+                THROW_IE_EXCEPTION << "Pad can't be grater than input shape in symmetric and reflect modes."
+                                   << " For dimension " << i << " pad_end=" << casted->pads_end[i]
+                                   << " in_shape="<< inShapes[0][i];
+            }
+        }
+    }
+}
+
+GatherValidator::GatherValidator(const std::string& _type) : LayerValidator(_type) {}
+
+void GatherValidator::parseParams(CNNLayer* layer) {
+    auto casted = dynamic_cast<GatherLayer*>(layer);
+    if (!casted) {
+        THROW_IE_EXCEPTION << "Layer is not instance of GatherLayer class";
+    }
+
+    casted->axis = casted->GetParamAsInt("axis", 0);
+}
+
+void GatherValidator::checkParams(const CNNLayer* layer) {
+    LayerValidator::checkParams(layer);
+}
+
+void GatherValidator::checkShapes(const CNNLayer* layer, const vector<SizeVector>& inShapes) const {
+    auto casted = dynamic_cast<const GatherLayer*>(layer);
+    if (!casted) {
+        THROW_IE_EXCEPTION << "Layer is not instance of GatherLayer class";
+    }
+
+    size_t numInputs = inShapes.size();
+    if (numInputs != 2)
+        THROW_IE_EXCEPTION << "Gather can take only 2 inputs, but actually it has: " << numInputs;
+
+    if (casted->axis > 0 && (inShapes[0].size() - casted->axis) < 1)
+        THROW_IE_EXCEPTION << "Incorrect input dictionary dimensions " << inShapes[0].size()
+                           << " and axis number " << casted->axis;
+    else if (casted->axis < 0 && (static_cast<int>(inShapes[0].size()) + casted->axis) < 0)
+        THROW_IE_EXCEPTION << "Incorrect input dictionary dimensions " << inShapes[0].size()
+                           << " and axis number " << casted->axis;
+}
+
+RNNValidator::RNNValidator(const std::string& _type) : LayerValidator(_type) {}
+
+void RNNValidator::parseParams(CNNLayer* layer) {
+    auto casted = dynamic_cast<RNNLayer*>(layer);
+    if (!casted)
+        THROW_IE_EXCEPTION << "Layer is not instance of RNNLayer class";
+
+    std::string cell = layer->GetParamAsString("cell_type");
+    std::string direction = layer->GetParamAsString("direction", "Forward");
+    int axis = layer->GetParamAsInt("axis", 1);
+
+    if (!one_of(cell, "LSTM", "RNN", "GRU"))
+        THROW_IE_EXCEPTION << "Unknown RNN cell type " << cell << ". "
+                           << "Expected one of [ LSTM | RNN | GRU ].";
+
+    if (!one_of(direction, "Forward", "Backward", "Bidirectional"))
+        THROW_IE_EXCEPTION << "Unknown RNN direction type " << direction << ". "
+                           << "Expected one of [ Forward | Backward | Bidirectional ].";
+
+    casted->axis = axis;
+    casted->cellType = cell;
+    casted->direction = direction == "Forward"  ? RNNLayer::RNN_FWD :
+                        direction == "Backward" ? RNNLayer::RNN_BWD :
+                                                  RNNLayer::RNN_BDR;
+}
+
+void RNNValidator::checkParams(const InferenceEngine::CNNLayer *layer) {
+    auto casted = dynamic_cast<const RNNLayer*>(layer);
+    if (!casted)
+        THROW_IE_EXCEPTION << "Layer is not instance of RNNLayer class";
+
+    if (!one_of(casted->axis, 1, 0))
+        THROW_IE_EXCEPTION << "Unsupported axis for RNN layer iterator. Only 0 and 1 axis are supported.";
+
+    // TODO: Add more RNN verification..
+}
+
+void RNNValidator::checkShapes(const CNNLayer* layer, const std::vector<SizeVector>& inShapes) const {}
+
+}  // namespace InferenceEngine
index 9079463..6361b4f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -158,7 +157,7 @@ public:
                              const std::vector<SizeVector>& inShapes) const override;
 };
 
-class INFERENCE_ENGINE_API_CLASS(DeconvolutionValidator) : public LayerValidator {
+class INFERENCE_ENGINE_API_CLASS(DeconvolutionValidator) : public ConvolutionValidator {
 public:
     void parseParams(CNNLayer* layer) override;
 
@@ -257,9 +256,6 @@ public:
     void parseParams(CNNLayer* layer) override;
 
     void checkParams(const CNNLayer* layer) override;
-
-protected:
-    void calculateIn2Out(ReshapeLayer* layer);
 };
 
 class INFERENCE_ENGINE_API_CLASS(EltwiseValidator) : public LayerValidator {
@@ -332,6 +328,8 @@ public:
     void parseParams(CNNLayer* layer) override;
 
     void checkParams(const CNNLayer* layer) override;
+
+    void checkShapes(const CNNLayer* layer, const std::vector<SizeVector>& inShapes) const override;
 };
 
 class INFERENCE_ENGINE_API_CLASS(ConcatValidator) : public LayerValidator {
@@ -341,6 +339,52 @@ public:
     void parseParams(CNNLayer* layer) override;
 
     void checkParams(const CNNLayer* layer) override;
+
+    void checkShapes(const CNNLayer* layer, const std::vector<SizeVector>& inShapes) const override;
+};
+
+class INFERENCE_ENGINE_API_CLASS(GemmValidator) : public LayerValidator {
+public:
+    explicit GemmValidator(const std::string& _type);
+
+    void parseParams(CNNLayer* layer) override;
+
+    void checkParams(const CNNLayer* layer) override;
+
+    void checkShapes(const CNNLayer* layer, const std::vector<SizeVector>& inShapes) const override;
+};
+
+class INFERENCE_ENGINE_API_CLASS(PadValidator) : public LayerValidator {
+public:
+    explicit PadValidator(const std::string& _type);
+
+    void parseParams(CNNLayer* layer) override;
+
+    void checkParams(const CNNLayer* layer) override;
+
+    void checkShapes(const CNNLayer* layer, const std::vector<SizeVector>& inShapes) const override;
+};
+
+class INFERENCE_ENGINE_API_CLASS(GatherValidator) : public LayerValidator {
+public:
+    explicit GatherValidator(const std::string& _type);
+
+    void parseParams(CNNLayer* layer) override;
+
+    void checkParams(const CNNLayer* layer) override;
+
+    void checkShapes(const CNNLayer* layer, const std::vector<SizeVector>& inShapes) const override;
+};
+
+class INFERENCE_ENGINE_API_CLASS(RNNValidator) : public LayerValidator {
+public:
+    explicit RNNValidator(const std::string& _type);
+
+    void parseParams(CNNLayer* layer) override;
+
+    void checkParams(const CNNLayer* layer) override;
+
+    void checkShapes(const CNNLayer* layer, const std::vector<SizeVector>& inShapes) const override;
 };
 
 template<typename Validator>
@@ -378,6 +422,10 @@ REG_LAYER_VALIDATOR_FOR_TYPE(NormValidator, LRN);
 REG_LAYER_VALIDATOR_FOR_TYPE(SplitValidator, Split);
 REG_LAYER_VALIDATOR_FOR_TYPE(SplitValidator, Slice);
 REG_LAYER_VALIDATOR_FOR_TYPE(ConcatValidator, Concat);
+REG_LAYER_VALIDATOR_FOR_TYPE(GemmValidator, Gemm);
+REG_LAYER_VALIDATOR_FOR_TYPE(PadValidator, Pad);
+REG_LAYER_VALIDATOR_FOR_TYPE(GatherValidator, Gather);
+REG_LAYER_VALIDATOR_FOR_TYPE(RNNValidator, RNN);
 
 }  // namespace details
 }  // namespace InferenceEngine
index 9a9f84c..55fb626 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include <map>
 #include <vector>
 #include <string>
+#include <tuple>
 #include "ie_layers_internal.hpp"
+#include "layer_transform.hpp"
 
 namespace InferenceEngine {
 
-Paddings getConvPaddings(const ConvolutionLayer &convLayer) {
-    std::string errorPrefix = "Failed to calculate padding for Convolution: ";
-    const std::map<std::string, std::string> &params = convLayer.params;
-    const std::vector<DataWeakPtr> &insData = convLayer.insData;
+template<class Layer>
+int getKernel(const Layer &layer, size_t i) {
+    if (layer._dilation.size() > i && layer._dilation[i])
+        return (layer._kernel[i] - 1) * layer._dilation[i] + 1;
+    return layer._kernel[i];
+}
+
+template<>
+int getKernel(const PoolingLayer &layer, size_t i) {
+    return layer._kernel[i];
+}
+
+template<class Layer>
+Paddings getPaddingsInternal(const Layer &layer) {
+    std::string errorPrefix = "Failed to calculate padding for " + layer.type + ": ";
     try {
+        const std::map<std::string, std::string> &params = layer.params;
+        const std::vector<DataWeakPtr> &insData = layer.insData;
         auto it = params.find("auto_pad");
-        std::string padType;
         if (it != params.end()) {
             if (it->second == "valid") {
-                return {PropertyVector<unsigned>(2, 0), PropertyVector<unsigned>(2, 0)};
+                return {PropertyVector<unsigned>(layer._kernel.size(), 0u),
+                        PropertyVector<unsigned>(layer._kernel.size(), 0u)};
             } else {
-                if (insData.size() != 1) THROW_IE_EXCEPTION << "number of inputs should be equal 1";
+                if (insData.size() != 1)
+                    THROW_IE_EXCEPTION << "number of inputs should be equal 1";
                 auto firstInput = insData[0].lock();
-                if (!firstInput) THROW_IE_EXCEPTION << "input is empty";
+                if (!firstInput)
+                    THROW_IE_EXCEPTION << "input is empty";
                 auto shape = firstInput->getTensorDesc().getDims();
-                if (shape.size() != 4) THROW_IE_EXCEPTION << "input shape must be 4D";
+                auto shape_size = shape.size();
+                if (shape_size < 4 || shape_size > 5)
+                    THROW_IE_EXCEPTION << "input shape must be 4D or 5D";
 
-                int SH = convLayer._stride[Y_AXIS];
-                int SW = convLayer._stride[X_AXIS];
+                std::vector<int> shapes;
+                shapes.push_back(shape[shape_size - 1]);
+                shapes.push_back(shape[shape_size - 2]);
+                if (shape.size() > 4)
+                    shapes.push_back(shape[shape_size - 3]);
 
-                int IH = shape[2];
-                int IW = shape[3];
+                PropertyVector<unsigned int> pad_begin, pad_end;
 
-                int KH = 0, KW = 0;
-                if (convLayer._dilation[Y_AXIS])
-                    KH = (convLayer._kernel[Y_AXIS] - 1) * convLayer._dilation[Y_AXIS] + 1;
-                else
-                    KH = convLayer._kernel[Y_AXIS];
-                if (convLayer._dilation[X_AXIS])
-                    KW = (convLayer._kernel[X_AXIS] - 1) * convLayer._dilation[X_AXIS] + 1;
-                else
-                    KW = convLayer._kernel[X_AXIS];
-                int PAH, PAW;
-                if (IH % SH == 0) {
-                    PAH = std::max(KH - SH, 0);
-                } else {
-                    PAH = std::max(KH - (IH % SH), 0);
-                }
-                if (IW % SW == 0) {
-                    PAW = std::max(KW - SW, 0);
-                } else {
-                    PAW = std::max(KW - (IW % SW), 0);
-                }
+                for (size_t i = 0; i < layer._kernel.size(); i++) {
+                    int PA = 0;
+                    int kernel = getKernel(layer, i);
 
-                unsigned top = PAH / 2;
-                unsigned bottom = PAH - top;
-                unsigned left = PAW / 2;
-                unsigned right = PAW - left;
+                    int stride = layer._stride.size() > i ? layer._stride[i] : 1;
+                    int sh = shapes[i];
+                    if (sh % stride == 0) {
+                        PA = std::max(kernel - stride, 0);
+                    } else {
+                        PA = std::max(kernel - (sh % stride), 0);
+                    }
+                    unsigned p_begin = PA / 2;
+                    unsigned p_end = PA - p_begin;
 
-                PropertyVector<unsigned int> pad_begin;
-                pad_begin.insert(X_AXIS, left);
-                pad_begin.insert(Y_AXIS, top);
+                    pad_begin.insert(i, p_begin);
+                    pad_end.insert(i, p_end);
+                }
 
-                PropertyVector<unsigned int> pad_end;
-                pad_end.insert(X_AXIS, right);
-                pad_end.insert(Y_AXIS, bottom);
                 return {pad_begin, pad_end};
             }
         }
-        return {convLayer._padding, convLayer._pads_end};
+        return {layer._padding, layer._pads_end};
     } catch (const InferenceEngine::details::InferenceEngineException &iee) {
         THROW_IE_EXCEPTION << errorPrefix << iee.what();
     }
 }
 
+class PaddingsUpdater {
+    std::reference_wrapper<Paddings> pad;
+ public:
+    explicit PaddingsUpdater(Paddings & pad) : pad(pad) {}
+    template <class T>
+    typename std::enable_if<!std::is_same<T, CNNLayer*>::value, bool>::type
+    operator () (T & layer) const {
+        pad.get() = getPaddingsInternal(*layer);
+        return true;
+    }
+    bool operator () (CNNLayer * layer) const {
+        THROW_IE_EXCEPTION << "padding calculation for layer: " << layer->name << "(" << layer->type << ") unsupported";
+    }
+};
+
+Paddings getPaddingsImpl(const CNNLayer &layer) {
+    Paddings actual;
+    details::visitActualLayer(std::tuple <DeconvolutionLayer*, ConvolutionLayer*, PoolingLayer*, CNNLayer*>(), layer, PaddingsUpdater(actual));
+    return actual;
+}
+
 }  // namespace InferenceEngine
index bfe526a..296b565 100644 (file)
@@ -1,12 +1,12 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
-
+#include <tuple>
 #include "ie_api.h"
 #include "ie_layers.h"
+#include "ie_util_internal.hpp"
 
 namespace InferenceEngine {
 
@@ -16,6 +16,22 @@ public:
     PropertyVector<unsigned int> end;
 };
 
-INFERENCE_ENGINE_API_CPP(Paddings) getConvPaddings(const ConvolutionLayer &convLayer);
+/**
+ * @brief gets padding with runtime type check
+ */
+INFERENCE_ENGINE_API_CPP(Paddings) getPaddingsImpl(const CNNLayer &layer);
+
+/**
+ * @brief gets padding without compile-time type check
+ */
+template <class T>
+inline  typename std::enable_if<is_one_of<T,
+                                          DeconvolutionLayer,
+                                          ConvolutionLayer,
+                                          PoolingLayer>::value, Paddings>::type
+getPaddings(const T & layer) {
+    return getPaddingsImpl(layer);
+}
+
 
 }  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/ie_layers_prv.h b/inference-engine/src/inference_engine/ie_layers_prv.h
new file mode 100644 (file)
index 0000000..9ec8c3c
--- /dev/null
@@ -0,0 +1,99 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+/**
+ * @brief a header file for internal Layers structure
+ * @file
+ */
+#pragma once
+
+#include "ie_layers.h"
+#include <string>
+
+namespace InferenceEngine {
+
+/**
+ * LSTM Cell Layer
+ *
+ * Inputs:
+ *    Xt   {N, D}
+ *    Ht-1 {N, S}
+ *    Ct-1 {N, S}
+ *
+ * Outputs:
+ *    Ht {N, S}
+ *    Ct {N, S}
+ *
+ * Weights:
+ *    W {G=4, S, D+S}
+ *    B {G=4, S}
+ *
+ * G=4 and gate order is [f,i,c,o]
+ *
+ * Semantic:
+ *
+ *   *  - matrix mult
+ *  (.) - eltwise mult
+ *  [,] - concatenation
+ *
+ *  f = sigmoid
+ *  h = tanh
+ *
+ * - ft = f(Wf*[Ht-1, Xt] + Bf)
+ * - it = f(Wi*[Ht-1, Xt] + Bi)
+ * - ct = h(Wc*[Ht-1, Xt] + Bc)
+ * - ot = f(Wo*[Ht-1, Xt] + Bo)
+ * - Ct = ft (.) Ct-1 + it (.) ct
+ * - Ht = ot (.) h(Ct)
+ */
+class LSTMCell : public WeightableLayer {
+public:
+    using WeightableLayer::WeightableLayer;
+};
+
+/**
+ * @brief This class represents RNN-Sequence layer
+ *
+ * Date shapes and meaning (cellType = "LSTM", axis = 1):
+ *   input[0] Xt - {N,T,DC} input data sequence
+ *   input[1] H0 - {N,SC}   initial hidden state
+ *   input[2] C0 - {N,SC}   initial cell state
+ *
+ *   output[0] Ht - {N,T,SC} out data sequence
+ *   output[1] HT - {N,SC}   last hidden state
+ *   output[2] CT - {N,SC}   last cell state
+ *
+ *   Recurrent formula and weight format are same as from
+ *   corresponding Cell primitive.
+ */
+class RNNLayer : public WeightableLayer {
+public:
+    /**
+     * @brief Type of RNN cell used sequence layer
+     * Possible values "RNN", "LSTM", "GRU".
+     */
+    std::string cellType = "LSTM";
+
+    /**
+     * @brief An axis by which iteration is performed
+     * axis=0 means first input/output data blob dimension is sequence
+     * axis=1 means first input/output data blob dimension is batch
+     */
+    unsigned int axis = 1;
+
+    /**
+     * @brief Direction of iteration through sequence dimension
+     */
+    enum Direction {
+        RNN_FWD,  /**< Forward mode. Iterate starts from index 0 with step 1.         */
+        RNN_BWD,  /**< Backward mode. Iterate starts from last index with step -1.    */
+        RNN_BDR   /**< Bidirectional mode. First is forward pass, second is backward. */
+    };
+
+    Direction direction = RNN_FWD;
+
+    using WeightableLayer::WeightableLayer;
+};
+
+}  // namespace InferenceEngine
index b06ab78..63cbc16 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -51,22 +50,23 @@ TensorDesc::TensorDesc(const Precision &precision, SizeVector dims, const Blocki
         : dims(dims), blockingDesc(blockDesc), precision(precision)  {
     if (dims.size() != *std::max_element(blockDesc.getOrder().begin(), blockDesc.getOrder().end()) + 1)
         THROW_IE_EXCEPTION << "Cannot create TensorDesc! Blocked dims are inconsistent with original dims.";
+
+    layout = Layout::BLOCKED;
     if (dims.size() == blockingDesc.getBlockDims().size()) {
         switch (dims.size()) {
             case 1:
                 layout = Layout::C;
-                return;
+                break;
             case 2:
                 if (blockingDesc.getOrder()[0] == 0 && blockingDesc.getOrder()[1] == 1)
                     layout = Layout::NC;
                 else
                     layout = Layout::CN;
-                return;
+                break;
             case 3:
                 if (blockingDesc.getOrder()[0] == 0 && blockingDesc.getOrder()[1] == 1 &&
                         blockingDesc.getOrder()[2] == 2) {
                     layout = Layout::CHW;
-                    return;
                 }
                 break;
             case 4:
@@ -76,15 +76,23 @@ TensorDesc::TensorDesc(const Precision &precision, SizeVector dims, const Blocki
                 } else if (blockingDesc.getOrder()[0] == 0 && blockingDesc.getOrder()[1] == 2 &&
                         blockingDesc.getOrder()[2] == 3 && blockingDesc.getOrder()[3] == 1) {
                     layout = Layout::NHWC;
-                } else {
-                    layout = Layout::BLOCKED;
                 }
-                return;
+                break;
+            case 5:
+                if (blockingDesc.getOrder()[0] == 0 && blockingDesc.getOrder()[1] == 1 &&
+                        blockingDesc.getOrder()[2] == 2 && blockingDesc.getOrder()[3] == 3 &&
+                        blockingDesc.getOrder()[4] == 4) {
+                    layout = Layout::NCDHW;
+                } else if (blockingDesc.getOrder()[0] == 0 && blockingDesc.getOrder()[1] == 2 &&
+                        blockingDesc.getOrder()[2] == 3 && blockingDesc.getOrder()[3] == 4 &&
+                        blockingDesc.getOrder()[4] == 1) {
+                    layout = Layout::NDHWC;
+                }
+                break;
             default:
                 break;
         }
     }
-    layout = Layout::BLOCKED;
 }
 
 TensorDesc::TensorDesc() {
@@ -129,6 +137,8 @@ Layout TensorDesc::getLayoutByDims(SizeVector dims) {
             return Layout::CHW;
         case 4:
             return Layout::NCHW;
+        case 5:
+            return Layout::NCDHW;
         default:
             return Layout::BLOCKED;
     }
@@ -163,8 +173,8 @@ size_t TensorDesc::offset(const SizeVector& v) const {
 size_t TensorDesc::offset(size_t l) const {
     size_t n_dims = dims.size();
     SizeVector pos(n_dims);
-    for (int rd = 0; rd < n_dims; ++rd) {
-        const size_t d = n_dims - 1 - rd;
+    for (int rd = 1; rd <= n_dims; ++rd) {
+        const size_t d = n_dims - rd;
         const size_t cur_dim = dims[d];
         pos[d] = l % cur_dim;
         l /= cur_dim;
@@ -249,11 +259,21 @@ BlockingDesc::BlockingDesc(const SizeVector& dims, Layout layout): offsetPadding
             l_order = {0, 1, 2, 3};
             l_dims = dims;
             break;
+        case Layout::NCDHW:
+            checkDims(dims.size(), 5);
+            l_order = {0, 1, 2, 3, 4};
+            l_dims = dims;
+            break;
         case Layout::NHWC:
             checkDims(dims.size(), 4);
             l_order = {0, 2, 3, 1};
             l_dims = {dims[0], dims[2], dims[3], dims[1]};
             break;
+        case Layout::NDHWC:
+            checkDims(dims.size(), 5);
+            l_order = {0, 2, 3, 4, 1};
+            l_dims = dims;
+            break;
         case Layout::CHW:
             checkDims(dims.size(), 3);
             l_order = {0, 1, 2};
diff --git a/inference-engine/src/inference_engine/ie_memcpy.cpp b/inference-engine/src/inference_engine/ie_memcpy.cpp
new file mode 100644 (file)
index 0000000..330c0f2
--- /dev/null
@@ -0,0 +1,21 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <stdint.h>
+#include <string.h>
+#include "ie_memcpy.h"
+
+int ie_memcpy(void* dest, size_t destsz, void const* src, size_t count) {
+    size_t i;
+    if (!src || count > destsz ||
+        count > (dest > src ? ((uintptr_t)dest - (uintptr_t)src)
+                            : ((uintptr_t)src - (uintptr_t)dest))) {
+        // zero out dest if error detected
+        memset(dest, 0, destsz);
+        return -1;
+    }
+
+    for (i = 0; i < count; ++i) (reinterpret_cast<uint8_t*>(dest))[i] = (reinterpret_cast<const uint8_t*>(src))[i];
+    return 0;
+}
diff --git a/inference-engine/src/inference_engine/ie_memcpy.h b/inference-engine/src/inference_engine/ie_memcpy.h
new file mode 100644 (file)
index 0000000..ab174de
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <stdlib.h>
+#include "ie_api.h"
+
+/**
+ * @brief Copies bytes between buffers with security enhancements
+ * Copies count bytes from src to dest. If the source and destination
+ * overlap, the behavior is undefined.
+ * @param dest
+ * pointer to the object to copy to
+ * @param destsz
+ * max number of bytes to modify in the destination (typically the size
+ * of the destination object)
+ * @param src
+ pointer to the object to copy from
+ * @param count
+ number of bytes to copy
+ @return zero on success and non-zero value on error.
+ */
+
+INFERENCE_ENGINE_API_CPP(int) ie_memcpy(void* dest, size_t destsz, void const* src, size_t count);
diff --git a/inference-engine/src/inference_engine/ie_network.cpp b/inference-engine/src/inference_engine/ie_network.cpp
new file mode 100644 (file)
index 0000000..3c92b99
--- /dev/null
@@ -0,0 +1,161 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ie_network.hpp"
+#include <details/ie_inetwork_iterator.hpp>
+#include <details/caseless.hpp>
+#include <iterator>
+#include <string>
+#include <vector>
+#include <memory>
+
+using namespace InferenceEngine;
+
+details::Network &details::Network::operator=(const details::Network &network) {
+    if (this == &network)
+        return *this;
+    name = network.getName();
+    for (const auto& layer : network) {
+        layers.push_back(Layer::Ptr(new details::Layer(*layer)));
+    }
+    for (const auto& connection : network.connections) {
+        connections.push_back(connection);
+    }
+    return *this;
+}
+
+details::Network &details::Network::operator=(const INetwork &network) {
+    if (this == &network)
+        return *this;
+    name = network.getName();
+    for (const auto& layer : network) {
+        layers.push_back(std::make_shared<details::Layer>(*layer));
+        for (const auto& newConnection : network.getLayerConnections(layer->getId())) {
+            bool connectionFound = false;
+            for (const auto& connection : connections) {
+                if (connection == newConnection) {
+                    connectionFound = true;
+                    break;
+                }
+            }
+            if (!connectionFound)
+                connections.push_back(newConnection);
+        }
+    }
+    return *this;
+}
+
+details::Network::Network(const Context& context, const std::string& name): ctx(context), name(name) {}
+
+details::Network::Network(const Context& context, const details::Network &network): ctx(context) {
+    *this = network;
+}
+
+details::Network::Network(const Context& context, const INetwork &network): ctx(context) {
+    *this = network;
+}
+
+size_t details::Network::size() const noexcept {
+    return static_cast<size_t>(std::distance(std::begin(*this), std::end(*this)));
+}
+
+const std::string& details::Network::getName() const noexcept {
+    return name;
+}
+
+std::string& details::Network::getName() noexcept {
+    return name;
+}
+
+const Context& details::Network::getContext() const noexcept {
+    return ctx;
+}
+
+const ILayer::Ptr details::Network::getLayer(size_t id) const noexcept {
+    for (const auto& layer : layers) {
+        if (layer->getId() == id)
+            return std::static_pointer_cast<ILayer>(layer);
+    }
+    return nullptr;
+}
+
+const std::vector<ILayer::Ptr> details::Network::getInputs() const noexcept {
+    std::vector<ILayer::Ptr> inputs;
+    for (const auto& layer : layers) {
+        bool isInputLayer = true;
+        for (const auto& connection : getLayerConnections(layer->getId())) {
+            if (connection.to().layerId() == layer->getId()) {
+                isInputLayer = false;
+                break;
+            }
+        }
+        if (isInputLayer) {
+            inputs.push_back(layer);
+        }
+    }
+    return inputs;
+}
+
+const std::vector<ILayer::Ptr> details::Network::getOutputs() const noexcept {
+    std::vector<ILayer::Ptr> outputs;
+    for (const auto& layer : layers) {
+        bool isOutputLayer = true;
+        for (const auto& connection : getLayerConnections(layer->getId())) {
+            if (connection.from().layerId() == layer->getId()) {
+                isOutputLayer = false;
+                break;
+            }
+        }
+        if (isOutputLayer) {
+            outputs.push_back(layer);
+        }
+    }
+    return outputs;
+}
+
+const std::vector<Connection>& details::Network::getConnections() const noexcept {
+    return connections;
+}
+
+details::Layer::Ptr details::Network::getLayer(size_t id) noexcept {
+    for (const auto& layer : layers) {
+        if (layer->getId() == id)
+            return layer;
+    }
+    return nullptr;
+}
+
+const std::vector<Connection> details::Network::getLayerConnections(idx_t layerId) const noexcept {
+    std::vector<Connection> layerConnections;
+    for (auto& connection : connections) {
+        if (connection.from().layerId() == layerId || connection.to().layerId() == layerId)
+            layerConnections.push_back(connection);
+    }
+    return layerConnections;
+}
+
+void details::Network::addLayer(const ILayer::Ptr &layer) noexcept {
+    if (layer)
+        layers.push_back(std::make_shared<Layer>(*layer));
+}
+
+void details::Network::addConnection(const Connection &connection) noexcept {
+    connections.push_back(connection);
+}
+
+INetwork::const_iterator details::Network::begin() const noexcept {
+    return INetwork::const_iterator(this);
+}
+
+INetwork::const_iterator details::Network::end() const noexcept {
+    return INetwork::const_iterator(this, true);
+}
+
+details::Network::iterator details::Network::begin() noexcept {
+    return Network::iterator(this);
+}
+
+details::Network::iterator details::Network::end() noexcept {
+    return Network::iterator(this, true);
+}
diff --git a/inference-engine/src/inference_engine/ie_network.hpp b/inference-engine/src/inference_engine/ie_network.hpp
new file mode 100644 (file)
index 0000000..16a80f7
--- /dev/null
@@ -0,0 +1,160 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_inetwork.hpp>
+#include <ie_blob.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include <map>
+
+namespace InferenceEngine {
+namespace details {
+
+class Network;
+
+class Parameters: public IParameters {
+public:
+    using Ptr = std::shared_ptr<Parameters>;
+
+    const std::map<std::string, Parameter>& getParameters() const noexcept override {
+        return params;
+    }
+    const std::map<std::string, Blob::CPtr>& getConstantData() const noexcept override {
+        return constData;
+    }
+
+    std::map<std::string, Parameter>& getParameters() {
+        return params;
+    }
+    std::map<std::string, Blob::CPtr>& getConstantData() noexcept {
+        return constData;
+    }
+private:
+    std::map<std::string, Parameter> params;
+    std::map<std::string, InferenceEngine::Blob::CPtr> constData;
+};
+
+class Layer: public ILayer {
+public:
+    using Ptr = std::shared_ptr<Layer>;
+
+    explicit Layer(size_t id): id(id), params(new Parameters()) {}
+    Layer(const Layer& layer) {
+        this->outputs = layer.getOutputPorts();
+        this->inputs = layer.getInputPorts();
+        this->params = layer.getParameters();
+        this->subGraph = layer.getGraph();
+        this->name = layer.getName();
+        this->type = layer.getType();
+        this->id = layer.getId();
+    }
+    explicit Layer(const ILayer& layer) {
+        this->outputs = layer.getOutputPorts();
+        this->inputs = layer.getInputPorts();
+        this->params = layer.getParameters();
+        this->subGraph = layer.getGraph();
+        this->name = layer.getName();
+        this->type = layer.getType();
+        this->id = layer.getId();
+    }
+
+    size_t getId() const noexcept override {
+        return id;
+    }
+    const std::string& getName() const noexcept override {
+        return name;
+    }
+    const std::string& getType() const noexcept override {
+        return type;
+    }
+    const INetwork::Ptr& getGraph() const noexcept override {
+        return subGraph;
+    }
+    const IParameters::Ptr& getParameters() const noexcept override {
+        return params;
+    }
+    const std::vector<Port>& getInputPorts() const noexcept override {
+        return inputs;
+    }
+    const std::vector<Port>& getOutputPorts() const noexcept override {
+        return outputs;
+    }
+
+    std::string& getName() noexcept {
+        return name;
+    }
+
+    std::string& getType() noexcept {
+        return type;
+    }
+    std::shared_ptr<Network> getGraph() noexcept {
+        return std::dynamic_pointer_cast<Network>(subGraph);
+    }
+    void setGraph(const INetwork::Ptr& graph) noexcept {
+        subGraph = graph;
+    }
+    Parameters::Ptr getParameters() noexcept {
+        return std::dynamic_pointer_cast<Parameters>(params);
+    }
+    std::vector<Port>& getInputPorts() noexcept {
+        return inputs;
+    }
+    std::vector<Port>& getOutputPorts() noexcept {
+        return outputs;
+    }
+
+private:
+    idx_t id;
+    std::string name;
+    std::string type;
+    INetwork::Ptr subGraph;
+    IParameters::Ptr params;
+    std::vector<Port> inputs;
+    std::vector<Port> outputs;
+};
+
+class Network: public INetwork {
+public:
+    using Ptr = std::shared_ptr<Network>;
+    using iterator = details::INetworkIterator<Network, Layer>;
+
+    explicit Network(const Context& context, const std::string& name = "");
+    Network(const Context& context, const INetwork& network);
+    Network(const Context& context, const Network& network);
+
+    Network& operator=(const Network& network);
+    Network& operator=(const INetwork& network);
+
+    const_iterator begin() const noexcept override;
+    const_iterator end() const noexcept override;
+    iterator begin() noexcept;
+    iterator end() noexcept;
+
+    const ILayer::Ptr getLayer(size_t id) const noexcept override;
+    const std::vector<ILayer::Ptr> getInputs() const noexcept override;
+    const std::vector<ILayer::Ptr> getOutputs() const noexcept override;
+    const std::vector<Connection> getLayerConnections(idx_t layerId) const noexcept override;
+    size_t size() const noexcept override;
+    const std::string& getName() const noexcept override;
+    const Context& getContext() const noexcept override;
+
+    const std::vector<Connection>& getConnections() const noexcept;
+    Layer::Ptr getLayer(size_t id) noexcept;
+    std::string& getName() noexcept;
+
+    void addLayer(const ILayer::Ptr& layer) noexcept;
+    void addConnection(const Connection& connection) noexcept;
+
+private:
+    const Context ctx;
+    std::string name;
+    std::vector<Layer::Ptr> layers;
+    std::vector<Connection> connections;
+};
+
+}  // namespace details
+}  // namespace InferenceEngine
index 13380c7..11c3f9e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -9,10 +8,12 @@
 #ifdef HAVE_SSE
 #include "ie_preprocess_data_sse42.hpp"
 #endif
+#include "ie_preprocess_gapi.hpp"
 
 #include <algorithm>
 
 namespace InferenceEngine {
+
 namespace Resize {
 
 template<typename data_t> static inline data_t saturate_cast(float res);
@@ -27,7 +28,7 @@ template<> inline uint8_t saturate_cast(float res) {
 }
 
 template<typename data_t = float>
-void resize_bilinear_fp32(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
+void resize_bilinear(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
     Border border = {BORDER_REPLICATE, 0};
 
     auto dstDims = outBlob->getTensorDesc().getDims();
@@ -61,8 +62,8 @@ void resize_bilinear_fp32(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* bu
     auto scale_x = static_cast<float>(src_full_width) / dst_full_width;
     auto scale_y = static_cast<float>(src_full_height) / dst_full_height;
 
-    auto* xofs = reinterpret_cast<int16_t*>(buffer);
-    auto* yofs = reinterpret_cast<int32_t*>(xofs + dwidth);
+    auto* xofs = reinterpret_cast<int32_t*>(buffer);
+    auto* yofs = xofs + dwidth;
     auto* alpha = reinterpret_cast<float*>(yofs + dheight);
     auto* beta = alpha + dwidth;
     auto* tptr = beta + dheight;
@@ -83,7 +84,7 @@ void resize_bilinear_fp32(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* bu
             sx0 = (std::max)(src_full_width - 2, 0);
         }
 
-        xofs[dx - dst_go_x] = (int16_t)(sx0 - src_go_x);
+        xofs[dx - dst_go_x] = sx0 - src_go_x;
         alpha[dx - dst_go_x] = fx;
     }
 
@@ -103,7 +104,7 @@ void resize_bilinear_fp32(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* bu
             sy0 = (std::max)(src_full_height - 2, 0);
         }
 
-        yofs[dy - dst_go_y] = (sy0 - src_go_y);
+        yofs[dy - dst_go_y] = sy0 - src_go_y;
         beta[dy - dst_go_y] = fy;
     }
 
@@ -282,7 +283,7 @@ int computeResizeAreaTabFP32(int src_go, int dst_go, int ssize, int dsize, float
 }
 
 template<typename data_t = float>
-void resize_area_fp32_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
+void resize_area_downscale(const Blob::Ptr inBlob, Blob::Ptr outBlob, uint8_t* buffer) {
     auto dstDims = outBlob->getTensorDesc().getDims();
     auto srcDims = inBlob->getTensorDesc().getDims();
 
@@ -591,54 +592,93 @@ size_t resize_get_buffer_size(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeA
     float scale_x = static_cast<float>(dstDims[3]) / srcDims[3];
     float scale_y = static_cast<float>(dstDims[2]) / srcDims[2];
 
-    size_t buffer_size;
-    if ((scale_x >= 1 || scale_y >= 1) && algorithm == RESIZE_AREA) {
-        buffer_size = (dstDims[3] + dstDims[2])*(sizeof(int) + sizeof(float)*2) + 2*dstDims[3] * sizeof(float);
-    } else if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
-        if (algorithm == RESIZE_BILINEAR) {
-            buffer_size = (sizeof(int16_t) * 4 + sizeof(uint8_t *)) * dstDims[3] +
-                          (sizeof(int32_t) + sizeof(int16_t)) * dstDims[2] +
-                          sizeof(uint32_t) * dstDims[3] +
-                          (((srcDims[3] + 7) / 8) * 8 * 8) +
-                          sizeof(uint8_t) * 12;
-        } else {
-            const int dwidth = dstDims[3];
-            const int dheight = dstDims[2];
-            const int swidth = srcDims[3];
+    auto resize_bilinear_u8_buffer_size = [&]() {
+        size_t buffer_size = (sizeof(int16_t) * 4 + sizeof(uint8_t *)) * dstDims[3] +
+                             (sizeof(int32_t) + sizeof(int16_t)) * dstDims[2] +
+                             sizeof(uint32_t) * dstDims[3] +
+                             (((srcDims[3] + 7) / 8) * 8 * 8) +
+                             sizeof(uint8_t) * 12;
+
+        return buffer_size;
+    };
+
+    auto resize_bilinear_fp32_buffer_size = [&]() {
+        size_t buffer_size = (sizeof(float) + sizeof(float *)) * dstDims[3] +
+                             (sizeof(int32_t) + sizeof(float)) * dstDims[2] +
+                             (((srcDims[3] + 1) / 2) * 2 * 2) * sizeof(float);
+
+        return buffer_size;
+    };
+
+    auto resize_area_u8_downscale_sse_buffer_size = [&]() {
+        const int dwidth = dstDims[3];
+        const int dheight = dstDims[2];
+        const int swidth = srcDims[3];
 
-            const int dst_go_x = 0;
-            const int dst_go_y = 0;
+        const int dst_go_x = 0;
+        const int dst_go_y = 0;
 
-            int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width, dwidth, static_cast<float>(src_full_width) / dst_full_width) + 1;
-            int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, static_cast<float>(src_full_height) / dst_full_height) + 1;
+        int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width, dwidth, static_cast<float>(src_full_width) / dst_full_width) + 1;
+        int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, static_cast<float>(src_full_height) / dst_full_height) + 1;
 
-            size_t si_buf_size = sizeof(uint16_t) * dwidth + sizeof(uint16_t) * dheight;
-            size_t alpha_buf_size =
-                    sizeof(uint16_t) * (dwidth * x_max_count + 8 * 16) + sizeof(uint16_t) * dheight * y_max_count;
-            size_t vert_sum_buf_size = sizeof(uint16_t) * (swidth * 2);
-            size_t alpha_array_buf_size = sizeof(uint16_t) * 4 * dwidth;
-            size_t sxid_array_buf_size = sizeof(uint16_t) * 4 * 4 * dwidth;
+        size_t si_buf_size = sizeof(uint16_t) * dwidth + sizeof(uint16_t) * dheight;
+        size_t alpha_buf_size =
+                sizeof(uint16_t) * (dwidth * x_max_count + 8 * 16) + sizeof(uint16_t) * dheight * y_max_count;
+        size_t vert_sum_buf_size = sizeof(uint16_t) * (swidth * 2);
+        size_t alpha_array_buf_size = sizeof(uint16_t) * 4 * dwidth;
+        size_t sxid_array_buf_size = sizeof(uint16_t) * 4 * 4 * dwidth;
 
-            buffer_size = si_buf_size +
-                          alpha_buf_size +
-                          vert_sum_buf_size +
-                          alpha_array_buf_size +
-                          sxid_array_buf_size;
+        size_t buffer_size = si_buf_size +
+                             alpha_buf_size +
+                             vert_sum_buf_size +
+                             alpha_array_buf_size +
+                             sxid_array_buf_size;
+
+        return buffer_size;
+    };
+
+    auto resize_area_downscale_buffer_size = [&]() {
+        size_t buffer_size = sizeof(float) * (srcDims[3]) +
+                             sizeof(uint32_t) * (dstDims[3] * 2 + 1) +
+                             sizeof(float) * ((srcDims[3] + srcDims[2]) * 4) +
+                             sizeof(float) * ((srcDims[3] + srcDims[2]) * 2);
+
+        return buffer_size;
+    };
+
+    auto resize_area_upscale_buffer_size = [&]() {
+        size_t buffer_size = (dstDims[3] + dstDims[2])*(sizeof(int) + sizeof(float)*2) + 2*dstDims[3] * sizeof(float);
+
+        return buffer_size;
+    };
+
+    if (algorithm == RESIZE_BILINEAR) {
+        if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
+            return resize_bilinear_u8_buffer_size();
+        } else {
+            return resize_bilinear_fp32_buffer_size();
         }
-    } else {
-        if (algorithm == RESIZE_BILINEAR) {
-            buffer_size = (sizeof(float) + sizeof(float *)) * dstDims[3] +
-                          (sizeof(int32_t) + sizeof(float)) * dstDims[2] +
-                          (((srcDims[3] + 1) / 2) * 2 * 2) * sizeof(float);
+    } else if (algorithm == RESIZE_AREA) {
+        if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
+            if (scale_x <= 1 && scale_y <= 1) {
+#ifdef HAVE_SSE
+                if (with_cpu_x86_sse42() && scale_x < 1 && scale_y < 1)
+                    return resize_area_u8_downscale_sse_buffer_size();
+                else
+#endif
+                    return resize_area_downscale_buffer_size();
+            } else {
+                return resize_area_upscale_buffer_size();
+            }
         } else {
-            buffer_size = sizeof(float) * (srcDims[3]) +
-                          sizeof(uint32_t) * (dstDims[3] * 2 + 1) +
-                          sizeof(float) * ((srcDims[3] + srcDims[2]) * 4) +
-                          sizeof(float) * ((srcDims[3] + srcDims[2]) * 2);
+            if (scale_x <= 1 && scale_y <= 1)
+                return resize_area_downscale_buffer_size();
+            else
+                return resize_area_upscale_buffer_size();
         }
     }
 
-    return buffer_size;
+    return 0;
 }
 
 void resize(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeAlgorithm &algorithm) {
@@ -670,25 +710,25 @@ void resize(Blob::Ptr inBlob, Blob::Ptr outBlob, const ResizeAlgorithm &algorith
                 Resize::resize_bilinear_u8(inBlob, outBlob, buffer);
             else
 #endif
-                resize_bilinear_fp32<uint8_t>(inBlob, outBlob, buffer);
+                resize_bilinear<uint8_t>(inBlob, outBlob, buffer);
         } else {
-            resize_bilinear_fp32(inBlob, outBlob, buffer);
+            resize_bilinear<float>(inBlob, outBlob, buffer);
         }
     } else if (algorithm == RESIZE_AREA) {
         if (inBlob->getTensorDesc().getPrecision() == Precision::U8) {
-            if (scale_x < 1 && scale_y < 1) {
+            if (scale_x <= 1 && scale_y <= 1) {
 #ifdef HAVE_SSE
-                if (with_cpu_x86_sse42())
+                if (with_cpu_x86_sse42() && scale_x < 1 && scale_y < 1)
                     Resize::resize_area_u8_downscale(inBlob, outBlob, buffer);
                 else
 #endif
-                    resize_area_fp32_downscale<uint8_t>(inBlob, outBlob, buffer);
+                    resize_area_downscale<uint8_t>(inBlob, outBlob, buffer);
             } else {
                 resize_area_upscale<uint8_t>(inBlob, outBlob, buffer);
             }
         } else {
-            if (scale_x < 1 && scale_y < 1)
-                resize_area_fp32_downscale(inBlob, outBlob, buffer);
+            if (scale_x <= 1 && scale_y <= 1)
+                resize_area_downscale<float>(inBlob, outBlob, buffer);
             else
                 resize_area_upscale<float>(inBlob, outBlob, buffer);
         }
@@ -711,7 +751,7 @@ Blob::Ptr PreProcessData::getRoiBlob() const {
     return _roiBlob;
 }
 
-void PreProcessData::execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm) {
+void PreProcessData::execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool serial) {
     IE_PROFILING_AUTO_SCOPE_TASK(perf_preprocessing)
 
     if (algorithm == NO_RESIZE) {
@@ -722,6 +762,13 @@ void PreProcessData::execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorith
         THROW_IE_EXCEPTION << "Input pre-processing is called without ROI blob set";
     }
 
+    if (!_preproc) {
+        _preproc.reset(new PreprocEngine);
+    }
+    if (_preproc->preprocessWithGAPI(_roiBlob, outBlob, algorithm, serial)) {
+        return;
+    }
+
     Blob::Ptr res_in, res_out;
     if (_roiBlob->getTensorDesc().getLayout() == NHWC) {
         if (!_tmp1 || _tmp1->size() != _roiBlob->size()) {
index 77b1866..f5a7730 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,6 +6,7 @@
 
 #include <map>
 #include <string>
+#include <memory>
 
 #include "ie_blob.h"
 #include "ie_input_info.hpp"
@@ -14,6 +14,8 @@
 
 namespace InferenceEngine {
 
+class PreprocEngine;
+
 /**
  * @brief This class stores pre-process information for exact input
  */
@@ -25,6 +27,12 @@ class INFERENCE_ENGINE_API_CLASS(PreProcessData) {
     Blob::Ptr _tmp1 = nullptr;
     Blob::Ptr _tmp2 = nullptr;
 
+    /**
+     * @brief Pointer-to-implementation (PIMPL) hiding preprocessing implementation details.
+     * BEWARE! Will be shared among copies!
+     */
+    std::shared_ptr<PreprocEngine> _preproc;
+
     InferenceEngine::ProfilingTask perf_resize {"Resize"};
     InferenceEngine::ProfilingTask perf_reorder_before {"Reorder before"};
     InferenceEngine::ProfilingTask perf_reorder_after {"Reorder after"};
@@ -48,7 +56,7 @@ public:
      * @param outBlob pre-processed output blob to be used for inference.
      * @param algorithm resize algorithm.
      */
-    void execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm);
+    void execute(Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool serial);
 };
 
 //----------------------------------------------------------------------
diff --git a/inference-engine/src/inference_engine/ie_preprocess_gapi.cpp b/inference-engine/src/inference_engine/ie_preprocess_gapi.cpp
new file mode 100644 (file)
index 0000000..31f5983
--- /dev/null
@@ -0,0 +1,374 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <utility>
+#include <vector>
+#include <algorithm>
+#include <tuple>
+#include <string>
+
+// Careful reader, don't worry -- it is not the whole OpenCV,
+// it is just a single stand-alone component of it
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/util/util.hpp>
+
+#include "ie_blob.h"
+#include "ie_input_info.hpp"
+#include "ie_preprocess_gapi.hpp"
+#include "ie_preprocess_gapi_kernels.hpp"
+
+#include "ie_parallel.hpp"
+
+#include <opencv2/gapi/fluid/gfluidkernel.hpp>  // GFluidOutputRois
+
+namespace InferenceEngine {
+namespace {
+namespace G {
+    struct Strides {int N; int C; int H; int W;};
+    struct Dims    {int N; int C; int H; int W;};
+    struct Desc    {Dims d; Strides s;};
+
+    void fix_strides_nhwc(const Dims &d, Strides &s) {
+        if (s.W > d.C) {
+            s.C = 1;
+            s.W = s.C*d.C;
+            s.H = s.W*d.W;
+            s.N = s.H*d.H;
+        }
+    }
+
+    Desc decompose(Blob::Ptr &blob) {
+        const auto& ie_desc     = blob->getTensorDesc();
+        const auto& ie_blk_desc = ie_desc.getBlockingDesc();
+        const auto& ie_dims     = ie_desc.getDims();
+        const auto& ie_strides  = ie_blk_desc.getStrides();
+
+        Dims d = {
+            static_cast<int>(ie_dims[0]),
+            static_cast<int>(ie_dims[1]),
+            static_cast<int>(ie_dims[2]),
+            static_cast<int>(ie_dims[3])
+        };
+
+        Strides s = {
+            static_cast<int>(ie_strides[0]),
+            static_cast<int>(blob->layout() == NHWC ? ie_strides[3] : ie_strides[1]),
+            static_cast<int>(blob->layout() == NHWC ? ie_strides[1] : ie_strides[2]),
+            static_cast<int>(blob->layout() == NHWC ? ie_strides[2] : ie_strides[3]),
+        };
+
+        if (blob->layout() == NHWC) fix_strides_nhwc(d, s);
+
+        return Desc{d, s};
+    }
+}  // namespace G
+
+inline int get_cv_depth(const InferenceEngine::TensorDesc &ie_desc) {
+    switch (ie_desc.getPrecision()) {
+    case Precision::U8:   return CV_8U;
+    case Precision::FP32: return CV_32F;
+    default: THROW_IE_EXCEPTION << "Unsupported data type";
+    }
+}
+
+std::vector<cv::gapi::own::Mat> bind_to_blob(Blob::Ptr &blob) {
+    const auto& ie_desc     = blob->getTensorDesc();
+    const auto& ie_desc_blk = ie_desc.getBlockingDesc();
+    const auto     desc     = G::decompose(blob);
+    const auto cv_depth     = get_cv_depth(ie_desc);
+    const auto stride       = desc.s.H*blob->element_size();
+    const auto planeSize    = cv::gapi::own::Size(desc.d.W, desc.d.H);
+
+
+    uint8_t* ptr = static_cast<uint8_t*>(blob->buffer());
+    ptr += blob->element_size()*ie_desc_blk.getOffsetPadding();
+
+    std::vector<cv::gapi::own::Mat> result;
+    if (blob->layout() == NHWC) {
+        result.emplace_back(planeSize.height, planeSize.width, CV_MAKETYPE(cv_depth, desc.d.C), ptr, stride);
+    } else {  // NCHW
+        const auto planeType = CV_MAKETYPE(cv_depth, 1);
+        for (size_t ch = 0; ch < desc.d.C; ch++) {
+            cv::gapi::own::Mat plane(planeSize.height, planeSize.width, planeType, ptr + ch*desc.s.C*blob->element_size(), stride);
+            result.emplace_back(plane);
+        }
+    }
+    return result;
+}
+
+template<typename... Ts, int... IIs>
+std::vector<cv::GMat> to_vec_impl(std::tuple<Ts...> &&gmats, cv::detail::Seq<IIs...>) {
+    return { std::get<IIs>(gmats)... };
+}
+
+template<typename... Ts>
+std::vector<cv::GMat> to_vec(std::tuple<Ts...> &&gmats) {
+    return to_vec_impl(std::move(gmats), typename cv::detail::MkSeq<sizeof...(Ts)>::type());
+}
+
+cv::GComputation buildGraph(const G::Desc &in_desc,
+                            const G::Desc &out_desc,
+                            InferenceEngine::Layout in_layout,
+                            InferenceEngine::Layout out_layout,
+                            InferenceEngine::ResizeAlgorithm algorithm,
+                            int precision) {
+    if ((in_layout == NHWC) && (in_desc.d.C == 3) && (precision == CV_8U) && (algorithm == RESIZE_BILINEAR)) {
+        const auto input_sz = cv::gapi::own::Size(in_desc.d.W, in_desc.d.H);
+        const auto scale_sz = cv::gapi::own::Size(out_desc.d.W, out_desc.d.H);
+        std::vector<cv::GMat> inputs(1);
+        std::vector<cv::GMat> outputs;
+
+        if (out_layout == NHWC) {
+            outputs.resize(1);
+            auto planes = to_vec(gapi::ScalePlanes::on(inputs[0], precision, input_sz, scale_sz, cv::INTER_LINEAR));
+            outputs[0] = gapi::Merge3::on(planes[0], planes[1], planes[2]);
+        } else {
+            outputs = to_vec(gapi::ScalePlanes::on(inputs[0], precision, input_sz, scale_sz, cv::INTER_LINEAR));
+        }
+        return cv::GComputation(inputs, outputs);
+    }
+
+    std::vector<cv::GMat> inputs;  // 1 element if NHWC, C elements if NCHW
+    std::vector<cv::GMat> planes;
+
+    // Convert input blob to planar format, if it is not yet planar
+    if (in_layout == NHWC) {
+        // interleaved input blob needs to be decomposed into distinct planes
+        inputs.resize(1);
+        switch (in_desc.d.C) {
+        case 1: planes = { inputs[0] };                       break;
+        case 2: planes = to_vec(gapi::Split2::on(inputs[0])); break;
+        case 3: planes = to_vec(gapi::Split3::on(inputs[0])); break;
+        case 4: planes = to_vec(gapi::Split4::on(inputs[0])); break;
+        default:
+            for (int chan = 0; chan < in_desc.d.C; chan++)
+                planes.emplace_back(gapi::ChanToPlane::on(inputs[0], chan));
+            break;
+        }
+    } else if (in_layout == NCHW) {
+        // planar blob can be passed to resize as-is
+        inputs.resize(in_desc.d.C);
+        planes = inputs;
+    }
+
+    // Resize every plane
+    std::vector<cv::GMat> out_planes;
+    const int interp_type = [](const ResizeAlgorithm &ar) {
+        switch (ar) {
+        case RESIZE_AREA:     return cv::INTER_AREA;
+        case RESIZE_BILINEAR: return cv::INTER_LINEAR;
+        default: THROW_IE_EXCEPTION << "Unsupported resize operation";
+        }
+    } (algorithm);
+    const auto input_sz  = cv::gapi::own::Size(in_desc.d.W, in_desc.d.H);
+    const auto scale_sz  = cv::gapi::own::Size(out_desc.d.W, out_desc.d.H);
+    const auto scale_fcn = std::bind(&gapi::ScalePlane::on,
+                                     std::placeholders::_1,
+                                     precision,
+                                     input_sz, scale_sz, interp_type);
+    std::transform(planes.begin(), planes.end(), std::back_inserter(out_planes), scale_fcn);
+
+    // Convert to expected layout, if required
+    std::vector<cv::GMat> outputs;  // 1 element if NHWC, C elements if NCHW
+    if (out_layout == NHWC) {
+        outputs.resize(1);
+        if      (out_desc.d.C == 1) outputs[0] = out_planes[0];
+        else if (out_desc.d.C == 2) outputs[0] = gapi::Merge2::on(out_planes[0], out_planes[1]);
+        else if (out_desc.d.C == 3) outputs[0] = gapi::Merge3::on(out_planes[0], out_planes[1], out_planes[2]);
+        else if (out_desc.d.C == 4) outputs[0] = gapi::Merge4::on(out_planes[0], out_planes[1], out_planes[2], out_planes[3]);
+        else    THROW_IE_EXCEPTION << "Output channels >4 are not supported for HWC [by G-API]";
+    } else {
+        outputs = out_planes;
+    }
+
+    return cv::GComputation(inputs, outputs);
+}
+}  // anonymous namespace
+
+InferenceEngine::PreprocEngine::PreprocEngine() : _lastComp(parallel_get_max_threads()) {}
+
+InferenceEngine::PreprocEngine::Update InferenceEngine::PreprocEngine::needUpdate(const CallDesc &newCallOrig) const {
+    // Given our knowledge about Fluid, full graph rebuild is required
+    // if and only if:
+    // 0. This is the first call ever
+    // 1. precision has changed (affects kernel versions)
+    // 2. layout has changed (affects graph topology)
+    // 3. algorithm has changed (affects kernel version)
+    // 4. dimensions have changed from downscale to upscale or
+    // vice-versa if interpolation is AREA.
+    if (!_lastCall) {
+        return Update::REBUILD;
+    }
+
+    BlobDesc last_in;
+    BlobDesc last_out;
+    ResizeAlgorithm last_algo;
+    std::tie(last_in, last_out, last_algo) = *_lastCall;
+
+    CallDesc newCall = newCallOrig;
+    BlobDesc new_in;
+    BlobDesc new_out;
+    ResizeAlgorithm new_algo;
+    std::tie(new_in, new_out, new_algo) = newCall;
+
+    // Declare two empty vectors per each call
+    SizeVector last_in_size;
+    SizeVector last_out_size;
+    SizeVector new_in_size;
+    SizeVector new_out_size;
+
+    // Now swap it with in/out descriptor vectors
+    // Now last_in/last_out would contain everything but sizes
+    last_in_size.swap(std::get<2>(last_in));
+    last_out_size.swap(std::get<2>(last_out));
+    new_in_size.swap(std::get<2>(new_in));
+    new_out_size.swap(std::get<2>(new_out));
+
+    // If anything (except input sizes) changes, rebuild is required
+    if (last_in != new_in || last_out != new_out || last_algo != new_algo) {
+        return Update::REBUILD;
+    }
+
+    // If output sizes change, graph should be regenerated (resize
+    // ratio is taken from parameters)
+    if (last_out_size != new_out_size) {
+        return Update::REBUILD;
+    }
+
+    // If interpolation is AREA and sizes change upscale/downscale
+    // mode, rebuild is required
+    if (last_algo == RESIZE_AREA) {
+        // 0123 == NCHW
+        const auto is_upscale = [](const SizeVector &in, const SizeVector &out) -> bool {
+            return in[2] < out[2] || in[3] < out[3];
+        };
+        const bool old_upscale = is_upscale(last_in_size, last_out_size);
+        const bool new_upscale = is_upscale(new_in_size, new_out_size);
+        if (old_upscale != new_upscale) {
+            return Update::REBUILD;
+        }
+    }
+
+    // If only sizes changes (considering the above exception),
+    // reshape is enough
+    if (last_in_size != new_in_size) {
+        return Update::RESHAPE;
+    }
+
+    return Update::NOTHING;
+}
+
+bool InferenceEngine::PreprocEngine::preprocessWithGAPI(Blob::Ptr &inBlob, Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool omp_serial) {
+    static const bool NO_GAPI = [](const char *str) -> bool {
+        std::string var(str ? str : "");
+        return var == "N" || var == "NO" || var == "OFF" || var == "0";
+    } (std::getenv("USE_GAPI"));
+
+    if (NO_GAPI)
+        return false;
+
+    const auto &in_desc_ie = inBlob->getTensorDesc();
+    const auto &out_desc_ie = outBlob->getTensorDesc();
+    auto supports_layout = [](Layout l) { return l == Layout::NCHW || l == Layout::NHWC; };
+    if (!supports_layout(inBlob->layout()) || !supports_layout(outBlob->layout())
+        || in_desc_ie.getDims().size() != 4 || out_desc_ie.getDims().size() != 4) {
+        THROW_IE_EXCEPTION << "Preprocess support NCHW/NHWC only";
+    }
+
+    const G::Desc
+        in_desc = G::decompose(inBlob),
+        out_desc = G::decompose(outBlob);
+
+    CallDesc thisCall = CallDesc{ BlobDesc{ in_desc_ie.getPrecision(),
+                                            inBlob->layout(),
+                                            in_desc_ie.getDims() },
+                                  BlobDesc{ out_desc_ie.getPrecision(),
+                                            outBlob->layout(),
+                                            out_desc_ie.getDims() },
+                                  algorithm };
+    const Update update = needUpdate(thisCall);
+
+    std::vector<cv::gapi::own::Mat> input_plane_mats  = bind_to_blob(inBlob);
+    std::vector<cv::gapi::own::Mat> output_plane_mats = bind_to_blob(outBlob);
+
+    Opt<cv::GComputation> _lastComputation;
+    if (Update::REBUILD == update || Update::RESHAPE == update) {
+        _lastCall = cv::util::make_optional(std::move(thisCall));
+
+        if (Update::REBUILD == update) {
+            //  rebuild the graph
+            IE_PROFILING_AUTO_SCOPE_TASK(_perf_graph_building);
+            _lastComputation = cv::util::make_optional(buildGraph(in_desc,
+                                                                  out_desc,
+                                                                  inBlob->layout(),
+                                                                  outBlob->layout(),
+                                                                  algorithm,
+                                                                  get_cv_depth(in_desc_ie)));
+        }
+    }
+
+    const int thread_num =
+            #if IE_THREAD == IE_THREAD_OMP
+                omp_serial ? 1 :    // disable threading for OpenMP if was asked for
+            #endif
+                0;                  // use all available threads
+
+    // to suppress unused warnings
+    (void)(omp_serial);
+
+    // Split the whole graph into `total_slices` slices, where
+    // `total_slices` is provided by the parallel runtime and assumed
+    // to be number of threads used.  However it is not guaranteed
+    // that an actual number of threads will be as assumed, so it
+    // possible that all slices are processed by the same thread.
+    //
+    parallel_nt_static(thread_num , [&, this](int slice_n, const int total_slices){
+        IE_PROFILING_AUTO_SCOPE_TASK(_perf_exec_tile);
+
+        auto& compiled = _lastComp[slice_n];
+        if (Update::REBUILD == update || Update::RESHAPE == update) {
+            //  need to compile (or reshape) own object for a particular ROI
+            IE_PROFILING_AUTO_SCOPE_TASK(_perf_graph_compiling);
+
+            auto meta_of = [](std::vector<cv::gapi::own::Mat> const& ins){
+                std::vector<cv::GMetaArg> rslt{ins.size()}; rslt.clear();
+                for (auto& m : ins) {
+                    rslt.emplace_back(descr_of(m));
+                }
+                return rslt;
+            };
+
+            using cv::gapi::own::Rect;
+
+            const auto lines_per_thread = output_plane_mats[0].rows / total_slices;
+            const auto remainder = output_plane_mats[0].rows - total_slices * lines_per_thread;
+            const auto roi_height = lines_per_thread + ((slice_n == total_slices -1) ?  remainder : 0);
+
+            auto roi = Rect{0, slice_n * lines_per_thread, output_plane_mats[0].cols, roi_height};
+            std::vector<Rect> rois(output_plane_mats.size(), roi);
+
+            // TODO: make a ROI a runtime argument to avoid
+            // recompilations
+            auto args = cv::compile_args(gapi::preprocKernels(), cv::GFluidOutputRois{std::move(rois)});
+            if (Update::REBUILD == update) {
+                auto& computation = _lastComputation.value();
+                compiled = computation.compile(meta_of(input_plane_mats), std::move(args));
+            } else {
+                IE_ASSERT(compiled);
+                compiled.reshape(meta_of(input_plane_mats), std::move(args));
+            }
+        }
+
+        cv::GRunArgs call_ins;
+        cv::GRunArgsP call_outs;
+        for (const auto & m : input_plane_mats) { call_ins.emplace_back(m);}
+        for (auto & m : output_plane_mats) { call_outs.emplace_back(&m);}
+
+        IE_PROFILING_AUTO_SCOPE_TASK(_perf_exec_graph);
+        compiled(std::move(call_ins), std::move(call_outs));
+    });
+
+    return true;
+}
+}  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/ie_preprocess_gapi.hpp b/inference-engine/src/inference_engine/ie_preprocess_gapi.hpp
new file mode 100644 (file)
index 0000000..5d9168a
--- /dev/null
@@ -0,0 +1,42 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_blob.h"
+#include "ie_input_info.hpp"
+
+#include <tuple>
+#include <vector>
+#include <opencv2/gapi/gcompiled.hpp>
+#include <opencv2/gapi/util/optional.hpp>
+#include "ie_profiling.hpp"
+
+// FIXME: Move this definition back to ie_preprocess_data,
+// also free ie_preprocess_gapi of these details
+
+namespace InferenceEngine {
+
+class PreprocEngine {
+    using BlobDesc = std::tuple<Precision, Layout, SizeVector>;
+    using CallDesc = std::tuple<BlobDesc, BlobDesc, ResizeAlgorithm>;
+    template<typename T> using Opt = cv::util::optional<T>;
+
+    Opt<CallDesc> _lastCall;
+    std::vector<cv::GCompiled> _lastComp;
+
+    ProfilingTask _perf_graph_building {"Preproc Graph Building"};
+    ProfilingTask _perf_exec_tile  {"Preproc Calc Tile"};
+    ProfilingTask _perf_exec_graph {"Preproc Exec Graph"};
+    ProfilingTask _perf_graph_compiling {"Preproc Graph compiling"};
+
+    enum class Update { REBUILD, RESHAPE, NOTHING };
+    Update needUpdate(const CallDesc &newCall) const;
+
+public:
+    PreprocEngine();
+    bool preprocessWithGAPI(Blob::Ptr &inBlob, Blob::Ptr &outBlob, const ResizeAlgorithm &algorithm, bool omp_serial);
+};
+
+}  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.cpp b/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.cpp
new file mode 100644 (file)
index 0000000..4910a2a
--- /dev/null
@@ -0,0 +1,1544 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ie_preprocess_gapi_kernels.hpp"
+#include "ie_preprocess_gapi_kernels_impl.hpp"
+
+// AFTER "ie_preprocess_gapi_kernels_impl.hpp"
+// (MANUAL_SIMD is defined there)
+#if MANUAL_SIMD
+  #include "cpu_detector.hpp"
+  #include "ie_preprocess_gapi_kernels_sse42.hpp"
+#endif
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/fluid/gfluidkernel.hpp>
+#include <opencv2/gapi/gcompoundkernel.hpp>
+
+#include <algorithm>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace InferenceEngine {
+namespace gapi {
+
+namespace kernels {
+
+template<typename T, int chs> static
+void mergeRow(const std::array<const uint8_t*, chs>& ins, uint8_t* out, int length) {
+#if MANUAL_SIMD
+    if (with_cpu_x86_sse42()) {
+        if (std::is_same<T, uint8_t>::value && chs == 2) {
+            mergeRow_8UC2(ins[0], ins[1], out, length);
+            return;
+        }
+
+        if (std::is_same<T, uint8_t>::value && chs == 3) {
+            mergeRow_8UC3(ins[0], ins[1], ins[2], out, length);
+            return;
+        }
+
+        if (std::is_same<T, uint8_t>::value && chs == 4) {
+            mergeRow_8UC4(ins[0], ins[1], ins[2], ins[3], out, length);
+            return;
+        }
+
+        if (std::is_same<T, float>::value && chs == 2) {
+            mergeRow_32FC2(reinterpret_cast<const float*>(ins[0]),
+                           reinterpret_cast<const float*>(ins[1]),
+                           reinterpret_cast<float*>(out), length);
+            return;
+        }
+
+        if (std::is_same<T, float>::value && chs == 3) {
+            mergeRow_32FC3(reinterpret_cast<const float*>(ins[0]),
+                           reinterpret_cast<const float*>(ins[1]),
+                           reinterpret_cast<const float*>(ins[2]),
+                           reinterpret_cast<float*>(out), length);
+            return;
+        }
+
+        if (std::is_same<T, float>::value && chs == 4) {
+            mergeRow_32FC4(reinterpret_cast<const float*>(ins[0]),
+                           reinterpret_cast<const float*>(ins[1]),
+                           reinterpret_cast<const float*>(ins[2]),
+                           reinterpret_cast<const float*>(ins[3]),
+                           reinterpret_cast<float*>(out), length);
+            return;
+        }
+    }
+#endif
+
+    const T* insT[chs];
+    for (int c = 0; c < chs; c++) {
+        insT[c] = reinterpret_cast<const T*>(ins[c]);
+    }
+    auto outT = reinterpret_cast<T*>(out);
+
+    for (int x = 0; x < length; x++) {
+        for (int c = 0; c < chs; c++) {
+            outT[chs*x + c] = insT[c][x];
+        }
+    }
+}
+
+template<typename T, int chs> static
+void splitRow(const uint8_t* in, std::array<uint8_t*, chs>& outs, int length) {
+#if MANUAL_SIMD
+    if (with_cpu_x86_sse42()) {
+        if (std::is_same<T, uint8_t>::value && chs == 2) {
+            splitRow_8UC2(in, outs[0], outs[1], length);
+            return;
+        }
+
+        if (std::is_same<T, uint8_t>::value && chs == 3) {
+            splitRow_8UC3(in, outs[0], outs[1], outs[2], length);
+            return;
+        }
+
+        if (std::is_same<T, uint8_t>::value && chs == 4) {
+            splitRow_8UC4(in, outs[0], outs[1], outs[2], outs[3], length);
+            return;
+        }
+
+        if (std::is_same<T, float>::value && chs == 2) {
+            splitRow_32FC2(reinterpret_cast<const float*>(in),
+                           reinterpret_cast<float*>(outs[0]),
+                           reinterpret_cast<float*>(outs[1]),
+                           length);
+            return;
+        }
+
+        if (std::is_same<T, float>::value && chs == 3) {
+            splitRow_32FC3(reinterpret_cast<const float*>(in),
+                           reinterpret_cast<float*>(outs[0]),
+                           reinterpret_cast<float*>(outs[1]),
+                           reinterpret_cast<float*>(outs[2]),
+                           length);
+            return;
+        }
+
+        if (std::is_same<T, float>::value && chs == 4) {
+            splitRow_32FC4(reinterpret_cast<const float*>(in),
+                           reinterpret_cast<float*>(outs[0]),
+                           reinterpret_cast<float*>(outs[1]),
+                           reinterpret_cast<float*>(outs[2]),
+                           reinterpret_cast<float*>(outs[3]),
+                           length);
+            return;
+        }
+    }
+#endif
+
+    auto inT = reinterpret_cast<const T*>(in);
+
+    T* outsT[chs];
+    for (int c = 0; c < chs; c++) {
+        outsT[c] = reinterpret_cast<T*>(outs[c]);
+    }
+
+    for (int x = 0; x < length; x++) {
+        for (int c = 0; c < chs; c++) {
+            outsT[c][x] = inT[chs*x + c];
+        }
+    }
+}
+
+GAPI_FLUID_KERNEL(FMerge2, Merge2, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View& a,
+                    const cv::gapi::fluid::View& b,
+                          cv::gapi::fluid::Buffer& out) {
+        const auto rowFunc = (a.meta().depth == CV_8U) ? &mergeRow<uint8_t, 2> : &mergeRow<float, 2>;
+        for (int l = 0; l < out.lpi(); l++) {
+            rowFunc({a.InLineB(l), b.InLineB(l)}, out.OutLineB(l), a.length());
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FMerge3, Merge3, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View& a,
+                    const cv::gapi::fluid::View& b,
+                    const cv::gapi::fluid::View& c,
+                          cv::gapi::fluid::Buffer& out) {
+        const auto rowFunc = (a.meta().depth == CV_8U) ? &mergeRow<uint8_t, 3> : &mergeRow<float, 3>;
+        for (int l = 0; l < out.lpi(); l++) {
+            rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l)}, out.OutLineB(l), a.length());
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FMerge4, Merge4, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View& a,
+                    const cv::gapi::fluid::View& b,
+                    const cv::gapi::fluid::View& c,
+                    const cv::gapi::fluid::View& d,
+                          cv::gapi::fluid::Buffer& out) {
+        const auto rowFunc = (a.meta().depth == CV_8U) ? &mergeRow<uint8_t, 4> : &mergeRow<float, 4>;
+        for (int l = 0; l < out.lpi(); l++) {
+            rowFunc({a.InLineB(l), b.InLineB(l), c.InLineB(l), d.InLineB(l)}, out.OutLineB(l), a.length());
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FSplit2, Split2, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View  & in,
+                          cv::gapi::fluid::Buffer& out1,
+                          cv::gapi::fluid::Buffer& out2) {
+        GAPI_DbgAssert(2 == in.meta().chan);
+        GAPI_DbgAssert(1 == out1.meta().chan);
+        GAPI_DbgAssert(1 == out2.meta().chan);
+        GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
+        GAPI_DbgAssert(CV_8U == in.meta().depth || CV_32F == in.meta().depth);
+        const auto rowFunc = (in.meta().depth == CV_8U) ?
+                             &splitRow<uint8_t, 2> :
+                             &splitRow<float  , 2>;
+        for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
+            std::array<uint8_t*, 2> outs = {out1.OutLineB(i), out2.OutLineB(i)};
+            rowFunc(in.InLineB(i), outs, in.length());
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FSplit3, Split3, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View  & in,
+                          cv::gapi::fluid::Buffer& out1,
+                          cv::gapi::fluid::Buffer& out2,
+                          cv::gapi::fluid::Buffer& out3) {
+        GAPI_DbgAssert(3 == in.meta().chan);
+        GAPI_DbgAssert(1 == out1.meta().chan);
+        GAPI_DbgAssert(1 == out2.meta().chan);
+        GAPI_DbgAssert(1 == out3.meta().chan);
+        GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
+        GAPI_DbgAssert(CV_8U == in.meta().depth || CV_32F == in.meta().depth);
+        const auto rowFunc = (in.meta().depth == CV_8U) ?
+                             &splitRow<uint8_t, 3> :
+                             &splitRow<float  , 3>;
+        for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
+            std::array<uint8_t*, 3> outs = {out1.OutLineB(i), out2.OutLineB(i),
+                                            out3.OutLineB(i)};
+            rowFunc(in.InLineB(i), outs, in.length());
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FSplit4, Split4, false) {
+    static const int LPI = 4;
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View  & in,
+                          cv::gapi::fluid::Buffer& out1,
+                          cv::gapi::fluid::Buffer& out2,
+                          cv::gapi::fluid::Buffer& out3,
+                          cv::gapi::fluid::Buffer& out4) {
+        GAPI_DbgAssert(4 == in.meta().chan);
+        GAPI_DbgAssert(1 == out1.meta().chan);
+        GAPI_DbgAssert(1 == out2.meta().chan);
+        GAPI_DbgAssert(1 == out3.meta().chan);
+        GAPI_DbgAssert(1 == out4.meta().chan);
+        GAPI_DbgAssert(in.meta().depth == out1.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out2.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out3.meta().depth);
+        GAPI_DbgAssert(in.meta().depth == out4.meta().depth);
+        GAPI_DbgAssert(CV_8U == in.meta().depth || CV_32F == in.meta().depth);
+        const auto rowFunc = (in.meta().depth == CV_8U) ?
+                             &splitRow<uint8_t, 4> :
+                             &splitRow<float  , 4>;
+        for (int i = 0, lpi = out1.lpi(); i < lpi; i++) {
+            std::array<uint8_t*, 4> outs = {out1.OutLineB(i), out2.OutLineB(i),
+                                            out3.OutLineB(i), out4.OutLineB(i)};
+            rowFunc(in.InLineB(i), outs, in.length());
+        }
+    }
+};
+
+//----------------------------------------------------------------------
+
+template<typename T>
+static void chanToPlaneRow(const uint8_t* in, int chan, int chs, uint8_t* out, int length) {
+    const auto inT  = reinterpret_cast<const T*>(in);
+          auto outT = reinterpret_cast<      T*>(out);
+
+    for (int x = 0; x < length; x++) {
+        outT[x] = inT[x*chs + chan];
+    }
+}
+
+//    GAPI_OCV_KERNEL(OCVChanToPlane, ChanToPlane) {
+//        static void run(const cv::Mat &in, int chan, cv::Mat &out) {
+//            out.create(in.rows, in.cols, in.depth());
+//            const auto rowFunc = (in.depth() == CV_8U) ? &chanToPlaneRow<uint8_t> : &chanToPlaneRow<float>;
+
+//            for (int y = 0; y < out.rows; y++)
+//            {
+//                rowFunc(in.data + y*in.step, chan, in.channels(), out.data + y*out.step, in.cols);
+//            }
+//        }
+//    };
+
+//    GAPI_OCV_KERNEL(OCVScalePlane, ScalePlane) {
+//        static void run(const cv::Mat &in, int /*type*/, const Size &sz, int interp, cv::Mat &out) {
+//            cv::resize(in, out, sz, 0, 0, interp);
+//        }
+//    };
+
+//    GAPI_OCV_KERNEL(OCVMerge2, Merge2) {
+//        static void run(const cv::Mat &a, const cv::Mat &b, cv::Mat out) {
+//            out.create(a.rows, a.cols, CV_MAKETYPE(a.depth(), 2));
+//            const auto rowFunc = (a.depth() == CV_8U) ? &mergeRow<uint8_t, 2> : &mergeRow<float, 2>;
+
+//            for (int y = 0; y < out.rows; y++)
+//            {
+//                rowFunc({a.data + y*a.step, b.data + y*b.step}, out.data + out.step, a.cols);
+//            }
+//        }
+//    };
+
+GAPI_FLUID_KERNEL(FChanToPlane, ChanToPlane, false) {
+    static const int Window = 1;
+    static void run(const cv::gapi::fluid::View& in, int chan,
+                    cv::gapi::fluid::Buffer& out) {
+        const auto rowFunc = (in.meta().depth == CV_8U) ? &chanToPlaneRow<uint8_t> : &chanToPlaneRow<float>;
+        rowFunc(in.InLineB(0), chan, in.meta().chan, out.OutLineB(), in.length());
+    }
+};
+
+//----------------------------------------------------------------------
+
+G_TYPED_KERNEL(ScalePlane8u, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.scale_plane_8u") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
+        GAPI_DbgAssert(in.depth == CV_8U && in.chan == 1);
+        return in.withSize(sz);
+    }
+};
+
+G_TYPED_KERNEL(ScalePlane32f, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.scale_plane_32f") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
+        GAPI_DbgAssert(in.depth == CV_32F && in.chan == 1);
+        return in.withSize(sz);
+    }
+};
+
+G_TYPED_KERNEL(UpscalePlaneArea8u, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.upscale_plane_area_8u") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
+        GAPI_DbgAssert(in.depth == CV_8U && in.chan == 1);
+        GAPI_DbgAssert(in.size.width < sz.width || in.size.height < sz.height);
+        return in.withSize(sz);
+    }
+};
+
+G_TYPED_KERNEL(UpscalePlaneArea32f, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.upscale_plane_area_32f") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
+        GAPI_DbgAssert(in.depth == CV_32F && in.chan == 1);
+        GAPI_DbgAssert(in.size.width < sz.width || in.size.height < sz.height);
+        return in.withSize(sz);
+    }
+};
+
+G_TYPED_KERNEL(ScalePlaneArea8u, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.scale_plane_area_8u") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
+        GAPI_DbgAssert(in.depth == CV_8U && in.chan == 1);
+        GAPI_DbgAssert(in.size.width >= sz.width && in.size.height >= sz.height);
+        return in.withSize(sz);
+    }
+};
+
+G_TYPED_KERNEL(ScalePlaneArea32f, <cv::GMat(cv::GMat, Size, int)>, "com.intel.ie.scale_plane_area_32f") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in, const Size &sz, int) {
+        GAPI_DbgAssert(in.depth == CV_32F && in.chan == 1);
+        GAPI_DbgAssert(in.size.width >= sz.width && in.size.height >= sz.height);
+        return in.withSize(sz);
+    }
+};
+
+GAPI_COMPOUND_KERNEL(FScalePlane, ScalePlane) {
+    static cv::GMat expand(cv::GMat in, int type, const Size& szIn, const Size& szOut, int interp) {
+        GAPI_DbgAssert(CV_8UC1 == type || CV_32FC1 == type);
+        GAPI_DbgAssert(cv::INTER_AREA == interp || cv::INTER_LINEAR == interp);
+
+        if (cv::INTER_AREA == interp) {
+            bool upscale = szIn.width < szOut.width || szIn.height < szOut.height;
+            if (CV_8UC1 == type) {
+                if (upscale)
+                    return UpscalePlaneArea8u::on(in, szOut, interp);
+                else
+                    return   ScalePlaneArea8u::on(in, szOut, interp);
+            }
+            if (CV_32FC1 == type) {
+                if (upscale)
+                    return UpscalePlaneArea32f::on(in, szOut, interp);
+                else
+                    return   ScalePlaneArea32f::on(in, szOut, interp);
+            }
+        }
+
+        if (cv::INTER_LINEAR == interp) {
+            if (CV_8UC1 == type) {
+                return ScalePlane8u::on(in, szOut, interp);
+            }
+            if (CV_32FC1 == type) {
+                return ScalePlane32f::on(in, szOut, interp);
+            }
+        }
+
+        GAPI_Assert(!"unsupported parameters");
+        return {};
+    }
+};
+
+static inline double invRatio(int inSz, int outSz) {
+    return static_cast<double>(outSz) / inSz;
+}
+
+static inline double ratio(int inSz, int outSz) {
+    return 1 / invRatio(inSz, outSz);
+}
+
+template<typename T, typename Mapper, int chanNum>
+struct linearScratchDesc {
+    using alpha_t = typename Mapper::alpha_type;
+    using index_t = typename Mapper::index_type;
+
+    alpha_t* alpha;
+    alpha_t* clone;
+    index_t* mapsx;
+    alpha_t* beta;
+    index_t* mapsy;
+    T*       tmp;
+
+    linearScratchDesc(int /*inW*/, int /*inH*/, int outW, int outH,  void* data) {
+        alpha = reinterpret_cast<alpha_t*>(data);
+        clone = reinterpret_cast<alpha_t*>(alpha + outW);
+        mapsx = reinterpret_cast<index_t*>(clone + outW*4);
+        beta  = reinterpret_cast<alpha_t*>(mapsx + outW);
+        mapsy = reinterpret_cast<index_t*>(beta  + outH);
+        tmp   = reinterpret_cast<T*>      (mapsy + outH*2);
+    }
+
+    static int bufSize(int inW, int inH, int outW, int outH, int lpi) {
+        auto size = outW * sizeof(alpha_t)     +
+                    outW * sizeof(alpha_t) * 4 +  // alpha clones // previous alpha is redundant?
+                    outW * sizeof(index_t)     +
+                    outH * sizeof(alpha_t)     +
+                    outH * sizeof(index_t) * 2 +
+                     inW * sizeof(T) * lpi * chanNum;
+
+        return static_cast<int>(size);
+    }
+};
+
+template<typename T, typename Mapper, int chanNum = 1>
+static void initScratchLinear(const cv::GMatDesc& in,
+                              const         Size& outSz,
+                         cv::gapi::fluid::Buffer& scratch,
+                                             int  lpi) {
+    using alpha_type = typename Mapper::alpha_type;
+    using index_type = typename Mapper::index_type;
+    static const auto unity = Mapper::unity;
+
+    auto inSz = in.size;
+    auto sbufsize = linearScratchDesc<T, Mapper, chanNum>::bufSize(inSz.width, inSz.height, outSz.width, outSz.height, lpi);
+
+    Size scratch_size{sbufsize, 1};
+
+    cv::GMatDesc desc;
+    desc.chan = 1;
+    desc.depth = CV_8UC1;
+    desc.size = scratch_size;
+
+    cv::gapi::fluid::Buffer buffer(desc);
+    scratch = std::move(buffer);
+
+    double hRatio = ratio(in.size.width, outSz.width);
+    double vRatio = ratio(in.size.height, outSz.height);
+
+    linearScratchDesc<T, Mapper, chanNum> scr(inSz.width, inSz.height, outSz.width, outSz.height, scratch.OutLineB());
+
+    auto *alpha = scr.alpha;
+    auto *clone = scr.clone;
+    auto *index = scr.mapsx;
+
+    for (int x = 0; x < outSz.width; x++) {
+        auto map = Mapper::map(hRatio, 0, in.size.width, x);
+        auto alpha0 = map.alpha0;
+        auto index0 = map.index0;
+
+        // TRICK:
+        // Algorithm takes pair of input pixels, sx0'th and sx1'th,
+        // and compute result as alpha0*src[sx0] + alpha1*src[sx1].
+        // By definition: sx1 == sx0 + 1 either sx1 == sx0, and
+        // alpha0 + alpha1 == unity (scaled appropriately).
+        // Here we modify formulas for alpha0 and sx1: by assuming
+        // that sx1 == sx0 + 1 always, and patching alpha0 so that
+        // result remains intact.
+        // Note that we need in.size.width >= 2, for both sx0 and
+        // sx0+1 were indexing pixels inside the input's width.
+        if (map.index1 != map.index0 + 1) {
+            GAPI_DbgAssert(map.index1 == map.index0);
+            GAPI_DbgAssert(in.size.width >= 2);
+            if (map.index0 < in.size.width-1) {
+                // sx1=sx0+1 fits inside row,
+                // make sure alpha0=unity and alpha1=0,
+                // so that result equals src[sx0]*unity
+                alpha0 = saturate_cast<alpha_type>(unity);
+            } else {
+                // shift sx0 to left by 1 pixel,
+                // and make sure that alpha0=0 and alpha1==1,
+                // so that result equals to src[sx0+1]*unity
+                alpha0 = 0;
+                index0--;
+            }
+        }
+
+        alpha[x] = alpha0;
+        index[x] = index0;
+
+        for (int l = 0; l < 4; l++) {
+            clone[4*x + l] = alpha0;
+        }
+    }
+
+    auto *beta    = scr.beta;
+    auto *index_y = scr.mapsy;
+
+    for (int y = 0; y < outSz.height; y++) {
+        auto mapY = Mapper::map(vRatio, 0, in.size.height, y);
+        beta[y] = mapY.alpha0;
+        index_y[y] = mapY.index0;
+        index_y[outSz.height + y] = mapY.index1;
+    }
+}
+
+template<typename T, class Mapper>
+static void calcRowLinear(const cv::gapi::fluid::View  & in,
+                                cv::gapi::fluid::Buffer& out,
+                                cv::gapi::fluid::Buffer& scratch) {
+    using alpha_type = typename Mapper::alpha_type;
+
+    auto  inSz =  in.meta().size;
+    auto outSz = out.meta().size;
+
+    auto inY = in.y();
+    int length = out.length();
+    int outY = out.y();
+    int lpi = out.lpi();
+    GAPI_DbgAssert(outY + lpi <= outSz.height);
+
+    GAPI_DbgAssert(lpi <= 4);
+
+    linearScratchDesc<T, Mapper, 1> scr(inSz.width, inSz.height, outSz.width, outSz.height, scratch.OutLineB());
+
+    const auto *alpha = scr.alpha;
+    const auto *clone = scr.clone;
+    const auto *mapsx = scr.mapsx;
+    const auto *beta0 = scr.beta;
+    const auto *mapsy = scr.mapsy;
+    auto *tmp         = scr.tmp;
+
+    const auto *beta = beta0 + outY;
+    const T *src0[4];
+    const T *src1[4];
+    T *dst[4];
+
+    for (int l = 0; l < lpi; l++) {
+        auto index0 = mapsy[outY + l] - inY;
+        auto index1 = mapsy[outSz.height + outY + l] - inY;
+        src0[l] = in.InLine<const T>(index0);
+        src1[l] = in.InLine<const T>(index1);
+        dst[l] = out.OutLine<T>(l);
+    }
+
+#if MANUAL_SIMD
+    if (with_cpu_x86_sse42()) {
+        if (std::is_same<T, uint8_t>::value) {
+            if (inSz.width >= 16 && outSz.width >= 8) {
+                calcRowLinear_8U(reinterpret_cast<uint8_t**>(dst),
+                                 reinterpret_cast<const uint8_t**>(src0),
+                                 reinterpret_cast<const uint8_t**>(src1),
+                                 reinterpret_cast<const short*>(alpha),
+                                 reinterpret_cast<const short*>(clone),
+                                 reinterpret_cast<const short*>(mapsx),
+                                 reinterpret_cast<const short*>(beta),
+                                 reinterpret_cast<uint8_t*>(tmp),
+                                 inSz, outSz, lpi);
+                return;
+            }
+        }
+
+        if (std::is_same<T, float>::value) {
+            calcRowLinear_32F(reinterpret_cast<float**>(dst),
+                              reinterpret_cast<const float**>(src0),
+                              reinterpret_cast<const float**>(src1),
+                              reinterpret_cast<const float*>(alpha),
+                              reinterpret_cast<const int*>(mapsx),
+                              reinterpret_cast<const float*>(beta),
+                              reinterpret_cast<float*>(tmp),
+                              inSz, outSz, lpi);
+            return;
+        }
+    }
+#endif
+
+    for (int l = 0; l < lpi; l++) {
+        constexpr static const auto unity = Mapper::unity;
+
+        auto beta0 =                                   beta[l];
+        auto beta1 = saturate_cast<alpha_type>(unity - beta[l]);
+
+        for (int x = 0; x < length; x++) {
+            auto alpha0 =                                   alpha[x];
+            auto alpha1 = saturate_cast<alpha_type>(unity - alpha[x]);
+            auto sx0 = mapsx[x];
+            auto sx1 = sx0 + 1;
+            T tmp0 = calc(beta0, src0[l][sx0], beta1, src1[l][sx0]);
+            T tmp1 = calc(beta0, src0[l][sx1], beta1, src1[l][sx1]);
+            dst[l][x] = calc(alpha0, tmp0, alpha1, tmp1);
+        }
+    }
+}
+
+template<typename T, class Mapper>
+static void calcRowLinearC3(const cv::gapi::fluid::View  & in,
+                                  cv::gapi::fluid::Buffer& out0,
+                                  cv::gapi::fluid::Buffer& out1,
+                                  cv::gapi::fluid::Buffer& out2,
+                                  cv::gapi::fluid::Buffer& scratch) {
+    using alpha_type = typename Mapper::alpha_type;
+
+    auto  inSz =  in.meta().size;
+    auto outSz = out0.meta().size;
+
+    auto inY  = in.y();
+    auto outY = out0.y();
+    auto lpi  = out0.lpi();
+
+    GAPI_DbgAssert(outY + lpi <= outSz.height);
+    GAPI_DbgAssert(lpi <= 4);
+
+    linearScratchDesc<T, Mapper, 3> scr(inSz.width, inSz.height, outSz.width, outSz.height, scratch.OutLineB());
+
+    const auto *alpha = scr.alpha;
+    const auto *clone = scr.clone;
+    const auto *mapsx = scr.mapsx;
+    const auto *beta0 = scr.beta;
+    const auto *mapsy = scr.mapsy;
+    auto *tmp         = scr.tmp;
+
+    const auto *beta = beta0 + outY;
+    const T *src0[4];
+    const T *src1[4];
+    std::array<std::array<T*, 4>, 3> dst;
+
+    for (int l = 0; l < lpi; l++) {
+        auto index0 = mapsy[outY + l] - inY;
+        auto index1 = mapsy[outSz.height + outY + l] - inY;
+        src0[l] = in.InLine<const T>(index0);
+        src1[l] = in.InLine<const T>(index1);
+        dst[0][l] = out0.OutLine<T>(l);
+        dst[1][l] = out1.OutLine<T>(l);
+        dst[2][l] = out2.OutLine<T>(l);
+    }
+
+#if MANUAL_SIMD
+    if (with_cpu_x86_sse42()) {
+        if (inSz.width >= 16 && outSz.width >= 8) {
+            calcRowLinear_8UC3(dst,
+                               reinterpret_cast<const uint8_t**>(src0),
+                               reinterpret_cast<const uint8_t**>(src1),
+                               reinterpret_cast<const short*>(alpha),
+                               reinterpret_cast<const short*>(clone),
+                               reinterpret_cast<const short*>(mapsx),
+                               reinterpret_cast<const short*>(beta),
+                               reinterpret_cast<uint8_t*>(tmp),
+                               inSz, outSz, lpi);
+            return;
+        }
+    }
+#endif
+
+    auto length = out0.length();
+
+    for (int l = 0; l < lpi; l++) {
+        constexpr static const auto unity = Mapper::unity;
+
+        auto beta0 =                                   beta[l];
+        auto beta1 = saturate_cast<alpha_type>(unity - beta[l]);
+
+        for (int x = 0; x < length; x++) {
+            auto alpha0 =                                   alpha[x];
+            auto alpha1 = saturate_cast<alpha_type>(unity - alpha[x]);
+            auto sx0 = mapsx[x];
+            auto sx1 = sx0 + 1;
+
+            for (int c = 0; c < 3; c++) {
+                auto idx0 = 3*sx0 + c;
+                auto idx1 = 3*sx1 + c;
+                T tmp0 = calc(beta0, src0[l][idx0], beta1, src1[l][idx0]);
+                T tmp1 = calc(beta0, src0[l][idx1], beta1, src1[l][idx1]);
+                dst[c][l][x] = calc(alpha0, tmp0, alpha1, tmp1);
+            }
+        }
+    }
+}
+
+
+//------------------------------------------------------------------------------
+
+namespace linear {
+struct Mapper {
+    typedef short alpha_type;
+    typedef short index_type;
+    constexpr static const int unity = ONE;
+
+    typedef MapperUnit<short, short> Unit;
+
+    static inline Unit map(double ratio, int start, int max, int outCoord) {
+        float f = ((outCoord + 0.5f) * ratio - 0.5f);
+        int s = cvFloor(f);
+        f -= s;
+
+        Unit u;
+
+        u.index0 = std::max(s - start, 0);
+        u.index1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
+
+        u.alpha0 = saturate_cast<short>(ONE * (1.0f - f));
+        u.alpha1 = saturate_cast<short>(ONE *         f);
+
+        return u;
+    }
+};
+}  // namespace linear
+
+namespace linear32f {
+struct Mapper {
+    typedef float alpha_type;
+    typedef int   index_type;
+    constexpr static const float unity = 1;
+
+    typedef MapperUnit<float, int> Unit;
+
+    static inline Unit map(double ratio, int start, int max, int outCoord) {
+        float f = ((outCoord + 0.5f) * ratio - 0.5f);
+        int s = cvFloor(f);
+        f -= s;
+
+        Unit u;
+
+        u.index0 = std::max(s - start, 0);
+        u.index1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
+
+        u.alpha0 = 1.f - f;
+        u.alpha1 =       f;
+
+        return u;
+    }
+};
+}  // namespace linear32f
+
+namespace areaUpscale {
+struct Mapper {
+    typedef short alpha_type;
+    typedef short index_type;
+    constexpr static const int unity = ONE;
+
+    typedef MapperUnit<short, short> Unit;
+
+    static inline Unit map(double ratio, int start, int max, int outCoord) {
+        int s = cvFloor(outCoord*ratio);
+        float f = static_cast<float>((outCoord+1) - (s+1)/ratio);
+        f = f <= 0 ? 0.f : f - cvFloor(f);
+
+        Unit u;
+
+        u.index0 = std::max(s - start, 0);
+        u.index1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
+
+        u.alpha0 = saturate_cast<short>(ONE * (1.0f - f));
+        u.alpha1 = saturate_cast<short>(ONE *         f);
+
+        return u;
+    }
+};
+}  // namespace areaUpscale
+
+namespace areaUpscale32f {
+struct Mapper {
+    typedef float alpha_type;
+    typedef int   index_type;
+    constexpr static const float unity = 1;
+
+    typedef MapperUnit<float, int> Unit;
+
+    static inline Unit map(double ratio, int start, int max, int outCoord) {
+        int s = cvFloor(outCoord*ratio);
+        float f = static_cast<float>((outCoord+1) - (s+1)/ratio);
+        f = f <= 0 ? 0.f : f - cvFloor(f);
+
+        Unit u;
+
+        u.index0 = std::max(s - start, 0);
+        u.index1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
+
+        u.alpha0 = 1.0f - f;
+        u.alpha1 =        f;
+
+        return u;
+    }
+};
+}  // namespace areaUpscale32f
+
+//------------------------------------------------------------------------------
+
+template<typename A, typename I, typename W>
+struct AreaDownMapper {
+    typedef A alpha_type;
+    typedef I index_type;
+    typedef W  work_type;
+
+    typedef MapperUnit<alpha_type, index_type> Unit;
+
+    inline Unit map(int outCoord) {
+        double inCoord0 =  outCoord      * ratio;
+        double inCoord1 = (outCoord + 1) * ratio;
+
+        double index0 = std::floor(inCoord0 + 0.001);
+        double index1 =  std::ceil(inCoord1 - 0.001);
+
+        double alpha0 =   (index0 + 1 - inCoord0) * inv_ratio;
+        double alpha1 = - (index1 - 1 - inCoord1) * inv_ratio;
+
+        GAPI_Assert(0 <= outCoord && outCoord <= outSz-1);
+        GAPI_Assert(0 <= index0 && index0 < index1 && index1 <= inSz);
+
+        Unit unit;
+
+        unit.index0 = checked_cast<index_type>(index0);
+        unit.index1 = checked_cast<index_type>(index1);
+
+        unit.alpha0 = convert_cast<alpha_type>(alpha0);
+        unit.alpha1 = convert_cast<alpha_type>(alpha1);
+
+        return unit;
+    }
+
+    int    inSz, outSz;
+    double ratio, inv_ratio;
+
+    alpha_type  alpha;  // == inv_ratio, rounded
+
+    void init(int _inSz, int _outSz) {
+        inSz  = _inSz;
+        outSz = _outSz;
+
+        inv_ratio = invRatio(inSz, outSz);
+        ratio     = 1.0 / inv_ratio;
+
+        alpha = convert_cast<alpha_type>(inv_ratio);
+    }
+};
+
+namespace areaDownscale32f {
+struct Mapper: public AreaDownMapper<float, int, float> {
+    Mapper(int _inSz, int _outSz) {
+        init(_inSz, _outSz);
+    }
+};
+}
+
+namespace areaDownscale8u {
+struct Mapper: public AreaDownMapper<Q0_16, short, Q8_8> {
+    Mapper(int _inSz, int _outSz) {
+        init(_inSz, _outSz);
+    }
+};
+}
+
+template<typename Mapper>
+static void initScratchArea(const cv::GMatDesc& in, const Size& outSz,
+                            cv::gapi::fluid::Buffer &scratch) {
+    using Unit = typename Mapper::Unit;
+    using alpha_type = typename Mapper::alpha_type;
+    using index_type = typename Mapper::index_type;
+
+    // compute the chunk of input pixels for each output pixel,
+    // along with the coefficients for taking the weigthed sum
+
+    Size inSz = in.size;
+    Mapper mapper(inSz.width, outSz.width);
+
+    std::vector<Unit> xmaps(outSz.width);
+    int  maxdif = 0;
+
+    for (int w = 0; w < outSz.width; w++) {
+        Unit map = mapper.map(w);
+        xmaps[w] = map;
+
+        int dif = map.index1 - map.index0;
+        if (dif > maxdif)
+            maxdif = dif;
+    }
+
+    // This assertion is critical for our trick with chunk sizes:
+    // we would expand a chunk it is is smaller than maximal size
+    GAPI_Assert(inSz.width >= maxdif);
+
+    // pack the input chunks positions and coefficients into scratch-buffer,
+    // along with the maximal size of chunk (note that chunk size may vary)
+
+    size_t scratch_bytes =               sizeof(int)
+                         + outSz.width * sizeof(index_type)
+                         + outSz.width * sizeof(alpha_type) * maxdif
+                         +  inSz.width * sizeof(alpha_type);
+    Size scratch_size{static_cast<int>(scratch_bytes), 1};
+
+    cv::GMatDesc desc;
+    desc.chan = 1;
+    desc.depth = CV_8UC1;
+    desc.size = scratch_size;
+
+    cv::gapi::fluid::Buffer buffer(desc);
+    scratch = std::move(buffer);
+
+    auto *maxdf =  scratch.OutLine<int>();
+    auto *index = reinterpret_cast<index_type*>(maxdf + 1);
+    auto *alpha = reinterpret_cast<alpha_type*>(index + outSz.width);
+//  auto *vbuf  = reinterpret_cast<work_type *>(alpha + outSz.width * maxdif);
+
+    for (int w = 0; w < outSz.width; w++) {
+        // adjust input indices so that:
+        // - data chunk is exactly maxdif pixels
+        // - data chunk fits inside input width
+        int index0 = xmaps[w].index0;
+        int index1 = xmaps[w].index1;
+        int i0 = index0, i1 = index1;
+        i1 = (std::min)(i0 + maxdif, in.size.width);
+        i0 =            i1 - maxdif;
+        GAPI_DbgAssert(i0 >= 0);
+
+        // fulfill coefficients for the data chunk,
+        // extending with zeros if any extra pixels
+        alpha_type *alphaw = &alpha[w * maxdif];
+        for (int i = 0; i < maxdif; i++) {
+            if (i + i0 == index0) {
+                alphaw[i] = xmaps[w].alpha0;
+
+            } else if (i + i0 == index1 - 1) {
+                alphaw[i] = xmaps[w].alpha1;
+
+            } else if (i + i0 > index0 && i + i0 < index1 - 1) {
+                alphaw[i] = mapper.alpha;
+
+            } else {
+                alphaw[i] = 0;
+            }
+        }
+
+        // start input chunk with adjusted position
+        index[w] = i0;
+    }
+
+    *maxdf = maxdif;
+}
+
+template<typename T, typename Mapper>
+static void calcAreaRow(const cv::gapi::fluid::View& in, cv::gapi::fluid::Buffer& out,
+                              cv::gapi::fluid::Buffer& scratch) {
+    using Unit = typename Mapper::Unit;
+    using alpha_type = typename Mapper::alpha_type;
+    using index_type = typename Mapper::index_type;
+    using  work_type = typename Mapper::work_type;
+
+    Size inSz  =  in.meta().size;
+    Size outSz = out.meta().size;
+
+    // this method is valid only for down-scale
+    GAPI_DbgAssert(inSz.width  >= outSz.width);
+    GAPI_DbgAssert(inSz.height >= outSz.height);
+
+//  Mapper xmapper(inSz.width,  outSz.width);
+    Mapper ymapper(inSz.height, outSz.height);
+
+    auto *xmaxdf = scratch.OutLine<const int>();
+    auto  maxdif = xmaxdf[0];
+
+    auto *xindex = reinterpret_cast<const index_type*>(xmaxdf + 1);
+    auto *xalpha = reinterpret_cast<const alpha_type*>(xindex + outSz.width);
+    auto *vbuf_c = reinterpret_cast<const  work_type*>(xalpha + outSz.width * maxdif);
+
+    auto *vbuf = const_cast<work_type*>(vbuf_c);
+
+    int iny = in.y();
+    int y = out.y();
+
+    int lpi = out.lpi();
+    GAPI_DbgAssert(y + lpi <= outSz.height);
+
+    for (int l = 0; l < lpi; l++) {
+        Unit ymap = ymapper.map(y + l);
+
+        const T *src[32];
+        GAPI_Assert(ymap.index1 - ymap.index0 <= 32);
+        for (int yin = ymap.index0; yin < ymap.index1; yin++) {
+            src[yin - ymap.index0] = in.InLine<const T>(yin - iny);
+        }
+
+        auto dst = out.OutLine<T>(l);
+
+#if MANUAL_SIMD
+        if (with_cpu_x86_sse42()) {
+            if (std::is_same<T, uchar>::value) {
+                calcRowArea_8U(reinterpret_cast<uchar*>(dst),
+                               reinterpret_cast<const uchar**>(src),
+                               inSz, outSz,
+                               static_cast<Q0_16>(ymapper.alpha),
+                               reinterpret_cast<const MapperUnit8U&>(ymap),
+                               xmaxdf[0],
+                               reinterpret_cast<const short*>(xindex),
+                               reinterpret_cast<const Q0_16*>(xalpha),
+                               reinterpret_cast<Q8_8*>(vbuf));
+                continue;  // next l = 0, ..., lpi-1
+            }
+
+            if (std::is_same<T, float>::value) {
+                calcRowArea_32F(reinterpret_cast<float*>(dst),
+                                reinterpret_cast<const float**>(src),
+                                inSz, outSz,
+                                static_cast<float>(ymapper.alpha),
+                                reinterpret_cast<const MapperUnit32F&>(ymap),
+                                xmaxdf[0],
+                                reinterpret_cast<const int*>(xindex),
+                                reinterpret_cast<const float*>(xalpha),
+                                reinterpret_cast<float*>(vbuf));
+                continue;
+            }
+        }
+#endif
+
+        // vertical pass
+        int y_1st = ymap.index0;
+        int ylast = ymap.index1 - 1;
+        if (y_1st < ylast) {
+            for (int w = 0; w < inSz.width; w++) {
+                vbuf[w] = mulas(ymap.alpha0, src[0][w])        // Q8_8 = Q0_16 * U8
+                        + mulas(ymap.alpha1, src[ylast - y_1st][w]);
+            }
+
+            for (int i = 1; i < ylast - y_1st; i++) {
+                for (int w = 0; w < inSz.width; w++) {
+                    vbuf[w] += mulas(ymapper.alpha, src[i][w]);
+                }
+            }
+        } else {
+            for (int w = 0; w < inSz.width; w++) {
+                vbuf[w] = convert_cast<work_type>(src[0][w]);  // Q8_8 = U8
+            }
+        }
+
+        // horizontal pass
+        for (int x = 0; x < outSz.width; x++) {
+            work_type sum = 0;
+
+            auto        index =  xindex[x];
+            const auto *alpha = &xalpha[x * maxdif];
+
+            for (int i = 0; i < maxdif; i++) {
+                sum +=  mulaw(alpha[i], vbuf[index + i]);      // Q8_8 = Q0_16 * Q8_8
+            }
+
+            dst[x] = convert_cast<T>(sum);                     // U8 = Q8_8
+        }
+    }
+}
+
+//----------------------------------------------------------------------
+#if USE_CVKL
+
+// taken from: ie_preprocess_data.cpp
+static int getResizeAreaTabSize(int dst_go, int ssize, int dsize, float scale) {
+    static const float threshold = 1e-3f;
+    int max_count = 0;
+
+    for (int col = dst_go; col < dst_go + dsize; col++) {
+        int count = 0;
+
+        float fsx1 = col * scale;
+        float fsx2 = fsx1 + scale;
+
+        int sx1 = ceil(fsx1);
+        int sx2 = floor(fsx2);
+
+        sx2 = (std::min)(sx2, ssize - 1);
+        sx1 = (std::min)(sx1, sx2);
+
+        if (sx1 - fsx1 > threshold) {
+            count++;
+        }
+
+        for (int sx = sx1; sx < sx2; sx++) {
+            count++;
+        }
+
+        if (fsx2 - sx2 > threshold) {
+            count++;
+        }
+        max_count = (std::max)(max_count, count);
+    }
+
+    return max_count;
+}
+
+// taken from: ie_preprocess_data.cpp
+static void computeResizeAreaTab(int src_go, int dst_go, int ssize, int dsize, float scale,
+                                 uint16_t* si, uint16_t* alpha, int max_count) {
+    static const float threshold = 1e-3f;
+    int k = 0;
+
+    for (int col = dst_go; col < dst_go + dsize; col++) {
+        int count = 0;
+
+        float fsx1 = col * scale;
+        float fsx2 = fsx1 + scale;
+        float cellWidth = (std::min)(scale, ssize - fsx1);
+
+        int sx1 = ceil(fsx1);
+        int sx2 = floor(fsx2);
+
+        sx2 = (std::min)(sx2, ssize - 1);
+        sx1 = (std::min)(sx1, sx2);
+
+        si[col - dst_go] = (uint16_t)(sx1 - src_go);
+
+        if (sx1 - fsx1 > threshold) {
+            si[col - dst_go] = (uint16_t)(sx1 - src_go - 1);
+            alpha[k++] = (uint16_t)((1 << 16) * ((sx1 - fsx1) / cellWidth));
+            count++;
+        }
+
+        for (int sx = sx1; sx < sx2; sx++) {
+            alpha[k++] = (uint16_t)((1 << 16) * (1.0f / cellWidth));
+            count++;
+        }
+
+        if (fsx2 - sx2 > threshold) {
+            alpha[k++] = (uint16_t)((1 << 16) * ((std::min)((std::min)(fsx2 - sx2, 1.f), cellWidth) / cellWidth));
+            count++;
+        }
+
+        if (count != max_count) {
+            alpha[k++] = 0;
+        }
+    }
+}
+
+// teken from: ie_preprocess_data.cpp
+static void generate_alpha_and_id_arrays(int x_max_count, int dcols, const uint16_t* xalpha, uint16_t* xsi,
+                                         uint16_t** alpha, uint16_t** sxid) {
+    if (x_max_count <= 4) {
+        for (int col = 0; col < dcols; col++) {
+            for (int x = 0; x < x_max_count; x++) {
+                alpha[x][col] = xalpha[col*x_max_count + x];
+            }
+        }
+    }
+    if (x_max_count <= 4) {
+        for (int col = 0; col <= dcols - 8; col += 8) {
+            for (int chunk_num_h = 0; chunk_num_h < x_max_count; chunk_num_h++) {
+                for (int i = 0; i < 128 / 16; i++) {
+                    int id_diff = xsi[col + i] - xsi[col];
+
+                    for (int chunk_num_v = 0; chunk_num_v < x_max_count; chunk_num_v++) {
+                        uint16_t* sxidp = sxid[chunk_num_v] + col * x_max_count + chunk_num_h * 8;
+
+                        int id0 = (id_diff + chunk_num_v) * 2 + 0;
+                        int id1 = (id_diff + chunk_num_v) * 2 + 1;
+
+                        (reinterpret_cast<int8_t*>(sxidp + i))[0] = static_cast<int8_t>(id0 >= (chunk_num_h * 16) && id0 < (chunk_num_h + 1) * 16 ? id0 : -1);
+                        (reinterpret_cast<int8_t*>(sxidp + i))[1] = static_cast<int8_t>(id1 >= (chunk_num_h * 16) && id1 < (chunk_num_h + 1) * 16 ? id1 : -1);
+                    }
+                }
+            }
+        }
+    }
+}
+
+// taken from: ie_preprocess_data.cpp
+// (and simplified for specifically downscale area 8u)
+static size_t resize_get_buffer_size(const Size& inSz, const Size& outSz) {
+    int dst_full_width  = outSz.width;
+    int dst_full_height = outSz.height;
+    int src_full_width  =  inSz.width;
+    int src_full_height =  inSz.height;
+
+    auto resize_area_u8_downscale_sse_buffer_size = [&]() {
+        const int dwidth  = outSz.width;
+        const int dheight = outSz.height;
+        const int swidth  =  inSz.width;
+
+        const int dst_go_x = 0;
+        const int dst_go_y = 0;
+
+        int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width,  dwidth,  static_cast<float>(src_full_width)  / dst_full_width)  + 1;
+        int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, static_cast<float>(src_full_height) / dst_full_height) + 1;
+
+        size_t si_buf_size = sizeof(uint16_t) * dwidth + sizeof(uint16_t) * dheight;
+        size_t alpha_buf_size =
+                sizeof(uint16_t) * (dwidth * x_max_count + 8 * 16) + sizeof(uint16_t) * dheight * y_max_count;
+        size_t vert_sum_buf_size = sizeof(uint16_t) * (swidth * 2);
+        size_t alpha_array_buf_size = sizeof(uint16_t) * 4 * dwidth;
+        size_t sxid_array_buf_size = sizeof(uint16_t) * 4 * 4 * dwidth;
+
+        size_t buffer_size = si_buf_size +
+                             alpha_buf_size +
+                             vert_sum_buf_size +
+                             alpha_array_buf_size +
+                             sxid_array_buf_size;
+
+        return buffer_size;
+    };
+
+    return resize_area_u8_downscale_sse_buffer_size();
+}
+
+// buffer-fulfill is taken from: ie_preprocess_data_sse42.cpp
+static void initScratchArea_CVKL_U8(const cv::GMatDesc & in,
+                                    const       Size   & outSz,
+                               cv::gapi::fluid::Buffer & scratch) {
+    const Size& inSz = in.size;
+
+    // estimate buffer size
+    size_t scratch_bytes = resize_get_buffer_size(inSz, outSz);
+
+    // allocate buffer
+
+    Size scratch_size{static_cast<int>(scratch_bytes), 1};
+
+    cv::GMatDesc desc;
+    desc.chan = 1;
+    desc.depth = CV_8UC1;
+    desc.size = scratch_size;
+
+    cv::gapi::fluid::Buffer buffer(desc);
+    scratch = std::move(buffer);
+
+    // fulfil buffer
+    {
+        // this code is taken from: ie_preprocess_data_sse42.cpp
+        // (and simplified for 1-channel cv::Mat instead of blob)
+
+        auto dwidth  = outSz.width;
+        auto dheight = outSz.height;
+        auto swidth  =  inSz.width;
+        auto sheight =  inSz.height;
+
+        const int src_go_x = 0;
+        const int src_go_y = 0;
+        const int dst_go_x = 0;
+        const int dst_go_y = 0;
+
+        auto src_full_width  = swidth;
+        auto src_full_height = sheight;
+        auto dst_full_width  = dwidth;
+        auto dst_full_height = dheight;
+
+        float scale_x = static_cast<float>(src_full_width)  / dst_full_width;
+        float scale_y = static_cast<float>(src_full_height) / dst_full_height;
+
+        int x_max_count = getResizeAreaTabSize(dst_go_x, src_full_width,  dwidth,  scale_x);
+        int y_max_count = getResizeAreaTabSize(dst_go_y, src_full_height, dheight, scale_y);
+
+        auto* maxdif = scratch.OutLine<int>();
+        auto* xsi = reinterpret_cast<uint16_t*>(maxdif + 2);
+        auto* ysi = xsi + dwidth;
+        auto* xalpha = ysi + dheight;
+        auto* yalpha = xalpha + dwidth*x_max_count + 8*16;
+    //  auto* vert_sum = yalpha + dheight*y_max_count;
+
+        maxdif[0] = x_max_count;
+        maxdif[1] = y_max_count;
+
+        computeResizeAreaTab(src_go_x, dst_go_x, src_full_width,   dwidth, scale_x, xsi, xalpha, x_max_count);
+        computeResizeAreaTab(src_go_y, dst_go_y, src_full_height, dheight, scale_y, ysi, yalpha, y_max_count);
+
+        int vest_sum_size = 2*swidth;
+        uint16_t* vert_sum = yalpha + dheight*y_max_count;
+        uint16_t* alpha0 = vert_sum + vest_sum_size;
+        uint16_t* alpha1 = alpha0 + dwidth;
+        uint16_t* alpha2 = alpha1 + dwidth;
+        uint16_t* alpha3 = alpha2 + dwidth;
+        uint16_t* sxid0 = alpha3 + dwidth;
+        uint16_t* sxid1 = sxid0 + 4*dwidth;
+        uint16_t* sxid2 = sxid1 + 4*dwidth;
+        uint16_t* sxid3 = sxid2 + 4*dwidth;
+
+        uint16_t* alpha[] = {alpha0, alpha1, alpha2, alpha3};
+        uint16_t* sxid[] = {sxid0, sxid1, sxid2, sxid3};
+        generate_alpha_and_id_arrays(x_max_count, dwidth, xalpha, xsi, alpha, sxid);
+    }
+}
+
+static void calcAreaRow_CVKL_U8(const cv::gapi::fluid::View   & in,
+                                      cv::gapi::fluid::Buffer & out,
+                                      cv::gapi::fluid::Buffer & scratch) {
+    Size inSz  =  in.meta().size;
+    Size outSz = out.meta().size;
+
+    // this method is valid only for down-scale
+    GAPI_DbgAssert(inSz.width  >= outSz.width);
+    GAPI_DbgAssert(inSz.height >= outSz.height);
+
+    int dwidth  = outSz.width;
+    int dheight = outSz.height;
+
+    auto* maxdif = scratch.OutLine<int>();
+    int x_max_count = maxdif[0];
+    int y_max_count = maxdif[1];
+
+    auto* xsi = reinterpret_cast<uint16_t*>(maxdif + 2);
+    auto* ysi    = xsi + dwidth;
+    auto* xalpha = ysi + dheight;
+    auto* yalpha = xalpha + dwidth*x_max_count + 8*16;
+    auto* vert_sum = yalpha + dheight*y_max_count;
+
+    int iny =  in.y();
+    int   y = out.y();
+
+    int lpi = out.lpi();
+    GAPI_DbgAssert(y + lpi <= outSz.height);
+
+    for (int l = 0; l < lpi; l++) {
+        int yin0 = ysi[y + l];
+        int yin1 = yin0 + y_max_count;
+
+        GAPI_Assert(yin1 - yin0 <= 32);
+        const uint8_t *src[32];
+
+        for (int yin = yin0; yin < yin1 && yin < inSz.height; yin++) {
+            if (yalpha[(y+l)*y_max_count + yin - yin0] == 0) {
+                src[yin - yin0] = in.InLine<const uint8_t>(yin - iny - 1);
+            } else {
+                src[yin - yin0] = in.InLine<const uint8_t>(yin - iny);
+            }
+        }
+
+        uint8_t *dst = out.OutLine<uint8_t>(l);
+
+        calcRowArea_CVKL_U8_SSE42(src, dst, inSz, outSz, y + l, xsi, ysi,
+                      xalpha, yalpha, x_max_count, y_max_count, vert_sum);
+    }
+}
+
+#endif  // CVKL
+//----------------------------------------------------------------------
+
+GAPI_FLUID_KERNEL(FScalePlane8u, ScalePlane8u, true) {
+    static const int Window = 1;
+    static const int LPI = 4;
+    static const auto Kind = cv::GFluidKernel::Kind::Resize;
+
+    static void initScratch(const cv::GMatDesc& in,
+                            Size outSz, int /*interp*/,
+                            cv::gapi::fluid::Buffer &scratch) {
+        initScratchLinear<uchar, linear::Mapper>(in, outSz, scratch, LPI);
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
+    }
+
+    static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
+        calcRowLinear<uint8_t, linear::Mapper>(in, out, scratch);
+    }
+};
+
+GAPI_FLUID_KERNEL(FScalePlanes, ScalePlanes, true) {
+    static const int Window = 1;
+    static const int LPI = 4;
+    static const auto Kind = cv::GFluidKernel::Kind::Resize;
+
+    static void initScratch(const cv::GMatDesc& in, int, Size,
+                            Size outSz, int /*interp*/,
+                            cv::gapi::fluid::Buffer &scratch) {
+        initScratchLinear<uchar, linear::Mapper, 3>(in, outSz, scratch, LPI);
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
+    }
+
+    static void run(const cv::gapi::fluid::View& in, int, Size, Size/*sz*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out1,
+                    cv::gapi::fluid::Buffer& out2,
+                    cv::gapi::fluid::Buffer& out3,
+                    cv::gapi::fluid::Buffer& scratch) {
+        calcRowLinearC3<uint8_t, linear::Mapper>(in, out1, out2, out3, scratch);
+    }
+};
+
+GAPI_FLUID_KERNEL(FUpscalePlaneArea8u, UpscalePlaneArea8u, true) {
+    static const int Window = 1;
+    static const int LPI = 4;
+    static const auto Kind = cv::GFluidKernel::Kind::Resize;
+
+    static void initScratch(const cv::GMatDesc& in,
+                            Size outSz, int /*interp*/,
+                            cv::gapi::fluid::Buffer &scratch) {
+        initScratchLinear<uchar, areaUpscale::Mapper>(in, outSz, scratch, LPI);
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
+    }
+
+    static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
+        calcRowLinear<uint8_t, areaUpscale::Mapper>(in, out, scratch);
+    }
+};
+
+GAPI_FLUID_KERNEL(FUpscalePlaneArea32f, UpscalePlaneArea32f, true) {
+    static const int Window = 1;
+    static const int LPI = 4;
+    static const auto Kind = cv::GFluidKernel::Kind::Resize;
+
+    static void initScratch(const cv::GMatDesc& in,
+                            Size outSz, int /*interp*/,
+                            cv::gapi::fluid::Buffer &scratch) {
+        initScratchLinear<float, areaUpscale32f::Mapper>(in, outSz, scratch, 0);
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
+    }
+
+    static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
+        calcRowLinear<float, areaUpscale32f::Mapper>(in, out, scratch);
+    }
+};
+
+GAPI_FLUID_KERNEL(FScalePlane32f, ScalePlane32f, true) {
+    static const int Window = 1;
+    static const int LPI = 4;
+    static const auto Kind = cv::GFluidKernel::Kind::Resize;
+
+    static void initScratch(const cv::GMatDesc& in,
+                            Size outSz, int /*interp*/,
+                            cv::gapi::fluid::Buffer &scratch) {
+        GAPI_DbgAssert(in.depth == CV_32F && in.chan == 1);
+
+        initScratchLinear<float, linear32f::Mapper>(in, outSz, scratch, 0);
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
+    }
+
+    static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
+        calcRowLinear<float, linear32f::Mapper>(in, out, scratch);
+    }
+};
+
+//----------------------------------------------------------------------
+
+GAPI_FLUID_KERNEL(FScalePlaneArea32f, ScalePlaneArea32f, true) {
+    static const int Window = 1;
+    static const int LPI = 4;
+    static const auto Kind = cv::GFluidKernel::Kind::Resize;
+
+    static void initScratch(const cv::GMatDesc& in,
+                            Size outSz, int /*interp*/,
+                            cv::gapi::fluid::Buffer &scratch) {
+        initScratchArea<areaDownscale32f::Mapper>(in, outSz, scratch);
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
+    }
+
+    static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
+        calcAreaRow<float, areaDownscale32f::Mapper>(in, out, scratch);
+    }
+};
+
+GAPI_FLUID_KERNEL(FScalePlaneArea8u, ScalePlaneArea8u, true) {
+    static const int Window = 1;
+    static const int LPI = 4;
+    static const auto Kind = cv::GFluidKernel::Kind::Resize;
+
+    static void initScratch(const cv::GMatDesc& in,
+                            Size outSz, int /*interp*/,
+                            cv::gapi::fluid::Buffer &scratch) {
+    #if USE_CVKL
+        if (with_cpu_x86_sse42()) {
+            const Size& inSz = in.size;
+            if (inSz.width > outSz.width && inSz.height > outSz.height) {
+                // CVKL code we use supports only downscale
+                initScratchArea_CVKL_U8(in, outSz, scratch);
+                return;
+            }
+        }
+    #endif
+
+        initScratchArea<areaDownscale8u::Mapper>(in, outSz, scratch);
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/) {
+    }
+
+    static void run(const cv::gapi::fluid::View& in, Size /*sz*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch) {
+    #if USE_CVKL
+        if (with_cpu_x86_sse42()) {
+            auto  inSz =  in.meta().size;
+            auto outSz = out.meta().size;
+            if (inSz.width > outSz.width && inSz.height > outSz.height) {
+                // CVKL's code supports only downscale
+                calcAreaRow_CVKL_U8(in, out, scratch);
+                return;
+            }
+        }
+    #endif
+
+        calcAreaRow<uint8_t, areaDownscale8u::Mapper>(in, out, scratch);
+    }
+};
+
+}  // namespace kernels
+
+//----------------------------------------------------------------------
+
+using namespace kernels;
+
+cv::gapi::GKernelPackage preprocKernels() {
+    return cv::gapi::kernels
+        < FChanToPlane
+        , FScalePlanes
+        , FScalePlane
+        , FScalePlane32f
+        , FScalePlane8u
+        , FUpscalePlaneArea8u
+        , FUpscalePlaneArea32f
+        , FScalePlaneArea8u
+        , FScalePlaneArea32f
+        , FMerge2
+        , FMerge3
+        , FMerge4
+        , FSplit2
+        , FSplit3
+        , FSplit4
+        >();
+}
+
+}  // namespace gapi
+}  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.hpp b/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels.hpp
new file mode 100644 (file)
index 0000000..f4875e6
--- /dev/null
@@ -0,0 +1,93 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <tuple>
+
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+
+namespace InferenceEngine {
+namespace gapi {
+    using Size = cv::gapi::own::Size;
+
+    using GMat2 = std::tuple<cv::GMat, cv::GMat>;
+    using GMat3 = std::tuple<cv::GMat, cv::GMat, cv::GMat>;
+    using GMat4 = std::tuple<cv::GMat, cv::GMat, cv::GMat, cv::GMat>;
+
+    G_TYPED_KERNEL(ChanToPlane, <cv::GMat(cv::GMat, int)>, "com.intel.ie.chan_to_plane") {
+        static cv::GMatDesc outMeta(const cv::GMatDesc &in, int) {
+            return in.withType(in.depth, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(ScalePlane, <cv::GMat(cv::GMat, int, Size, Size, int)>, "com.intel.ie.scale_plane") {
+        static cv::GMatDesc outMeta(const cv::GMatDesc &in, int type, const Size &szIn, const Size &szOut, int) {
+            GAPI_Assert(type == in.depth);
+            return in.withSize(szOut);
+        }
+    };
+
+    G_TYPED_KERNEL_M(ScalePlanes, <GMat3(cv::GMat, int, Size, Size, int)>, "com.intel.ie.scale_planes") {
+        static std::tuple<cv::GMatDesc, cv::GMatDesc, cv::GMatDesc> outMeta(const cv::GMatDesc &in, int /*type*/, const Size &szIn, const Size &szOut, int) {
+            cv::GMatDesc out_desc;
+            out_desc.depth = in.depth;
+            out_desc.chan  = 1;
+            out_desc.size = szOut;
+            return std::make_tuple(out_desc, out_desc, out_desc);
+        }
+    };
+
+    G_TYPED_KERNEL(Merge2, <cv::GMat(cv::GMat, cv::GMat)>, "com.intel.ie.merge2") {
+        static cv::GMatDesc outMeta(const cv::GMatDesc &in, const cv::GMatDesc &) {
+            // FIXME: check a/b are equal!
+            return in.withType(in.depth, 2);
+        }
+    };
+
+    G_TYPED_KERNEL(Merge3, <cv::GMat(cv::GMat, cv::GMat, cv::GMat)>, "com.intel.ie.merge3") {
+        static cv::GMatDesc outMeta(const cv::GMatDesc &in, const cv::GMatDesc &, const cv::GMatDesc &) {
+            // FIXME: check a/b are equal!
+            return in.withType(in.depth, 3);
+        }
+    };
+
+    G_TYPED_KERNEL(Merge4, <cv::GMat(cv::GMat, cv::GMat, cv::GMat, cv::GMat)>, "com.intel.ie.merge4") {
+        static cv::GMatDesc outMeta(const cv::GMatDesc& in,
+                                    const cv::GMatDesc&, const cv::GMatDesc&, const cv::GMatDesc&) {
+            // FIXME: check a/b are equal!
+            return in.withType(in.depth, 4);
+        }
+    };
+
+    G_TYPED_KERNEL_M(Split2, <GMat2(cv::GMat)>, "com.intel.ie.split2") {
+        static std::tuple<cv::GMatDesc, cv::GMatDesc> outMeta(const cv::GMatDesc& in) {
+            const auto out_depth = in.depth;
+            const auto out_desc  = in.withType(out_depth, 1);
+            return std::make_tuple(out_desc, out_desc);
+        }
+    };
+
+    G_TYPED_KERNEL_M(Split3, <GMat3(cv::GMat)>, "com.intel.ie.split3") {
+        static std::tuple<cv::GMatDesc, cv::GMatDesc, cv::GMatDesc> outMeta(const cv::GMatDesc& in) {
+            const auto out_depth = in.depth;
+            const auto out_desc  = in.withType(out_depth, 1);
+            return std::make_tuple(out_desc, out_desc, out_desc);
+        }
+    };
+
+    G_TYPED_KERNEL_M(Split4, <GMat4(cv::GMat)>, "com.intel.ie.split4") {
+        static std::tuple<cv::GMatDesc, cv::GMatDesc, cv::GMatDesc, cv::GMatDesc> outMeta(const cv::GMatDesc& in) {
+            const auto out_depth = in.depth;
+            const auto out_desc  = in.withType(out_depth, 1);
+            return std::make_tuple(out_desc, out_desc, out_desc, out_desc);
+        }
+    };
+
+    cv::gapi::GKernelPackage preprocKernels();
+
+}  // namespace gapi
+}  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels_impl.hpp b/inference-engine/src/inference_engine/ie_preprocess_gapi_kernels_impl.hpp
new file mode 100644 (file)
index 0000000..11530dc
--- /dev/null
@@ -0,0 +1,89 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#ifdef HAVE_SSE
+  #define MANUAL_SIMD 1  // 1=call manually vectored code, 0=don't
+#else
+  #define MANUAL_SIMD 0
+#endif
+
+#if MANUAL_SIMD
+  #define USE_CVKL 1     // 1=reuse CVKL code for Resize, 0=don't
+#else
+  #define USE_CVKL 0
+#endif
+
+#include <climits>
+#include <cstdint>
+
+namespace InferenceEngine {
+namespace gapi {
+namespace kernels {
+
+template<typename DST, typename SRC> static inline DST saturate_cast(SRC x);
+template<> inline short saturate_cast(int x) { return (std::min)(SHRT_MAX, (std::max)(SHRT_MIN, x)); }
+template<> inline short saturate_cast(float x) { return saturate_cast<short>(static_cast<int>(std::rint(x))); }
+template<> inline float saturate_cast(float x) { return x; }
+template<> inline short saturate_cast(short x) { return x; }
+template<> inline uint16_t saturate_cast(int x) { return (std::min)(USHRT_MAX, (std::max)(0, x)); }
+
+//------------------------------------------------------------------------------
+
+constexpr static const int ONE = 1 << 15;
+
+inline static uint8_t calc(short alpha0, uint8_t src0, short alpha1, uint8_t src1) {
+    constexpr static const int half = 1 << 14;
+    return (src0 * alpha0 + src1 * alpha1 + half) >> 15;
+}
+
+inline static float calc(float alpha0, float src0, float alpha1, float src1) {
+    return src0 * alpha0 + src1 * alpha1;
+}
+
+//------------------------------------------------------------------------------
+
+// Variants:
+// - F=float, I=int
+// - F=short, I=short (e.g. F is Q1.7.8 encoded with short)
+template<typename F, typename I>
+struct MapperUnit {
+    F alpha0, alpha1;
+    I index0, index1;
+};
+
+//------------------------------------------------------------------------------
+
+typedef uint16_t Q0_16;  // value [0..1)   with 16 fractional bits
+typedef uint16_t Q8_8;   // value [0..255) with  8 fractional bits
+typedef uint8_t  U8;     // value [0..255)
+
+template<typename DST, typename SRC> static inline DST convert_cast(SRC x);
+template<> inline uint8_t convert_cast(uint8_t x) { return x; }
+template<> inline uint8_t convert_cast(float x) { return static_cast<uint8_t>(x); }
+template<> inline float convert_cast(float  x) { return x; }
+template<> inline float convert_cast(double x) { return static_cast<float>(x); }
+template<> inline Q0_16 convert_cast(double x) {
+    int ix = static_cast<int>(std::rint(x * (1 << 16)));
+    return saturate_cast<Q0_16>(ix);
+}
+template<> inline Q8_8 convert_cast(uchar x) { return x << 8; }
+template<> inline uchar convert_cast(Q8_8 x) { return x >> 8; }
+
+template<typename DST, typename SRC> static inline DST checked_cast(SRC x) {
+    short dx = static_cast<DST>(x);
+    GAPI_Assert(x == dx);  // check
+    return dx;
+}
+
+static inline Q8_8 mulas(Q0_16 a, U8   s) { return static_cast<Q8_8>((a * s) >>  8); }
+static inline Q8_8 mulaw(Q0_16 a, Q8_8 w) { return static_cast<Q8_8>((a * w) >> 16); }
+
+static inline float mulas(float a, float s) { return a * s; }
+static inline float mulaw(float a, float w) { return a * w; }
+
+}  // namespace kernels
+}  // namespace gapi
+}  // namespace InferenceEngine
index d10dfe1..540255b 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -264,4 +263,11 @@ inline static void annotateEnd(IttStatic&, IttProfilingTask& t) {
 
 #define IE_PROFILING_AUTO_SCOPE_TASK(PROFILING_TASK) IE_ITT_TASK_SCOPE(PROFILING_TASK); IE_TIMER_SCOPE(PROFILING_TASK.name);
 
+inline static void anotateSetThreadName(const char* name) {
+    #if ENABLE_PROFILING_ITT
+    __itt_thread_set_name(name);
+    #endif
+    // to suppress "unused" warning
+    (void)(name);
+}
 }  // namespace InferenceEngine
index d6269a0..44be1b5 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -8,6 +7,7 @@
 #include "details/caseless.hpp"
 #include "ie_utils.hpp"
 #include "ie_icnn_network_stats.hpp"
+#include "details/ie_cnn_network_tools.h"
 
 #include <ie_layers.h>
 
@@ -134,6 +134,9 @@ CNNLayerPtr clonelayer(const CNNLayer& source) {
         &layerCloneImpl<ReshapeLayer           >,
         &layerCloneImpl<CropLayer              >,
         &layerCloneImpl<EltwiseLayer           >,
+        &layerCloneImpl<GemmLayer              >,
+        &layerCloneImpl<PadLayer               >,
+        &layerCloneImpl<GatherLayer            >,
         &layerCloneImpl<ClampLayer             >,
         &layerCloneImpl<ReLULayer              >,
         &layerCloneImpl<SoftMaxLayer           >,
@@ -265,6 +268,8 @@ details::CNNNetworkImplPtr cloneNet(const std::vector<CNNLayerPtr>& layers,
             clonedLayer->outData.push_back(clonedData);
             for (auto&& inp : data->getInputTo()) {
                 auto layer = inp.second;
+                // TODO(amalyshe) is it the best place to check priorbox and remove
+                // such edge from outputs?
                 if (std::find(layers.begin(), layers.end(), layer) == layers.end() &&
                     !(CaselessEq<string>()(layer->type, "priorbox") ||
                       CaselessEq<string>()(layer->type, "PriorBoxClustered"))) {
@@ -296,7 +301,7 @@ details::CNNNetworkImplPtr cloneNet(const std::vector<CNNLayerPtr>& layers,
             if (nullptr == layer) {
                 LayerParams params;
                 params.name = data->getName();
-                params.precision = data->precision;
+                params.precision = data->getPrecision();
                 params.type = "Input";
                 layer = std::make_shared<CNNLayer>(params);
                 // this place should be transactional
@@ -486,7 +491,7 @@ struct NodePrinter {
         dims_ss << ']';
 
         printed_properties.emplace_back("dims", dims_ss.str());
-        printed_properties.emplace_back("precision", data->precision.name());
+        printed_properties.emplace_back("precision", data->getPrecision().name());
 
         printNode(node_name, data->name, node_properties, printed_properties);
     }
@@ -534,10 +539,17 @@ void saveGraphToDot(InferenceEngine::ICNNNetwork &network, std::ostream &out, pr
         }
     }
 
+    std::vector<std::pair<CNNLayerPtr, std::string>> perf_info;
+    auto store_perf_info = [&](CNNLayerPtr layer) {
+        auto perf = layer->params.find("perf");
+        if (perf != layer->params.end()) perf_info.push_back({layer, perf->second});
+    };
+
     out << "strict digraph Network {\n";
     // Traverse graph and print nodes
-    CNNNetForestDFS(inputs, [&](CNNLayerPtr layer) {
+    for (const auto &layer : details::CNNNetSortTopologically(network)) {
         printer.printLayerNode(layer);
+        store_perf_info(layer);
 
         // Print output Data Object
         for (auto &dataptr : layer->outData) {
@@ -558,7 +570,28 @@ void saveGraphToDot(InferenceEngine::ICNNNetwork &network, std::ostream &out, pr
             // to remove duplicate edges
             printer.printEdge(layer, dataptr, true);
         }
-    }, true);
+    }
+
+    if (!perf_info.empty()) {
+        out << "// Performance statistic" << std::endl;
+        out << "node [shape=plain, fontsize=24]" << std::endl;
+
+        for (auto &p : perf_info) {
+            auto &perf = p.second;
+            auto &name = p.first->name;
+            auto layer_name = "layer_" + name;
+            auto perf_name = "perf_" + name;
+            // {rank=same; perf_conv1 [label="133  mcs"]; layer_conv1;}
+            out << "{rank=same; " << perf_name << " [label=\"" << perf << "\"]; "
+                << layer_name << ";}" << std::endl;
+        }
+
+        out << std::endl << "edge[style=invis];" << std::endl;
+        auto p = perf_info.begin();
+        out << "perf_" + p->first->name;
+        for (; p != perf_info.end(); p++)
+            out << " -> perf_" + p->first->name;
+    }
 
     out << "}" << std::endl;
 }
index 46907d8..1f6e9f6 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -15,6 +14,9 @@
 
 #include <cpp/ie_cnn_network.h>
 #include <cnn_network_impl.hpp>
+#include <tuple>
+#include <type_traits>
+
 
 namespace InferenceEngine {
 
@@ -33,6 +35,27 @@ bool contains(const C& container, const T& element) {
 }
 
 /**
+ * @brief checks that given type is one of specified in variadic template list
+ * @tparam ...
+ */
+template <typename...>
+struct is_one_of {
+    static constexpr bool value = false;
+};
+
+/**
+ * @brief checks that given type is one of specified in variadic template list
+ * @tparam ...
+ */
+template <typename F, typename S, typename... T>
+struct is_one_of<F, S, T...> {
+    static constexpr bool value =
+        std::is_same<F, S>::value || is_one_of<F, T...>::value;
+};
+
+
+
+/**
  * @brief Split graph into subgraphs using provided splitter object
  *
  * @param network - Source network
index 3dc77c2..aa8e009 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 93d8bdd..cca54cc 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 25039ba..fd51793 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,7 +6,9 @@
 
 #include <tuple>
 #include <memory>
+#include <utility>
 #include "ie_layers.h"
+#include "ie_layers_prv.h"
 
 namespace InferenceEngine {
 
@@ -27,6 +28,9 @@ using AllLayers = std::tuple <
     ConvolutionLayer *,
     PoolingLayer*,
     FullyConnectedLayer*,
+    GemmLayer*,
+    PadLayer*,
+    GatherLayer*,
     ConcatLayer*,
     SplitLayer*,
     NormLayer*,
@@ -43,6 +47,9 @@ using AllLayers = std::tuple <
     PowerLayer*,
     BatchNormalizationLayer*,
     ClampLayer*,
+    TensorIterator*,
+    LSTMCell*,
+    RNNLayer*,
     WeightableLayer*,
     CNNLayer*
 >;
@@ -63,11 +70,11 @@ void dynamic_cast_layer(const CNNLayer &source, CNNLayerPtr &target, T & /*, Inj
 
 template<class Visitor, std::size_t I = 0, typename... Tp>
 inline typename std::enable_if<I == sizeof...(Tp), void>::type
-visitActualLayer(std::tuple<Tp...> &t, const CNNLayer &sourceLayer, const Visitor & v) {}
+visitActualLayer(std::tuple<Tp...> &&t, const CNNLayer &sourceLayer, const Visitor & v) {}
 
 template<class Visitor, std::size_t I = 0, typename... Tp>
 inline typename std::enable_if < I < sizeof...(Tp), void>::type
-visitActualLayer(std::tuple<Tp...> &t, const CNNLayer &sourceLayer, const Visitor & visitor) {
+visitActualLayer(std::tuple<Tp...> &&t, const CNNLayer &sourceLayer, const Visitor & visitor) {
     using EType = typename std::tuple_element<I, std::tuple<Tp...>>::type;
     auto casted = dynamic_cast<EType>(const_cast<CNNLayer *>(&sourceLayer));
 
@@ -78,7 +85,7 @@ visitActualLayer(std::tuple<Tp...> &t, const CNNLayer &sourceLayer, const Visito
         }
     }
 
-    visitActualLayer<Visitor, I + 1, Tp...>(t, sourceLayer, visitor);
+    visitActualLayer<Visitor, I + 1, Tp...>(std::move(t), sourceLayer, visitor);
 }
 
 template<class InjectedType, std::size_t I = 0, typename... Tp>
@@ -164,8 +171,7 @@ inline CNNLayerPtr injectData(CNNLayerPtr sourceLayer, const InjectType & value
  */
 template<class Transformer>
 inline void transformLayer(const CNNLayer & sourceLayer, const Transformer & transformer) {
-    details::AllLayers layers;
-    details::visitActualLayer<Transformer>(layers, sourceLayer, transformer);
+    details::visitActualLayer<Transformer>(std::move(details::AllLayers()), sourceLayer, transformer);
 }
 
 template<class Transformer>
diff --git a/inference-engine/src/inference_engine/net_pass.cpp b/inference-engine/src/inference_engine/net_pass.cpp
new file mode 100644 (file)
index 0000000..96ceb63
--- /dev/null
@@ -0,0 +1,179 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "net_pass.h"
+#include "ie_layers_prv.h"
+#include "graph_tools.hpp"
+
+#include <string>
+#include <utility>
+#include <memory>
+#include <unordered_set>
+
+template <typename T, typename P>
+inline bool one_of(T val, P item) { return val == item; }
+template <typename T, typename P, typename... Args>
+inline bool one_of(T val, P item, Args... item_others) {
+    return val == item || one_of(val, item_others...);
+}
+
+namespace InferenceEngine {
+namespace NetPass {
+
+inline bool is_full_ranged(const TensorIterator::PortMap& rule, const DataPtr &data) {
+    if (!data)
+        THROW_IE_EXCEPTION << "Internal error. data == nullptr";
+
+    if (rule.axis == -1 || !one_of(rule.stride, 1, -1))
+        return false;
+
+    auto &shape = data->getDims();
+    int size = shape[rule.axis];
+
+    int begin = rule.start >= 0 ? rule.start : size + rule.start + 1;
+    int end = rule.end >= 0 ? rule.end : size + rule.end + 1;
+
+    return (rule.stride == 1)
+        ? begin == 0 && end == size
+        : begin == size && end == 0;
+}
+
+bool convertToLSTMSequence(CNNLayerPtr cur) {
+    if (cur->type != "TensorIterator") return false;
+    auto ti = std::dynamic_pointer_cast<TensorIterator>(cur);
+
+    IE_ASSERT(ti) << "Cannot cast object with type TensorIterator to TensorIterator object";
+
+    // Topological order
+    std::vector<CNNLayerPtr> all_body_layers;
+    CNNNetForestDFS(ti->body.inputs, [&](CNNLayerPtr  current){
+        all_body_layers.push_back(current);
+    }, false);
+    std::reverse(all_body_layers.begin(), all_body_layers.end());
+
+    // Check if body is:  squeeze -> lstm_cell -> unsqueeze
+    if (all_body_layers.size() != 3
+        || all_body_layers[0]->type != "Reshape"
+        || all_body_layers[1]->type != "LSTMCell"
+        || all_body_layers[2]->type != "Reshape")
+        return false;
+
+    auto &rsp1 = all_body_layers[0];
+    auto &lstm = all_body_layers[1];
+    auto &rsp2 = all_body_layers[2];
+
+    IE_ASSERT(lstm->insData.size() == 3);  // {data, hidden, cell}
+    IE_ASSERT(lstm->outData.size() == 2);  // {hidden, cell}
+
+    if (lstm->insData[0].lock()->creatorLayer.lock() != rsp1 ||
+        lstm->outData[0]->inputTo.begin()->second != rsp2)
+        return false;
+
+    // Check port mapping
+    auto _indx_in = [&] (const std::vector<DataPtr> &scope,  const DataPtr &data) {
+        int indx = std::find(scope.begin(), scope.end(), data) - scope.begin();
+        return indx == scope.size() ? -1 : indx;
+    };
+
+    int in_hs_idx = _indx_in(ti->body.inputs, lstm->insData[1].lock());
+    int in_cs_idx = _indx_in(ti->body.inputs, lstm->insData[2].lock());
+    int in_dt_idx = _indx_in(ti->body.inputs, rsp1->insData[0].lock());
+
+    int out_hs_idx = _indx_in(ti->body.outputs, lstm->outData[0]);
+    int out_cs_idx = _indx_in(ti->body.outputs, lstm->outData[1]);
+    int out_dt_idx = _indx_in(ti->body.outputs, rsp2->outData[0]);
+
+    // indexes should be [0,1,2] : sum == 3
+    if (in_hs_idx + in_cs_idx + in_dt_idx != 3 || out_hs_idx + out_cs_idx + out_dt_idx != 3)
+        return false;
+
+    std::map<int, TensorIterator::PortMap> i2map, o2map, be2map;
+    for (auto &m : ti->input_port_map) i2map[m.to] = m;
+    for (auto &m : ti->output_port_map) o2map[m.to] = m;
+    for (auto &m : ti->back_edges) be2map[m.to] = m;
+
+    if (!one_of(i2map.size(), 3, 1) ||
+        !one_of(o2map.size(), 3, 1) ||
+        !one_of(be2map.size(), 2))
+        return false;
+
+
+    auto in_iter_rule = i2map[in_dt_idx];
+    auto in_iter_data = ti->insData[in_iter_rule.from].lock();
+
+    auto out_iter_rule = o2map[out_dt_idx];
+    auto out_iter_data = ti->outData[out_iter_rule.from];
+
+    // TI iterates only for full range of tensor
+    if (!is_full_ranged(in_iter_rule, in_iter_data) ||
+        !is_full_ranged(out_iter_rule, out_iter_data))
+        return false;
+
+    // supported only same axis and strides for in/out data tensors
+    if (in_iter_rule.axis != out_iter_rule.axis ||
+        in_iter_rule.stride != out_iter_rule.stride)
+        return false;
+
+    // supported only firs and second dim for LSTM-Sequence
+    if (!one_of(in_iter_rule.axis, 0, 1))
+        return false;
+
+    bool no_init_state = i2map.size() == 1;
+    bool no_last_state = o2map.size() == 1;
+
+    if (!no_init_state && ( i2map[in_hs_idx].axis != -1 || i2map[in_cs_idx].axis != -1 ))
+        return false;
+    if (!no_last_state && ( o2map[out_hs_idx].axis != -1 || o2map[out_cs_idx].axis != -1 ))
+        return false;
+
+    auto i_order = no_init_state
+            ? std::vector<int>{i2map[in_dt_idx].from}
+            : std::vector<int>{i2map[in_dt_idx].from,
+                               i2map[in_hs_idx].from,
+                               i2map[in_cs_idx].from};
+    auto o_order = no_last_state
+            ? std::vector<int>{o2map[out_dt_idx].from}
+            : std::vector<int>{o2map[out_dt_idx].from,
+                               o2map[out_hs_idx].from,
+                               o2map[out_cs_idx].from};
+
+    // need swap an i/o ports if it is not in natural order
+    std::string name = lstm->name + "_sequence";
+    auto rnn  = std::make_shared<RNNLayer>(LayerParams{ name, "RNN",  Precision::FP32 });
+    rnn->cellType = "LSTM";
+    rnn->axis = in_iter_rule.axis;
+    rnn->direction = in_iter_rule.stride == 1
+            ? RNNLayer::RNN_FWD
+            : RNNLayer::RNN_BWD;
+
+    rnn->_weights = dynamic_cast<WeightableLayer*>(lstm.get())->_weights;
+    rnn->blobs["weights"] = lstm->blobs["weights"];
+    rnn->_biases = dynamic_cast<WeightableLayer*>(lstm.get())->_biases;
+    rnn->blobs["biases"] = lstm->blobs["biases"];
+
+    for (int i : i_order) {
+        rnn->insData.push_back(ti->insData[i]);
+        rnn->insData.back().lock()->inputTo[ti->name] = rnn;
+    }
+    for (int i : o_order) {
+        rnn->outData.push_back(ti->outData[i]);
+        rnn->outData.back()->creatorLayer = rnn;
+    }
+
+    return true;
+}
+
+bool CombineLSTMSeq(const ICNNNetwork &net) {
+    // Apply action for all nodes
+    CNNNetForestDFS(CNNNetGetAllInputLayers(net), &convertToLSTMSequence, true);
+    return true;
+}
+
+bool UnrollTI(const ICNNNetwork &net) {
+    return false;
+}
+
+}  // namespace NetPass
+}  // namespace InferenceEngine
+
diff --git a/inference-engine/src/inference_engine/net_pass.h b/inference-engine/src/inference_engine/net_pass.h
new file mode 100644 (file)
index 0000000..8b19286
--- /dev/null
@@ -0,0 +1,31 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_icnn_network.hpp"
+
+#include <vector>
+#include <string>
+#include <map>
+
+namespace InferenceEngine {
+namespace NetPass {
+
+/**
+ * Try to detect LSTM Sequence pattern inside TI and convert it
+ * @param net network to modify
+ * @return true if all Tensor iterator was converted
+ */
+INFERENCE_ENGINE_API_CPP(bool) CombineLSTMSeq(const ICNNNetwork &net);
+
+/**
+ * Unroll all present Tensor Iterators
+ * @param net network to modify
+ * @return true if all Tensor iterator was unrolled successfully
+ */
+INFERENCE_ENGINE_API_CPP(bool) UnrollTI(const ICNNNetwork &net);
+
+}  // namespace NetPass
+}  // namespace InferenceEngine
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,18 +6,16 @@
 #include <map>
 #include <vector>
 #include <string>
-#include <ie_precision.hpp>
+
 #include "details/ie_cnn_network_tools.h"
 #include "details/caseless.hpp"
-#include "ie_layers_property.hpp"
 #include "network_serializer.h"
-#include "../common/samples/common.hpp"
+#include "xml_parse_utils.h"
 
 using namespace InferenceEngine;
 using namespace details;
 
-template<typename T>
-std::string arrayToIRProperty(const T& property) {
+template<typename T> std::string arrayToIRProperty(const T& property) {
     std::string sProperty;
     for (size_t i = 0; i < property.size(); i++) {
         sProperty = sProperty + std::to_string(property[i]) +
@@ -27,8 +24,7 @@ std::string arrayToIRProperty(const T& property) {
     return sProperty;
 }
 
-template<typename T>
-std::string arrayRevertToIRProperty(const T& property) {
+template<typename T> std::string arrayRevertToIRProperty(const T& property) {
     std::string sProperty;
     for (size_t i = 0; i < property.size(); i++) {
         sProperty = sProperty + std::to_string(property[property.size() - i - 1]) +
@@ -38,41 +34,37 @@ std::string arrayRevertToIRProperty(const T& property) {
 }
 
 
-void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::string &binPath,
-                                     ICNNNetwork &network) {
+void NetworkSerializer::serialize(
+    const std::string &xmlPath,
+    const std::string &binPath,
+    const InferenceEngine::ICNNNetwork& network) {
+
     std::ofstream ofsBin(binPath, std::ofstream::out | std::ofstream::binary);
+    if (!ofsBin) {
+        THROW_IE_EXCEPTION << "File '" << binPath << "' is not opened as out file stream";
+    }
 
     pugi::xml_document doc;
-
     pugi::xml_node net = doc.append_child("net");
-
-    char name[1024];
-    network.getName(name, 1024);
-
-    net.append_attribute("name").set_value(name);
+    net.append_attribute("name").set_value(network.getName().c_str());
     net.append_attribute("version").set_value("3");
-    net.append_attribute("batch").set_value("1");
+    net.append_attribute("batch").set_value(network.getBatchSize());
 
     pugi::xml_node layers = net.append_child("layers");
 
-    size_t dataOffset = 0;
-
-    std::string dataName = "data";
-
-    std::vector<CNNLayerPtr> ordered;
-
-    ordered = CNNNetSortTopologically(network);
-
+    const std::vector<CNNLayerPtr> ordered = CNNNetSortTopologically(network);
     std::map<CNNLayer::Ptr, int> matching;
     for (size_t i = 0; i < ordered.size(); i++) {
         matching[ordered[i]] = i;
     }
 
-    for (size_t i = 0; i < ordered.size(); i++) {
-        CNNLayerPtr node = ordered[i];
+    const std::string dataName = "data";
+    size_t dataOffset = 0;
+    for (size_t i = 0; i < ordered.size(); ++i) {
+        const CNNLayerPtr node = ordered[i];
 
         pugi::xml_node layer = layers.append_child("layer");
-        Precision precision = node->precision;
+        const Precision precision = node->precision;
         layer.append_attribute("name").set_value(node->name.c_str());
         layer.append_attribute("type").set_value(node->type.c_str());
         layer.append_attribute("precision").set_value(precision.name());
@@ -80,12 +72,11 @@ void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::stri
 
         updateStdLayerParams(node);
 
-        auto &params = node->params;
-
+        const auto &params = node->params;
         if (params.size()) {
             pugi::xml_node data = layer.append_child(dataName.c_str());
 
-            for (auto it : params) {
+            for (const auto it : params) {
                 data.append_attribute(it.first.c_str()).set_value(it.second.c_str());
             }
         }
@@ -94,7 +85,7 @@ void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::stri
             pugi::xml_node input = layer.append_child("input");
 
             for (size_t iport = 0; iport < node->insData.size(); iport++) {
-                DataPtr d = node->insData[iport].lock();
+                const DataPtr d = node->insData[iport].lock();
                 pugi::xml_node port = input.append_child("port");
 
                 port.append_attribute("id").set_value(iport);
@@ -111,14 +102,14 @@ void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::stri
 
                 port.append_attribute("id").set_value(node->insData.size() + oport);
 
-                for (auto dim : node->outData[oport]->getDims()) {
+                for (const auto dim : node->outData[oport]->getDims()) {
                     port.append_child("dim").text().set(dim);
                 }
             }
         }
         if (node->blobs.size()) {
             auto blobsNode = layer.append_child("blobs");
-            for (auto dataIt : node->blobs) {
+            for (const auto dataIt : node->blobs) {
                 const char *dataPtr = dataIt.second->buffer().as<char*>();
 
                 size_t dataSize = dataIt.second->byteSize();
@@ -128,14 +119,22 @@ void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::stri
 
                 dataOffset += dataSize;
                 ofsBin.write(dataPtr, dataSize);
+                if (!ofsBin.good()) {
+                    THROW_IE_EXCEPTION << "Error during '" << binPath << "' writing";
+                }
             }
         }
     }
 
+    ofsBin.close();
+    if (!ofsBin.good()) {
+        THROW_IE_EXCEPTION << "Error during '" << binPath << "' closing";
+    }
+
     pugi::xml_node edges = net.append_child("edges");
 
     for (size_t i = 0; i < ordered.size(); i++) {
-        CNNLayer::Ptr node = ordered[i];
+        const CNNLayer::Ptr node = ordered[i];
 
         if (node->outData.size()) {
             auto itFrom = matching.find(node);
@@ -143,7 +142,7 @@ void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::stri
                 THROW_IE_EXCEPTION << "Internal error, cannot find " << node->name << " in matching container during serialization of IR";
             }
             for (size_t oport = 0; oport < node->outData.size(); oport++) {
-                DataPtr outData = node->outData[oport];
+                const DataPtr outData = node->outData[oport];
                 for (auto inputTo : outData->inputTo) {
                     auto itTo = matching.find(inputTo.second);
                     if (itTo == matching.end()) {
@@ -178,7 +177,7 @@ void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::stri
 
     // assuming that we have preprocess only for one input
     for (auto ii : inputInfo) {
-        auto pp = ii.second->getPreProcess();
+        const PreProcessInfo& pp = ii.second->getPreProcess();
         size_t  nInChannels = pp.getNumberOfChannels();
         if (nInChannels) {
             pugi::xml_node preproc = net.append_child("pre-process");
@@ -187,7 +186,7 @@ void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::stri
             preproc.append_attribute("mean-precision").set_value(Precision(Precision::FP32).name());
 
             for (size_t ch = 0; ch < nInChannels; ch++) {
-                PreProcessChannel::Ptr &preProcessChannel = pp[ch];
+                const PreProcessChannel::Ptr &preProcessChannel = pp[ch];
                 auto channel = preproc.append_child("channel");
                 channel.append_attribute("id").set_value(ch);
 
@@ -207,9 +206,9 @@ void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::stri
     ICNNNetworkStats* netNodesStats = nullptr;
     auto stats = net.append_child("statistics");
     network.getStats(&netNodesStats, nullptr);
-    NetworkStatsMap statsmap =  netNodesStats->getNodesStats();
+    const NetworkStatsMap statsmap =  netNodesStats->getNodesStats();
 
-    auto joinCommas = [&](std::vector<float>& v) -> std::string {
+    auto joinCommas = [&](const std::vector<float>& v) -> std::string {
         std::string res;
 
         for (size_t i = 0; i < v.size(); ++i) {
@@ -222,7 +221,7 @@ void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::stri
         return res;
     };
 
-    for (auto itStats : statsmap) {
+    for (const auto itStats : statsmap) {
         auto layer = stats.append_child("layer");
 
         layer.append_child("name").text().set(itStats.first.c_str());
@@ -231,13 +230,14 @@ void CNNNetworkSerializer::Serialize(const std::string &xmlPath, const std::stri
         layer.append_child("max").text().set(joinCommas(itStats.second->_maxOutputs).c_str());
     }
 
-    doc.save_file(xmlPath.c_str());
+    if (!doc.save_file(xmlPath.c_str())) {
+        THROW_IE_EXCEPTION << "file '" << xmlPath << "' was not serialized";
+    }
 }
 
 
-void CNNNetworkSerializer::updateStdLayerParams(CNNLayer::Ptr layer) {
+void NetworkSerializer::updateStdLayerParams(const CNNLayer::Ptr layer) {
     auto layerPtr = layer.get();
-    auto type = layer->type;
     auto &params = layer->params;
 
     if (CaselessEq<std::string>()(layer->type, "power")) {
@@ -305,8 +305,6 @@ void CNNNetworkSerializer::updateStdLayerParams(CNNLayer::Ptr layer) {
     } else if (CaselessEq<std::string>()(layer->type, "reshape")) {
         // need to add here support of flatten layer if it is created from API
         ReshapeLayer *lr = dynamic_cast<ReshapeLayer *>(layerPtr);
-        params["axis"] = std::to_string(lr->axis);
-        params["num_axes"] = std::to_string(lr->num_axes);
         params["dim"] = arrayToIRProperty(lr->shape);
     } else if (CaselessEq<std::string>()(layer->type, "Eltwise")) {
         EltwiseLayer *lr = dynamic_cast<EltwiseLayer *>(layerPtr);
@@ -378,4 +376,4 @@ void CNNNetworkSerializer::updateStdLayerParams(CNNLayer::Ptr layer) {
             pwlayer->blobs["biases"] = pwlayer->_biases;
         }
     }
-}
\ No newline at end of file
+}
diff --git a/inference-engine/src/inference_engine/network_serializer.h b/inference-engine/src/inference_engine/network_serializer.h
new file mode 100644 (file)
index 0000000..a67f4f4
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+
+namespace InferenceEngine {
+namespace details {
+
+/**
+* Class for serialization of model been presented as ICNNNetwork to the disk
+*/
+class NetworkSerializer {
+public:
+    static void serialize(const std::string &xmlPath, const std::string &binPath, const InferenceEngine::ICNNNetwork& network);
+
+private:
+    static void updateStdLayerParams(InferenceEngine::CNNLayer::Ptr layer);
+};
+
+}  // namespace details
+}  // namespace InferenceEngine
index 5056803..acfe776 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -8,8 +7,6 @@
 #include <ie_icnn_network.hpp>
 #include "cnn_network_impl.hpp"
 
-#include <string>
-
 namespace pugi {
 class xml_node;
 }  // namespace pugi
@@ -22,8 +19,6 @@ struct IFormatParser {
     virtual CNNNetworkImplPtr Parse(pugi::xml_node &root) = 0;
 
     virtual void SetWeights(const TBlob<uint8_t>::Ptr &weights) = 0;
-
-    virtual void CopyBlobsByName(void* layerParsePrms, std::string name) = 0;
 };
 }  // namespace details
 }  // namespace InferenceEngine
index 7dc58bc..9988693 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -9,9 +8,14 @@
 #include <ie_blob.h>
 #include "inference_engine.hpp"
 
-using namespace InferenceEngine;
+namespace InferenceEngine {
+namespace PrecisionUtils {
 
-void PrecisionUtils::f16tof32Arrays(float *dst, const short *src, size_t nelem, float scale, float bias) {
+INFERENCE_ENGINE_API_CPP(void) f16tof32Arrays(float *dst,
+                                              const short *src,
+                                              size_t nelem,
+                                              float scale,
+                                              float bias) {
     const ie_fp16 *_src = reinterpret_cast<const ie_fp16 *>(src);
 
     for (size_t i = 0; i < nelem; i++) {
@@ -19,7 +23,11 @@ void PrecisionUtils::f16tof32Arrays(float *dst, const short *src, size_t nelem,
     }
 }
 
-void PrecisionUtils::f32tof16Arrays(short *dst, const float *src, size_t nelem, float scale, float bias) {
+INFERENCE_ENGINE_API_CPP(void) f32tof16Arrays(short *dst,
+                                              const float *src,
+                                              size_t nelem,
+                                              float scale,
+                                              float bias) {
     for (size_t i = 0; i < nelem; i++) {
         dst[i] = PrecisionUtils::f32tof16(src[i] * scale + bias);
     }
@@ -31,14 +39,13 @@ void PrecisionUtils::f32tof16Arrays(short *dst, const float *src, size_t nelem,
 #define EXP_MASK_F32 0x7F800000U
 #define EXP_MASK_F16     0x7C00U
 
-
 // small helper function to represent uint32_t value as float32
 inline float asfloat(uint32_t v) {
     return *reinterpret_cast<float *>(&v);
 }
 
 // Function to convert F32 into F16
-float PrecisionUtils::f16tof32(ie_fp16 x) {
+INFERENCE_ENGINE_API_CPP(float) f16tof32(ie_fp16 x) {
     // this is storage for output result
     uint32_t u = x;
 
@@ -80,7 +87,7 @@ float PrecisionUtils::f16tof32(ie_fp16 x) {
 
 // This function convert f32 to f16 with rounding to nearest value to minimize error
 // the denormal values are converted to 0.
-ie_fp16 PrecisionUtils::f32tof16(float x) {
+INFERENCE_ENGINE_API_CPP(ie_fp16) f32tof16(float x) {
     // create minimal positive normal f16 value in f32 format
     // exp:-14,mantissa:0 -> 2^-14 * 1.0
     static float min16 = asfloat((127 - 14) << 23);
@@ -143,3 +150,7 @@ ie_fp16 PrecisionUtils::f32tof16(float x) {
 
     return v.u | s;
 }
+
+}  // namespace PrecisionUtils
+}  // namespace InferenceEngine
+
index 14e89df..0781df1 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -16,6 +15,7 @@
 #include "ie_eltwise_shape_infer.hpp"
 #include "ie_permute_shape_infer.hpp"
 #include "ie_reshape_shape_infer.hpp"
+#include "ie_flatten_shape_infer.hpp"
 #include "ie_proposal_shape_infer.hpp"
 #include "ie_priorbox_shape_infer.hpp"
 #include "ie_upsampling_shape_infer.hpp"
@@ -32,6 +32,9 @@
 #include "ie_resample_shape_infer.hpp"
 #include "ie_interp_shape_infer.hpp"
 #include "ie_argmax_shape_infer.hpp"
+#include "ie_gemm_shape_infer.hpp"
+#include "ie_pad_shape_infer.hpp"
+#include "ie_gather_shape_infer.hpp"
 #include <algorithm>
 #include <memory>
 #include <string>
@@ -77,9 +80,10 @@ BuiltInShapeInferHolder::getShapeInferImpl(IShapeInferImpl::Ptr& impl, const cha
 void BuiltInShapeInferHolder::SetLogCallback(InferenceEngine::IErrorListener& listener) noexcept {}
 
 // Register without implementation just to protect from adding custom implementation for them
-REG_SHAPE_INFER_FOR_TYPE(DoNothingShapeProp, Input);
-REG_SHAPE_INFER_FOR_TYPE(DoNothingShapeProp, Memory);
-REG_SHAPE_INFER_FOR_TYPE(DoNothingShapeProp, Const);
+REG_SHAPE_INFER_FOR_TYPE(EqualShapeProp, Input);
+REG_SHAPE_INFER_FOR_TYPE(DoNothingShapeProp, Output);
+REG_SHAPE_INFER_FOR_TYPE(EqualShapeProp, Memory);
+REG_SHAPE_INFER_FOR_TYPE(EqualShapeProp, Const);
 
 // Outputs = Inputs
 REG_SHAPE_INFER_FOR_TYPE(EqualShapeProp, Activation);
@@ -113,7 +117,7 @@ REG_SHAPE_INFER_FOR_TYPE(InnerProductShapeProp, FullyConnected);
 REG_SHAPE_INFER_FOR_TYPE(SplitShapeProp, Split);
 REG_SHAPE_INFER_FOR_TYPE(SplitShapeProp, Slice);
 REG_SHAPE_INFER_FOR_TYPE(PermuteShapeProp, Permute);
-REG_SHAPE_INFER_FOR_TYPE(ReshapeShapeProp, Flatten);
+REG_SHAPE_INFER_FOR_TYPE(FlattenShapeProp, Flatten);
 REG_SHAPE_INFER_FOR_TYPE(ReshapeShapeProp, Reshape);
 REG_SHAPE_INFER_FOR_TYPE(DetectionOutputShapeProp, DetectionOutput);
 REG_SHAPE_INFER_FOR_TYPE(PriorBoxClusteredShapeProp, PriorBoxClustered);
@@ -133,6 +137,9 @@ REG_SHAPE_INFER_FOR_TYPE(ProposalShapeProp, Proposal);
 REG_SHAPE_INFER_FOR_TYPE(ReorgYoloShapeProp, ReorgYolo);
 REG_SHAPE_INFER_FOR_TYPE(RegionYoloShapeProp, RegionYolo);
 REG_SHAPE_INFER_FOR_TYPE(ArgMaxShapeProp, ArgMax);
+REG_SHAPE_INFER_FOR_TYPE(GemmShapeProp, Gemm);
+REG_SHAPE_INFER_FOR_TYPE(PadShapeProp, Pad);
+REG_SHAPE_INFER_FOR_TYPE(GatherShapeProp, Gather);
 
 }  // namespace ShapeInfer
 }  // namespace InferenceEngine
index 6d96cd5..7c1751f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -13,7 +12,7 @@
 #include <vector>
 #include <debug.h>
 #include <cmath>
-#include <v2_format_parser.h>
+#include <ie_format_parser.h>
 
 namespace InferenceEngine {
 namespace ShapeInfer {
@@ -55,9 +54,7 @@ public:
         size_t PH = convLayer._padding[Y_AXIS];
         size_t PW = convLayer._padding[X_AXIS];
         size_t OC = convLayer._out_depth;
-        auto it = convLayer.params.find("auto_pad");
-        std::string padType;
-        if (it != convLayer.params.end()) padType = it->second;
+        std::string padType = convLayer._auto_pad;
         if (padType == "valid") {
             OH_temp = std::ceil((IH - KH + 1.f) / SH);
             OW_temp = std::ceil((IW - KW + 1.f) / SW);
@@ -68,26 +65,10 @@ public:
             OH_temp = std::floor(1.f * IH / SH);
             OW_temp = std::floor(1.f * IW / SW);
         } else {
-            auto ir_version = details::BaseCreator::version_;
-            bool isEndPaddingsSet = false;
-            try {
-                if (ir_version == 3) {
-                    auto pads_end = convLayer.GetParamAsUInts("pads_end");
-                    PR = pads_end[pads_end.size() - 1 - X_AXIS];
-                    PB = pads_end[pads_end.size() - 1 - Y_AXIS];
-                } else if (ir_version < 3) {
-                    PR = convLayer.GetParamAsInt("pad-r");
-                    PB = convLayer.GetParamAsInt("pad-b");
-                }
-                isEndPaddingsSet = true;
-            } catch (...) {}
-            if (!isEndPaddingsSet) {
-                OH_temp = std::floor((IH + 2.f * PH - KH) / SH) + 1.f;
-                OW_temp = std::floor((IW + 2.f * PW - KW) / SW) + 1.f;
-            } else {
-                OH_temp = std::floor(1.f * (IH + PH + PB - KH) / SH) + 1.f;
-                OW_temp = std::floor(1.f * (IW + PW + PR - KW) / SW) + 1.f;
-            }
+            PR = convLayer._pads_end[X_AXIS];
+            PB = convLayer._pads_end[Y_AXIS];
+            OH_temp = std::floor(1.f * (IH + PH + PB - KH) / SH) + 1.f;
+            OW_temp = std::floor(1.f * (IW + PW + PR - KW) / SW) + 1.f;
         }
         if (OH_temp < 0 || OW_temp < 0)
             THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(dims) << " make output shape negative";
index ec5665e..c4f130a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,7 +10,7 @@
 #include <memory>
 #include <string>
 #include <vector>
-#include <v2_format_parser.h>
+#include <ie_format_parser.h>
 
 namespace InferenceEngine {
 namespace ShapeInfer {
@@ -52,9 +51,7 @@ public:
         size_t PH = deconvLayer._padding[Y_AXIS];
         size_t PW = deconvLayer._padding[X_AXIS];
         size_t OC = deconvLayer._out_depth;
-        auto it = deconvLayer.params.find("auto_pad");
-        std::string padType;
-        if (it != deconvLayer.params.end()) padType = it->second;
+        std::string padType = deconvLayer._auto_pad;
         if (padType == "valid") {
             OHTemp = IH * SH + KH - 1;
             OWTemp = IW * SW + KW - 1;
@@ -62,26 +59,10 @@ public:
             OHTemp = IH * SH;
             OWTemp = IW * SW;
         } else {
-            auto ir_version = details::BaseCreator::version_;
-            bool isEndPaddingsSet = false;
-            try {
-                if (ir_version == 3) {
-                    auto pads_end = deconvLayer.GetParamAsUInts("pads_end");
-                    PR = pads_end[pads_end.size() - 1 - X_AXIS];
-                    PB = pads_end[pads_end.size() - 1 - Y_AXIS];
-                } else if (ir_version < 3) {
-                    PR = deconvLayer.GetParamAsInt("pad-r");
-                    PB = deconvLayer.GetParamAsInt("pad-b");
-                }
-                isEndPaddingsSet = true;
-            } catch (...) {}
-            if (!isEndPaddingsSet) {
-                OHTemp = SH * (IH - 1) + KH - 2 * PH;
-                OWTemp = SW * (IW - 1) + KW - 2 * PW;
-            } else {
-                OHTemp = SH * (IH - 1) + KH - PH - PB;
-                OWTemp = SW * (IW - 1) + KW - PW - PR;
-            }
+            PR = deconvLayer._pads_end[X_AXIS];
+            PB = deconvLayer._pads_end[Y_AXIS];
+            OHTemp = SH * (IH - 1) + KH - PH - PB;
+            OWTemp = SW * (IW - 1) + KW - PW - PR;
         }
         if (OHTemp < 0 || OWTemp < 0)
             THROW_IE_EXCEPTION << "New shapes " << details::dumpVec(dims) << " make output shape negative";
diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_flatten_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_flatten_shape_infer.hpp
new file mode 100644 (file)
index 0000000..bdde976
--- /dev/null
@@ -0,0 +1,68 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <description_buffer.hpp>
+#include "ie_built_in_impl.hpp"
+#include <ie_layers.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <debug.h>
+#include <functional>
+
+namespace InferenceEngine {
+namespace ShapeInfer {
+
+/**
+ *@brief Implementation of Shape inference for Reshape layer
+ */
+class FlattenShapeProp : public BuiltInShapeInferImpl {
+public:
+    explicit FlattenShapeProp(const std::string &type) : BuiltInShapeInferImpl(type) {}
+
+    void inferShapesImpl(const std::vector<SizeVector> &inShapes,
+                         const std::map<std::string, std::string> &params,
+                         const std::map<std::string, Blob::Ptr> &blobs,
+                         std::vector<SizeVector> &outShapes) override {
+        LayerParams lp{};
+        ReshapeLayer reshapeLayer(lp);
+        reshapeLayer.params = params;
+        reshapeLayer.type = _type;
+        validate(&reshapeLayer, inShapes, params, blobs);
+
+        auto inputShape = inShapes[0];
+        size_t inputShapeTotal = std::accumulate(inputShape.begin(), inputShape.end(), 1lu, std::multiplies<size_t>());
+        SizeVector outShape;
+
+        int numAxes = reshapeLayer.num_axes;
+        int axis = reshapeLayer.axis;
+        size_t notFlatten = 1;
+        if (numAxes == -1 && axis == 0) {
+            outShape = {inputShapeTotal};
+        } else {
+            if (axis > 0) {
+                for (int i = 0; i < axis; i++) {
+                    notFlatten *= inputShape[i];
+                    outShape.push_back(inputShape[i]);
+                }
+            }
+            outShape.push_back(1);
+            if (numAxes > 0) {
+                for (int i = numAxes + 1; i < inputShape.size(); i++) {
+                    notFlatten *= inputShape[i];
+                    outShape.push_back(inputShape[i]);
+                }
+            }
+            outShape[axis] = inputShapeTotal / notFlatten;
+        }
+
+        outShapes.emplace_back(outShape);
+    }
+};
+
+}  // namespace ShapeInfer
+}  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_gather_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_gather_shape_infer.hpp
new file mode 100644 (file)
index 0000000..41641cb
--- /dev/null
@@ -0,0 +1,52 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_built_in_impl.hpp"
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace ShapeInfer {
+
+/**
+ *@brief Implementation of Shape inference for Gather layer
+ */
+class GatherShapeProp : public BuiltInShapeInferImpl {
+public:
+    explicit GatherShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {}
+
+    void inferShapesImpl(const std::vector<SizeVector>& inShapes,
+                         const std::map<std::string, std::string>& params,
+                         const std::map<std::string, Blob::Ptr>& blobs,
+                         std::vector<SizeVector>& outShapes) override {
+        LayerParams lp{};
+        GatherLayer gatherLayer(lp);
+        gatherLayer.params = params;
+        gatherLayer.type = _type;
+        validate(&gatherLayer, inShapes, params, blobs);
+
+        int axis = gatherLayer.axis;
+        if (axis < 0)
+            axis += inShapes[0].size();
+
+        outShapes.resize(1);
+        outShapes[0].resize(inShapes[0].size() + inShapes[1].size() - 1);
+        for (size_t i = 0; i < axis; i++)
+            outShapes[0][i] = inShapes[0][i];
+
+        for (size_t i = 0; i < inShapes[1].size(); i++)
+            outShapes[0][i + axis] = inShapes[1][i];
+
+        for (size_t i = axis + 1; i < inShapes[0].size(); i++)
+            outShapes[0][i + inShapes[1].size() - 1] = inShapes[0][i];
+    }
+};
+
+}  // namespace ShapeInfer
+}  // namespace InferenceEngine
+
diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_gemm_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_gemm_shape_infer.hpp
new file mode 100644 (file)
index 0000000..5cac2f5
--- /dev/null
@@ -0,0 +1,62 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <description_buffer.hpp>
+#include "ie_built_in_impl.hpp"
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <debug.h>
+#include <cmath>
+#include <algorithm>
+
+namespace InferenceEngine {
+namespace ShapeInfer {
+
+/**
+ *@brief Implementation of Shape inference for Gemm layer
+ */
+class GemmShapeProp : public BuiltInShapeInferImpl {
+public:
+    explicit GemmShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {}
+
+    void inferShapesImpl(const std::vector<SizeVector>& inShapes,
+                         const std::map<std::string, std::string>& params,
+                         const std::map<std::string, Blob::Ptr>& blobs,
+                         std::vector<SizeVector>& outShapes) override {
+        LayerParams lp{};
+        GemmLayer gemmLayer(lp);
+        gemmLayer.params = params;
+        gemmLayer.type = _type;
+        validate(&gemmLayer, inShapes, params, blobs);
+
+        auto dims0 = inShapes[0];
+        auto dims1 = inShapes[1];
+
+        SizeVector shapes;
+        for (int idx = 0; idx < dims0.size() - 2; idx++) {
+            unsigned long max_dim = dims0[idx] > dims1[idx] ? dims0[idx] : dims1[idx];
+
+            if (inShapes.size() == 3) {
+                auto dims2 = inShapes[2];
+                max_dim = max_dim > dims2[idx] ? max_dim : dims2[idx];
+            }
+
+            shapes.push_back(max_dim);
+        }
+
+        unsigned long xAxis = dims0.size() - 1;
+        unsigned long yAxis = dims0.size() - 2;
+
+        shapes.push_back(dims0[yAxis]);
+        shapes.push_back(dims1[xAxis]);
+        outShapes.push_back(shapes);
+    }
+};
+
+}  // namespace ShapeInfer
+}  // namespace InferenceEngine
diff --git a/inference-engine/src/inference_engine/shape_infer/built-in/ie_pad_shape_infer.hpp b/inference-engine/src/inference_engine/shape_infer/built-in/ie_pad_shape_infer.hpp
new file mode 100644 (file)
index 0000000..2fb1c49
--- /dev/null
@@ -0,0 +1,41 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_built_in_impl.hpp"
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace InferenceEngine {
+namespace ShapeInfer {
+
+/**
+ *@brief Implementation of Shape inference for Pad layer
+ */
+class PadShapeProp : public BuiltInShapeInferImpl {
+public:
+    explicit PadShapeProp(const std::string& type) : BuiltInShapeInferImpl(type) {}
+
+    void inferShapesImpl(const std::vector<SizeVector>& inShapes,
+                         const std::map<std::string, std::string>& params,
+                         const std::map<std::string, Blob::Ptr>& blobs,
+                         std::vector<SizeVector>& outShapes) override {
+        LayerParams lp{};
+        PadLayer padLayer(lp);
+        padLayer.params = params;
+        padLayer.type = _type;
+        validate(&padLayer, inShapes, params, blobs);
+
+        outShapes.push_back(inShapes[0]);
+        for (size_t i = 0; i < outShapes[0].size(); i++) {
+            outShapes[0][i] += padLayer.pads_begin[i] + padLayer.pads_end[i];
+        }
+    }
+};
+
+}  // namespace ShapeInfer
+}  // namespace InferenceEngine
index 28f43f0..4850c8a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,7 +10,7 @@
 #include <string>
 #include <vector>
 #include <cmath>
-#include <v2_format_parser.h>
+#include <ie_format_parser.h>
 
 namespace InferenceEngine {
 namespace ShapeInfer {
@@ -47,9 +46,7 @@ public:
         size_t PH = poolLayer._padding[Y_AXIS];
         size_t PW = poolLayer._padding[X_AXIS];
 
-        auto it = poolLayer.params.find("auto_pad");
-        std::string padType;
-        if (it != poolLayer.params.end()) padType = it->second;
+        std::string padType = poolLayer._auto_pad;
         if (padType == "valid") {
             OHTemp = std::ceil((IH - KH + 1.f) / SH);
             OWTemp = std::ceil((IW - KW + 1.f) / SW);
@@ -60,7 +57,7 @@ public:
             OHTemp = std::floor(1.f * IH / SH);
             OWTemp = std::floor(1.f * IW / SW);
         } else {
-            it = std::find_if(
+            auto it = std::find_if(
                 poolLayer.params.begin(),
                 poolLayer.params.end(),
                 [](decltype(*poolLayer.params.begin()) & lhs) {
@@ -70,27 +67,10 @@ public:
             if (it != poolLayer.params.end()) {
                 if (it->second == "floor") isCeil = false;
             }
-
-            auto ir_version = details::BaseCreator::version_;
-            bool isEndPaddingsSet = false;
-            try {
-                if (ir_version == 3) {
-                    auto pads_end = poolLayer.GetParamAsUInts("pads_end");
-                    PR = pads_end[pads_end.size() - 1 - X_AXIS];
-                    PB = pads_end[pads_end.size() - 1 - Y_AXIS];
-                } else if (ir_version < 3) {
-                    PR = poolLayer.GetParamAsInt("pad-r");
-                    PB = poolLayer.GetParamAsInt("pad-b");
-                }
-                isEndPaddingsSet = true;
-            } catch (...) {}
-            if (!isEndPaddingsSet) {
-                OHTemp += (IH + 2.f * PH - KH) / SH;
-                OWTemp += (IW + 2.f * PW - KW) / SW;
-            } else {
-                OHTemp += 1.f * (IH + PH + PB - KH) / SH;
-                OWTemp += 1.f * (IW + PW + PR - KW) / SW;
-            }
+            PR = poolLayer._pads_end[X_AXIS];
+            PB = poolLayer._pads_end[Y_AXIS];
+            OHTemp += 1.f * (IH + PH + PB - KH) / SH;
+            OWTemp += 1.f * (IW + PW + PR - KW) / SW;
             if (isCeil) {
                 OHTemp = std::ceil(OHTemp);
                 OWTemp = std::ceil(OWTemp);
index a1f96d3..97b6571 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -12,6 +11,8 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <debug.h>
+#include <functional>
 
 namespace InferenceEngine {
 namespace ShapeInfer {
@@ -21,83 +22,49 @@ namespace ShapeInfer {
  */
 class ReshapeShapeProp : public BuiltInShapeInferImpl {
 public:
-    explicit ReshapeShapeProp(const std::stringtype) : BuiltInShapeInferImpl(type) {}
+    explicit ReshapeShapeProp(const std::string &type) : BuiltInShapeInferImpl(type) {}
 
-    void inferShapesImpl(const std::vector<SizeVector>inShapes,
-                         const std::map<std::string, std::string>params,
-                         const std::map<std::string, Blob::Ptr>blobs,
-                         std::vector<SizeVector>outShapes) override {
+    void inferShapesImpl(const std::vector<SizeVector> &inShapes,
+                         const std::map<std::string, std::string> &params,
+                         const std::map<std::string, Blob::Ptr> &blobs,
+                         std::vector<SizeVector> &outShapes) override {
         LayerParams lp{};
         ReshapeLayer reshapeLayer(lp);
         reshapeLayer.params = params;
         reshapeLayer.type = _type;
         validate(&reshapeLayer, inShapes, params, blobs);
-        std::string in2out = reshapeLayer.GetParamAsString("in2out", "");
 
-        auto firstInputShape = inShapes[0];
+        auto inputShape = inShapes[0];
+        size_t inputShapeTotal = std::accumulate(inputShape.begin(), inputShape.end(), 1lu, std::multiplies<size_t>());
         SizeVector outShape;
-        if (!reshapeLayer.shape.empty()) {
-            for (size_t i = 0; i < reshapeLayer.shape.size(); i++) {
-                outShape.push_back(reshapeLayer.shape[i] < 0 ?
-                                   0 :
-                                   ((reshapeLayer.shape[i] == 0) ?
-                                    firstInputShape[i] :
-                                    static_cast<size_t>(reshapeLayer.shape[i])));
-            }
-        } else {
-            for (size_t i = 0; i < reshapeLayer.axis; i++) {
-                outShape.push_back(firstInputShape[i]);
-            }
-            size_t shapeTill = reshapeLayer.num_axes < 0 ? firstInputShape.size() : reshapeLayer.num_axes;
-            outShape.push_back(1);
-
-            for (size_t i = shapeTill; i < firstInputShape.size(); i++) {
-                outShape.push_back(firstInputShape[i]);
-            }
-        }
-
-        if (details::product(firstInputShape) != details::product(outShape)) {
-            std::istringstream stream(in2out);
-            std::string str;
-            std::vector<int> inMap;
-            std::vector<int> outMap;
-            while (getline(stream, str, ',')) {
-                std::istringstream num_stream(str);
-                std::string num;
-                getline(num_stream, num, '-');
-                inMap.push_back(std::stoi(num));
-                getline(num_stream, num, '-');
-                outMap.push_back(std::stoi(num));
-            }
+        std::vector<int> reshapeMask = reshapeLayer.shape;
 
-            std::vector<bool> changedField;
-            for (const auto& dim : outShape) {
-                changedField.push_back(false);
-            }
-            for (size_t i = 0; i < inMap.size(); i++) {
-                if (firstInputShape[inMap[i]]) {
-                    if (outShape[outMap[i]] == 0)
-                        continue;
-                    if (!changedField[outMap[i]])
-                        outShape[outMap[i]] = 1;
-                    outShape[outMap[i]] *= firstInputShape[inMap[i]];
-                    changedField[outMap[i]] = true;
+        if (reshapeMask.empty()) {
+            outShape = {inputShapeTotal};
+        } else {
+            size_t res = 1;
+            for (int i = 0; i < reshapeMask.size(); i++) {
+                if (reshapeMask[i] == 0) {
+                    res *= inputShape[i];
+                } else if (reshapeMask[i] != -1) {
+                    res *= reshapeMask[i];
                 }
             }
-
-            for (size_t& i : outShape) {
-                if (!i) {
-                    size_t outShapeMul(1), totalMul(1);
-                    for (auto& dim : outShape) {
-                        if (dim)
-                            outShapeMul *= dim;
-                    }
-                    for (auto& dim : firstInputShape) {
-                        totalMul *= dim;
-                    }
-                    i = totalMul / outShapeMul;
+            size_t newDim = inputShapeTotal / res;
+            for (int i = 0; i < reshapeMask.size(); i++) {
+                if (reshapeMask[i] == 0) {
+                    outShape.push_back(inputShape[i]);
+                } else if (reshapeMask[i] == -1) {
+                    outShape.push_back(newDim);
+                } else {
+                    outShape.push_back(reshapeMask[i]);
                 }
             }
+            size_t outputShapeTotal = std::accumulate(outShape.begin(), outShape.end(), 1lu, std::multiplies<size_t>());
+            if (inputShapeTotal != outputShapeTotal)
+                THROW_IE_EXCEPTION << "Invalid reshape mask (dim attribute): number of elements in input: "
+                                   << details::dumpVec(inputShape) << " and output: " << details::dumpVec(outShape)
+                                   << " mismatch";
         }
         outShapes.emplace_back(outShape);
     }
index 8c9d00f..c2651a0 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -10,6 +9,7 @@
 #include <vector>
 #include <map>
 #include <set>
+#include <details/ie_exception.hpp>
 #include "shape_infer/ie_reshape_launcher.hpp"
 #include "shape_infer/ie_reshape_io_controllers.hpp"
 
@@ -37,8 +37,15 @@ OutputController* DefaultInitializer::createOutputController(const CNNLayer* lay
 ReshapeLauncher::ReshapeLauncher(const CNNLayer* layer, const IShapeInferImpl::Ptr& impl,
                                  const DefaultInitializer::Ptr& initializer) : _layer(layer), _impl(impl) {
     initializer->check(layer, impl);
-    _iController = initializer->createInputController(layer);
-    _oController = initializer->createOutputController(layer);
+    try {
+        _iController = initializer->createInputController(layer);
+        _oController = initializer->createOutputController(layer);
+    } catch (...) {
+        auto exception = std::current_exception();
+        delete _iController;
+        delete _oController;
+        std::rethrow_exception(exception);
+    }
 }
 
 ReshapeLauncher::~ReshapeLauncher() {
index d7043ce..89dd72e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "shape_infer/ie_reshaper.hpp"
 #include "details/caseless.hpp"
 #include "details/ie_cnn_network_tools.h"
+#include "ie_reshaper.hpp"
 
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
 using namespace ShapeInfer;
 
+Reshaper::Reshaper(const Context &context, Network::Ptr& network): ctx(context), network(network) {}
+
 Reshaper::Reshaper(ICNNNetwork& network, const LauncherCreator::Ptr& launcherCreator) {
     auto builtIn = std::make_shared<BuiltInShapeInferHolder>();
     _allTypes = getTypeNamesFromExtension(builtIn);
@@ -51,6 +53,12 @@ Reshaper::Reshaper(ICNNNetwork& network, const LauncherCreator::Ptr& launcherCre
 
 void Reshaper::AddExtension(const IShapeInferExtensionPtr& extension) {
     if (!extension) THROW_IE_EXCEPTION << "Failed to add empty shape infer extension";
+
+    if (network) {
+        ctx.addExtension(extension);
+        return;
+    }
+
     auto newLayerTypes = getTypeNamesFromExtension(extension);
     std::string badLayerTypes;
     for (const auto& type : newLayerTypes) {
@@ -103,7 +111,10 @@ ReshapeLauncher::Ptr Reshaper::getLauncherByLayerName(const std::string& layerNa
     return *foundLauncher;
 }
 
-void Reshaper::run(const std::map<std::string, SizeVector>& inputShapes) {
+StatusCode Reshaper::run(const std::map<std::string, SizeVector>& inputShapes, ResponseDesc* resp) {
+    if (network) {
+        return networkShapeInfer(inputShapes, resp);
+    }
     // Reset all shapes from previous run
     for (const auto& launcher : _launchers) {
         launcher->reset();
@@ -135,6 +146,79 @@ void Reshaper::run(const std::map<std::string, SizeVector>& inputShapes) {
         auto foundLauncher = getLauncherByLayerName(layer->name);
         foundLauncher->applyChanges(layer.get());
     }
+    return OK;
+}
+
+StatusCode Reshaper::networkShapeInfer(const std::map<std::string, SizeVector>& inputShapes, ResponseDesc* resp) {
+    if (!network)
+        return DescriptionBuffer(GENERAL_ERROR, resp) << "Cannot infer shapes! Network is not loaded.";
+    std::vector<Layer> propagatedLayers;
+    Network propagatedNetwork(*network);
+
+    // Set new input shapes
+    for (auto& layer : propagatedNetwork) {
+        if (inputShapes.find(layer->getName()) == inputShapes.end() ||
+                details::CaselessEq<std::string>()(layer->getType(), "Const"))
+            continue;
+
+        if (layer->getOutputPorts().size() != 1)
+            return DescriptionBuffer(GENERAL_ERROR, resp) << "Cannot infer shapes! Input layers can have only one output port.";
+
+        layer->getOutputPorts()[0].shape() = inputShapes.find(layer->getName())->second;
+    }
+
+    // Try to propagate shapes
+    for (auto& layer : propagatedNetwork) {
+        const auto impl = ctx.getShapeInferImpl(layer->getType());
+        if (!impl)
+            return DescriptionBuffer(NOT_FOUND, resp) <<
+                        "Cannot infer shapes! Shape infer implementation was not found for type " << layer->getType() << ".";
+        std::vector<SizeVector> inShapes;
+        std::vector<SizeVector> outShapes;
+        std::map<std::string, std::string> params;
+        std::map<std::string, Blob::Ptr> blobs;
+
+        for (const auto& inPort : layer->getInputPorts().empty() ? layer->getOutputPorts() : layer->getInputPorts()) {
+            inShapes.push_back(inPort.shape());
+        }
+        if (layer->getParameters()) {
+            for (const auto& it  : layer->getParameters()->getParameters()) {
+                params[it.first] = it.second;
+            }
+            for (const auto& it  : layer->getParameters()->getConstantData()) {
+                blobs[it.first] = std::const_pointer_cast<Blob>(it.second);
+            }
+        }
+
+        StatusCode sts = impl->inferShapes(inShapes, params, blobs, outShapes, resp);
+        if (sts != OK)
+            return sts;
+
+        if (outShapes.size() != layer->getOutputPorts().size())
+            return DescriptionBuffer(GENERAL_ERROR, resp) << "Cannot infer shapes! The number of output shapes is not equal the number of output ports.";
+
+        for (size_t i = 0; i < outShapes.size(); i++) {
+            layer->getOutputPorts()[i].shape() = outShapes[i];
+        }
+        for (const auto& connection : propagatedNetwork.getLayerConnections(layer->getId())) {
+            if (connection.from().layerId() != layer->getId())
+                continue;
+            auto nextLayer = propagatedNetwork.getLayer(connection.to().layerId());
+            nextLayer->getInputPorts()[connection.to().portId()].shape() = outShapes[connection.from().portId()];
+        }
+    }
+
+    // Apply new shapes
+    for (auto& layer : *network) {
+        const auto& propagatedLayer = propagatedNetwork.getLayer(layer->getId());
+        for (size_t i = 0; i < layer->getInputPorts().size(); i++) {
+            layer->getInputPorts()[i].shape() = propagatedLayer->getInputPorts()[i].shape();
+        }
+        for (size_t i = 0; i < layer->getOutputPorts().size(); i++) {
+            layer->getOutputPorts()[i].shape() = propagatedLayer->getOutputPorts()[i].shape();
+        }
+    }
+    return OK;
 }
 
 caseless_set<std::string> Reshaper::getTypeNamesFromExtension(const IShapeInferExtensionPtr& extension) {
index 00550a7..4f18507 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include <memory>
 
 #include <ie_layers.h>
+#include <ie_context.hpp>
+#include "../ie_network.hpp"
 #include "details/caseless.hpp"
-#include "shape_infer/built-in/ie_built_in_holder.hpp"
 #include "ie_reshape_launcher.hpp"
+#include "ie_icnn_network.hpp"
 
 namespace InferenceEngine {
 namespace ShapeInfer {
@@ -61,6 +62,8 @@ public:
     explicit Reshaper(ICNNNetwork& network,
                       const LauncherCreator::Ptr& creator = std::make_shared<LauncherCreator>());
 
+    Reshaper(const Context& context, details::Network::Ptr& network);
+
     virtual ~Reshaper() = default;
 
     /**
@@ -74,20 +77,22 @@ public:
      * Throws if shape infer failed without corruption of original shapes
      * @param inputShapes - Map of input names (data) to their input shapes.
      */
-    void run(const std::map<std::string, SizeVector>& inputShapes);
-
-    using Ptr = std::shared_ptr<Reshaper>;
+    StatusCode run(const std::map<std::string, SizeVector>& inputShapes, ResponseDesc* resp = nullptr);
 private:
     ReshapeLauncher::Ptr getLauncherByLayerName(const std::string& layerName) const;
 
+    StatusCode networkShapeInfer(const std::map<std::string, SizeVector>& inputShapes, ResponseDesc* resp);
+
     static InferenceEngine::details::caseless_set<std::string> getTypeNamesFromExtension(const IShapeInferExtensionPtr& extension);
 
-private:
     std::vector<IShapeInferExtensionPtr> _extensions;
     std::set<ReshapeLauncher::Ptr> _launchers;
     std::vector<CNNLayerPtr> _allSortedLayers{};
     std::set<CNNLayerPtr> _inputLayers{};
     InferenceEngine::details::caseless_set<std::string> _allTypes;
+
+    Context ctx;
+    details::Network::Ptr network;
 };
 
 }  // namespace ShapeInfer
diff --git a/inference-engine/src/inference_engine/v2_layer_parsers.cpp b/inference-engine/src/inference_engine/v2_layer_parsers.cpp
deleted file mode 100644 (file)
index b7160e8..0000000
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (C) 2018 Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "v2_layer_parsers.h"
-#include "ie_cnn_net_reader_impl.h"
-
-using namespace InferenceEngine;
-using namespace InferenceEngine::details;
-
-CNNLayer::Ptr ActivationLayerCreator::CreateLayer(pugi::xml_node& node, LayerParseParameters& layerParsePrms)  {
-    pugi::xml_node dn = GetChild(node, { "data", "activation_data" }, false);
-    if (dn.empty()) {
-        THROW_IE_EXCEPTION << "Activation layer has no data node";
-    }
-
-    std::string type;
-    for (auto ait = dn.attributes_begin(); ait != dn.attributes_end(); ++ait) {
-        pugi::xml_attribute attr = *ait;
-        if (CaselessEq<std::string>()("type", attr.name())) {
-            if (!type.empty()) {
-                THROW_IE_EXCEPTION << "Activation layer has multiple types";
-            }
-            type = attr.value();
-        }
-    }
-
-    static caseless_map<std::string, std::shared_ptr<BaseCreator>> activationCreators = {
-        {"relu", std::make_shared<V2LayerCreator<ReLULayer>>("ReLU")},
-        {"prelu", std::make_shared<V2LayerCreator<PReLULayer>>("PReLU")},
-        {"clamp", std::make_shared<V2LayerCreator<ClampLayer>>("Clamp")},
-        {"elu", std::make_shared<V2LayerCreator<CNNLayer>>("ELU")},
-        {"sigmoid", std::make_shared<V2LayerCreator<CNNLayer>>("Sigmoid")},
-        {"tanh", std::make_shared<V2LayerCreator<CNNLayer>>("TanH")},
-    };
-
-    auto activationBuilder = activationCreators.find(type);
-    if (activationBuilder == activationCreators.end()) {
-        THROW_IE_EXCEPTION << "Unsupported Activation layer type: " << type;
-    }
-
-    auto activation = activationBuilder->second->CreateLayer(node, layerParsePrms);
-
-    activation->type = activationBuilder->first;
-    activation->params.erase("type");
-
-    return activation;
-}
-
-CNNLayer::Ptr TILayerCreator::CreateLayer(pugi::xml_node& node, LayerParseParameters& layerParsePrms) {
-    std::string ti_name = node.attribute("name").as_string();
-
-    auto bn = node.child("body");
-    if (bn.empty()) {
-        THROW_IE_EXCEPTION << "TensorIterator " << ti_name << " has no body";
-    }
-
-    std::vector<TensorIterator::Port> _input_ports;
-    std::vector<TensorIterator::Port> _output_ports;
-    std::vector<TensorIterator::BackEdge> _backEdges;
-
-    pugi::xml_node bedges = node.child("back_edges");
-    FOREACH_CHILD(_ec, bedges, "edge") {
-        int fromLayer = GetIntAttr(_ec, "from-layer");
-        int fromPort = GetIntAttr(_ec, "from-port");
-        int toLayer = GetIntAttr(_ec, "to-layer");
-        int toPort = GetIntAttr(_ec, "to-port");
-
-        _backEdges.push_back({ fromLayer, fromPort, toLayer, toPort });
-    }
-
-    pugi::xml_node ports = node.child("port_map");
-    for (auto p = ports.first_child(); p; p = p.next_sibling()) {
-        int external_port_id = GetIntAttr(p, "external_port_id");
-        int internal_layer_id = GetIntAttr(p, "internal_layer_id");
-        int internal_port_id = GetIntAttr(p, "internal_port_id");
-
-        int axis = GetIntAttr(p, "axis", -1);
-        int part_size = GetIntAttr(p, "part_size", -1);
-        int stride = GetIntAttr(p, "stride", 0);
-
-        TensorIterator::Port port{ external_port_id, internal_layer_id, internal_port_id, axis, part_size, stride };
-
-        std::string pname(p.name());
-        if ( pname == "input" ) {
-            _input_ports.push_back(port);
-        } else if (pname == "output") {
-            _output_ports.push_back(port);
-        } else {
-            THROW_IE_EXCEPTION << "Unknown item {" << pname << "} in port map of TensorIterator " << ti_name;
-        }
-    }
-
-    int prev_ir_version = BaseCreator::version_;
-    auto pReader = std::make_shared<CNNNetReaderImpl>(std::make_shared<V2FormatParserCreator>());
-
-    StatusCode status = pReader->ReadSubNetwork(bn);
-
-    ResponseDesc resp;
-    auto pNet = dynamic_cast<CNNNetworkImpl*>(pReader->getNetwork(&resp));
-
-    bool recognized = false;
-    unsigned axis_cand = 16;
-
-    size_t layerCount = pNet->layerCount();
-
-    if (layerCount == 3) {
-        auto _layers = pNet->allLayers();
-
-        for (auto &item : _layers) {
-            auto cell = dynamic_cast<LSTMCell*> (item.second.get());
-
-            if (cell != nullptr) {
-                for (auto inputData : cell->insData) {
-                    auto prevData = inputData.lock();
-                    if (prevData == nullptr) {
-                        THROW_IE_EXCEPTION << "No input reshape for LSTM cell " << cell->name;
-                    }
-                    auto inReshape  = dynamic_cast<ReshapeLayer*> (prevData->creatorLayer.lock().get());
-                    auto outReshape = dynamic_cast<ReshapeLayer*> (cell->outData[0]->getInputTo().begin()->second.get());
-
-                    if (inReshape != nullptr && outReshape != nullptr) {
-                        layerParsePrms.prms.type = "RNN";
-                        pReader->CopyBlobs(&layerParsePrms, cell->name);
-
-                        // axis analysis
-                        unsigned input_axis = _input_ports[0].axis;
-                        size_t input_dims = layerParsePrms.inputPorts[0].dims.size();
-                        unsigned output_axis = _output_ports[0].axis;
-                        size_t output_dims = layerParsePrms.outputPorts[0].dims.size();
-                        if ( input_axis == output_axis && input_dims == output_dims && input_axis < input_dims ) {
-                            axis_cand = input_axis;
-                        }
-
-                        recognized = true;
-                        break;
-                    }
-                }
-                break;
-            }
-        }
-    }
-
-    // Global var. Need to restore after TI parsing.
-    BaseCreator::version_ = prev_ir_version;
-
-    if (recognized) {
-        auto res = std::make_shared<RNNLayer>(layerParsePrms.prms);
-        res->cellType = LSTM;
-
-        /*** WA */
-        {
-            int d_ind = 0;
-            int s1_ind = 0;
-            int s2_ind = 0;
-            if (_input_ports[1].internal_layer_id == _input_ports[2].internal_layer_id) {
-                d_ind = 0; s1_ind = 1; s2_ind = 2;
-            } else if (_input_ports[0].internal_layer_id == _input_ports[2].internal_layer_id) {
-                d_ind = 1; s1_ind = 0; s2_ind = 2;
-            } else if (_input_ports[0].internal_layer_id == _input_ports[1].internal_layer_id) {
-                d_ind = 2; s1_ind = 0; s2_ind = 1;
-            }
-            res->params["swap_state"] = _input_ports[s1_ind].internal_port_id > _input_ports[s2_ind].internal_port_id ?
-                    "YES" : "NO";
-        }
-        /*** end of WA */
-
-        if (axis_cand < layerParsePrms.inputPorts[0].dims.size()) {
-            res->_axis = axis_cand;
-        }
-        return res;
-    } else {
-        auto res = std::make_shared<TensorIterator>(layerParsePrms.prms);
-        res->reader = pReader;
-        res->input_ports = _input_ports;
-        res->output_ports = _output_ports;
-        res->backEdges = _backEdges;
-        return res;
-    }
-}
-
index 44aaf47..e5243db 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 931bd26..5064580 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -13,6 +12,7 @@
 #include <process.h>
 #include <direct.h>
 #include <io.h>
+#include <chrono>
 
 #define strncasecmp _strnicmp
 #define getcwd _getcwd
 
 #define SecuredGetEnv GetEnvironmentVariableA
 
-static void usleep(long microSecs) { Sleep(microSecs / 1000); }
+#if defined usleep
+#undef usleep
+#endif
+
+#define usleep(m) std::this_thread::sleep_for(std::chrono::microseconds(m))
+
 #else
 
 #include <unistd.h>
index a833eeb..82327e8 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -15,6 +14,17 @@ int XMLParseUtils::GetIntAttr(const pugi::xml_node &node, const char *str) {
     return atoi(attr.value());
 }
 
+uint64_t XMLParseUtils::GetUInt64Attr(const pugi::xml_node &node, const char *str) {
+    auto attr = node.attribute(str);
+    if (attr.empty())
+        THROW_IE_EXCEPTION << "node <" << node.name() << "> is missing mandatory attribute: " << str << " at offset "
+                           << node.offset_debug();
+    int64_t value = atoll(attr.value());
+    if (value < 0)
+        THROW_IE_EXCEPTION << "node <" << node.name() << "> has incorrect parameter: " << str << " at offset "
+                           << node.offset_debug();
+    return static_cast<uint64_t>(value);
+}
 
 unsigned int XMLParseUtils::GetUIntAttr(const pugi::xml_node &node, const char *str) {
     auto attr = node.attribute(str);
@@ -71,6 +81,16 @@ int XMLParseUtils::GetIntAttr(const pugi::xml_node &node, const char *str, int d
     return atoi(attr.value());
 }
 
+uint64_t XMLParseUtils::GetUInt64Attr(const pugi::xml_node &node, const char *str, uint64_t defVal) {
+    auto attr = node.attribute(str);
+    if (attr.empty()) return defVal;
+    int64_t value = atoll(attr.value());
+    if (value < 0)
+        THROW_IE_EXCEPTION << "node <" << node.name() << "> has incorrect parameter: " << str << " at offset "
+                           << node.offset_debug();
+    return static_cast<uint64_t>(value);
+}
+
 unsigned int XMLParseUtils::GetUIntAttr(const pugi::xml_node &node, const char *str, unsigned int defVal) {
     auto attr = node.attribute(str);
     if (attr.empty()) return defVal;
index aec0ff9..3d2750b 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -21,6 +20,10 @@ INFERENCE_ENGINE_API_CPP(int) GetIntAttr(const pugi::xml_node &node, const char
 
 INFERENCE_ENGINE_API_CPP(int) GetIntAttr(const pugi::xml_node &node, const char *str, int defVal);
 
+INFERENCE_ENGINE_API_CPP(uint64_t) GetUInt64Attr(const pugi::xml_node &node, const char *str);
+
+INFERENCE_ENGINE_API_CPP(uint64_t) GetUInt64Attr(const pugi::xml_node &node, const char *str, uint64_t defVal);
+
 INFERENCE_ENGINE_API_CPP(unsigned int) GetUIntAttr(const pugi::xml_node &node, const char *str);
 
 INFERENCE_ENGINE_API_CPP(unsigned int) GetUIntAttr(const pugi::xml_node &node, const char *str, unsigned int defVal);
index 79551f6..5997f7d 100644 (file)
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 set(TARGET_NAME "MKLDNNPlugin")
 
 if (UNIX AND NOT APPLE)
@@ -25,9 +26,7 @@ file(GLOB HEADERS
 
 addVersionDefines(mkldnn_plugin.cpp CI_BUILD_NUMBER MKL_VERSION)
 
-if(WIN32)
-    add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
-endif()
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
 
 include_directories(
         ${IE_MAIN_SOURCE_DIR}/include
@@ -38,39 +37,30 @@ include_directories(
         ${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/include
 )
 
+if (GEMM STREQUAL "MKL")
+    log_rpath_from_dir(MKL "${MKL}/lib")
+endif()
+
 add_library(${TARGET_NAME} SHARED ${SOURCES} ${HEADERS})
+set_ie_threading_interface_for(${TARGET_NAME})
 
 if (THREADING STREQUAL "TBB")
-    target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_TBB -DMKLDNN_THR=MKLDNN_THR_TBB)
-    target_include_directories(${TARGET_NAME} PUBLIC ${TBB_INCLUDE_DIRS})
-    target_link_libraries(${TARGET_NAME} debug ${TBB_LIBRARIES_RELEASE} optimized ${TBB_LIBRARIES_RELEASE})
+    set(MKLDNN_THR MKLDNN_THR_TBB)
 elseif (THREADING STREQUAL "OMP")
-    target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_OMP -DMKLDNN_THR=MKLDNN_THR_OMP)
-    enable_omp()
-    if(ENABLE_INTEL_OMP)
-        target_link_libraries(${TARGET_NAME} ${intel_omp_lib})
-    endif()
+    set(MKLDNN_THR MKLDNN_THR_OMP)
 else()
-    target_compile_definitions(${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_SEQ -DMKLDNN_THR=MKLDNN_THR_SEQ)
+    set(MKLDNN_THR MKLDNN_THR_SEQ)
 endif()
 
-target_link_libraries(${TARGET_NAME} inference_engine ${INTEL_ITT_LIBS} mkldnn)
+target_compile_definitions(${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR})
+target_link_libraries(${TARGET_NAME} PRIVATE inference_engine ${INTEL_ITT_LIBS} mkldnn)
+
 set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
 
 add_library(test_${TARGET_NAME} STATIC ${SOURCES} ${HEADERS})
+set_ie_threading_interface_for(test_${TARGET_NAME})
 
-if (THREADING STREQUAL "TBB")
-    target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_TBB -DMKLDNN_THR=MKLDNN_THR_TBB)
-    target_include_directories(test_${TARGET_NAME} PUBLIC ${TBB_INCLUDE_DIRS})
-    target_link_libraries(test_${TARGET_NAME} debug ${TBB_LIBRARIES_RELEASE} optimized ${TBB_LIBRARIES_RELEASE})
-elseif (THREADING STREQUAL "OMP")
-    target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_OMP -DMKLDNN_THR=MKLDNN_THR_OMP)
-    if(ENABLE_INTEL_OMP)
-        target_link_libraries(test_${TARGET_NAME} ${intel_omp_lib})
-    endif()
-else()
-    target_compile_definitions(test_${TARGET_NAME} PUBLIC -DIE_THREAD=IE_THREAD_SEQ -DMKLDNN_THR=MKLDNN_THR_SEQ)
-endif()
+target_compile_definitions(test_${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR})
+target_link_libraries(test_${TARGET_NAME} PRIVATE inference_engine_s mkldnn)
 
-target_link_libraries(test_${TARGET_NAME} inference_engine_s mkldnn)
 set_target_properties(test_${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME test_${TARGET_NAME})
index 57c8dc9..4ef10ee 100644 (file)
@@ -1,16 +1,23 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
+// avoiding clash of the "max" macro with std::max
+#define NOMINMAX
+
 #include "config.h"
 #include "ie_plugin_config.hpp"
 #include "ie_common.h"
 
 #include <string>
+#include <cstring>
 #include <map>
 #include <algorithm>
+#include <stdexcept>
+
 #include <cpp_interfaces/exception2status.hpp>
+#include <thread>
+#include "mkldnn/omp_manager.h"
 
 namespace MKLDNNPlugin {
 
@@ -44,6 +51,42 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
             else
                 THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS
                                    << ". Expected only YES/NO";
+        } else if (key == PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS) {
+            if (val == PluginConfigParams::CPU_THROUGHPUT_NUMA) {
+                throughputStreams = MKLDNNPlugin::cpu::getNumberOfCPUSockets();
+            } else if (val == PluginConfigParams::CPU_THROUGHPUT_AUTO) {
+                // bare minimum of streams (that evenly divides available number of core)
+                const int num_cores = std::thread::hardware_concurrency();
+                if (0 == num_cores % 4)
+                    throughputStreams = std::max(4, num_cores / 4);
+                else if (0 == num_cores % 5)
+                    throughputStreams = std::max(5, num_cores / 5);
+                else if (0 == num_cores % 3)
+                    throughputStreams = std::max(3, num_cores / 3);
+                else  // if user disables some cores say in BIOS, so we got weird #cores which is not easy to divide
+                    throughputStreams = 1;
+            } else {
+                int val_i;
+                try {
+                    val_i = std::stoi(val);
+                } catch (const std::exception&) {
+                    THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS
+                                       << ". Expected only positive numbers (#streams) or "
+                                       << "PluginConfigParams::CPU_THROUGHPUT_NUMA/CPU_THROUGHPUT_AUTO";
+                }
+                if (val_i > 0)
+                    throughputStreams = val_i;
+            }
+        } else if (key == PluginConfigParams::KEY_CPU_THREADS_NUM) {
+            int val_i;
+            try {
+                val_i = std::stoi(val);
+            } catch (const std::exception&) {
+                THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_CPU_THREADS_NUM
+                                   << ". Expected only positive numbers (#threads)";
+            }
+            if (val_i > 0)
+                threadsNum = val_i;
         } else if (key.compare(PluginConfigParams::KEY_DYN_BATCH_ENABLED) == 0) {
             if (val.compare(PluginConfigParams::YES) == 0)
                 enableDynamicBatch = true;
@@ -52,10 +95,15 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
             else
                 THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_DYN_BATCH_ENABLED
                 << ". Expected only YES/NO";
+        } else if (key.compare(PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT) == 0) {
+            // empty string means that dumping is switched off
+            dumpToDot = val;
         } else {
             THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported property " << key << " by CPU plugin";
         }
     }
+    if (exclusiveAsyncRequests)  // Exclusive request feature disables the streams
+        throughputStreams = 1;
 }
 
 }  // namespace MKLDNNPlugin
index 0bb390c..558ac87 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -15,7 +14,10 @@ struct Config {
     bool collectPerfCounters = false;
     bool exclusiveAsyncRequests = false;
     bool enableDynamicBatch = false;
+    std::string dumpToDot = "";
     int batchLimit = 0;
+    int throughputStreams = 1;
+    int threadsNum = 0;
 
     void readProperties(const std::map<std::string, std::string> &config);
 };
index ff87e14..f1ac17e 100644 (file)
@@ -1,10 +1,10 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "mean_image.h"
 #include "ie_parallel.hpp"
+#include "ie_memcpy.h"
 
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
@@ -54,7 +54,8 @@ void MeanImage::Load(const MKLDNNDims& inputDims, InputInfo::Ptr inputInfo) {
                     THROW_IE_EXCEPTION << "mean image size does not match expected network input, expecting " << meanWidth << " x " << meanHeight;
                 }
                 // todo: cast to TBlob and make sure it is floats
-                memcpy(meanBuffer->data() + channel*meanBlob->size(), meanBlob->buffer(), meanBlob->byteSize());
+                ie_memcpy(meanBuffer->data() + channel*meanBlob->size(), meanBuffer->byteSize() - channel*meanBlob->byteSize(),
+                          meanBlob->buffer(), meanBlob->byteSize());
             }
         }
             break;
index c27d667..24dc816 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index d0b9117..57b6edc 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 
 namespace mkldnn {
 
-template <> struct handle_traits<mkldnn_primitive_desc_iterator_t> {
-    static constexpr auto destructor = &mkldnn_primitive_desc_iterator_destroy;
-};
-
 struct primitive_desc_iterator : public handle<mkldnn_primitive_desc_iterator_t> {
     template <typename T>
     primitive_desc_iterator(const T &adesc, const mkldnn::primitive_attr &aattr, const engine &aengine) {
index 834f8bd..ff3616a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -33,6 +32,7 @@ impl_desc_type MKLDNNPlugin::parse_impl_name(std::string impl_desc_name) {
     res = static_cast<impl_desc_type>(res | impl_desc_type::_key);
 
     SEARCH_WORD_2(nchw, ref);
+    SEARCH_WORD_2(ncdhw, ref);
     SEARCH_WORD_2(wino, winograd);
 #undef SEARCH_WORD_2
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp b/inference-engine/src/mkldnn_plugin/mkldnn/omp_manager.cpp
new file mode 100644 (file)
index 0000000..19bc513
--- /dev/null
@@ -0,0 +1,47 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cstdlib>
+#include <cstring>
+#include "ie_parallel.hpp"
+#include "omp_manager.h"
+
+using namespace MKLDNNPlugin;
+namespace MKLDNNPlugin {
+namespace cpu {
+
+static const char *openMpEnvVars[] = {
+        "OMP_CANCELLATION", "OMP_DISPLAY_ENV", "OMP_DEFAULT_DEVICE", "OMP_DYNAMIC",
+        "OMP_MAX_ACTIVE_LEVELS", "OMP_MAX_TASK_PRIORITY", "OMP_NESTED",
+        "OMP_NUM_THREADS", "OMP_PROC_BIND", "OMP_PLACES", "OMP_STACKSIZE",
+        "OMP_SCHEDULE", "OMP_THREAD_LIMIT", "OMP_WAIT_POLICY", "GOMP_CPU_AFFINITY",
+        "GOMP_DEBUG", "GOMP_STACKSIZE", "GOMP_SPINCOUNT", "GOMP_RTEMS_THREAD_POOLS",
+        "KMP_AFFINITY", "KMP_NUM_THREADS", "MIC_KMP_AFFINITY",
+        "MIC_OMP_NUM_THREADS", "MIC_OMP_PROC_BIND", "PHI_KMP_AFFINITY",
+        "PHI_OMP_NUM_THREADS", "PHI_KMP_PLACE_THREADS", "MKL_NUM_THREADS",
+        "MKL_DYNAMIC", "MKL_DOMAIN_NUM_THREADS"
+};
+
+static const unsigned numberOfOpenMpEnvVars =
+        sizeof(openMpEnvVars) / sizeof(openMpEnvVars[0]);
+
+bool checkOpenMpEnvVars(bool includeOMPNumThreads) {
+    for (unsigned i = 0; i < numberOfOpenMpEnvVars; i++) {
+        if (getenv(openMpEnvVars[i])) {
+            if (0 != strcmp(openMpEnvVars[i], "OMP_NUM_THREADS") || includeOMPNumThreads)
+                return true;
+        }
+    }
+    return false;
+}
+
+#if !(defined(__APPLE__) || defined(_WIN32))
+// getNumberOfCPUSockets/getNumberOfCPUCores are implemented in the lin_omp_manager.cpp
+#else
+int getNumberOfCPUSockets() {return 1;}
+int getNumberOfCPUCores()   {return parallel_get_max_threads();}
+#endif
+
+}  // namespace cpu
+}  // namespace MKLDNNPlugin
index 26cba00..65cc216 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -9,10 +8,15 @@
  */
 #pragma once
 
-#ifdef _WIN32
-    #include "mkldnn/os/win/win_omp_manager.h"
-#elif defined(__APPLE__)
-    #include "mkldnn/os/osx/osx_omp_manager.h"
-#else
-    #include "mkldnn/os/lin/lin_omp_manager.h"
-#endif
+namespace MKLDNNPlugin {
+namespace cpu {
+
+bool checkOpenMpEnvVars(bool includeOMPNumThreads = true);
+// numbers of CPU sockets in the machine (on Linux), 1 on all other OSes
+int getNumberOfCPUSockets();
+// numbers of CPU physical cores on Linux (which is considered to be more performance friendly for servers)
+// (on other OSes it simply relies on the original parallel API of choice, which usually use the logical cores )
+int getNumberOfCPUCores();
+
+}  // namespace cpu
+}  // namespace MKLDNNPlugin
\ No newline at end of file
index 75f2e4c..14c3e1d 100644 (file)
@@ -1,10 +1,8 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "lin_omp_manager.h"
-#include "ie_parallel.hpp"
 #include <fstream>
 #include <set>
 #include <string>
@@ -19,20 +17,13 @@ namespace cpu {
 Processor::Processor() {
     processor = 0;
     physicalId = 0;
-    siblings = 0;
-    coreId = 0;
     cpuCores = 0;
-    speedMHz = 0;
 }
 
 CpuInfo::CpuInfo() {
     loadContentFromFile("/proc/cpuinfo");
 }
 
-CpuInfo::CpuInfo(const char *content) {
-    loadContent(content);
-}
-
 void CpuInfo::loadContentFromFile(const char *fileName) {
     std::ifstream file(fileName);
     std::string content(
@@ -98,10 +89,6 @@ Collection::Collection(CpuInfoInterface *cpuInfo) : cpuInfo(*cpuInfo) {
     collectBasicCpuInformation();
 }
 
-unsigned Collection::getProcessorSpeedMHz() {
-    return processors.size() ? processors[0].speedMHz : 0;
-}
-
 unsigned Collection::getTotalNumberOfSockets() {
     return totalNumberOfSockets;
 }
@@ -114,10 +101,6 @@ unsigned Collection::getNumberOfProcessors() {
     return processors.size();
 }
 
-const Processor &Collection::getProcessor(unsigned processorId) {
-    return processors[processorId];
-}
-
 void Collection::parseCpuInfo() {
     const char *cpuInfoLine = cpuInfo.getFirstLine();
     for (; cpuInfoLine; cpuInfoLine = cpuInfo.getNextLine()) {
@@ -148,21 +131,9 @@ void Collection::parseValue(const char *fieldName, const char *valueString) {
         currentProcessor->physicalId = parseInteger(valueString);
     }
 
-    if (beginsWith(fieldName, "siblings")) {
-        currentProcessor->siblings = parseInteger(valueString);
-    }
-
-    if (beginsWith(fieldName, "core id")) {
-        currentProcessor->coreId = parseInteger(valueString);
-    }
-
     if (beginsWith(fieldName, "cpu cores")) {
         currentProcessor->cpuCores = parseInteger(valueString);
     }
-
-    if (beginsWith(fieldName, "model name")) {
-        currentProcessor->speedMHz = extractSpeedFromModelName(valueString);
-    }
 }
 
 void Collection::appendNewProcessor() {
@@ -184,32 +155,6 @@ unsigned Collection::parseInteger(const char *text) const {
     return atol(text);
 }
 
-/* Function extracts CPU speed from model name. If unit is not set it is
-   assumed that values below 100 are specified in GHz, otherwise MHz */
-unsigned Collection::extractSpeedFromModelName(const char *text) const {
-    text = strstr(text, "@");
-    if (!text) {
-        return 0;
-    }
-
-    char *unit;
-    double speed = strtod(&text[1], &unit);
-
-    while (isspace(*unit)) {
-        unit++;
-    }
-
-    bool isMHz = !strncmp(unit, "MHz", 3);
-    bool isGHz = !strncmp(unit, "GHz", 3);
-    bool isGHzPossible = (speed < 100);
-
-    if (isGHz || (isGHzPossible && !isMHz)) {
-        return 1000 * speed + 0.5;
-    } else {
-        return speed + 0.5;
-    }
-}
-
 void Collection::collectBasicCpuInformation() {
     std::set<unsigned> uniquePhysicalId;
     std::vector<Processor>::iterator processor = processors.begin();
@@ -229,120 +174,27 @@ void Collection::updateCpuInformation(const Processor &processor,
     totalNumberOfCpuCores += processor.cpuCores;
 }
 
-
-/* The OpenMpManager class is responsible for determining a set of all of
-   available CPU cores and delegating each core to perform other tasks. The
-   first of available cores is delegated for background threads, while other
-   remaining cores are dedicated for OpenMP threads. Each OpenMP thread owns
-   one core for exclusive use. The number of OpenMP threads is then limited
-   to the number of available cores minus one. The amount of CPU cores may
-   be limited by system eg. when numactl was used. */
 #include <sched.h>
 
-static const char *openMpEnvVars[] = {
-        "OMP_CANCELLATION", "OMP_DISPLAY_ENV", "OMP_DEFAULT_DEVICE", "OMP_DYNAMIC",
-        "OMP_MAX_ACTIVE_LEVELS", "OMP_MAX_TASK_PRIORITY", "OMP_NESTED",
-        "OMP_NUM_THREADS", "OMP_PROC_BIND", "OMP_PLACES", "OMP_STACKSIZE",
-        "OMP_SCHEDULE", "OMP_THREAD_LIMIT", "OMP_WAIT_POLICY", "GOMP_CPU_AFFINITY",
-        "GOMP_DEBUG", "GOMP_STACKSIZE", "GOMP_SPINCOUNT", "GOMP_RTEMS_THREAD_POOLS",
-        "KMP_AFFINITY", "KMP_NUM_THREADS", "MIC_KMP_AFFINITY",
-        "MIC_OMP_NUM_THREADS", "MIC_OMP_PROC_BIND", "PHI_KMP_AFFINITY",
-        "PHI_OMP_NUM_THREADS", "PHI_KMP_PLACE_THREADS", "MKL_NUM_THREADS",
-        "MKL_DYNAMIC", "MKL_DOMAIN_NUM_THREADS"
-};
-
-static const unsigned numberOfOpenMpEnvVars =
-        sizeof(openMpEnvVars) / sizeof(openMpEnvVars[0]);
-
-OpenMpManager::OpenMpManager(Collection *collection) :
-        collection(*collection), isGpuEnabled(false) {
-    getOpenMpEnvVars();
-    getCurrentCpuSet();
-    getCurrentCoreSet();
-}
-
-OpenMpManager &OpenMpManager::getInstance() {
+int getNumberOfCPUSockets() {
     static CpuInfo cpuInfo;
     static Collection collection(&cpuInfo);
-    static OpenMpManager openMpManager(&collection);
-    return openMpManager;
-}
-
-void OpenMpManager::setGpuEnabled() {
-    OpenMpManager &openMpManager = getInstance();
-    openMpManager.isGpuEnabled = true;
-}
-
-void OpenMpManager::setGpuDisabled() {
-    OpenMpManager &openMpManager = getInstance();
-    openMpManager.isGpuEnabled = false;
-}
-
-// Ideally bind given thread to secondary logical core, if
-// only one thread exists then bind to primary one
-void OpenMpManager::bindCurrentThreadToNonPrimaryCoreIfPossible() {
-    OpenMpManager &openMpManager = getInstance();
-    if (openMpManager.isThreadsBindAllowed()) {
-        int totalNumberOfAvailableCores = CPU_COUNT(&openMpManager.currentCoreSet);
-        int logicalCoreToBindTo = totalNumberOfAvailableCores > 1 ? 1 : 0;
-        openMpManager.bindCurrentThreadToLogicalCoreCpus(logicalCoreToBindTo);
-    }
+    return collection.getTotalNumberOfSockets();
 }
 
-void OpenMpManager::bindOpenMpThreads(int env_cores) {
-    OpenMpManager &openMpManager = getInstance();
-
-    if (!openMpManager.isThreadsBindAllowed())
-        return;
-
-    openMpManager.setOpenMpThreadNumberLimit(env_cores);
-    InferenceEngine::parallel_nt(0, [&] (unsigned logicalCoreId, int nthr) {
-        openMpManager.bindCurrentThreadToLogicalCoreCpu(logicalCoreId);
-    });
-}
-
-int OpenMpManager::getOpenMpThreadNumber() {
-    OpenMpManager &openMpManager = getInstance();
-
-    return openMpManager.getCoreNumber();
-}
-
-
-void OpenMpManager::getOpenMpEnvVars() {
-    isAnyOpenMpEnvVarSpecified = false;
-    for (unsigned i = 0; i < numberOfOpenMpEnvVars; i++) {
-        if (getenv(openMpEnvVars[i])) {
-            isAnyOpenMpEnvVarSpecified = true;
-        }
-    }
-}
-
-void OpenMpManager::getCurrentCpuSet() {
-    if (sched_getaffinity(0, sizeof(currentCpuSet), &currentCpuSet)) {
-        getDefaultCpuSet(&currentCpuSet);
-    }
-}
-
-void OpenMpManager::getDefaultCpuSet(cpu_set_t *defaultCpuSet) {
-    CPU_ZERO(defaultCpuSet);
-    unsigned numberOfProcessors = collection.getNumberOfProcessors();
-    for (int processorId = 0; processorId < numberOfProcessors; processorId++) {
-        CPU_SET(processorId, defaultCpuSet);
-    }
-}
-
-/* Function getCurrentCoreSet() fills currentCoreSet variable with a set of
-   available CPUs, where only one CPU per core is chosen. When multiple CPUs
-   of single core are used, function is selecting only first one of all
-   available. */
-void OpenMpManager::getCurrentCoreSet() {
+int getNumberOfCPUCores() {
+    static CpuInfo cpuInfo;
+    static Collection collection(&cpuInfo);
     unsigned numberOfProcessors = collection.getNumberOfProcessors();
     unsigned totalNumberOfCpuCores = collection.getTotalNumberOfCpuCores();
 
-    cpu_set_t usedCoreSet;
+    cpu_set_t usedCoreSet, currentCoreSet, currentCpuSet;
+    CPU_ZERO(&currentCpuSet);
     CPU_ZERO(&usedCoreSet);
     CPU_ZERO(&currentCoreSet);
 
+    sched_getaffinity(0, sizeof(currentCpuSet), &currentCpuSet);
+
     for (int processorId = 0; processorId < numberOfProcessors; processorId++) {
         if (CPU_ISSET(processorId, &currentCpuSet)) {
             unsigned coreId = processorId % totalNumberOfCpuCores;
@@ -352,70 +204,9 @@ void OpenMpManager::getCurrentCoreSet() {
             }
         }
     }
-}
-
-void OpenMpManager::selectAllCoreCpus(cpu_set_t *set, unsigned physicalCoreId) {
-    unsigned numberOfProcessors = collection.getNumberOfProcessors();
-    unsigned totalNumberOfCpuCores = collection.getTotalNumberOfCpuCores();
-
-    int processorId = physicalCoreId % totalNumberOfCpuCores;
-    while (processorId < numberOfProcessors) {
-        if (CPU_ISSET(processorId, &currentCpuSet)) {
-            CPU_SET(processorId, set);
-        }
-
-        processorId += totalNumberOfCpuCores;
-    }
-}
-
-unsigned OpenMpManager::getPhysicalCoreId(unsigned logicalCoreId) {
-    unsigned numberOfProcessors = collection.getNumberOfProcessors();
-
-    for (int processorId = 0; processorId < numberOfProcessors; processorId++) {
-        if (CPU_ISSET(processorId, &currentCoreSet)) {
-            if (!logicalCoreId--) {
-                return processorId;
-            }
-        }
-    }
-
-    std::cerr << "This should never happen!";
-    return 0;
-}
-
-bool OpenMpManager::isThreadsBindAllowed() {
-    return !isAnyOpenMpEnvVarSpecified && !isGpuEnabled;
-}
-
-// Limit of threads to number of logical cores available
-void OpenMpManager::setOpenMpThreadNumberLimit(int env_cores) {
-    parallel_set_num_threads(env_cores == 0 ? CPU_COUNT(&currentCoreSet) : 0);
-}
-
-int OpenMpManager::getCoreNumber() {
     return CPU_COUNT(&currentCoreSet);
 }
 
-void OpenMpManager::bindCurrentThreadToLogicalCoreCpu(unsigned logicalCoreId) {
-    unsigned physicalCoreId = getPhysicalCoreId(logicalCoreId);
-#if IE_THREAD == IE_THREAD_OMP
-    cpu_set_t set;
-    CPU_ZERO(&set);
-    CPU_SET(physicalCoreId, &set);
-    sched_setaffinity(0, sizeof(set), &set);
-#endif
-}
-
-void OpenMpManager::bindCurrentThreadToLogicalCoreCpus(unsigned logicalCoreId) {
-    unsigned physicalCoreId = getPhysicalCoreId(logicalCoreId);
-#if IE_THREAD == IE_THREAD_OMP
-    cpu_set_t set;
-    CPU_ZERO(&set);
-    selectAllCoreCpus(&set, physicalCoreId);
-    sched_setaffinity(0, sizeof(set), &set);
-#endif
-}
-
 #endif  // #ifndef APPLE
 }  // namespace cpu
 }  // namespace MKLDNNPlugin
index d39329a..dfd69bb 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -20,10 +19,7 @@ namespace cpu {
 struct Processor {
     unsigned processor;
     unsigned physicalId;
-    unsigned siblings;
-    unsigned coreId;
     unsigned cpuCores;
-    unsigned speedMHz;
 
     Processor();
 };
@@ -41,8 +37,6 @@ class CpuInfo : public CpuInfoInterface {
 public:
     CpuInfo();
 
-    explicit CpuInfo(const char *content);
-
     virtual ~CpuInfo();
 
     virtual const char *getFirstLine();
@@ -64,32 +58,17 @@ private:
 class CollectionInterface {
 public:
     virtual ~CollectionInterface() {}
-
-    virtual unsigned getProcessorSpeedMHz() = 0;
-
     virtual unsigned getTotalNumberOfSockets() = 0;
-
-    virtual unsigned getTotalNumberOfCpuCores() = 0;
-
-    virtual unsigned getNumberOfProcessors() = 0;
-
-    virtual const Processor &getProcessor(unsigned processorId) = 0;
 };
 
 class Collection : public CollectionInterface {
 public:
     explicit Collection(CpuInfoInterface *cpuInfo);
 
-    virtual unsigned getProcessorSpeedMHz();
-
     virtual unsigned getTotalNumberOfSockets();
-
     virtual unsigned getTotalNumberOfCpuCores();
-
     virtual unsigned getNumberOfProcessors();
 
-    virtual const Processor &getProcessor(unsigned processorId);
-
 private:
     CpuInfoInterface &cpuInfo;
     unsigned totalNumberOfSockets;
@@ -113,70 +92,11 @@ private:
 
     unsigned parseInteger(const char *text) const;
 
-    unsigned extractSpeedFromModelName(const char *text) const;
-
     void collectBasicCpuInformation();
 
     void updateCpuInformation(const Processor &processor,
                               unsigned numberOfUniquePhysicalId);
 };
-
-
-class OpenMpManager {
-public:
-    static void setGpuEnabled();
-
-    static void setGpuDisabled();
-
-    static void bindCurrentThreadToNonPrimaryCoreIfPossible();
-
-    static void bindOpenMpThreads(int env_cores = 0);
-
-    static int getOpenMpThreadNumber();
-
-    static void printVerboseInformation();
-
-    static bool isMajorThread(int currentThread);
-
-private:
-    Collection &collection;
-
-    bool isGpuEnabled;
-    bool isAnyOpenMpEnvVarSpecified;
-    cpu_set_t currentCpuSet;
-    cpu_set_t currentCoreSet;
-
-    explicit OpenMpManager(Collection *collection);
-
-    OpenMpManager(const OpenMpManager &openMpManager);
-
-    OpenMpManager &operator=(const OpenMpManager &openMpManager);
-
-    static OpenMpManager &getInstance();
-
-    void getOpenMpEnvVars();
-
-    void getCurrentCpuSet();
-
-    int getCoreNumber();
-
-    void getDefaultCpuSet(cpu_set_t *defaultCpuSet);
-
-    void getCurrentCoreSet();
-
-    void selectAllCoreCpus(cpu_set_t *set, unsigned physicalCoreId);
-
-    unsigned getPhysicalCoreId(unsigned logicalCoreId);
-
-    bool isThreadsBindAllowed();
-
-    void setOpenMpThreadNumberLimit(int env_cores);
-
-    void bindCurrentThreadToLogicalCoreCpu(unsigned logicalCoreId);
-
-    void bindCurrentThreadToLogicalCoreCpus(unsigned logicalCoreId);
-};
-
 #endif  // #ifndef __APPLE__
 }  // namespace cpu
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/osx/osx_omp_manager.h
deleted file mode 100644 (file)
index 0484bb5..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (C) 2018 Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-/**
-* @brief WINAPI based code
-* @file win_omp_manager.h
-*/
-
-#pragma once
-
-#include <thread>
-#include <vector>
-
-namespace MKLDNNPlugin {
-namespace cpu {
-
-class OpenMpManager {
-public:
-    static int getOpenMpThreadNumber() {
-        return getCoreNumber();
-    }
-
-    static int getCoreNumber() {
-        return 4;
-    }
-};
-
-}  // namespace cpu
-}  // namespace MKLDNNPlugin
-
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h b/inference-engine/src/mkldnn_plugin/mkldnn/os/win/win_omp_manager.h
deleted file mode 100644 (file)
index d598916..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (C) 2018 Intel Corporation
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-
-/**
-* @brief WINAPI based code
-* @file win_omp_manager.h
-*/
-
-#pragma once
-
-#include <thread>
-#include <vector>
-#include <windows.h>
-
-namespace MKLDNNPlugin {
-namespace cpu {
-
-class OpenMpManager {
-public:
-    static int getOpenMpThreadNumber() {
-        return getCoreNumber();
-    }
-
-    static int getCoreNumber() {
-        int num_cores = std::thread::hardware_concurrency();
-        unsigned long size = 0;
-
-        if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &size)) {
-            if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
-                std::vector<char> buf(size);
-                SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* info
-                        = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*>(&buf.front());
-                SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* ptr = info;
-                if (GetLogicalProcessorInformationEx(RelationProcessorCore, info, &size)) {
-                    if (GetLastError() == ERROR_SUCCESS) {
-                        int num = 0;
-                        unsigned long offset = 0;
-                        while (offset < size) {
-                            num++;
-                            offset += ptr->Size;
-                            ptr = reinterpret_cast<SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*>(
-                                    reinterpret_cast<byte*>(ptr) + ptr->Size);
-                        }
-                        num_cores = num;
-                    }
-                }
-            }
-        }
-        return num_cores;
-    }
-};
-
-}  // namespace cpu
-}  // namespace MKLDNNPlugin
-
index f707f26..06616a8 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 102955f..92c8c5a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 91c586b..f5364f6 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 983fc2b..9c079ef 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,7 @@
 #include <limits>
 #include <fstream>
 #include <unordered_map>
+#include <memory>
 #include "details/caseless.hpp"
 
 #include "mkldnn_graph.h"
@@ -24,7 +24,6 @@
 #include "mkldnn_extension_utils.h"
 #include "mkldnn_extension_mngr.h"
 #include "mkldnn/omp_manager.h"
-#include "ie_parallel.hpp"
 #include <graph_tools.hpp>
 #include <cpp_interfaces/ie_executor_manager.hpp>
 #include "ie_algorithm.hpp"
 #include "mkldnn_async_infer_request.h"
 #include <blob_factory.hpp>
 #include <ie_util_internal.hpp>
+#include <net_pass.h>
+
+#include <mkldnn_graph_dumper.h>
 
 #include <data_stats.h>
-#include "../inference_engine/cnn_network_int8_normalizer.hpp"
+#include "cnn_network_int8_normalizer.hpp"
+#include "ie_memcpy.h"
 
 #define XBYAK_NO_OP_NAMES
 #define XBYAK_UNDEF_JNL
 #include "../../thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h"
 
 #include "cnn_network_stats_impl.hpp"
-// #define DEBUG_DUMP_PATH "/temp/path/dump/"
-// #define DEBUG_DUMP_NEW_FOLDER_PER_INFER
-#ifdef DEBUG_DUMP_PATH
-#include "../../thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp"
-#include <iomanip>
-// #define DEBUG_BMP_OUTPUT 1
+
+#include "utils/blob_dump.h"
+
+/*****************************************************
+ * Dump capability
+ * Specify path to dump folder in BLOB_DUMP_PATH
+ *****************************************************/
+// #define BLOB_DUMP_PATH "dump"
+
+#ifdef BLOB_DUMP_PATH
+#   define DUMP_DIR        BLOB_DUMP_PATH
+#   define ENABLE_DUMP(_x) { _x ;}
+#else
+#   define DUMP_DIR ""
+#   define ENABLE_DUMP(_x)
 #endif
 
 using namespace mkldnn;
@@ -56,37 +68,11 @@ using namespace MKLDNNPlugin::cpu;
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
 
-void BindThreads(mkldnn::engine eng) {
-    static bool alreadyBind = false;
-    if (!alreadyBind) {
-#if IE_THREAD == IE_THREAD_OMP
-        int env_cores = 0;
-        if (getenv("OMP_NUM_THREADS") != nullptr) {
-            try {
-                env_cores = std::stoi(std::string(getenv("OMP_NUM_THREADS")));
-            } catch (...) {
-                env_cores = 0;
-            }
-        }
-#if !(defined(__APPLE__) || defined(_WIN32))
-        OpenMpManager::setGpuDisabled();
-        OpenMpManager::bindOpenMpThreads(env_cores);
-#else
-        int num_cores = env_cores == 0 ? OpenMpManager::getOpenMpThreadNumber() : env_cores;
-        parallel_set_num_threads(num_cores);
-#endif
-#endif
-        alreadyBind = true;
-    }
-}
-
-void MKLDNNGraph::CreateGraph(ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
+void MKLDNNGraph::CreateGraph(const ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr) {
     if (IsReady()) {
         ForgetGraphData();
     }
 
-    if (config.useThreadBinding) BindThreads(eng);
-
     // go over the inputs and create input primitives
     InputsDataMap inputs;
     network.getInputsInfo(inputs);
@@ -273,6 +259,9 @@ void MKLDNNGraph::CreateGraph(ICNNNetwork &network, const MKLDNNExtensionManager
 
     CreatePrimitives();
 
+    // Will do it before cleanup. Because it will lose original layers information
+    if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_init.dot");
+
     for (auto &graphNode : graphNodes) {
         graphNode->cleanup();
     }
@@ -378,15 +367,31 @@ void MKLDNNGraph::ParseNode(const CNNLayerPtr& cnnLayer, MKLDNNNodePtr& parent,
     if (exists)
         return;
 
+    if (cnnLayer->blobs.find("ext-scale") != cnnLayer->blobs.end())
+        node->ext_scales = cnnLayer->blobs["ext-scale"];
+
     graphNodes.push_back(node);
 
     size_t count_out = 0;
+    std::vector<ParsedLayer> remaining;
     for (const auto &layer : cnnLayer->outData) {
+        bool first = true;
         for (const auto &data : layer->getInputTo()) {
-            queuelayers.push_back({node, data.second, count_out});
+            if (first) {
+                queuelayers.push_back({node, data.second, count_out});
+                first = false;
+            } else {
+                // TODO: Just to hide bug with port ordering.
+                //       At first step we visit only first connection
+                //       at port. As second we will visit all remaining.
+                //
+                // Not first connection to the port are stored here
+                remaining.push_back({node, data.second, count_out});
+            }
         }
         count_out++;
     }
+    queuelayers.insert(queuelayers.end(), remaining.begin(), remaining.end());
 }
 
 void MKLDNNGraph::InitNodes() {
@@ -416,58 +421,6 @@ void MKLDNNGraph::InitEdges() {
         if (MKLDNNMemoryDesc(parentDesc).getFormat() != MKLDNNMemoryDesc(childDesc).getFormat()) {
             inArgs += (inArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(parentDesc).getFormat());
             outArgs += (outArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(childDesc).getFormat());
-        } else if (inArgs.empty() && outArgs.empty()) {
-            // This detailed name disabled by request from ICV team
-#if 0
-            auto parentBlk = parentDesc.getBlockingDesc();
-            auto childBlk = childDesc.getBlockingDesc();
-            std::string order_in, order_out, stride_in, stride_out, dims_in, dims_out, off_in, off_out;
-            for (size_t i = 0; i < parentBlk.getBlockDims().size(); i++) {
-                if (i) {
-                    stride_in += ",";
-                    order_in += ",";
-                    dims_in += ",";
-                    off_in += ",";
-                }
-                stride_in += std::to_string(parentBlk.getStrides()[i]);
-                order_in += std::to_string(parentBlk.getOrder()[i]);
-                dims_in += std::to_string(parentBlk.getBlockDims()[i]);
-                off_in += std::to_string(parentBlk.getOffsetPaddingToData()[i]);
-            }
-            for (size_t i = 0; i < childBlk.getBlockDims().size(); i++) {
-                if (i) {
-                    stride_out += ",";
-                    order_out += ",";
-                    dims_out += ",";
-                    off_out += ",";
-                }
-                stride_out += std::to_string(childBlk.getStrides()[i]);
-                order_out += std::to_string(childBlk.getOrder()[i]);
-                dims_out += std::to_string(childBlk.getBlockDims()[i]);
-                off_out += std::to_string(childBlk.getOffsetPaddingToData()[i]);
-            }
-
-            if (parentBlk.getOffsetPadding() != childBlk.getOffsetPadding()) {
-                inArgs += (inArgs.empty() ? "" : "_") + std::string("off:") + std::to_string(parentBlk.getOffsetPadding());
-                outArgs += (outArgs.empty() ? "" : "_") + std::string("off:") + std::to_string(childBlk.getOffsetPadding());
-            }
-            if (parentBlk.getStrides() != childBlk.getStrides()) {
-                inArgs += (inArgs.empty() ? "" : "_") + std::string("str:") + stride_in;
-                outArgs += (outArgs.empty() ? "" : "_") + std::string("str:") + stride_out;
-            }
-            if (parentBlk.getOrder() != childBlk.getOrder()) {
-                inArgs += (inArgs.empty() ? "" : "_") + std::string("ord:") + order_in;
-                outArgs += (outArgs.empty() ? "" : "_") + std::string("ord:") + order_out;
-            }
-            if (parentBlk.getBlockDims() != childBlk.getBlockDims()) {
-                inArgs += (inArgs.empty() ? "" : "_") + std::string("dim:") + dims_in;
-                outArgs += (outArgs.empty() ? "" : "_") + std::string("dim:") + dims_out;
-            }
-            if (parentBlk.getOffsetPaddingToData() != childBlk.getOffsetPaddingToData()) {
-                inArgs += (inArgs.empty() ? "" : "_") + std::string("offs:") + off_in;
-                outArgs += (outArgs.empty() ? "" : "_") + std::string("offs:") + off_out;
-            }
-#endif
         }
         return inArgs + "_" + outArgs;
     };
@@ -529,7 +482,7 @@ static inline bool isConstOutput(MKLDNNEdgePtr edge) {
 void MKLDNNGraph::AllocateWithReuse() {
     std::vector<std::vector<MKLDNNEdgePtr>> edge_clasters;
 
-    // detect edge clasters which are view on one.
+    // detect edge clusters which are view on one.
     for (auto &edge : graphEdges) {
         MKLDNNEdgePtr par = (edge->getStatus() == MKLDNNEdge::Status::NotAllocated)
                             ? edge->getSharedEdge()
@@ -606,7 +559,7 @@ void MKLDNNGraph::AllocateWithReuse() {
 
             int e_size = block_desk.getOffsetPadding() + 1;  // size in elements (from begin of data to last element)
             for (int j = 0; j < block_desk.getBlockDims().size(); j++)
-                e_size += (block_desk.getBlockDims()[j] - 1 ) * block_desk.getStrides()[j];
+                e_size += (block_desk.getBlockDims()[j] - 1) * block_desk.getStrides()[j];
 
             box.start = std::min(e_start, box.start);
             box.finish = std::max(e_finish, box.finish);
@@ -754,139 +707,9 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) {
             MB_to_process = std::min<int>(config.batchLimit, MB_to_process);
         size_t size_to_copy = intr_blob.GetSize() * MB_to_process / MB;
 
-        memcpy(ext_blob_ptr, intr_blob_ptr, size_to_copy);
-    }
-}
-
-#ifdef DEBUG_BMP_OUTPUT
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#include "../../thirdparty/stb_lib/stb_image_write.h"
-
-#if defined(_WIN32)
-#define mkdir(dir, mode) _mkdir(dir)
-#endif
-
-void dump_as_bitmaps(const std::string name, const float* data,
-                     const SizeVector& cdims,
-                    mkldnn::impl::memory_format_t format = mkldnn::impl::memory_format::nchw) {
-    std::string dir_name = name + "_bmp_dir/";
-    mkdir(dir_name.c_str(), 0755);
-
-    std::ofstream layer_bmp_log;
-    layer_bmp_log.open(dir_name + "bmp_dump_log.txt");
-    layer_bmp_log << "Format " << format << std::endl;
-
-    if (cdims.size() == 1) {
-        layer_bmp_log << "Only one dimension: " << cdims[0] << std::endl;
-        layer_bmp_log.close();
-        return;
-    }
-
-    SizeVector dims(cdims.rbegin(), cdims.rend());
-
-    size_t x = dims[0], y = dims[1], total_images = 1;
-    size_t img_sz = x*y;
-
-    for (size_t k = 0; k < dims.size(); ++k)
-        if (dims[k])
-            total_images *= dims[k];
-
-    total_images /= img_sz;
-
-    //  sanity checks
-    if (img_sz < 100) {
-        layer_bmp_log << "Image size is too small" << std::endl;
-        layer_bmp_log.close();
-        return;
-    } else if (x < 10 || y < 10 || x > 2048 || y > 2048) {
-        layer_bmp_log << "Dimensions are unapropriate to dump - " << y << "x" << x << std::endl;
-        layer_bmp_log.close();
-        return;
-    } else {
-        float ratio = static_cast<float>(x) / static_cast<float>(y);
-        if (ratio < 1.0) ratio = 1.0 / ratio;
-
-        if (ratio > 8.f) {
-            layer_bmp_log << "Suspicious aspect ratio - " << ratio << std::endl;
-            layer_bmp_log.close();
-            return;
-        }
-    }
-
-    layer_bmp_log << total_images << " images to write ..." << std::endl;
-
-    const float* dataPtr = data;
-    for (size_t img = 0; img < total_images; img++) {
-        std::string img_name = "img" + std::to_string(img) + ".bmp";
-
-        //  copy image plane to separate buffer,
-        //  normalize and convert to 3-channel 8-bit bmp
-        std::vector<float> imgbuf(img_sz);
-        int stride = 1;
-        switch (format) {
-        case mkldnn::impl::memory_format::nChw8c:
-            stride = 8;
-            break;
-        case mkldnn::impl::memory_format::nChw16c:
-            stride = 16;
-            break;
-        case mkldnn::impl::memory_format::nchw:
-        default:
-            break;
-        }
-
-        float maxval = -FLT_MAX, minval = FLT_MAX;
-        for (size_t i = 0; i < y; i++)
-            for (size_t j = 0; j < x; j++) {
-                float val = dataPtr[(i*x + j) * stride];
-                if (val > maxval) maxval = val;
-                if (val < minval) minval = val;
-                imgbuf[i*x + j] = val;
-            }
-
-        if (minval >= 0.f && maxval <= 0.f) {
-            layer_bmp_log << img_name << " all zero." << std::endl;
-        } else {
-            const float mult = 256.f / (maxval - minval);
-            std::vector<unsigned char> bmpbuf(img_sz * 3);
-            unsigned char* bmp_ptr = bmpbuf.data();
-
-            for (int i = 0; i < imgbuf.size(); i++, bmp_ptr += 3) {
-                if (imgbuf[i] >= 0.f && imgbuf[i] <= 0.f) {
-                    bmp_ptr[0] = 65;
-                    bmp_ptr[1] = bmp_ptr[2] = 0;
-                } else {
-                    bmp_ptr[0] = bmp_ptr[1] = bmp_ptr[2] = (unsigned char)((imgbuf[i] - minval) * mult);
-                }
-            }
-
-            //  write bmp file
-            std::string full_name = dir_name + img_name;
-            stbi_write_bmp(full_name.c_str(), x, y, 3, (const void *)bmpbuf.data());
-        }
-
-        switch (format) {
-        case mkldnn::impl::memory_format::nChw8c:
-            if ( ( img & 7 ) < 7 )   dataPtr++;
-            else                dataPtr += img_sz * 8;
-            break;
-        case mkldnn::impl::memory_format::nChw16c:
-            if ( ( img & 15 ) < 15 )    dataPtr++;
-            else                    dataPtr += img_sz * 16;
-            break;
-        case mkldnn::impl::memory_format::nchw:
-        default:
-            dataPtr += img_sz;
-            break;
-        }
+        ie_memcpy(ext_blob_ptr, ext_blob->byteSize(), intr_blob_ptr, size_to_copy);
     }
-
-    layer_bmp_log.close();
 }
-#endif
 
 void MKLDNNGraph::Infer(int batch) {
     if (!IsReady()) {
@@ -894,175 +717,20 @@ void MKLDNNGraph::Infer(int batch) {
     }
 
     mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
-#ifdef DEBUG_DUMP_NEW_FOLDER_PER_INFER
-        static int folderIdx = 0;
-        folderIdx++;
-#endif
     for (int i = 0; i < graphNodes.size(); i++) {
         PERF(graphNodes[i]);
 
         if (batch > 0)
             graphNodes[i]->setDynamicBatchLim(batch);
 
+        ENABLE_DUMP(do_before(DUMP_DIR, graphNodes[i]));
+
         if (!graphNodes[i]->isConstant()) {
             IE_PROFILING_AUTO_SCOPE_TASK(graphNodes[i]->profilingTask)
             graphNodes[i]->execute(stream);
         }
 
-#ifdef DEBUG_DUMP_PATH
-        {
-            auto folderName = std::string(DEBUG_DUMP_PATH) +
-#ifdef DEBUG_DUMP_NEW_FOLDER_PER_INFER
-            std::to_string(folderIdx - 1) +
-#endif
-            "/";
-            std::cout << "Try to create logs for " << graphNodes[i]->getName() << std::endl;
-            std::string nodeName = graphNodes[i]->name;
-            std::replace(nodeName.begin(), nodeName.end(), '/', '_');
-            std::ofstream layer_data_dump;
-            for (size_t j = 0; j < graphNodes[i]->getChildEdges().size(); j++) {
-                auto childEdge = graphNodes[i]->getChildEdgeAt(j);
-                std::string childName = graphNodes[i]->getChildEdgeAt(j)->getChild()->getName();
-                std::replace(childName.begin(), childName.end(), '/', '_');
-
-                //  std::string fname = DEBUG_DUMP_PATH + nodeName + "_dst_" + childName + "_" + std::to_string(j) + ".txt";
-                std::string tname = folderName + nodeName + "_dst_" + childName + "_" + std::to_string(j);
-                std::string fname = tname + ".txt";
-                if (graphNodes[i]->getChildEdges().size() == 1) {
-                    fname = folderName + nodeName + "_dst.txt";
-                }
-                layer_data_dump.open(fname);
-                if (layer_data_dump.is_open()) {
-                    float *data = static_cast<float *>(childEdge->getMemory().GetData());
-                    mkldnn::impl::memory_desc_wrapper dst_d(childEdge->getMemory().GetDescriptor().data);
-    #ifdef DEBUG_BMP_OUTPUT
-                    dump_as_bitmaps(tname, data, childEdge->getDims().ToSizeVector(), dst_d.format());
-    #endif
-
-                    layer_data_dump << "shape: ";
-                    for (size_t d = 0; d < childEdge->getDims().ndims(); d++)
-                        layer_data_dump << childEdge->getDims()[d] << " ";
-                    layer_data_dump << "(" << dst_d.nelems() << ")" << std::endl;
-                    if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::FP32) {
-                        float *data = childEdge->getBlob()->buffer();
-                        for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
-                            layer_data_dump << std::fixed << std::setprecision(3) << data[dst_d.off_l(bs)] << std::endl;
-                    }
-                    } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::I8) {
-                        int8_t *data = childEdge->getBlob()->buffer();
-                        for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
-                            layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl;
-                        }
-                    } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::U8) {
-                        uint8_t *data = childEdge->getBlob()->buffer();
-                        for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
-                            layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl;
-                        }
-                    } else if (childEdge->getBlob()->getTensorDesc().getPrecision() == Precision::I32) {
-                        int32_t *data = childEdge->getBlob()->buffer();
-                        for (size_t bs = 0; bs < dst_d.nelems(); bs++) {
-                            layer_data_dump << static_cast<int>(data[dst_d.off_l(bs)]) << std::endl;
-                        }
-                    }
-
-                    layer_data_dump.close();
-                } else {
-                    std::cout << "Cannot create file " << fname << std::endl;
-                }
-            }
-
-            for (size_t p = 0 ; p < graphNodes[i]->getParentEdges().size(); p++) {
-                auto parentEdge = graphNodes[i]->getParentEdgeAt(p);
-                auto parent = parentEdge->getParent();
-                std::string parentName = parent->getName();
-                std::replace(parentName.begin(), parentName.end(), '/', '_');
-                //  std::string fname = folderName + nodeName + "_src_" + parentName + "_" + std::to_string(p) + ".txt";
-                std::string tname = folderName + nodeName + "_src_" + parentName + "_" + std::to_string(p);
-                std::string fname = tname + ".txt";
-                layer_data_dump.open(fname);
-                if (layer_data_dump.is_open()) {
-                    size_t dataSize = graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetSize();
-                    mkldnn::impl::memory_desc_wrapper src_d(graphNodes[i]->getParentEdges()[p]
-                                                                    .lock()->getMemory().GetDescriptor().data);
-    #ifdef DEBUG_BMP_OUTPUT
-                    dump_as_bitmaps(tname, data, parentEdge->getDims().ToSizeVector(), src_d.format());
-    #endif
-                    layer_data_dump << "shape: ";
-                    for (size_t d = 0; d < parentEdge->getDims().ndims(); d++)
-                        layer_data_dump << parentEdge->getDims()[d] << " ";
-                    layer_data_dump << "(" << src_d.nelems() << ")"<< std::endl;
-                    auto precision = graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision();
-                    if (precision == Precision::FP32) {
-                        float *data = static_cast<float *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
-                        for (size_t bs = 0; bs < dataSize; bs++) {
-                            layer_data_dump << std::fixed << std::setprecision(3) << data[src_d.off_l(bs)] << std::endl;
-                        }
-                    } else if (precision == Precision::I8) {
-                        int8_t *data = static_cast<int8_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
-                        for (size_t bs = 0; bs < dataSize; bs++) {
-                            layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl;
-                        }
-                    } else if (graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision() == Precision::U8) {
-                        uint8_t *data = static_cast<uint8_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
-                        for (size_t bs = 0; bs < dataSize; bs++) {
-                            layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl;
-                        }
-                    } else if (graphNodes[i]->getParentEdges()[p].lock()->outputDesc.getPrecision() == Precision::I32) {
-                        int32_t *data = static_cast<int32_t *>(graphNodes[i]->getParentEdges()[p].lock()->getMemory().GetData());
-                        for (size_t bs = 0; bs < dataSize; bs++) {
-                            layer_data_dump << static_cast<int>(data[src_d.off_l(bs)]) << std::endl;
-                        }
-                    } else {
-                        layer_data_dump << "Unsupported precision: " << precision.name() << std::endl;
-                    }
-
-                    layer_data_dump.close();
-                } else {
-                    std::cout << "Cannot create file " << fname << std::endl;
-                }
-            }
-
-            GenericLayer* genericLayer = dynamic_cast<GenericLayer*>(graphNodes[i]->getCnnLayer().get());
-            if (genericLayer != nullptr) {
-                for (auto blob : genericLayer->blobs) {
-                    layer_data_dump.open(folderName + nodeName + "_blob-" + blob.first + ".txt");
-                    if (layer_data_dump.is_open()) {
-                        layer_data_dump << "shape: ";
-                        for (size_t d = 0; d < blob.second->dims().size(); d++)
-                            layer_data_dump << blob.second->dims()[d] << " ";
-                        layer_data_dump << "(" << blob.second->size() << ")"<< std::endl;
-                        if (blob.second->getTensorDesc().getPrecision() == Precision::FP32) {
-                        float *data = blob.second->buffer();
-                        for (size_t bs = 0; bs < blob.second->size(); bs++) {
-                            layer_data_dump << std::fixed << std::setprecision(3) << data[bs] << std::endl;
-                        }
-                        } else if (blob.second->getTensorDesc().getPrecision() == Precision::I8) {
-                            int8_t *data = blob.second->buffer();
-                            for (size_t bs = 0; bs < blob.second->size(); bs++) {
-                                layer_data_dump << static_cast<int>(data[bs]) << std::endl;
-                            }
-                        } else if (blob.second->getTensorDesc().getPrecision() == Precision::U8) {
-                            uint8_t *data = blob.second->buffer();
-                            for (size_t bs = 0; bs < blob.second->size(); bs++) {
-                                layer_data_dump << static_cast<int>(data[bs]) << std::endl;
-                            }
-                        } else if (blob.second->getTensorDesc().getPrecision() == Precision::I32) {
-                            int32_t *data = blob.second->buffer();
-                            for (size_t bs = 0; bs < blob.second->size(); bs++) {
-                                layer_data_dump << static_cast<int>(data[bs]) << std::endl;
-                            }
-                        } else {
-                            layer_data_dump << "Unsupported precision: " << blob.second->getTensorDesc().getPrecision().name() << std::endl;
-                        }
-                        layer_data_dump.close();
-                    } else {
-                        std::cout << "Cannot create file " << folderName << nodeName
-                                  << "_" << blob.first << ".txt" << std::endl;
-                    }
-                }
-            }
-        }
-#endif
+        ENABLE_DUMP(do_after(DUMP_DIR, graphNodes[i]));
     }
 }
 
@@ -1153,6 +821,8 @@ void MKLDNNGraph::GetPerfData(std::map<std::string, InferenceEngine::InferenceEn
     for (int i = 1; i < graphNodes.size(); i++) {
         getPerfMapFor(perfMap, graphNodes[i]);
     }
+
+    if (!config.dumpToDot.empty()) dumpToDotFile(config.dumpToDot + "_perf.dot");
 }
 
 void MKLDNNGraph::setConfig(const Config &cfg) {
@@ -1257,7 +927,56 @@ void MKLDNNGraph::RemoveDroppedEdges() {
     }
 }
 
-bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network) const {
+void MKLDNNGraph::dumpToDotFile(std::string file) const {
+    std::ofstream dot;
+    dot.open(file);
+    if (!dot.is_open()) THROW_IE_EXCEPTION << "CPU Plugin cannot create dot file " << file << ".";
+
+    dump_graph_as_dot(*this, dot);
+    dot.close();
+}
+
+void MKLDNNGraph::do_before(const std::string &dir, const MKLDNNNodePtr &node) {
+    auto exec_order = std::to_string(node->execIndex);
+    std::string nodeName = node->name;
+    std::replace(nodeName.begin(), nodeName.end(), '/', '_');
+
+    auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size();
+    for (size_t i = 0; i < num_ports; i++) {
+        auto prEdge = node->getParentEdgeAt(i);
+        auto pr = prEdge->getParent();
+
+        auto dump_file = dir + "/#" + exec_order + "_" +  nodeName + "_in" + std::to_string(i) + ".ieb";
+        TensorDesc desc = prEdge->getDesc();
+        Blob::Ptr blob = make_blob_with_precision(desc, prEdge->getMemoryPtr()->GetData());
+
+        BlobDumper dumper(blob);
+        if (pr->ext_scales) dumper.withScales(pr->ext_scales);
+        dumper.dump(dump_file);
+    }
+}
+
+void MKLDNNGraph::do_after(const std::string &dir, const MKLDNNNodePtr &node) {
+    auto exec_order = std::to_string(node->execIndex);
+    auto nodeName = node->name;
+    std::replace(nodeName.begin(), nodeName.end(), '/', '_');
+
+    auto num_ports = node->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size();
+    for (size_t i = 0; i < num_ports; i++) {
+        auto childEdge = node->getChildEdgeAt(i);
+
+        auto dump_file = dir + "/#" + exec_order + "_" +  nodeName + "_out" + std::to_string(i) + ".ieb";
+        TensorDesc desc = childEdge->getDesc();
+        Blob::Ptr blob = make_blob_with_precision(desc, childEdge->getMemoryPtr()->GetData());
+
+        BlobDumper dumper(blob);
+        if (node->ext_scales) dumper.withScales(node->ext_scales);
+
+        dumper.dump(dump_file);
+    }
+}
+
+bool MKLDNNExecNetwork::CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const {
     InputsDataMap inputs;
     network.getInputsInfo(inputs);
 
@@ -1274,6 +993,11 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network
     bool check_result = true;
     details::UnorderedDFS(allLayers, secondLayers.begin()->second, [&](CNNLayerPtr layer) {
         auto type = TypeFromName(layer->type);
+        // This is WA for Tile layer
+        auto tileLayer = dynamic_cast<TileLayer *>(layer.get());
+        if (tileLayer && tileLayer->axis)
+            return;
+
         if (type != Input &&
             type != Output &&
             type != Convolution &&
@@ -1283,6 +1007,7 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network
             type != Lrn &&
             type != Pooling &&
             type != FullyConnected &&
+            type != Gemm &&
             type != SoftMax &&
             type != Split &&
             type != Concatenation &&
@@ -1301,55 +1026,87 @@ bool MKLDNNExecNetwork::CanProcessDynBatch(InferenceEngine::ICNNNetwork &network
 InferenceEngine::InferRequestInternal::Ptr
 MKLDNNExecNetwork::CreateInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                           InferenceEngine::OutputsDataMap networkOutputs) {
-    return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs);
+    if (graphs.size() > 1)  // streams uses special requests that are not connected to graphs
+        return std::make_shared<MKLDNNGraphlessInferRequest>(networkInputs, networkOutputs);
+    else
+        return std::make_shared<MKLDNNInferRequest>(networkInputs, networkOutputs);
 }
 
-MKLDNNExecNetwork::MKLDNNExecNetwork(InferenceEngine::ICNNNetwork &network,
+MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network,
                                      const Config &cfg,
                                      const MKLDNNExtensionManager::Ptr& extMgr) : extensionManager(extMgr) {
-    graph.reset(new MKLDNNGraph());
-    graph->setConfig(cfg);
+    ICNNNetworkStats* pstats = nullptr;
+    StatusCode s = network.getStats(&pstats, nullptr);
+    // we are cloning network if we have statistics and we can transform network
+    // in other case we pass original network. Especially because LSTM networks
+    // are not cloned properly
+    details::CNNNetworkImplPtr clonedNetwork;
+    if (s == StatusCode::OK && pstats && !pstats->isEmpty()) {
+        CNNNetworkInt8Normalizer cnnorm;
+        clonedNetwork = cloneNet(network);
+        cnnorm.NormalizeNetwork(*clonedNetwork, *pstats);
+    }
+    bool ti_proc_ok = !NetPass::CombineLSTMSeq(network) ? NetPass::UnrollTI(network) : true;
+    if (!ti_proc_ok)
+        THROW_IE_EXCEPTION << "Plugin doesn't support Tensor Iterator in pure form. "
+                              "None TI optimization pattern has been applied successfully";
+
 
     if (cfg.batchLimit > 1) {
         // check topology for applicability
-        if (!CanProcessDynBatch(network)) {
+        if (!CanProcessDynBatch(clonedNetwork ? *clonedNetwork : network)) {
             THROW_IE_EXCEPTION << "MKLDNNGraph::CreateGraph: such topology cannot be compiled for dynamic batch!";
         }
     }
+    // check whether any (affinity-related) envs are set and if user requested thread binding
+    const bool bPinningRequested = !check_env_variables() && cfg.useThreadBinding;
+    // general #threads logic
+    const int env_threads = parallel_get_env_threads();
+    // for streams need all (logical) cores, while single-stream case just physical cores (better for servers), as usual
+    const int hw_cores = cfg.throughputStreams > 1 ? parallel_get_max_threads() : getNumberOfCPUCores();
+    const int threads = cfg.threadsNum ? cfg.threadsNum : (env_threads ? env_threads : hw_cores);
+    const int threads_per_stream = std::max(1, threads/cfg.throughputStreams);
+
+    // graph(s) initialization in taskExecutor threads (streams), in parallel (in case of streams)
+    std::vector<Task::Ptr> tasks;
+
+    for (int n = 0; n < cfg.throughputStreams; n++) {
+        MKLDNNGraph::Ptr _graph = std::make_shared<MKLDNNGraph>();
+        graphs.push_back(_graph);
+        auto task = std::make_shared<InferenceEngine::Task>([=, &cfg, &network]() {
+            _graph->CreateArena(threads_per_stream);
+
+            if (bPinningRequested) {
+                _graph->CreateObserver(n, threads_per_stream);
+            }
 
-    if (graph->getProperty().exclusiveAsyncRequests) {
-        ExecutorManager *executorManager = ExecutorManager::getInstance();
-        _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eCPU));
+            _graph->setConfig(cfg);
+            _graph->CreateGraph(clonedNetwork ? *clonedNetwork : network, extensionManager);
+            if (cfg.throughputStreams > 1)  // for streams, each worker thread has it's own graph
+                MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph = _graph;
+        });
+        tasks.push_back(task);
     }
 
-    // initialization in taskExecutor thread
-    auto task = std::make_shared<InferenceEngine::Task>([&]() {
-        // we are cloning network if we have statistics and we can transform network
-        // in other case we pass original network. Especially because LSTM networks
-        // are not cloned properly
-        ICNNNetworkStats* pstats = nullptr;
-        StatusCode s = network.getStats(&pstats, nullptr);
-        Xbyak::util::Cpu cpu;
-        // Enable int8 only for avx512
-        if (s == StatusCode::OK && pstats && !pstats->isEmpty() && cpu.has(Xbyak::util::Cpu::tAVX512F)) {
-            details::CNNNetworkImplPtr clonnedNetwork = cloneNet(network);
-            CNNNetworkInt8Normalizer cnnorm;
-            cnnorm.NormalizeNetwork(*clonnedNetwork, *pstats);
-            graph->CreateGraph(*clonnedNetwork, extensionManager);
-        } else {
-            graph->CreateGraph(network, extensionManager);
+    if (cfg.throughputStreams > 1) {
+        // special executor with as many threads as requested #streams, each with it's own initialization task
+        _taskExecutor = std::make_shared<MultiWorkerTaskExecutor>(tasks);
+    } else {
+        if (cfg.exclusiveAsyncRequests) {
+            // special case when all InferRequests are muxed into a single queue
+            ExecutorManager *executorManager = ExecutorManager::getInstance();
+            _taskExecutor = executorManager->getExecutor(TargetDeviceInfo::name(TargetDevice::eCPU));
         }
-    });
-
-    _taskExecutor->startTask(task);
-    Task::Status sts = task->wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
-
-    if (sts == Task::TS_ERROR) task->checkException();
+        _taskExecutor->startTask(tasks[0]);
+        Task::Status sts = tasks[0]->wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
+    }
+    for (auto t : tasks)
+        t->checkException();
 }
 
 void MKLDNNExecNetwork::setProperty(const std::map<std::string, std::string> &properties) {
-    if (graph)  // TODO: graph field cannot be empty
-        graph->setProperty(properties);
+    for (auto g : graphs)
+        g->setProperty(properties);
 }
 
 void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) {
@@ -1362,13 +1119,10 @@ void MKLDNNExecNetwork::CreateInferRequest(InferenceEngine::IInferRequest::Ptr &
 
     asyncRequestImpl->SetPointerToPublicInterface(asyncRequest);
 
-    auto mkldnnSyncRequest = dynamic_cast<MKLDNNInferRequest *>(syncRequestImpl.get());
-    if (!mkldnnSyncRequest)
-        THROW_IE_EXCEPTION << " Cannot get mkldnn sync request.";
-    mkldnnSyncRequest->SetGraph(graph);
-}
-
-MKLDNNExecNetwork::~MKLDNNExecNetwork() {
-    graph.reset();
-    extensionManager.reset();
+    if (graphs.size() == 1) {  // single-stream (legacy/hetero) case - single graph for all requests
+        auto mkldnnSyncRequest = dynamic_cast<MKLDNNInferRequest *>(syncRequestImpl.get());
+        if (!mkldnnSyncRequest)
+            THROW_IE_EXCEPTION << " Cannot get mkldnn sync request.";
+        mkldnnSyncRequest->SetGraph(graphs[0]);
+    }
 }
index d1fdb0f..de026b5 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,7 @@
 #include <memory>
 #include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
 
+#include "ie_parallel.hpp"
 #include "mkldnn_memory.h"
 #include "config.h"
 #include "perf_count.h"
@@ -19,6 +19,7 @@
 #include "mkldnn_node.h"
 #include "mkldnn_edge.h"
 #include "mkldnn_extension_utils.h"
+#include "mkldnn_streams.h"
 
 namespace MKLDNNPlugin {
 
@@ -48,7 +49,7 @@ public:
     void getInputBlobs(InferenceEngine::BlobMap &in_map);
     void getOutputBlobs(InferenceEngine::BlobMap &out_map);
 
-    void CreateGraph(InferenceEngine::ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr);
+    void CreateGraph(const InferenceEngine::ICNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr);
 
     bool hasMeanImageFor(const std::string& name) {
         return _meanImages.find(name) != _meanImages.end();
@@ -81,6 +82,35 @@ public:
     void RemoveDroppedEdges();
     void DropNode(const MKLDNNNodePtr& node);
 
+    void CreateArena(int threads_per_stream) {
+        #if IE_THREAD == IE_THREAD_OMP
+        omp_set_num_threads(threads_per_stream);
+        #elif IE_THREAD == IE_THREAD_TBB
+        ptrArena = std::unique_ptr<tbb::task_arena>(new tbb::task_arena(threads_per_stream));
+        #endif
+    }
+
+    void CreateObserver(int _stream_id, int _threads_per_stream, int _pinning_step = 1) {
+        #if IE_THREAD == IE_THREAD_TBB
+        ptrObserver
+                = std::unique_ptr<tbb::task_scheduler_observer>(
+                new pinning_observer(*ptrArena.get(), _stream_id, _threads_per_stream, _pinning_step));
+        #else
+        cpu_set_t *process_mask = nullptr;
+        int ncpus = 0;
+        get_process_mask(ncpus, process_mask);
+            #if IE_THREAD == IE_THREAD_OMP
+            #pragma omp parallel for
+                    for (int thread_index = 0; thread_index < _threads_per_stream; thread_index++) {
+                        pin_thread_to_vacant_core(_stream_id * _threads_per_stream + thread_index, 1, ncpus, process_mask);
+                    }
+            #elif IE_THREAD == IE_THREAD_SEQ
+            pin_thread_to_vacant_core(_stream_id * _threads_per_stream, 1, ncpus, process_mask);
+            #endif
+        CPU_FREE(process_mask);
+        #endif
+    }
+
 protected:
     MKLDNNNodePtr FindNodeWithName(const std::string& name) const;
     void VisitNode(MKLDNNNodePtr node, std::vector<MKLDNNNodePtr>& sortedNodes);
@@ -108,6 +138,10 @@ protected:
 
     std::map<std::string, MeanImage> _meanImages;
 
+    #if IE_THREAD == IE_THREAD_TBB
+    std::unique_ptr<tbb::task_arena> ptrArena;
+    std::unique_ptr<tbb::task_scheduler_observer> ptrObserver;
+    #endif
     mkldnn::engine eng;
 
     void InitNodes();
@@ -116,13 +150,15 @@ protected:
     void AllocateWithReuse();
     void CreatePrimitives();
 
-    void BreakEdgeInsertScaleShift(MKLDNNPlugin::MKLDNNEdgePtr edgeToBreak,
-                                   InferenceEngine::CNNLayerPtr ssCnnLayer);
-    void AddScaleShiftBeforeAndAfterInt8(InferenceEngine::CNNNetwork& net);
+    void do_before(const std::string &dir, const MKLDNNNodePtr &node);
+    void do_after(const std::string &dir, const MKLDNNNodePtr &node);
 
     friend class MKLDNNInferRequest;
+    friend class MKLDNNGraphlessInferRequest;
+    friend std::shared_ptr<InferenceEngine::ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph);
 
 private:
+    void dumpToDotFile(std::string file) const;
     struct ParsedLayer {
         MKLDNNNodePtr parent;
         InferenceEngine::CNNLayerPtr cnnLayer;
@@ -142,18 +178,21 @@ public:
 
     void CreateInferRequest(InferenceEngine::IInferRequest::Ptr &asyncRequest) override;
 
-    MKLDNNExecNetwork(InferenceEngine::ICNNNetwork &network, const Config &cfg,
+    MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network, const Config &cfg,
                       const MKLDNNExtensionManager::Ptr& extMgr);
 
-    ~MKLDNNExecNetwork() override;
+    ~MKLDNNExecNetwork() {
+        graphs.clear();
+        extensionManager.reset();
+    }
 
     void setProperty(const std::map<std::string, std::string> &properties);
 
 protected:
-    MKLDNNGraph::Ptr graph;
+    std::vector<MKLDNNGraph::Ptr> graphs;
     MKLDNNExtensionManager::Ptr extensionManager;
 
-    bool CanProcessDynBatch(InferenceEngine::ICNNNetwork &network) const;
+    bool CanProcessDynBatch(const InferenceEngine::ICNNNetwork &network) const;
 };
 
 }  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
new file mode 100644 (file)
index 0000000..ae24579
--- /dev/null
@@ -0,0 +1,207 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mkldnn_graph_dumper.h"
+#include "cnn_network_impl.hpp"
+#include "ie_util_internal.hpp"
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+
+using namespace InferenceEngine;
+
+namespace MKLDNNPlugin {
+
+static void copy_node_metadata(const MKLDNNNodePtr &, CNNLayer::Ptr &);
+static void drawer_callback(const InferenceEngine::CNNLayerPtr, ordered_properties &, ordered_properties &);
+
+CNNLayer::Ptr convert_node(const MKLDNNNodePtr &node) {
+    CNNLayer::Ptr layer(new CNNLayer({"name", "type", Precision::FP32}));
+    copy_node_metadata(node, layer);
+
+    auto &cfg = node->getSelectedPrimitiveDescriptor()->getConfig();
+    layer->insData.resize(cfg.inConfs.size());
+    layer->outData.resize(cfg.outConfs.size());
+
+    return layer;
+}
+
+std::shared_ptr<ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph) {
+    auto net = std::make_shared<details::CNNNetworkImpl>();
+
+    net->setPrecision(Precision::FP32);
+    net->setName("internal_cpu_graph");
+    std::map<MKLDNNNodePtr, CNNLayerPtr> node2layer;
+
+    // Copy all nodes to network
+    for (auto &node : graph.graphNodes) {
+        auto layer = convert_node(node);
+        node2layer[node] = layer;
+        net->addLayer(layer);
+    }
+
+    // Copy all edges to network
+    for (auto &node : graph.graphNodes) {
+        auto pr = node2layer[node];
+        auto ch_edges = node->getChildEdges();
+
+        for (int i = 0; i < ch_edges.size(); i++) {
+            auto edge = node->getChildEdgeAt(i);
+            int out_port = edge->getInputNum();
+            int in_port = edge->getOutputNum();
+            auto ch_node = edge->getChild();
+            auto ch  = node2layer[ch_node];
+
+            DataPtr data;
+            if (i < pr->outData.size()) {
+                std::string data_name = node->getName() + "_out" + std::to_string(i);
+                pr->outData[i] = std::make_shared<Data>(data_name, edge->getDesc());
+                data = pr->outData[i];
+                data->creatorLayer = pr;
+            } else {
+                data = pr->outData[0];
+            }
+
+            data->inputTo[ch->name] = ch;
+            ch->insData[in_port] = data;
+        }
+    }
+
+    // Specify inputs data
+    for (auto kvp : graph.inputNodes) {
+        auto in_node = kvp.second;
+        auto in_layer = node2layer[in_node];
+
+        auto in_info = std::make_shared<InputInfo>();
+        in_info->setInputData(in_layer->outData[0]);
+        net->setInputInfo(in_info);
+    }
+
+    return net;
+}
+
+void dump_graph_as_dot(const MKLDNNGraph &graph, std::ostream &out) {
+    auto dump_net = dump_graph_as_ie_net(graph);
+    InferenceEngine::saveGraphToDot(*dump_net, out, drawer_callback);
+}
+
+//**********************************
+// Special converters of meta data
+//**********************************
+
+static std::map<Type, std::string> type_n2l {
+    {Unknown, "Unknown"},
+    {Generic, "Unknown"},
+    {Reorder, "Reorder"},
+    {Copy, "Reorder"},
+    {Input, "Input"},
+    {Output, "Output"},
+    {Convolution, "Conv"},
+    {Deconvolution, "Deconv"},
+    {Convolution_Sum, "Conv_Eltw"},
+    {Convolution_Activation, "Conv_Activ"},
+    {Convolution_Sum_Activation, "Conv_Eltw_Activ"},
+    {Activation, "Activation"},
+    {Depthwise, "Depthwise"},
+    {Lrn, "Lrn"},
+    {Pooling, "Pool"},
+    {FullyConnected, "FC"},
+    {SoftMax, "SoftMax"},
+    {Split, "Split"},
+    {Concatenation, "Concat"},
+    {Power, "Power"},
+    {Eltwise, "Eltwise"},
+    {Crop, "Crop"},
+    {Reshape, "Reshape"},
+    {Tile, "Tile"},
+    {SimplerNMS, "Proposal"},
+    {ROIPooling, "ROIPooling"},
+    {BatchNormalization, "BatchNorm"},
+    {Flatten, "Flatten"},
+    {Permute, "Permute"},
+    {MemoryOutput, "MemoryIn"},
+    {MemoryInput, "MemoryOut"}
+};
+
+static const std::string ORIGIN_NAMES = "origin";
+static const std::string IMPL_TYPE    = "impl";
+static const std::string PRECISION    = "prec";
+static const std::string PERF_COUNTER = "perf";
+
+static const std::string BLUE  = "#D8D9F1";
+static const std::string GREEN = "#D9EAD3";
+
+void copy_node_metadata(const MKLDNNNodePtr &node, CNNLayer::Ptr &layer) {
+    layer->type = type_n2l[node->getType()];
+    layer->name = node->getName();  // Is ID
+
+    if (node->getCnnLayer()) {
+        // Original layer names
+        std::vector<MKLDNNNodePtr> internal = node->getFusedWith();
+        auto &merged = node->getMergeWith();
+        internal.insert(internal.end(), merged.begin(), merged.end());
+
+        std::string orig_names = node->getCnnLayer()->name;
+        for (auto &sub_node : internal)
+            orig_names += " " + sub_node->getCnnLayer()->name;
+
+        layer->params[ORIGIN_NAMES] = orig_names;
+    }
+
+    // Implementation type name
+    layer->params[IMPL_TYPE] = node->getPrimitiveDescriptorType();
+
+    // Precision
+    // TODO: That is not fully correct mapping type to precision.
+    std::string precision = "FP32";
+    auto desc = node->getSelectedPrimitiveDescriptor();
+    if (desc == nullptr) {
+        THROW_IE_EXCEPTION << "Internal error - descriptor is empty";
+    }
+    impl_desc_type impl_type = desc->getImplementationType();
+
+    if (impl_type == gemm_blas &&
+        node->getParentEdgeAt(0)->getDesc().getPrecision() == Precision::U8)  precision = "INT8";
+
+    if (impl_type & jit && impl_type & avx512 &&
+        node->getParentEdgeAt(0)->getDesc().getPrecision() == Precision::U8)  precision = "INT8";
+
+    layer->params[PRECISION] = precision;
+
+    // Performance
+    if (node->PerfCounter().avg() != 0) {
+        layer->params[PERF_COUNTER] = std::to_string(node->PerfCounter().avg())+ " mcs";
+    }
+}
+
+void drawer_callback(const InferenceEngine::CNNLayerPtr layer,
+        ordered_properties &printed_properties,
+        ordered_properties &node_properties) {
+    const auto &params = layer->params;
+
+    // Implementation
+    auto impl = params.find(IMPL_TYPE);
+    if (impl != params.end()) {
+        printed_properties.push_back({"impl", impl->second});
+    }
+
+    // Original names
+    auto orig = params.find(ORIGIN_NAMES);
+    if (orig != params.end()) {
+        printed_properties.push_back({"originals", orig->second});
+    }
+
+    // Precision
+    auto prec = params.find(PRECISION);
+    if (prec != params.end()) {
+        printed_properties.push_back({"precision", prec->second});
+    }
+
+    // Set color
+    node_properties.push_back({"fillcolor", prec->second == "FP32" ? GREEN : BLUE});
+}
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.h
new file mode 100644 (file)
index 0000000..6ec5ffc
--- /dev/null
@@ -0,0 +1,18 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_icnn_network.hpp"
+#include "mkldnn_graph.h"
+
+#include <memory>
+
+namespace MKLDNNPlugin {
+
+    void dump_graph_as_dot(const MKLDNNGraph &graph, std::ostream &out);
+
+    std::shared_ptr<InferenceEngine::ICNNNetwork> dump_graph_as_ie_net(const MKLDNNGraph &graph);
+
+}  // namespace MKLDNNPlugin
index 3be1fbf..6c88ebd 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -144,20 +143,27 @@ void MKLDNNGraphOptimizer::FuseBatchNormWithScale(MKLDNNGraph &graph) {
 }
 
 void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
+    auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
+        for (auto a : algs) {
+            if (alg == a) {
+                return true;
+            }
+        }
+        return false;
+    };
+
     auto& graphNodes = graph.GetNodes();
 
-    auto isFusingSupported = [&](MKLDNNNodePtr node) {
-        if (!node->getCnnLayer())
+    auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
+        if (!activation->getCnnLayer())
             return false;
 
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
+        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
 
         return activationNode &&
-               (activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_relu           ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_elu            ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_logistic       ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_bounded_relu   ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_clamp);
+            (activationNode->getAlgorithm() == eltwise_relu ||
+            (conv->getCnnLayer()->precision == Precision::FP32 &&
+             isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp})));
     };
 
     for (int i = 0; i < graphNodes.size(); i++) {
@@ -172,13 +178,13 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
             if (conv->getChildEdges().size() == 1) {
                 auto ch1 = conv->getChildEdgeAt(0)->getChild();
 
-                if (isFusingSupported(ch1)) {
+                if (isFusingSupported(conv, ch1)) {
                     fuse(ch1);
 
                     if (ch1->getChildEdges().size() == 1) {
                         auto ch2 = ch1->getChildEdgeAt(0)->getChild();
 
-                        if (isFusingSupported(ch2)) {
+                        if (isFusingSupported(conv, ch2)) {
                             fuse(ch2);
                             graph.DropNode(ch2);
                         }
@@ -193,7 +199,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
 
                         if (is_max_pool && pool->getChildEdges().size() == 1) {
                             auto ch2 = pool->getChildEdgeAt(0)->getChild();
-                            if (isFusingSupported(ch2)) {
+                            if (isFusingSupported(conv, ch2)) {
                                 fuse(ch2);
                                 graph.DropNode(ch2);
                             }
@@ -274,8 +280,12 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
 
     auto isSutableChildConvolution = [](MKLDNNNodePtr node) {
         auto* layer = dynamic_cast<ConvolutionLayer*>(node->getCnnLayer().get());
-        auto allPads = getConvPaddings(*layer);
+        auto allPads = getPaddings(*layer);
         bool isSupportedParams = layer->_out_depth == layer->_group &&
+
+                                 layer->_out_depth != 1 &&
+                                 // Depthwise convolution output should be multiple of 8
+
                                  layer->_kernel[X_AXIS] == 3 && layer->_kernel[Y_AXIS] == 3 &&
                                  allPads.begin[X_AXIS] == 1 && allPads.begin[Y_AXIS] == 1 &&
                                  layer->_dilation[X_AXIS] == 1 && layer->_dilation[Y_AXIS] == 1 &&
@@ -379,18 +389,25 @@ static bool is_data_dependency(const std::shared_ptr<MKLDNNNode> &parent,
 void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) {
     std::vector<MKLDNNNodePtr> &graphNodes = graph.GetNodes();
 
-    auto isFusingSupported = [&](MKLDNNNodePtr node) {
-        if (!node->getCnnLayer())
+    auto isOneOf = [&](mkldnn::algorithm alg, std::vector<mkldnn::algorithm> algs) {
+        for (auto a : algs) {
+            if (alg == a) {
+                return true;
+            }
+        }
+        return false;
+    };
+
+    auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
+        if (!activation->getCnnLayer())
             return false;
 
-        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(node.get());
+        auto* activationNode = dynamic_cast<MKLDNNActivationNode *>(activation.get());
 
         return activationNode &&
-               (activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_relu           ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_elu            ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_logistic       ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_bounded_relu   ||
-                activationNode->getAlgorithm() == mkldnn::algorithm::eltwise_clamp);
+            (activationNode->getAlgorithm() == eltwise_relu ||
+            (conv->getCnnLayer()->precision == Precision::FP32 &&
+             isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp})));
     };
 
     for (auto &graphNode : graphNodes) {
@@ -411,6 +428,10 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
 
         auto mergedConv = (parent1->getType() == Convolution) ? parent1 : parent2;
         auto peerNode = (parent1->getType() == Convolution) ? parent2 : parent1;
+        if (peerNode->getType() == Convolution && mergedConv->getChildEdges().size() != 1) {
+            mergedConv = parent2;
+            peerNode = parent1;
+        }
         auto sum = graphNode;
         auto lastNode = sum;
 
@@ -431,7 +452,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
         if (!fuse_allowed) continue;
 
         if (graphNode->getChildEdges().size() == 1 &&
-                isFusingSupported(graphNode->getChildEdgeAt(0)->getChild())) {
+                isFusingSupported(graphNode, graphNode->getChildEdgeAt(0)->getChild())) {
             auto relu_shared = graphNode->getChildEdgeAt(0)->getChild();
             lastNode = relu_shared;
             mergedConv->setType(Convolution_Sum_Activation);
@@ -472,29 +493,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
     }
 }
 
-/**
- *  Convert LSTM layer format with combined state blob
- */
-void MKLDNNGraphOptimizer::SLTMTransform(MKLDNNGraph& graph) {
-    auto &all_nodes = graph.GetNodes();
-
-    for (auto &lstm : all_nodes) {
-        if (lstm->getType() != RNN)
-            continue;
-
-        auto layer = lstm->getCnnLayer();
-        auto in_datas = layer->insData;
-        auto out_datas = layer->outData;
-
-        if (in_datas.size() == 3) {
-            assert(lstm->getParentEdges().size() == 3);
-            // Concatenate 2 states into one blob
-            // TODO: TBD
-        } else if ((in_datas.size() != 1)) {
-            THROW_IE_EXCEPTION << "Unsupported mode for LSTM cell. Expected two state blobs";
-        }
-    }
-}
 
 void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
     for (MKLDNNNodePtr& node : graph.GetNodes()) {
@@ -520,8 +518,11 @@ void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
 
 void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
     std::set<MKLDNNNodePtr> processed;
+    std::vector<MKLDNNNodePtr> newNodes;
     for (MKLDNNNodePtr& node : graph.GetNodes()) {
-        if (processed.find(node) == processed.end() && node->getType() == Reorder && node->getChildEdgeAt(0)->getChild()->getType() == Reorder) {
+        if (processed.find(node) == processed.end() && node->getType() == Reorder
+            && node->getChildEdges().size() == 1
+            && node->getChildEdgeAt(0)->getChild()->getType() == Reorder ) {
             auto nextNode = node->getChildEdgeAt(0)->getChild();
             MKLDNNReorderNode* n = dynamic_cast<MKLDNNReorderNode*>(node.get());
             MKLDNNReorderNode* nn = dynamic_cast<MKLDNNReorderNode*>(nextNode.get());
@@ -590,10 +591,13 @@ void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
             afterNode->getDesc();
             graph.GetEdges().push_back(afterNode);
 
-            graph.GetNodes().push_back(newReorder);
+            newNodes.push_back(newReorder);
             graph.GetEdges().erase(std::remove(graph.GetEdges().begin(), graph.GetEdges().end(), edge), graph.GetEdges().end());
         }
     }
+    for (MKLDNNNodePtr& node : newNodes) {
+        graph.GetNodes().push_back(node);
+    }
 }
 
 void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
@@ -603,7 +607,7 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
 
             auto cur = l->insData[0].lock();
             if (cur == nullptr) {
-                THROW_IE_EXCEPTION << "[MKLDNN] shared_ptr l->insData[0].lock() returned nullptr";
+                THROW_IE_EXCEPTION << "[MKLDNN] error - invalid input data";
             }
             if (cur->precision != l->outData[0]->precision) {
                 if (node->name.find("_iScaleShift_") != std::string::npos) {
index 338ed72..95e8039 100644 (file)
@@ -1,10 +1,10 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "mkldnn_infer_request.h"
 #include "mkldnn_extension_utils.h"
+#include "mkldnn_streams.h"
 #include <vector>
 #include <string>
 #include <map>
@@ -36,83 +36,97 @@ void MKLDNNPlugin::MKLDNNInferRequest::InferImpl() {
     if (!graph || !graph->IsReady()) {
         THROW_IE_EXCEPTION << "Network not loaded.";
     }
-
-    // execute input pre-processing.
-    execDataPreprocessing(_inputs);
-
-    changeDefaultPtr();
-    // need to retain converted blobs until infer finish
-    std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
-    for (auto input : _inputs) {
-        if (!_networkInputs[input.first]) {
-            THROW_IE_EXCEPTION <<
-                               "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name "
-                               << input.first;
-        }
-        /*if (_networkInputs[input.first]->getInputPrecision() != input.second->precision()) {
-            THROW_IE_EXCEPTION << "Different input precision for input " << input.first
-                               << " registered in IInferencePlugin::LoadNetwork network and IInferencePlugin::Infer. "
-                               << _networkInputs[input.first]->getInputPrecision() << " vs "
-                               << input.second->precision();
-        }*/
+    auto infer = [this] {
+        // execute input pre-processing.
+        execDataPreprocessing(_inputs);
+
+        changeDefaultPtr();
+        // need to retain converted blobs until infer finish
+        std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
+        for (auto input : _inputs) {
+            if (!_networkInputs[input.first]) {
+                THROW_IE_EXCEPTION <<
+                                   "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name "
+                                   << input.first;
+            }
+            /*if (_networkInputs[input.first]->getInputPrecision() != input.second->precision()) {
+                THROW_IE_EXCEPTION << "Different input precision for input " << input.first
+                                   << " registered in IInferencePlugin::LoadNetwork network and IInferencePlugin::Infer. "
+                                   << _networkInputs[input.first]->getInputPrecision() << " vs "
+                                   << input.second->precision();
+            }*/
 
 
 
-        InferenceEngine::Blob::Ptr iconv;
-        InferenceEngine::TBlob<float> *in_f = nullptr;
-        switch (input.second->precision()) {
-            case InferenceEngine::Precision::FP32:
-                pushInput<float>(input.first, input.second);
-                break;
-            case InferenceEngine::Precision::U16:
-                // U16 is unsupported by mkldnn, so here we convert the blob and send FP32
-                iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
-                        InferenceEngine::Precision::FP32,
-                        input.second->getTensorDesc().getLayout(), input.second->dims());
-                convertedInputs.push_back(iconv);
-                iconv->allocate();
-                in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
-                InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
-                pushInput<float>(input.first, iconv);
-                break;
-            case InferenceEngine::Precision::I16:
-                if (graph->hasMeanImageFor(input.first)) {
-                    // If a mean image exists, we convert the blob and send FP32
-                    iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
-                            InferenceEngine::Precision::FP32,
-                            input.second->getTensorDesc().getLayout(), input.second->dims());
-                    convertedInputs.push_back(iconv);
-                    iconv->allocate();
-                    in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
-                    InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
-                    pushInput<float>(input.first, iconv);
-                } else {
-                    // Instead we can send I16 directly
-                    pushInput<int16_t>(input.first, input.second);
-                }
-                break;
-            case InferenceEngine::Precision::U8:
-                if (graph->hasMeanImageFor(input.first)) {
-                    // If a mean image exists, we convert the blob and send FP32
+            InferenceEngine::Blob::Ptr iconv;
+            InferenceEngine::TBlob<float> *in_f = nullptr;
+            switch (input.second->precision()) {
+                case InferenceEngine::Precision::FP32:
+                    pushInput<float>(input.first, input.second);
+                    break;
+                case InferenceEngine::Precision::I32:
+                    pushInput<int32_t>(input.first, input.second);
+                    break;
+                case InferenceEngine::Precision::I8:
+                    pushInput<int8_t>(input.first, input.second);
+                    break;
+                case InferenceEngine::Precision::U16:
+                    // U16 is unsupported by mkldnn, so here we convert the blob and send FP32
                     iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
                             InferenceEngine::Precision::FP32,
                             input.second->getTensorDesc().getLayout(), input.second->dims());
                     convertedInputs.push_back(iconv);
                     iconv->allocate();
                     in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
-                    InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
+                    InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
                     pushInput<float>(input.first, iconv);
-                } else {
-                    // Instead we can send I8 directly
-                    pushInput<uint8_t>(input.first, input.second);
-                }
-                break;
-            default:
-                THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
+                    break;
+                case InferenceEngine::Precision::I16:
+                    if (graph->hasMeanImageFor(input.first)) {
+                        // If a mean image exists, we convert the blob and send FP32
+                        iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+                                InferenceEngine::Precision::FP32,
+                                input.second->getTensorDesc().getLayout(), input.second->dims());
+                        convertedInputs.push_back(iconv);
+                        iconv->allocate();
+                        in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
+                        pushInput<float>(input.first, iconv);
+                    } else {
+                        // Instead we can send I16 directly
+                        pushInput<int16_t>(input.first, input.second);
+                    }
+                    break;
+                case InferenceEngine::Precision::U8:
+                    if (graph->hasMeanImageFor(input.first)) {
+                        // If a mean image exists, we convert the blob and send FP32
+                        iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+                                InferenceEngine::Precision::FP32,
+                                input.second->getTensorDesc().getLayout(), input.second->dims());
+                        convertedInputs.push_back(iconv);
+                        iconv->allocate();
+                        in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
+                        pushInput<float>(input.first, iconv);
+                    } else {
+                        // Instead we can send I8 directly
+                        pushInput<uint8_t>(input.first, input.second);
+                    }
+                    break;
+                default:
+                    THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
+            }
         }
-    }
-    graph->Infer(m_curBatch);
-    graph->PullOutputData(_outputs);
+        graph->Infer(m_curBatch);
+        graph->PullOutputData(_outputs);
+    };
+#if IE_THREAD == IE_THREAD_TBB
+    auto_scope_observing observer(graph->ptrObserver);
+    // a TBB arena is made "this" for Infer call via executing lambda for the arena
+    graph->ptrArena->execute([&] { infer(); });
+#else
+    infer();
+#endif
 }
 
 void MKLDNNPlugin::MKLDNNInferRequest::GetPerformanceCounts(
index ebbd864..1821b88 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -169,8 +168,22 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) {
         case f::OhIw16o4i:
         case f::OIhw4i16o4i:
             ndims = 4; break;
-        case f::goihw:
+        // DHW
+        case f::ncdhw:
+        case f::ndhwc:
+        case f::nCdhw8c:
+        case f::nCdhw16c:
+        case f::oidhw:
+        case f::OIdhw8i8o:
+        case f::OIdhw16i16o:
+        case f::OIdhw8o8i:
+        case f::OIdhw16o16i:
+        case f::OIdhw8i16o2i:
+        case f::Odhwi8o:
+        case f::Odhwi16o:
+        // Group HW
         case f::hwigo:
+        case f::goihw:
         case f::gOIhw8i8o:
         case f::gOIhw16i16o:
         case f::gOIhw8i16o2i:
@@ -183,6 +196,15 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) {
         case f::Goihw8g:
         case f::Goihw16g:
             ndims = 5; break;
+        case f::goidhw:
+        case f::gOIdhw8i8o:
+        case f::gOIdhw16i16o:
+        case f::gOIdhw8i16o2i:
+        case f::gOdhwi8o:
+        case f::gOdhwi16o:
+        case f::gOIdhw8o8i:
+        case f::gOIdhw16o16i:
+            ndims = 6; break;
         case f::format_undef:
             ndims = 0; break;
         case f::any:
@@ -197,8 +219,8 @@ bool MKLDNNMemory::isConsistant(memory::dims dims, memory::format format) {
 }
 
 bool MKLDNNMemory::IsPlainFormat(memory::format format) {
-    std::vector<memory::format> plains = {memory::nc, memory::nchw, memory::nhwc, memory::chwn,
-        memory::oi, memory::io, memory::oihw, memory::ihwo,
+    std::vector<memory::format> plains = {memory::nc, memory::nchw, memory::ncdhw, memory::nhwc, memory::ndhwc, memory::chwn,
+        memory::oi, memory::io, memory::oihw, memory::oidhw, memory::ihwo,
         memory::goihw,
         memory::blocked};
 
@@ -217,13 +239,28 @@ memory::format MKLDNNMemory::GetPlainFormat(memory::dims dims) {
             return memory::x;
         case 2:
             return memory::nc;
+        case 3:
+            return memory::tnc;
         case 4:
             return memory::nchw;
+        case 5:
+            return memory::ncdhw;
         default:
             return memory::blocked;
     }
 }
 
+InferenceEngine::Layout MKLDNNMemory::GetPlainLayout(memory::dims dims) {
+    switch (dims.size()) {
+        case 1: return Layout::C;
+        case 2: return Layout::NC;
+        case 3: return Layout::CHW;
+        case 4: return Layout::NCHW;
+        default:
+            return Layout::BLOCKED;
+    }
+}
+
 void MKLDNNMemory::CreateBlockingDesc(memory::desc &desc) {
     auto dims = desc.data.dims;
     int ndims = desc.data.ndims;
@@ -262,6 +299,10 @@ memory::format MKLDNNMemory::Convert(const InferenceEngine::Layout layout) {
             return memory::nchw;
         case NHWC:
             return memory::nhwc;
+        case NCDHW:
+            return memory::ncdhw;
+        case NDHWC:
+            return memory::ndhwc;
         case CHW:
             return memory::tnc;
         case NC:
@@ -294,6 +335,11 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) {
         case memory::nChw8c: return "nChw8c";
         case memory::nChw16c: return "nChw16c";
 
+        case memory::ncdhw: return "ncdhw";
+        case memory::ndhwc: return "ndhwc";
+        case memory::nCdhw8c: return "nCdhw8c";
+        case memory::nCdhw16c: return "nCdhw16c";
+
         case memory::oihw: return "oihw";
         case memory::ihwo: return "ihwo";
         case memory::OIhw8i8o: return "OIhw8i8o";
@@ -306,8 +352,18 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) {
         case memory::Ohwi16o: return "Ohwi16o";
         case memory::OhIw16o4i: return "OhIw16o4i";
 
+        case memory::oidhw: return "oidhw";
+        case memory::OIdhw8i8o: return "OIdhw8i8o";
+        case memory::OIdhw16i16o: return "OIdhw16i16o";
+        case memory::OIdhw8o8i: return "OIdhw8o8i";
+        case memory::OIdhw16o16i: return "OIdhw16o16i";
+        case memory::OIdhw8i16o2i: return "OIdhw8i16o2i";
+        case memory::Odhwi8o: return "Odhwi8o";
+        case memory::Odhwi16o: return "Odhwi16o";
+
         case memory::goihw: return "goihw";
         case memory::hwigo: return "hwigo";
+        case memory::hwio: return "hwio";
         case memory::gOIhw8i8o: return "gOIhw8i8o";
         case memory::gOIhw16i16o: return "gOIhw16i16o";
         case memory::gOIhw8i16o2i: return "gOIhw8i16o2i";
@@ -317,6 +373,16 @@ std::string MKLDNNMemory::formatToString(memory::format fmt) {
         case memory::gOIhw8o8i: return "gOIhw8o8i";
         case memory::gOIhw16o16i: return "gOIhw16o16i";
         case memory::gOhIw16o4i: return "gOhIw16o4i";
+
+        case memory::goidhw: return "goidhw";
+        case memory::gOIdhw8i8o: return "gOIdhw8i8o";
+        case memory::gOIdhw16i16o: return "gOIdhw16i16o";
+        case memory::gOIdhw8i16o2i: return "gOIdhw8i16o2i";
+        case memory::gOdhwi8o: return "gOdhwi8o";
+        case memory::gOdhwi16o: return "gOdhwi16o";
+        case memory::gOIdhw8o8i: return "gOIdhw8o8i";
+        case memory::gOIdhw16o16i: return "gOIdhw16o16i";
+
         default: {
             THROW_IE_EXCEPTION << "Unknown data format.";
         }
@@ -400,66 +466,96 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
     auto blkInfo = desc.data.layout_desc.blocking;
     auto offset = static_cast<size_t>(blkInfo.offset_padding);
     SizeVector offsetsForDims;
+    SizeVector dims = getDims().ToSizeVector();
     switch (getFormat()) {
         case memory::format_undef:
             THROW_IE_EXCEPTION << "Cannot cast to tensor desc. Format is undefined!";
         case memory::any:
             layout = Layout::ANY;
-            return TensorDesc(precision, getDims().ToSizeVector(), layout);
+            return TensorDesc(precision, dims, layout);
         case memory::x:
             layout = Layout::C;
             order = {0};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
             break;
         case memory::oi:
         case memory::nc:
             layout = Layout::NC;
             order = {0, 1};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
             break;
         case memory::tnc:
             layout = Layout::CHW;
             order = {0, 1, 2};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
             break;
         case memory::ntc:
             layout = Layout::CHW;
             order = {1, 0, 2};
-            blkDims = {static_cast<size_t>(getDims()[1]),
-                       static_cast<size_t>(getDims()[0]),
-                       static_cast<size_t>(getDims()[2])};
+            blkDims = {static_cast<size_t>(dims[1]),
+                       static_cast<size_t>(dims[0]),
+                       static_cast<size_t>(dims[2])};
             break;
         case memory::oihw:
         case memory::nchw:
             layout = Layout::NCHW;
             order = {0, 1, 2, 3};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
+            break;
+        case memory::ncdhw:
+            layout = Layout::NCDHW;
+            order = {0, 1, 2, 3, 4};
+            blkDims = dims;
             break;
         case memory::nhwc:
             layout = Layout::NHWC;
             order = {0, 2, 3, 1};
-            blkDims = {static_cast<size_t>(getDims()[0]),
-                       static_cast<size_t>(getDims()[2]),
-                       static_cast<size_t>(getDims()[3]),
-                       static_cast<size_t>(getDims()[1])};
+            blkDims = {static_cast<size_t>(dims[0]),
+                       static_cast<size_t>(dims[2]),
+                       static_cast<size_t>(dims[3]),
+                       static_cast<size_t>(dims[1])};
             break;
+        case memory::ndhwc:
+            layout = Layout::NDHWC;
+            order = {0, 2, 3, 4, 1};
+            blkDims = {static_cast<size_t>(dims[0]),
+                       static_cast<size_t>(dims[2]),
+                       static_cast<size_t>(dims[3]),
+                       static_cast<size_t>(dims[4]),
+                       static_cast<size_t>(dims[1])};
+            break;
+        case memory::oIhw8i:
         case memory::nChw8c:
             order = {0, 1, 2, 3, 1};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
+            blkDims[1] = blkDims[1] / 8 + (blkDims[1] % 8 ? 1 : 0);
+            blkDims.push_back(8);
+            layout = Layout::BLOCKED;
+            break;
+        case memory::nCdhw8c:
+            order = {0, 1, 2, 3, 4, 1};
+            blkDims = dims;
             blkDims[1] = blkDims[1] / 8 + (blkDims[1] % 8 ? 1 : 0);
             blkDims.push_back(8);
             layout = Layout::BLOCKED;
             break;
         case memory::nChw16c:
             order = {0, 1, 2, 3, 1};
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
+            blkDims[1] = blkDims[1] / 16 + (blkDims[1] % 16 ? 1 : 0);
+            blkDims.push_back(16);
+            layout = Layout::BLOCKED;
+            break;
+        case memory::nCdhw16c:
+            order = {0, 1, 2, 3, 4, 1};
+            blkDims = dims;
             blkDims[1] = blkDims[1] / 16 + (blkDims[1] % 16 ? 1 : 0);
             blkDims.push_back(16);
             layout = Layout::BLOCKED;
             break;
         case memory::blocked:
             order.clear();
-            blkDims = getDims().ToSizeVector();
+            blkDims = dims;
             for (size_t i = 0; i < blkDims.size(); i++) {
                 order.push_back(i);
                 if ((i && blkInfo.strides[0][i - 1] < blkInfo.strides[0][i]) || blkInfo.block_dims[i] != 1) {
@@ -478,14 +574,14 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
 
     SizeVector strides(blkDims.size());
 
-    if (layout == Layout::NHWC || layout == Layout::CHW) {
+    if (layout == Layout::NHWC || layout == Layout::NDHWC || layout == Layout::CHW) {
         for (size_t i = 0; i < order.size(); i++) {
             strides[i] = static_cast<size_t>(blkInfo.strides[0][order[i]]);
         }
     } else {
         strides[blkDims.size() - 1] = 1;
         for (size_t i = 2; i <= order.size(); i++) {
-            if (blkDims.size() - i < getDims().ndims()) {
+            if (blkDims.size() - i < dims.size()) {
                 strides[blkDims.size() - i] = static_cast<size_t>(blkInfo.strides[0][order[blkDims.size() - i]]);
             } else {
                 strides[blkDims.size() - i] = strides[blkDims.size() - i + 1] * blkDims[blkDims.size() - i + 1];
@@ -494,13 +590,13 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
     }
 
     for (size_t i = 0; i < blkDims.size() && i < TENSOR_MAX_DIMS; i++) {
-        if (i < getDims().ndims())
+        if (i < dims.size())
             offsetsForDims.push_back(blkInfo.offset_padding_to_data[i]);
         else
             offsetsForDims.push_back(0);
     }
 
-    TensorDesc tensorDesc(precision, getDims().ToSizeVector(), {blkDims, order, offset, offsetsForDims, strides});
+    TensorDesc tensorDesc(precision, dims, {blkDims, order, offset, offsetsForDims, strides});
 
     tensorDesc.setLayout(layout);
     return tensorDesc;
@@ -543,9 +639,15 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
         case NCHW:
             mkldnnFormat = memory::format::nchw;
             break;
+        case NCDHW:
+            mkldnnFormat = memory::format::ncdhw;
+            break;
         case NHWC:
             mkldnnFormat = memory::format::nhwc;
             break;
+        case NDHWC:
+            mkldnnFormat = memory::format::ndhwc;
+            break;
         case OIHW:
             mkldnnFormat = memory::format::oihw;
             break;
@@ -553,6 +655,11 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
             mkldnnFormat = memory::format::x;
             break;
         case CHW:
+            if (order == SizeVector{0, 1, 2})
+                mkldnnFormat = memory::format::tnc;
+            else if (order == SizeVector{1, 0, 2})
+                mkldnnFormat = memory::format::ntc;
+            else
                 mkldnnFormat = memory::format::blocked;
             break;
         case HW:
@@ -560,32 +667,41 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
             mkldnnFormat = memory::format::nc;
             break;
         case BLOCKED:
+            mkldnnFormat = memory::format::blocked;
             if (realDims.ndims() == 1) {
                 mkldnnFormat = memory::format::x;
-                break;
             } else if (realDims.ndims() == 2) {
                 mkldnnFormat = memory::format::nc;
-                break;
             } else if (realDims.ndims() == 4) {
                 if (order.size() == 5 && order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 1) {
                     if (blkdDims[4] == 8) {
                         mkldnnFormat = memory::format::nChw8c;
-                        break;
                     } else if (blkdDims[4] == 16) {
                         mkldnnFormat = memory::format::nChw16c;
-                        break;
                     }
                 } else if (order.size() == 4) {
                     if (order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3) {
                         mkldnnFormat = memory::format::nchw;
-                        break;
                     } else if (order[0] == 0 && order[1] == 2 && order[2] == 3 && order[3] == 1) {
                         mkldnnFormat = memory::format::nhwc;
-                        break;
+                    }
+                }
+            } else if (realDims.ndims() == 5) {
+                if (order.size() == 6 &&
+                        order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 4 && order[5] == 1) {
+                    if (blkdDims[5] == 8) {
+                        mkldnnFormat = memory::format::nCdhw8c;
+                    } else if (blkdDims[5] == 16) {
+                        mkldnnFormat = memory::format::nCdhw16c;
+                    }
+                } else if (order.size() == 5) {
+                    if (order[0] == 0 && order[1] == 1 && order[2] == 2 && order[3] == 3 && order[4] == 4) {
+                        mkldnnFormat = memory::format::ncdhw;
+                    } else if (order[0] == 0 && order[1] == 2 && order[2] == 3 && order[3] == 4 && order[4] == 1) {
+                        mkldnnFormat = memory::format::ndhwc;
                     }
                 }
             }
-            mkldnnFormat = memory::format::blocked;
             break;
         case CN:
             mkldnnFormat = memory::format::blocked;
index a5329ee..37578e5 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -108,6 +107,7 @@ public:
 
     static bool IsPlainFormat(mkldnn::memory::format format);
     static mkldnn::memory::format GetPlainFormat(mkldnn::memory::dims dims);
+    static InferenceEngine::Layout GetPlainLayout(mkldnn::memory::dims dims);
     static bool isConsistant(mkldnn::memory::dims dims, mkldnn::memory::format format);
     static mkldnn::memory::format Convert(const InferenceEngine::Layout layout);
 
index 7bda59d..73975b7 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -10,6 +9,8 @@
 #include <vector>
 #include <string>
 #include <limits>
+#include <cstdint>
+#include <unordered_map>
 
 #include <nodes/mkldnn_batchnorm_node.h>
 #include <nodes/mkldnn_concat_node.h>
@@ -17,6 +18,7 @@
 #include <nodes/mkldnn_crop_node.h>
 #include <nodes/mkldnn_deconv_node.h>
 #include <nodes/mkldnn_eltwise_node.h>
+#include <nodes/mkldnn_gemm_node.h>
 #include <nodes/mkldnn_fullyconnected_node.h>
 #include <nodes/mkldnn_generic_node.h>
 #include <nodes/mkldnn_input_node.h>
@@ -35,8 +37,9 @@
 #include <nodes/mkldnn_memory_node.hpp>
 #include <nodes/mkldnn_rnn.h>
 #include <mkldnn_types.h>
-
 #include "mkldnn_extension_utils.h"
+#include "mkldnn_plugin.h"
+#include "ie_memcpy.h"
 
 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@@ -52,6 +55,7 @@ MKLDNNNode::Register<MKLDNNConvolutionNode> MKLDNNConvolutionNode::reg;
 MKLDNNNode::Register<MKLDNNCropNode> MKLDNNCropNode::reg;
 MKLDNNNode::Register<MKLDNNDeconvolutionNode> MKLDNNDeconvolutionNode::reg;
 MKLDNNNode::Register<MKLDNNEltwiseNode> MKLDNNEltwiseNode::reg;
+MKLDNNNode::Register<MKLDNNGemmNode> MKLDNNGemmNode::reg;
 MKLDNNNode::Register<MKLDNNFullyConnectedNode> MKLDNNFullyConnectedNode::reg;
 MKLDNNNode::Register<MKLDNNInputNode> MKLDNNInputNode::reg;
 MKLDNNNode::Register<MKLDNNLrnNode> MKLDNNLrnNode::reg;
@@ -358,6 +362,8 @@ std::vector<memory::format> MKLDNNNode::getAvailableFormatsForDims(const MKLDNND
         return {memory::format::nc};
     else if (dims.ndims() == 4)
         return {memory::format::nchw, memory::format::nChw8c, memory::format::nChw16c};
+    else if (dims.ndims() == 5)
+        return {memory::format::ncdhw, memory::format::nCdhw8c, memory::format::nCdhw16c};
     return {memory::format::any};
 }
 
@@ -506,7 +512,7 @@ InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeV
 
     size_t offset = blb->byteSize();
     checkSize(intBuffSize, offset);
-    memcpy(data, blb->buffer(), blb->byteSize());
+    ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), blb->byteSize());
     data += blb->byteSize();
     for (const auto &merged : getMergeWith()) {
         wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(merged->getCnnLayer().get());
@@ -519,7 +525,7 @@ InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeV
             THROW_IE_EXCEPTION << "Cannot get internal blob layer for node " << getName() << ".";
         offset += blb->byteSize();
         checkSize(intBuffSize, offset);
-        memcpy(data, blb->buffer(), blb->byteSize());
+        ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), blb->byteSize());
         data += blb->byteSize();
     }
 
@@ -545,13 +551,32 @@ void MKLDNNNode::prepareMemory(const PrimitiveDescInfo *selected_pd, mkldnn::pri
 
     internalBlobMemory.clear();
     for (size_t i = 0; i < internalBlobs.size(); i++) {
-        auto& internalBlob = internalBlobs[i];
-        internalBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(engine)));
-
-        internalBlobMemory[i]->Create(intDescs[i]);
-        MKLDNNMemory memory(engine);
-        memory.Create(MKLDNNMemoryDesc(internalBlob->getTensorDesc()), internalBlob->buffer());
-        internalBlobMemory[i]->SetData(memory);
+        const auto &internalBlob = internalBlobs[i];
+
+        const uint64_t data_hash =  Engine::GetWeightsSharing().GetHashFunc().hash(internalBlob->buffer(), internalBlob->byteSize());
+        const std::string string_hash = name + "_" + std::to_string(i)
+                                     + "_" + std::to_string(internalBlob->byteSize())
+                                     + "_" + std::to_string(data_hash);
+        MKLDNNMemoryPtr ptr =
+                Engine::GetWeightsSharing().findOrCreate(string_hash, [&] () {
+                    MKLDNNMemoryPtr _ptr = MKLDNNMemoryPtr(new MKLDNNMemory(engine));
+                    _ptr->Create(intDescs[i]);
+                    MKLDNNMemory memory(engine);
+
+                    auto newDesc = MKLDNNMemoryDesc(internalBlob->getTensorDesc());
+                    auto newFormat = newDesc.getFormat();
+                    if (newFormat == mkldnn::memory::ncdhw) {
+                        newFormat = mkldnn::memory::goihw;
+                    }
+                    if (newFormat == mkldnn::memory::nchw) {
+                        newFormat = mkldnn::memory::oihw;
+                    }
+                    memory.Create(MKLDNNMemoryDesc(newDesc.getDims(), newDesc.getDataType(), newFormat), internalBlob->buffer());
+                    auto aformat = memory.GetFormat();
+                    _ptr->SetData(memory);
+                    return _ptr;
+                });
+        internalBlobMemory.push_back(ptr);
     }
 }
 
@@ -648,6 +673,8 @@ std::string MKLDNNNode::typeToStr(Type type) {
             return "Pooling";
         case FullyConnected:
             return "FullyConnected";
+        case Gemm:
+            return "Gemm";
         case SoftMax:
             return "SoftMax";
         case Split:
@@ -682,6 +709,9 @@ std::string MKLDNNNode::typeToStr(Type type) {
             return "MemoryInput";
         case RNN:
             return "RNN";
+        case LSTMCell:
+            return "LSTMCell";
+
         default:
             return "Unknown";
     }
@@ -838,17 +868,18 @@ InferenceEngine::TensorDesc MKLDNNNode::getConfiguredOutputDesc(const InferenceE
 
 void MKLDNNNode::initOptimalPrimitiveDescriptor() {
     auto config = getSelectedPrimitiveDescriptor()->getConfig();
-    if (isInitConfig(config))
-        return;
-
-    for (size_t i = 0; i < config.inConfs.size(); i++) {
-        config.inConfs[i].desc = getConfiguredInputDesc(config, i);
-    }
+    if (!isInitConfig(config)) {
+        for (size_t i = 0; i < config.inConfs.size(); i++) {
+            config.inConfs[i].desc = getConfiguredInputDesc(config, i);
+        }
 
-    for (size_t i = 0; i < config.outConfs.size(); i++) {
-        config.outConfs[i].desc = getConfiguredOutputDesc(config, i);
+        for (size_t i = 0; i < config.outConfs.size(); i++) {
+            config.outConfs[i].desc = getConfiguredOutputDesc(config, i);
+        }
+        initDescriptor(config);
+    } else if (getType() != RNN && getType() != LSTMCell) {
+        initDescriptor(config);
     }
-    initDescriptor(config);
 }
 
 bool MKLDNNNode::isInitConfig(const InferenceEngine::LayerConfig& config) const {
index acfe8e1..fe71c66 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -49,6 +48,7 @@ enum Type {
     Concatenation,
     Power,
     Eltwise,
+    Gemm,
     Crop,
     Reshape,
     Tile,
@@ -60,6 +60,7 @@ enum Type {
     Copy,
     MemoryOutput,
     MemoryInput,
+    LSTMCell,
     RNN
 };
 
@@ -86,6 +87,7 @@ static Type TypeFromName(const std::string type) {
             { "Pooling", Pooling },
             { "FullyConnected", FullyConnected },
             { "InnerProduct", FullyConnected },
+            { "Gemm", Gemm },
             { "Softmax", SoftMax },
             { "SoftMax", SoftMax },
             { "Split", Split },
@@ -103,6 +105,7 @@ static Type TypeFromName(const std::string type) {
             { "Flatten", Flatten },
             { "Permute", Permute },
             { "Copy", Copy },
+            { "LSTMCell", LSTMCell },
             { "RNN", RNN },
             { "MemoryInput", MemoryInput},  // for construction from name ctor, arbitrary name is used
             { "Memory", MemoryOutput },  // for construction from layer ctor
@@ -191,6 +194,10 @@ public:
         return mergedWith;
     }
 
+    const std::vector <MKLDNNNodePtr> &getFusedWith() {
+        return fusedWith;
+    }
+
     const std::string getName() const {
         return name;
     }
@@ -317,7 +324,7 @@ protected:
         this->type = type;
     }
 
-    int getMaxBatch();
+    virtual int getMaxBatch();
 
     virtual InferenceEngine::TensorDesc getConfiguredInputDesc(const InferenceEngine::LayerConfig& config, size_t idx) const;
     virtual InferenceEngine::TensorDesc getConfiguredOutputDesc(const InferenceEngine::LayerConfig& config, size_t idx) const;
@@ -350,6 +357,8 @@ protected:
     MKLDNNPrimitive prim;
     std::vector<MKLDNNDescriptor> descs;
 
+    InferenceEngine::Blob::Ptr ext_scales;
+
     friend class MKLDNNEdge;
     friend class MKLDNNGraph;
     friend class MKLDNNGraphOptimizer;
@@ -371,8 +380,9 @@ protected:
     public:
         Register() {
             Registry::RegisterNode(
-                Registry::CreatorByLayerFunction([](const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) -> MKLDNNNode * {
-                    return new To(layer, eng); } ) );
+                Registry::CreatorByLayerFunction(
+                        [](const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng)
+                        -> MKLDNNNode* { return new To(layer, eng); } ) );
         }
     };
 
index 3b51c97..35a965a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,9 @@
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 
+MKLDNNWeightsSharing Engine::weightsSharing;
+const SimpleDataHash MKLDNNWeightsSharing::simpleCRC;
+
 InferenceEngine::ExecutableNetworkInternal::Ptr
 Engine::LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config) {
     auto specifiedDevice = network.getTargetDevice();
@@ -25,8 +27,12 @@ Engine::LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, const std::map
     network.getInputsInfo(_networkInputs);
     for (auto ii : _networkInputs) {
         auto input_precision = ii.second->getInputPrecision();
-        if (input_precision != InferenceEngine::Precision::U16 && input_precision != InferenceEngine::Precision::I16
-            && input_precision != InferenceEngine::Precision::FP32 && input_precision != InferenceEngine::Precision::U8) {
+        if (input_precision != InferenceEngine::Precision::FP32 &&
+            input_precision != InferenceEngine::Precision::I32 &&
+            input_precision != InferenceEngine::Precision::U16 &&
+            input_precision != InferenceEngine::Precision::I16 &&
+            input_precision != InferenceEngine::Precision::I8 &&
+            input_precision != InferenceEngine::Precision::U8) {
             THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str
                                << "Input image format " << input_precision << " is not supported yet...";
         }
@@ -86,7 +92,7 @@ void Engine::QueryNetwork(const ICNNNetwork& network, const std::map<std::string
 INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin*& plugin, ResponseDesc *resp) noexcept {
     try {
         plugin = make_ie_compatible_plugin(
-                {{1, 4},
+                {{1, 5},
 #ifdef MKL_VERSION
                  MKL_VERSION,
 #else
index 482405a..383feaa 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -8,11 +7,59 @@
 #include "mkldnn_graph.h"
 #include <string>
 #include <map>
+#include <unordered_map>
 #include <memory>
+#include <functional>
 #include <cpp_interfaces/impl/ie_plugin_internal.hpp>
 
 namespace MKLDNNPlugin {
 
+class SimpleDataHash {
+public:
+    SimpleDataHash() {
+        for (int i = 0; i < kTableSize; i++) {
+            uint64_t c = i;
+            for (int j = 0; j < 8; j++)
+                c = ((c & 1) ? 0xc96c5795d7870f42 : 0) ^ (c >> 1);
+            table[i] = c;
+        }
+    }
+    // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
+    uint64_t hash(const unsigned char* data, size_t size) const {
+        uint64_t crc = 0;
+        for (size_t idx = 0; idx < size; idx++)
+            crc = table[(unsigned char)crc ^ data[idx]] ^ (crc >> 8);
+
+        return ~crc;
+    }
+
+protected:
+    static const int kTableSize = 256;
+    uint64_t table[kTableSize];
+};
+
+class MKLDNNWeightsSharing {
+public:
+    MKLDNNMemoryPtr findOrCreate(const std::string& name_hash,
+                             std::function<MKLDNNMemoryPtr(void)> create) {
+        std::unique_lock<std::mutex> lock(guard);
+        auto found = sharedWeights.find(name_hash);
+
+        MKLDNNMemoryPtr ptr;
+        if (found == sharedWeights.end() || !(ptr = found->second.lock())) {
+            ptr = create();
+            sharedWeights[name_hash] = ptr;
+        }
+        return ptr;
+    }
+    static const SimpleDataHash& GetHashFunc () { return simpleCRC; }
+
+protected:
+    std::unordered_map<std::string, std::weak_ptr<MKLDNNMemory>> sharedWeights;
+    std::mutex guard;
+    static const SimpleDataHash simpleCRC;
+};
+
 class Engine : public InferenceEngine::InferencePluginInternal {
 public:
     Engine() = default;
@@ -30,16 +77,20 @@ public:
     void SetConfig(const std::map<std::string, std::string> &config) override;
 
     /**
-     * @depricated Use the version with config parameter
+     * @deprecated Use the version with config parameter
      */
     void QueryNetwork(const InferenceEngine::ICNNNetwork& network, InferenceEngine::QueryNetworkResult& res) const override;
     void QueryNetwork(const InferenceEngine::ICNNNetwork& network,
                       const std::map<std::string, std::string>& config, InferenceEngine::QueryNetworkResult& res) const override;
 
+    static MKLDNNWeightsSharing& GetWeightsSharing() { return weightsSharing; }
 
 private:
     Config engConfig;
     MKLDNNExtensionManager::Ptr extensionManager = std::make_shared<MKLDNNExtensionManager>();
+
+protected:
+    static MKLDNNWeightsSharing weightsSharing;
 };
 
 }  // namespace MKLDNNPlugin
index 5bf9834..075afff 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp b/inference-engine/src/mkldnn_plugin/mkldnn_streams.cpp
new file mode 100644 (file)
index 0000000..a519837
--- /dev/null
@@ -0,0 +1,372 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <map>
+#include <vector>
+#include <limits>
+#include <chrono>
+#include <climits>
+#include <memory>
+
+#include "mkldnn_graph.h"
+#include "ie_parallel.hpp"
+#include "mkldnn_streams.h"
+
+using namespace mkldnn;
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+using namespace InferenceEngine::details;
+
+namespace MKLDNNPlugin {
+
+thread_local MultiWorkerTaskContext MultiWorkerTaskExecutor::ptrContext;
+
+bool check_env_variables() {
+#if IE_THREAD == IE_THREAD_OMP
+    return MKLDNNPlugin::cpu::checkOpenMpEnvVars(false);
+#else
+    return false;
+#endif
+}
+
+#if !(defined(__APPLE__) || defined(_WIN32))
+/* Get the cores affinity mask for the current process */
+bool get_process_mask(int& ncpus, cpu_set_t*& mask) {
+    for (ncpus = sizeof(cpu_set_t) / CHAR_BIT; ncpus < 1024 /* reasonable limit of #cores*/; ncpus <<= 1) {
+        mask = CPU_ALLOC(ncpus);
+        if (!mask) return false;
+
+        const size_t size = CPU_ALLOC_SIZE(ncpus);
+        CPU_ZERO_S(size, mask);
+        const int err = sched_getaffinity(getpid(), size, mask);
+        // the result fits the mask
+        if (!err) break;
+        // mask size is not enough
+        CPU_FREE(mask);
+        mask = NULL;
+        // other error
+        if (errno != EINVAL) break;
+    }
+    if (!mask) {
+        return false;
+    }
+    return true;
+}
+/* Pin current thread to a set of cores determined by the mask. */
+bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask) {
+    return 0 == sched_setaffinity(0, ncores, proc_mask);
+}
+/* Pin thread to a spare core in the round-robin scheme, while respecting the given process mask.
+ * The function can also handle the hyper-threading (by populating the physical cores first) */
+bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask) {
+    const size_t size = CPU_ALLOC_SIZE(ncores);
+    const int num_cpus = CPU_COUNT_S(size, proc_mask);
+    thr_idx %= num_cpus;  // To limit unique number in [; num_cpus-1] range
+
+    // Place threads with specified step
+    int cpu_idx = 0;
+    for (int i = 0, offset = 0; i < thr_idx; ++i) {
+        cpu_idx += hyperthreads;
+        if (cpu_idx >= num_cpus)
+            cpu_idx = ++offset;
+    }
+
+    // Find index of 'cpu_idx'-th bit that equals to 1
+    int mapped_idx = -1;
+    while (cpu_idx >= 0) {
+        if (CPU_ISSET_S(++mapped_idx, size, proc_mask))
+            --cpu_idx;
+    }
+
+    cpu_set_t *target_mask = CPU_ALLOC(ncores);
+    CPU_ZERO_S(size, target_mask);
+    CPU_SET_S(mapped_idx, size, target_mask);
+    bool res = pin_current_thread_by_mask(size, target_mask);
+    CPU_FREE(target_mask);
+    return res;
+}
+#else   // no threads pinning/binding on Win/MacOS
+bool get_process_mask(int& ncpus, cpu_set_t*& mask) {
+    ncpus = 0;
+    mask =  nullptr;
+    return false;
+}
+bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask) {
+    return false;
+}
+bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask) {
+    return false;
+}
+#endif  // !(defined(__APPLE__) || defined(_WIN32))
+
+MultiWorkerTaskExecutor::MultiWorkerTaskExecutor(const std::vector<Task::Ptr>& init_tasks, std::string name) :
+        _isStopped(false), _name(name), _initCount(0) {
+    for (auto t : init_tasks) {
+        _threads.push_back(std::thread([&, t] {
+            // initialization (no contention, every worker thread is doing it's own task)
+            t->runNoThrowNoBusyCheck();
+            _initCount++;
+
+            while (!_isStopped) {
+                bool isQueueEmpty;
+                Task::Ptr currentTask = nullptr;
+                {  // waiting for the new task or for stop signal
+                    std::unique_lock<std::mutex> lock(_queueMutex);
+                    _queueCondVar.wait(lock, [&]() { return !_taskQueue.empty() || _isStopped; });
+                    isQueueEmpty = _taskQueue.empty();
+                    if (!isQueueEmpty) {
+                        currentTask = _taskQueue.front();
+                        _taskQueue.pop();
+                        isQueueEmpty = _taskQueue.empty();
+                    }
+                }
+                if (currentTask)
+                    currentTask->runNoThrowNoBusyCheck();
+                if (_isStopped)
+                    break;
+                if (isQueueEmpty)  // notify dtor, that all tasks were completed
+                    _queueCondVar.notify_all();
+            }
+        }));
+    }
+    while (_initCount != init_tasks.size()) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    }
+}
+
+MultiWorkerTaskExecutor::~MultiWorkerTaskExecutor() {
+    {
+        std::unique_lock<std::mutex> lock(_queueMutex);
+        if (!_taskQueue.empty()) {
+            _queueCondVar.wait(lock, [this]() { return _taskQueue.empty(); });
+        }
+        _isStopped = true;
+        _queueCondVar.notify_all();
+    }
+    for (auto& thread : _threads) {
+        if (thread.joinable()) {
+            thread.join();
+        }
+    }
+}
+
+bool MultiWorkerTaskExecutor::startTask(Task::Ptr task) {
+    if (!task->occupy()) return false;
+    std::unique_lock<std::mutex> lock(_queueMutex);
+    _taskQueue.push(task);
+    _queueCondVar.notify_one();
+    return true;
+}
+
+MKLDNNPlugin::MKLDNNGraphlessInferRequest::MKLDNNGraphlessInferRequest(InferenceEngine::InputsDataMap networkInputs,
+                                                                       InferenceEngine::OutputsDataMap networkOutputs)
+        : InferRequestInternal(networkInputs, networkOutputs), m_curBatch(-1) {
+    // Allocate all input blobs
+    for (const auto& it : networkInputs) {
+        InferenceEngine::Blob::Ptr blob;
+        GetBlob(it.first.c_str(), blob);
+    }
+    // Allocate all output blobs
+    for (const auto& it : networkOutputs) {
+        InferenceEngine::Blob::Ptr blob;
+        GetBlob(it.first.c_str(), blob);
+    }
+}
+
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::InferImpl() {
+    IE_PROFILING_AUTO_SCOPE(MKLDNN_INFER)
+
+    auto infer = [this] {
+        IE_ASSERT(MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph != nullptr);
+        MKLDNNGraph::Ptr graph = MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph;
+        if (!graph->IsReady())
+            THROW_IE_EXCEPTION << "Network not loaded.";
+        if (m_curBatch > 0 && !graph->getProperty().enableDynamicBatch)
+            THROW_IE_EXCEPTION << "Dynamic batch is not enabled.";
+
+        if (m_curBatch > graph->getProperty().batchLimit)
+            THROW_IE_EXCEPTION << "Invalid dynamic batch size " << m_curBatch <<
+                               " for this request.";
+
+        // execute input pre-processing.
+        execDataPreprocessing(_inputs);
+
+        // need to retain converted blobs until infer finish
+        std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
+        for (auto input : _inputs) {
+            if (!_networkInputs[input.first]) {
+                THROW_IE_EXCEPTION <<
+                                   "input blobs map contains not registered during IInferencePlugin::LoadNetwork blob with name "
+                                   << input.first;
+            }
+            InferenceEngine::Blob::Ptr iconv;
+            InferenceEngine::TBlob<float> *in_f = nullptr;
+            switch (input.second->precision()) {
+                case InferenceEngine::Precision::FP32:
+                    graph->PushInputData(input.first, input.second);
+                    break;
+                case InferenceEngine::Precision::U16:
+                    // U16 is unsupported by mkldnn, so here we convert the blob and send FP32
+                    iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+                            InferenceEngine::Precision::FP32,
+                            input.second->getTensorDesc().getLayout(), input.second->dims());
+                    convertedInputs.push_back(iconv);
+                    iconv->allocate();
+                    in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                    InferenceEngine::copyToFloat<uint16_t>(in_f->data(), input.second.get());
+                    graph->PushInputData(input.first, iconv);
+                    break;
+                case InferenceEngine::Precision::I16:
+                    if (graph->hasMeanImageFor(input.first)) {
+                        // If a mean image exists, we convert the blob and send FP32
+                        iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+                                InferenceEngine::Precision::FP32,
+                                input.second->getTensorDesc().getLayout(), input.second->dims());
+                        convertedInputs.push_back(iconv);
+                        iconv->allocate();
+                        in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        InferenceEngine::copyToFloat<int16_t>(in_f->data(), input.second.get());
+                        graph->PushInputData(input.first, iconv);
+                    } else {
+                        // Instead we can send I16 directly
+                        graph->PushInputData(input.first, input.second);
+                    }
+                    break;
+                case InferenceEngine::Precision::U8:
+                    if (graph->hasMeanImageFor(input.first)) {
+                        // If a mean image exists, we convert the blob and send FP32
+                        iconv = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(
+                                InferenceEngine::Precision::FP32,
+                                input.second->getTensorDesc().getLayout(), input.second->dims());
+                        convertedInputs.push_back(iconv);
+                        iconv->allocate();
+                        in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(iconv.get());
+                        InferenceEngine::copyToFloat<uint8_t>(in_f->data(), input.second.get());
+                        graph->PushInputData(input.first, iconv);
+                    } else {
+                        // Instead we can send I8 directly
+                        graph->PushInputData(input.first, input.second);
+                    }
+                    break;
+                default:
+                    THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
+            }
+        }
+        graph->Infer(m_curBatch);
+        graph->PullOutputData(_outputs);
+        if (graph->getProperty().collectPerfCounters) {
+            m_perfMap.clear();
+            graph->GetPerfData(m_perfMap);
+        }
+    };
+#if IE_THREAD == IE_THREAD_TBB
+    auto_scope_observing observer(MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph->ptrObserver);
+    // a TBB arena is made "this" for Infer call via executing lambda for the arena
+    MKLDNNPlugin::MultiWorkerTaskExecutor::ptrContext.ptrGraph->ptrArena->execute([&] { infer(); });
+#else
+    infer();
+#endif
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::GetPerformanceCounts(
+        std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const {
+    perfMap = m_perfMap;
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::GetBlob(const char *name, InferenceEngine::Blob::Ptr &data) {
+    // ROI blob is returned only if it was set previously.
+    auto it = _preProcData.find(name);
+    if (it != _preProcData.end()) {
+        data = it->second.getRoiBlob();
+        return;
+    }
+
+    if (_inputs.find(name) != _inputs.end()) {
+        data = _inputs[name];
+        checkBlob(data, name, true);
+        return;
+    } else if (_networkInputs.find(name) != _networkInputs.end()) {
+        InferenceEngine::Layout l = _networkInputs[name]->getLayout();
+        InferenceEngine::Precision p = _networkInputs[name]->getPrecision();
+        InferenceEngine::SizeVector dims = _networkInputs[name]->getTensorDesc().getDims();
+
+        InferenceEngine::TensorDesc desc = InferenceEngine::TensorDesc(p, dims, l);
+        _inputs[name] = data = make_blob_with_precision(desc);
+        _inputs[name]->allocate();
+        checkBlob(data, name, true);
+        return;
+    }
+
+    if (_outputs.find(name) != _outputs.end()) {
+        data = _outputs[name];
+        checkBlob(data, name, false);
+        return;
+    } else if (_networkOutputs.find(name) != _networkOutputs.end()) {
+        InferenceEngine::Layout l = _networkOutputs[name]->getLayout();
+        InferenceEngine::Precision p = _networkOutputs[name]->getPrecision();
+        InferenceEngine::SizeVector dims = _networkOutputs[name]->getTensorDesc().getDims();
+
+        InferenceEngine::TensorDesc desc = InferenceEngine::TensorDesc(p, dims, l);
+        _outputs[name] = data = make_blob_with_precision(desc);
+        _outputs[name]->allocate();
+        checkBlob(data, name, false);
+        return;
+    }
+
+    THROW_IE_EXCEPTION << "Cannot find blob with name: " << name;
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::SetBlob(const char *name, const InferenceEngine::Blob::Ptr &data) {
+    if (!data)
+        THROW_IE_EXCEPTION << NOT_ALLOCATED_str << "Failed to set empty blob with name: \'" << name << "\'";
+    if (data->buffer() == nullptr)
+        THROW_IE_EXCEPTION << "Input data was not allocated. Input name: \'" << name << "\'";
+    if (name == nullptr) {
+        THROW_IE_EXCEPTION << NOT_FOUND_str + "Failed to set blob with empty name";
+    }
+    InferenceEngine::InputInfo::Ptr foundInput;
+    InferenceEngine::DataPtr foundOutput;
+    size_t dataSize = data->size();
+    if (findInputAndOutputBlobByName(name, foundInput, foundOutput)) {
+        if (foundInput->getInputPrecision() != data->precision()) {
+            THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str << "Failed to set Blob with precision "
+                               << data->precision();
+        }
+
+        if (foundInput->getPreProcess().getResizeAlgorithm() != InferenceEngine::ResizeAlgorithm::NO_RESIZE) {
+            // Stores the given blob as ROI blob. It will be used to fill in network input during pre-processing.
+            _preProcData[name].setRoiBlob(data);
+        } else {
+            size_t inputSize = InferenceEngine::details::product(foundInput->getDims());
+            if (dataSize != inputSize) {
+                THROW_IE_EXCEPTION << "Input blob size is not equal network input size ("
+                                   << dataSize << "!=" << inputSize << ").";
+            }
+            _inputs[name] = data;
+        }
+    } else {
+        size_t outputSize = InferenceEngine::details::product(foundOutput->getDims());
+        if (dataSize != outputSize) {
+            THROW_IE_EXCEPTION << "Output blob size is not equal network output size ("
+                               << dataSize << "!=" << outputSize << ").";
+        }
+        if (foundOutput->getPrecision() != data->precision()) {
+            THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str
+                               << "Failed to set Blob with precision not corresponding to user output precision";
+        }
+        _outputs[name] = data;
+    }
+}
+
+void MKLDNNPlugin::MKLDNNGraphlessInferRequest::SetBatch(int new_batch) {
+    if (new_batch < 1) {
+        THROW_IE_EXCEPTION << "Invalid dynamic batch size " << new_batch <<
+                           " for this request.";
+    }
+    m_curBatch = new_batch;
+}
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_streams.h b/inference-engine/src/mkldnn_plugin/mkldnn_streams.h
new file mode 100644 (file)
index 0000000..31558fe
--- /dev/null
@@ -0,0 +1,177 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <atomic>
+#include <map>
+#include <queue>
+#include <memory>
+#include <climits>
+#include <cpp_interfaces/impl/ie_infer_request_internal.hpp>
+#include <cpp_interfaces/ie_task_executor.hpp>
+#include "ie_parallel.hpp"
+#include "mkldnn/omp_manager.h"
+
+/* CPU "streams" implement a feature that allows multiple Infer Requests to be efficiently run simultaneously.
+ * To avoid potential oversubscription the CPU execution resources are divided accordingly.
+ * The feature enables much better performance for the networks that originally do not scale well with #threads
+ * even for a large batches. Examples are lightweight topologies or topologies with many sequential/mem-bound/etc or
+ * otherwise non-scalable layers. This is especially pronounced for many-core (e.g. server) machines.
+ * This is rather throughput-oriented feature,because running multiple requests in parallel might increase the latency
+ * of each request.
+ * Additionally, the streams help to relax the need for the large batch to improve the throughput and simplify the
+ * application logic, helping to saturate the CPU by multiple requests instead.
+ * Implementation-wise, the "streams" constitute the following:
+ *  - Pure "graph-less" Infer Requests that are not connected to the specific MKLDNNGraph (which is regular/legacy approach)
+ *  - Just like regular requests, the graph-less go to the common (per ExecutableNetwork) queue
+ *  - But unlike conventional case, there are multiple threads that grab the requests (see MultiWorkerTaskExecutor)
+ *  - So every stream is in fact is independent "worker" thread that monitors the queue.
+ *  - Every worker thread (stream) has it's own copy of the graph (which handles intermediate data required for execution)
+ *  - While the Infer Requests just keep only input/output data
+*/
+namespace MKLDNNPlugin {
+
+using namespace InferenceEngine;
+class MKLDNNGraph;
+class pinning_observer;
+
+/* This structure handles an "execution context" - data required to execute an Infer Request.
+ * This includes graph (which handles the intermediate data) and arena/observer for the TBB */
+struct MultiWorkerTaskContext {
+    std::shared_ptr<MKLDNNGraph> ptrGraph;
+};
+
+#if defined(__APPLE__) || defined(_WIN32)
+typedef void cpu_set_t;
+#define CPU_FREE(cpuset)
+// notice that functions below are just stubs for OSs other than Linux
+#endif
+/* Check whether any affinity-related env variables are set (relevant for the OpenMP) */
+bool check_env_variables();
+/* Get the cores affinity mask for the current process */
+bool get_process_mask(int& ncpus, cpu_set_t*& mask);
+/* Pin current thread to a set of cores determined by the mask. */
+bool pin_current_thread_by_mask(int ncores, const cpu_set_t* proc_mask);
+/* Pin thread to a spare core in the round-robin scheme, while respecting the given process mask.
+ * The function can also handle the hyper-threading (by populating the physical cores first) */
+bool pin_thread_to_vacant_core(int thr_idx, int hyperthreads, int ncores, const cpu_set_t* proc_mask);
+
+#if IE_THREAD == IE_THREAD_TBB
+/* Simple observer that handles pinning threads to the cores, it serves as a callback for threads entering the arena. */
+class pinning_observer: public tbb::task_scheduler_observer {
+    cpu_set_t *mask;
+    int ncpus;
+    int stream_id, threads_per_stream;
+    const int pinning_step;
+
+public:
+    pinning_observer(tbb::task_arena& _arena, int _stream_id, int _threads_per_stream, int _pinning_step = 1) :
+            tbb::task_scheduler_observer(_arena),
+            stream_id(_stream_id), threads_per_stream(_threads_per_stream), pinning_step(_pinning_step) {
+        get_process_mask(ncpus, mask);
+    }
+
+    void on_scheduler_entry(bool) override {
+        if (!mask) return;
+        int thread_idx = tbb::task_arena::current_thread_index();
+        int thr_idx = stream_id * threads_per_stream + thread_idx;
+        // pin thread to the vacant slot
+        pin_thread_to_vacant_core(thr_idx, pinning_step, ncpus, mask);
+    }
+
+    void on_scheduler_exit(bool) override {
+        if (!mask) return;
+        // reset the thread's mask (to the original process mask)
+        pin_current_thread_by_mask(ncpus, mask);
+    }
+
+    ~pinning_observer() {
+        if (mask)
+            CPU_FREE(mask);
+    }
+};
+
+class auto_scope_observing {
+public:
+     explicit auto_scope_observing(std::unique_ptr<tbb::task_scheduler_observer>&  _p) : p(_p) {
+         if (p)
+             p->observe(true);
+     }
+     ~auto_scope_observing() {
+         if (p)
+            p->observe(false);
+     }
+
+protected:
+    std::unique_ptr<tbb::task_scheduler_observer>&  p;
+};
+#endif  // IE_THREAD == IE_THREAD_TBB
+
+/* Class wrapping multiple worker threads that monitors the same queue with Infer Requests. */
+class MultiWorkerTaskExecutor : public ITaskExecutor {
+public:
+    typedef std::shared_ptr<MultiWorkerTaskExecutor> Ptr;
+
+    explicit MultiWorkerTaskExecutor(const std::vector<Task::Ptr>&, std::string name = "Default");
+
+    ~MultiWorkerTaskExecutor();
+
+    /**
+    * @brief Adds task for execution and notifies one of the working threads about the new task.
+    * @note can be called from multiple threads - tasks will be added to the queue and executed one-by-one in FIFO mode.
+    * @param task - shared pointer to the task
+    *  @return true if succeed to add task, otherwise - false
+    */
+    bool startTask(Task::Ptr task) override;
+
+    static thread_local MultiWorkerTaskContext ptrContext;
+
+private:
+    std::vector<std::thread> _threads;
+    std::mutex _queueMutex;
+    std::condition_variable _queueCondVar;
+    std::queue<Task::Ptr> _taskQueue;
+    std::atomic<bool> _isStopped;
+    std::string _name;
+    std::atomic<int> _initCount;
+};
+
+/* Pure Infer Requests - just input and output data. */
+class MKLDNNGraphlessInferRequest : public InferenceEngine::InferRequestInternal {
+public:
+    typedef std::shared_ptr<MKLDNNGraphlessInferRequest> Ptr;
+    explicit MKLDNNGraphlessInferRequest(InferenceEngine::InputsDataMap networkInputs,
+                                         InferenceEngine::OutputsDataMap networkOutputs);
+
+    void InferImpl() override;
+
+    void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const override;
+
+    /**
+     * @brief Given optional implementation of setting blob to avoid need for it to be implemented by plugin
+     * @param name - a name of input or output blob.
+     * @param data - a reference to input or output blob. The type of Blob must correspond to the network input precision and size.
+     */
+    void SetBlob(const char *name, const InferenceEngine::Blob::Ptr &data) override;
+
+    /**
+     * @brief Given optional implementation of getting blob to avoid need for it to be implemented by plugin
+     * @param name - a name of input or output blob.
+     * @param data - a reference to input or output blob. The type of Blob must correspond to the network input precision and size.
+     */
+    void GetBlob(const char *name, InferenceEngine::Blob::Ptr &data) override;
+
+
+    void SetBatch(int batch = -1) override;
+
+private:
+    int m_curBatch;
+    std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> m_perfMap;
+};
+
+
+}  // namespace MKLDNNPlugin
index f48ada4..d23b12e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -16,6 +15,7 @@ using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
 
+// TODO: (ichuraev) I don't fully sure that names of types and parameters are correct for square, abs, sqrt, linear, bounded_relu and soft_relu
 caseless_map<std::string, std::function<void(GenericLayer*, mkldnn::algorithm&, float&, float&)>> MKLDNNActivationNode::initializers = {
         {"relu", [](GenericLayer* activationLayer, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
             alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f);
index 502a804..173df1c 100644 (file)
@@ -1,11 +1,11 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "mkldnn_batchnorm_node.h"
 #include "mkldnn_depthwise_node.h"
 #include <mkldnn_extension_utils.h>
+#include "ie_memcpy.h"
 
 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@@ -77,7 +77,7 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() {
             THROW_IE_EXCEPTION << "Cannot get weights blob for node " << getName() << ".";
 
         size_t weightsByteSize = blb->byteSize();
-        memcpy(data, blb->buffer(), weightsByteSize);
+        ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), weightsByteSize);
         data += blb->size();
         blb = scshLayer->_biases;
 
@@ -86,7 +86,7 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() {
         } else {
             if (weightsByteSize != blb->byteSize())
                 THROW_IE_EXCEPTION << "ScaleShift has incorrect weights!";
-            memcpy(data, blb->buffer(), weightsByteSize);
+            ie_memcpy(data, internalBlob->byteSize(), blb->buffer(), weightsByteSize);
         }
         internalBlobs.push_back(internalBlob);
     }
index 1da5d57..fd2893e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -59,16 +58,11 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
     if (!supportedPrimitiveDescriptors.empty())
         return;
 
-    InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-    precision = getCnnLayer()->outData[0]->getPrecision();
+    InferenceEngine::Precision iIEPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
+    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(iIEPrecision);
+    InferenceEngine::Precision precision = getCnnLayer()->outData[0]->getPrecision();
     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
 
-    if (getCnnLayer()->precision == Precision::I8) {
-        inputDataType = memory::data_type::u8;
-        outputDataType = memory::data_type::u8;
-    }
-
     MKLDNNDims dstDims = getChildEdgeAt(0)->getDims();
     InferenceEngine::LayerConfig config;
     config.dynBatchSupport = true;
@@ -103,6 +97,16 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
                 supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
             }
         }
+    } else if (dims.ndims() == 5) {
+        if (dims[1] % 8 == 0) {
+            config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw8c));
+            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
+
+            if (dims[1] % 16 == 0) {
+                config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw16c));
+                supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
+            }
+        }
     }
 
     if (axis != 1 || hasEltwise)
@@ -110,12 +114,11 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
 
     auto numOfDim = static_cast<size_t>(dstDims.ndims());
 
-    SizeVector order;
-    SizeVector offsets;
+    SizeVector order(numOfDim);
+    SizeVector offsets(numOfDim, 0lu);
     size_t offset = std::numeric_limits<size_t>::max();
     for (size_t i = 0; i < numOfDim; i++) {
-        order.push_back(i);
-        offsets.push_back(0);
+        order[i] = i;
     }
 
     if (this->getCnnLayer()->precision == Precision::I8) {
@@ -135,7 +138,9 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
                 strides[i] = std::numeric_limits<size_t>::max();
             }
 
-            config.outConfs[0].desc = TensorDesc(Precision::U8, dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
+            config.outConfs[0].desc = TensorDesc(this->getCnnLayer()->outData[0]->getPrecision(),
+                                                 dstDims.ToSizeVector(),
+                                                 { blkDims, order, offset, offsets, strides });
             for (size_t i = 0; i < getParentEdges().size(); i++) {
                 auto parentEdge = getParentEdgeAt(i);
 
@@ -144,7 +149,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
 
                 config.inConfs[i].inPlace = -1;     // Change to 0 here if inplace concat is supported for NHWC in mkldnn
 
-                config.inConfs[i].desc = TensorDesc(Precision::U8, parentEdge->getDims().ToSizeVector(),
+                config.inConfs[i].desc = TensorDesc(iIEPrecision, parentEdge->getDims().ToSizeVector(),
                                                     {blkDims, order, offset, offsets, strides});
             }
 
@@ -174,26 +179,30 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
 
         supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
 
-        if (numOfDim == 4) {
-            order = {0, 1, 2, 3, 1};
-            offsets = {0, 0, 0, 0, 0};
-            numOfDim = 5;
+        if (numOfDim == 4lu || numOfDim == 5lu) {
+            size_t blkDimsLen = numOfDim + 1;
+            order.resize(blkDimsLen);
+            for (size_t i = 0; i < numOfDim; i++) {
+                order[i] = i;
+            }
+            order[numOfDim] = 1lu;
+            offsets = SizeVector(blkDimsLen, 0lu);
 
-            // nChw8c and nChw16c
-            for (int sizeS : {8, 16}) {
+            // nChw8c, nChw16c, nCdhw8c, nCdhw16c
+            for (size_t sizeS : {8lu, 16lu}) {
                 SizeVector blkDims = dstDims.ToSizeVector();
                 if (blkDims[1] % sizeS)
                     continue;
-                blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+                blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
                 blkDims.push_back(sizeS);
 
-                strides.resize(numOfDim);
-                strides[numOfDim - 1] = 1;
-                for (size_t i = 2; i <= numOfDim; i++) {
-                    if (numOfDim - i < axis) {
-                        strides[numOfDim - i] = std::numeric_limits<size_t>::max();
+                strides.resize(blkDimsLen);
+                strides[blkDimsLen - 1] = 1;
+                for (size_t i = 2lu; i <= blkDimsLen; i++) {
+                    if (blkDimsLen - i < axis) {
+                        strides[blkDimsLen - i] = std::numeric_limits<size_t>::max();
                     } else {
-                        strides[numOfDim - i] = strides[numOfDim - i + 1] * blkDims[numOfDim - i + 1];
+                        strides[blkDimsLen - i] = strides[blkDimsLen - i + 1] * blkDims[blkDimsLen - i + 1];
                     }
                 }
                 config.outConfs[0].desc = TensorDesc(
@@ -201,13 +210,13 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
                         dstDims.ToSizeVector(), {blkDims, order, offset, offsets, strides});
 
                 bool canInplace = true;
-                for (size_t i = 0; canInplace && i < getParentEdges().size(); i++) {
+                for (size_t i = 0lu; canInplace && i < getParentEdges().size(); i++) {
                     auto parentEdge = getParentEdgeAt(i);
                     blkDims = parentEdge->getDims().ToSizeVector();
                     if (blkDims[1] % sizeS)
                         canInplace = false;
 
-                    blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+                    blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
                     blkDims.push_back(sizeS);
                     config.inConfs[i].desc =  TensorDesc(MKLDNNExtensionUtils::DataTypeToIEPrecision(inputDataType), parentEdge->getDims().ToSizeVector(),
                                                          {blkDims, order, offset, offsets, strides});
@@ -225,11 +234,6 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
     precision = getCnnLayer()->outData[0]->getPrecision();
     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
 
-    if (getCnnLayer()->precision == Precision::I8) {
-        inputDataType = memory::data_type::u8;
-        outputDataType = memory::data_type::u8;
-    }
-
     bool hasUnknown = false;
     std::vector<size_t> canSelectPrimitive;
     for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
index 109a87f..ea1aee8 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -37,18 +36,18 @@ MKLDNNConvolutionNode::MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr&
         wScale = ws->second;
     }
 
-
     // Trying to find oi-scale
-    lastInInt8Chain = true;
-    auto ois = layer->blobs.find("oi-scale");
-    if (ois != layer->blobs.end()) {
-        // If we can find an o-scale, then the next layer has to be an INT8.
-        lastInInt8Chain = false;
-        oScale = ois->second;
-    } else {
-        // If we can't find an oi-scale then the next layer has to be
-        // an FP32, so we are the last layer in the INT8-chain
-        lastInInt8Chain = true;
+    if (getCnnLayer()->type == "Convolution" && getCnnLayer()->precision == Precision::I8) {
+        auto ois = layer->blobs.find("oi-scale");
+        if ((getCnnLayer()->outData[0]->getPrecision() == Precision::I8 || getCnnLayer()->outData[0]->getPrecision() == Precision::U8)
+            && ois == layer->blobs.end()) {
+            THROW_IE_EXCEPTION << "Internal error of graph quantization - missmatch of intermediate scales and next layer type for convolution "
+                << getCnnLayer()->name;
+        }
+        if (ois != layer->blobs.end()) {
+            // If we can find an oi-scale, then the next layer has to be an INT8.
+            oScale = ois->second;
+        }
     }
 }
 
@@ -99,6 +98,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
         groupOC /= groupNum;
     }
 
+    weightDims.clear();
     weightDims.push_back(groupOC);
     weightDims.push_back(groupIC);
     for (int i = 1; i <= convLayer->_kernel.size(); i++) {
@@ -141,13 +141,13 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
         dilation.push_back(static_cast<int>(convLayer->_dilation[convLayer->_dilation.size() - i]) - 1);
     }
 
-    auto allPads = getConvPaddings(*convLayer);
+    auto allPads = getPaddings(*convLayer);
     invertVectorCopyUtoI(allPads.begin, paddingL);
     invertVectorCopyUtoI(allPads.end, paddingR);
 
     MKLDNNDims weightsDims = MKLDNNDims(weightDims);
 
-    for (int i = 0; i < 2; i++) {
+    for (int i = 0; i < paddingR.size(); i++) {
         int with_group = (isGrouped || isMerged) ? 1 : 0;
         int krn = weightsDims[with_group + 2 + i];
         int src = getParentEdgeAt(0)->getDims()[2 + i];
@@ -176,26 +176,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
         }
     }
 
-    if (weights->precision() == Precision::I8) {
-        inputDataType = memory::u8;
-        if (lastInInt8Chain) {
-            outputDataType = memory::f32;
-        } else {
-            // Searching for the last fused node and taking the precision from there
-            Precision p = getCnnLayer()->precision;
-            if (fusedWith.size() > 0 && fusedWith[fusedWith.size() - 1]->getCnnLayer()->type == "ReLU") {
-                p = fusedWith[fusedWith.size() - 1]->getCnnLayer()->precision;
-            }
-
-            if (p == Precision::I8) {
-                outputDataType = memory::s8;
-            } else if (p == Precision::U8) {
-                outputDataType = memory::u8;
-            } else {
-                 THROW_IE_EXCEPTION << "Invalid layer precision for " << getName();
-            }
-        }
-
+    if (this->getCnnLayer()->precision == Precision::I8) {
         MKLDNNMemoryDesc in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nhwc);
         MKLDNNMemoryDesc out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nhwc);
         createDescriptor({in_candidate}, {out_candidate});
@@ -204,22 +185,48 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
         inputDataType = memory::f32;
         outputDataType = memory::f32;
 
-        MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType, memory::nchw);
-        MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, memory::nchw);
-        createDescriptor({in_candidate}, {out_candidate});
+        Layout layout = convLayer->input()->getLayout();
 
-        if (IC == 3 || IC == 1) {
-            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
-            createDescriptor({in_candidate}, {out_candidate});
-            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+        if (layout == NCHW || layout == NHWC) {
+            MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType,
+                    layout == NCHW ? memory::nchw : memory::nhwc);
+            MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType,
+                    layout == NCHW ? memory::nchw : memory::nhwc);
             createDescriptor({in_candidate}, {out_candidate});
-        } else {
-            in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
-            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
-            createDescriptor({in_candidate}, {out_candidate});
-            in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c);
-            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+
+            if (IC == 3 || IC == 1) {
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
+                createDescriptor({in_candidate}, {out_candidate});
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+                createDescriptor({in_candidate}, {out_candidate});
+            } else {
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
+                createDescriptor({in_candidate}, {out_candidate});
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+                createDescriptor({in_candidate}, {out_candidate});
+            }
+        } else if (layout == NCDHW || layout == NDHWC) {
+            MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType,
+                    layout == NCDHW ? memory::ncdhw : memory::ndhwc);
+            MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType,
+                    layout == NCDHW ? memory::ncdhw : memory::ndhwc);
             createDescriptor({in_candidate}, {out_candidate});
+
+            if (IC == 3 || IC == 1) {
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c);
+                createDescriptor({in_candidate}, {out_candidate});
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c);
+                createDescriptor({in_candidate}, {out_candidate});
+            } else {
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c);
+                createDescriptor({in_candidate}, {out_candidate});
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c);
+                createDescriptor({in_candidate}, {out_candidate});
+            }
         }
     }
 }
@@ -231,7 +238,15 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
     for (auto &node : fusedWith) {
         auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
         if (eltwiseNode) {
-            ops.append_sum(1.0);
+            if (eltwiseNode->getCnnLayer()->precision == Precision::I8) {
+                auto it = eltwiseNode->getCnnLayer()->blobs.find("eltwise-sum-scale");
+                if (it != eltwiseNode->getCnnLayer()->blobs.end()) {
+                    // currently there is the only one scale while we need scale by channel :(
+                    ops.append_sum(it->second->buffer().as<float*>()[0]);
+                }
+            } else {
+                ops.append_sum(1.0);
+            }
             continue;
         }
 
@@ -252,11 +267,10 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
                 PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
                 PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
 
-                int bufferSize = depthwiseNode->isBroadcast() ? 1 : depthwiseDims[0];
                 PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
                                                              depthwiseLayer->_weights->buffer(),
-                                                             bufferSize * MKLDNNExtensionUtils::sizeOfDataType(
-                                                                     memory::data_type::f32));
+                                                             depthwiseLayer->_weights->size() *
+                                                             MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
 
                 if (depthwiseNode->isBroadcast()) {
                     float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[0];
@@ -271,9 +285,8 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
                                                                 memory::format::x);
                     PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
                                                                  depthwiseLayer->_biases->buffer(),
-                                                                 bufferSize *
-                                                                 MKLDNNExtensionUtils::sizeOfDataType(
-                                                                         memory::data_type::f32));
+                                                                 depthwiseLayer->_biases->size() *
+                                                                 MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
 
                     if (depthwiseNode->isBroadcast()) {
                         float broadcastValue = static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[0];
@@ -450,14 +463,15 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::
         bdt = memory::s32;
 
         Precision outPrec;
-        if (lastInInt8Chain) {
+        if (getCnnLayer()->outData[0]->getPrecision() == Precision::FP32) {
             outPrec = Precision::FP32;
         } else {
             // define precision accordninly normalizer
+            // TODO(amalyshe) do we need to have separate flow for last in int8 chain or not?
             outPrec = outDesc.getPrecision();
         }
 
-        inDesc = TensorDesc(Precision::U8, inputDesc[0].getDims(), inputDesc[0].getBlockingDesc());
+        inDesc = TensorDesc(inDesc.getPrecision() , inputDesc[0].getDims(), inputDesc[0].getBlockingDesc());
         outDesc = TensorDesc(outPrec, outputDesc[0].getDims(), outputDesc[0].getBlockingDesc());
     }
 
@@ -502,8 +516,8 @@ void MKLDNNConvolutionNode::addScaleToPrimitiveAttr(mkldnn::primitive_attr attr)
        float* wScaleData = static_cast<float*>(wScale->buffer());
 
        std::vector<float> oScaleDataVector;
-       if (!lastInInt8Chain) {
-           float* oScaleData = static_cast<float*>(oScale->buffer());
+       if (getCnnLayer()->precision == Precision::I8 && getCnnLayer()->outData[0]->getPrecision() != Precision::FP32) {
+           float *oScaleData = static_cast<float *>(oScale->buffer());
 
            for (size_t c = 0; c < wScale->size(); c++) {
                oScaleDataVector.push_back(wScaleData[c] / oScaleData[c]);
index aa24241..19191ee 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -57,8 +56,6 @@ private:
 
     InferenceEngine::ConvolutionLayer* convLayer;
     InferenceEngine::Blob::Ptr wScale, oScale;
-
-    bool lastInInt8Chain;
 };
 
 }  // namespace MKLDNNPlugin
index 1295e05..38ca06c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,7 @@
 #include <vector>
 #include <mkldnn_types.h>
 #include <mkldnn_extension_utils.h>
+#include <ie_layers_internal.hpp>
 #include "ie_parallel.hpp"
 
 using namespace mkldnn;
@@ -67,18 +67,17 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
                 deconvLayer->_group,
                 deconvLayer->input()->getTensorDesc().getDims()[1] / deconvLayer->_group,
                 deconvLayer->_out_depth / deconvLayer->_group,
-                deconvLayer->_kernel[Y_AXIS],
-                deconvLayer->_kernel[X_AXIS]
         };
         groupNum = deconvLayer->_group;
     } else {
         weightDims = {
                 deconvLayer->input()->getTensorDesc().getDims()[1],
-                deconvLayer->_out_depth,
-                deconvLayer->_kernel[Y_AXIS],
-                deconvLayer->_kernel[X_AXIS]
+                deconvLayer->_out_depth
         };
     }
+    for (int i = 1; i <= deconvLayer->_kernel.size(); i++) {
+        weightDims.push_back(deconvLayer->_kernel[deconvLayer->_kernel.size() - i]);
+    }
 
     internalBlobs.push_back(createInternalBlob(weightDims, true));
 
@@ -86,12 +85,13 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {
     for (int i = 1; i <= deconvLayer->_dilation.size(); i++) {
         dilation.push_back(static_cast<int>(deconvLayer->_dilation[deconvLayer->_dilation.size() - i]) - 1);
     }
-    invertVectorCopyUtoI(deconvLayer->_padding, paddingL);
-    invertVectorCopyUtoI(deconvLayer->_pads_end, paddingR);
+    auto allPads = getPaddings(*deconvLayer);
+    invertVectorCopyUtoI(allPads.begin, paddingL);
+    invertVectorCopyUtoI(allPads.end, paddingR);
 
     weightsDims = MKLDNNDims(weightDims);
 
-    for (int i = 0; i < 2; i++) {
+    for (int i = 0; i < paddingR.size(); i++) {
         int with_group = (withGroups) ? 1 : 0;
         int krn = weightsDims[with_group + 2 + i];
         int src = getChildEdgeAt(0)->getDims()[2 + i];
@@ -115,28 +115,46 @@ void MKLDNNDeconvolutionNode::execute(mkldnn::stream strm) {
     }
     if (withBiases) {
         const auto *bias = biases->buffer().as<const float*>();
+        auto biasSize = biases->size();
 
         auto dst = getChildEdgeAt(0)->getBlob();
 
         float *output = dst->buffer().as<float *>() + dst->getTensorDesc().getBlockingDesc().getOffsetPadding();
+        auto dims_size = dst->getTensorDesc().getDims().size();
+        auto layout = dst->layout();
 
         const size_t N = dst->getTensorDesc().getDims()[0];
-        const size_t C = dst->getTensorDesc().getBlockingDesc().getBlockDims()[1] / groupNum;
-        const size_t H = dst->getTensorDesc().getDims()[2];
-        const size_t W = dst->getTensorDesc().getDims()[3];
-        const size_t blkC =
-                dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 4 ?
-                dst->getTensorDesc().getBlockingDesc().getBlockDims()[4] :
-                1;
+        size_t C = dst->getTensorDesc().getBlockingDesc().getBlockDims()[1] / groupNum;
+        if (C < 1) C = 1;
+        const size_t D = dims_size > 4 ? dst->getTensorDesc().getDims()[dims_size - 3] : 1lu;
+        const size_t H = dst->getTensorDesc().getDims()[dims_size - 2];
+        const size_t W = dst->getTensorDesc().getDims()[dims_size - 1];
+        size_t blkC = 1lu;
+        if (layout == BLOCKED && dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 5) {
+            blkC = dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 5 ?
+                   dst->getTensorDesc().getBlockingDesc().getBlockDims()[5] :
+                   1lu;
+        } else if (layout == BLOCKED && dst->getTensorDesc().getBlockingDesc().getBlockDims().size() > 4) {
+            blkC = dst->getTensorDesc().getBlockingDesc().getBlockDims()[4];
+        }
 
         auto strides = dst->getTensorDesc().getBlockingDesc().getStrides();
+        int output_size = strides[0] * N - dst->getTensorDesc().getBlockingDesc().getOffsetPadding();
 
-        parallel_for4d(N, C, H, W, [&](size_t n, size_t c, size_t h, size_t w) {
+        parallel_for5d(N, C, D, H, W, [&](size_t n, size_t c, size_t d, size_t h, size_t w) {
             for (size_t g = 0; g < groupNum; g++) {
-                const size_t off = n * strides[0] + (g * C + c) * strides[1] + h * strides[2] + w * strides[3];
+                const size_t off = n * strides[0]
+                                 + (g * C + c) * strides[1]
+                                 + d * strides[dims_size - 3]
+                                 + h * strides[dims_size - 2]
+                                 + w * strides[dims_size - 1];
+                if (off >= output_size) continue;
                 auto o = &output[off];
+                int gcb = g * C * blkC + c * blkC;
                 for (int bc = 0; bc < blkC; ++bc) {
-                    o[bc] += bias[c * blkC + bc];
+                    int index = gcb + bc;
+                    if (index < biasSize)
+                        o[bc] += bias[index];
                 }
             }
         });
index 8eadcf8..6b1097a 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -39,9 +38,20 @@ void MKLDNNDepthwiseNode::getSupportedDescriptors() {
     SizeVector weightDims = { (long unsigned int)parentOutDims[1] };
     MKLDNNDims blocked_weightDims(weightDims);
 
+    auto * wLayer = dynamic_cast<InferenceEngine::WeightableLayer*>(getCnnLayer().get());
+    if (wLayer == nullptr)
+        THROW_IE_EXCEPTION << "Cannot get weightable layer for node " << getName() << ".";
+
+    InferenceEngine::Blob::Ptr blb = wLayer->_weights;
+    if (blb)
+        realWeightSize = blb->size();
     internalBlobs.push_back(createInternalBlob(weightDims, true));
-    if (isWithBiases())
+    if (isWithBiases()) {
+        InferenceEngine::Blob::Ptr blb = wLayer->_biases;
+        if (blb)
+            realBiasSize = blb->size();
         internalBlobs.push_back(createInternalBlob(weightDims, false));
+    }
 
     for (auto format : getAvailableFormatsForDims(parentOutDims)) {
         MKLDNNMemoryDesc in_candidate{parentOutDims, inputDataType, format};
@@ -66,13 +76,15 @@ void MKLDNNDepthwiseNode::createPrimitive() {
 
     if (isBroadcast()) {
         float broadcastValue = static_cast<float*>(internalBlobMemory[0]->GetData())[0];
-        for (int i = 1; i < internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
+        int blbSize = internalBlobMemory[0]->GetPrimitiveDescriptor().desc().data.dims[0];
+        for (int i = 1; i < blbSize && realWeightSize != blbSize; i++) {
             static_cast<float*>(internalBlobMemory[0]->GetData())[i] = broadcastValue;
         }
 
         if (isWithBiases()) {
+            blbSize = internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0];
             broadcastValue = static_cast<float*>(internalBlobMemory[1]->GetData())[0];
-            for (int i = 1; i < internalBlobMemory[1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
+            for (int i = 1; i < blbSize && realBiasSize != blbSize; i++) {
                 static_cast<float*>(internalBlobMemory[1]->GetData())[i] = broadcastValue;
             }
         }
index 78ef529..16bd3a5 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -50,6 +49,8 @@ private:
     static Register<MKLDNNDepthwiseNode> reg;
 
     mkldnn::algorithm algorithm;
+    size_t realWeightSize = 0;
+    size_t realBiasSize = 0;
     bool withBiases;
     bool broadcast;
 };
index 0a051dc..1111968 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -99,15 +98,9 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
             mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::FP32);
             supportedPrimitiveDescriptors.push_back(same(inputDT, outputDT, format));
         } else {
-            THROW_IE_EXCEPTION << "Invalid Eltwise layer precision";
+            THROW_IE_EXCEPTION << "Invalid Eltwise layer precision: " << getCnnLayer()->name;
         }
     }
-
-    if (getCnnLayer()->precision == Precision::I8) {
-        mkldnn::memory::data_type inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::U8);
-        mkldnn::memory::data_type outputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::U8);
-        supportedPrimitiveDescriptors.push_back(same(inputDT, outputDT, mkldnn::memory::format::nhwc));
-    }
 }
 
 void MKLDNNEltwiseNode::createPrimitive() {
index 20b60c6..75b814e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -60,8 +59,11 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
     } else if (inDims.ndims() == 4) {
         weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]),
                        static_cast<size_t>(inDims[3])};
+    } else if (inDims.ndims() == 5) {
+        weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]),
+                       static_cast<size_t>(inDims[3]), static_cast<size_t>(inDims[4])};
     } else {
-        THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 4 or 2, got: "
+        THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 5, 4 or 2, got: "
                            << inDims.ndims() << " dims.";
     }
 
@@ -113,10 +115,16 @@ memory::format MKLDNNFullyConnectedNode::weightsFormatForSrcFormat(memory::forma
             return memory::format::oi;
         case memory::format::nchw:
             return memory::format::oihw;
+        case memory::format::ncdhw:
+            return memory::format::oidhw;
         case memory::format::nChw8c:
             return memory::format::oIhw8i;
+        case memory::format::nCdhw8c:
+            return memory::format::oIdhw8i;
         case memory::format::nChw16c:
             return memory::format::oIhw16i;
+        case memory::format::nCdhw16c:
+            return memory::format::oIdhw16i;
         default:
             THROW_IE_EXCEPTION << "Unsupported source format for node " << getName();
     }
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp
new file mode 100644 (file)
index 0000000..2874d9d
--- /dev/null
@@ -0,0 +1,234 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "mkldnn_gemm_node.h"
+#include <ie_layers.h>
+#include <string>
+#include <vector>
+#include <memory>
+#include <algorithm>
+#include <cmath>
+#include <mkldnn_types.h>
+#include <mkldnn_extension_utils.h>
+
+using namespace mkldnn;
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+
+MKLDNNGemmNode::MKLDNNGemmNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {}
+
+void MKLDNNGemmNode::getSupportedDescriptors() {
+    auto* gemmLayer = dynamic_cast<GemmLayer*>(getCnnLayer().get());
+
+    if (gemmLayer == nullptr)
+        THROW_IE_EXCEPTION << "Cannot convert gemm layer.";
+
+    if (getParentEdges().size() != 2 && getParentEdges().size() != 3)
+        THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
+    if (getChildEdges().size() != 1)
+        THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
+
+    auto inDims0 = getParentEdgeAt(0)->getDims();
+    auto inDims1 = getParentEdgeAt(1)->getDims();
+    auto outDims = getChildEdgeAt(0)->getDims();
+
+    alpha = gemmLayer->alpha;
+    beta = gemmLayer->beta;
+    transposeA = gemmLayer->transpose_a;
+    transposeB = gemmLayer->transpose_b;
+
+    if ((inDims0.ndims() < 2 || inDims0.ndims() > 4) ||
+        (inDims1.ndims() < 2 || inDims1.ndims() > 4))
+        THROW_IE_EXCEPTION << "Unsupported input dims count for layer " << getName();
+
+    if (outDims.ndims() < 2 || outDims.ndims() > 4)
+        THROW_IE_EXCEPTION << "Unsupported output dims count for layer " << getName();
+
+    if (inDims0.ndims() != inDims1.ndims() || inDims0.ndims() != outDims.ndims())
+        THROW_IE_EXCEPTION << "Invalid dims count for layer " << getName();
+
+    int nDims = inDims0.ndims();
+    xAxis = nDims - 1;
+    yAxis = nDims - 2;
+
+    if (inDims0[xAxis] != inDims1[yAxis] || inDims0[yAxis] != outDims[yAxis] || inDims1[xAxis] != outDims[xAxis])
+        THROW_IE_EXCEPTION << "Spatial input and output dimensions are incorrect for layer " << getName();
+
+    isThreeInputs = getParentEdges().size() == 3;
+
+    if (isThreeInputs) {
+        auto inDims2 = getParentEdgeAt(2)->getDims();
+
+        if (inDims2.ndims() < 2 || inDims2.ndims() > 4)
+            THROW_IE_EXCEPTION << "Unsupported output dims count for layer " << getName();
+
+        if (inDims2.ndims() != outDims.ndims())
+            THROW_IE_EXCEPTION << "Invalid dims count for layer " << getName();
+
+        if (inDims2[yAxis] != outDims[yAxis] || inDims2[xAxis] != outDims[xAxis])
+            THROW_IE_EXCEPTION << "Spatial input and output dimensions are incorrect for layer " << getName();
+    }
+
+    for (int dim_idx = nDims - 3; dim_idx >= 0; dim_idx--) {
+        if (isThreeInputs) {
+            auto inDims2 = getParentEdgeAt(2)->getDims();
+
+            if (inDims2[dim_idx] != outDims[dim_idx] && inDims2[dim_idx] != 1)
+                THROW_IE_EXCEPTION << "Input batch dimensions are incorrect for layer " << getName();
+
+            int cOffset = 1;
+            for (int i = dim_idx + 1; i < nDims; i++)
+                cOffset *= inDims2[i];
+            cOffsets.push_back(inDims2[dim_idx] == outDims[dim_idx] ? cOffset : 0);
+        }
+
+        if ((inDims0[dim_idx] != outDims[dim_idx] && inDims0[dim_idx] != 1) ||
+            (inDims1[dim_idx] != outDims[dim_idx] && inDims1[dim_idx] != 1)) {
+            THROW_IE_EXCEPTION << "Input batch dimensions are incorrect for layer " << getName();
+        }
+
+        int aOffset = 1;
+        for (int i = dim_idx + 1; i < nDims; i++)
+            aOffset *= inDims0[i];
+        aOffsets.push_back(inDims0[dim_idx] == outDims[dim_idx] ? aOffset : 0);
+
+        int bOffset = 1;
+        for (int i = dim_idx + 1; i < nDims; i++)
+            bOffset *= inDims1[i];
+        bOffsets.push_back(inDims1[dim_idx] == outDims[dim_idx] ? bOffset : 0);
+    }
+
+    for (unsigned long dim_idx = aOffsets.size(); dim_idx < 2; dim_idx++)
+        aOffsets.push_back(0);
+    for (unsigned long dim_idx = bOffsets.size(); dim_idx < 2; dim_idx++)
+        bOffsets.push_back(0);
+    for (unsigned long dim_idx = cOffsets.size(); dim_idx < 2; dim_idx++)
+        cOffsets.push_back(0);
+}
+
+void MKLDNNGemmNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
+    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision::FP32);
+
+    auto same = [&] (memory::format fmt) -> PrimitiveDescInfo {
+        InferenceEngine::LayerConfig config;
+        config.dynBatchSupport = true;
+        for (size_t i = 0; i < getParentEdges().size(); i++) {
+            InferenceEngine::DataConfig dataConfig;
+            dataConfig.inPlace = -1;
+            dataConfig.constant = false;
+            dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, fmt);
+            config.inConfs.push_back(dataConfig);
+        }
+
+        InferenceEngine::DataConfig dataConfig;
+            dataConfig.inPlace = -1;
+            dataConfig.constant = false;
+            dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt);
+            config.outConfs.push_back(dataConfig);
+        return {config, impl_desc_type::gemm_any};
+    };
+
+    supportedPrimitiveDescriptors.push_back(same(memory::any));
+}
+
+void MKLDNNGemmNode::createPrimitive() {
+    auto& dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
+    auto& src0MemPtr = getParentEdgeAt(0)->getMemoryPtr();
+    auto& src1MemPtr = getParentEdgeAt(1)->getMemoryPtr();
+    if (!dstMemPtr || !dstMemPtr->GetPrimitivePtr())
+        THROW_IE_EXCEPTION << "Destination memory isn't allocated.";
+    if (!src0MemPtr || !src0MemPtr->GetPrimitivePtr() || !src1MemPtr || !src1MemPtr->GetPrimitivePtr())
+        THROW_IE_EXCEPTION << "Input memory isn't allocated.";
+    if (getSelectedPrimitiveDescriptor() == nullptr)
+        THROW_IE_EXCEPTION << "Preferable primitive descriptor isn't set.";
+
+    if (isThreeInputs) {
+        auto& src2MemPtr = getParentEdgeAt(2)->getMemoryPtr();
+        if (!src2MemPtr || !src2MemPtr->GetPrimitivePtr())
+            THROW_IE_EXCEPTION << "Input memory isn't allocated.";
+    }
+}
+
+void MKLDNNGemmNode::execute(mkldnn::stream strm) {
+    auto inDims0 = getParentEdgeAt(0)->getDims();
+    auto inDims1 = getParentEdgeAt(1)->getDims();
+    auto outDims = getChildEdgeAt(0)->getDims();
+
+    auto& srcMemory0 = getParentEdgeAt(0)->getMemory();
+    auto& srcMemory1 = getParentEdgeAt(1)->getMemory();
+    const float *src0_ptr = reinterpret_cast<const float*>(srcMemory0.GetData()) +
+                            srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding;
+    const float *src1_ptr = reinterpret_cast<const float*>(srcMemory1.GetData()) +
+                            srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding;
+    float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
+                     getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
+
+    int MB1 = outDims.ndims() == 4 ? batchToProcess() : 1;
+    int MB2 = outDims.ndims() == 3 ? batchToProcess() : outDims.ndims() > 3 ? outDims[outDims.ndims() - 3] : 1;
+    int M = inDims0[yAxis];
+    int N = inDims1[xAxis];
+    int K = inDims0[xAxis];
+
+    const char transa = transposeA ? 'T' : 'N';
+    const char transb = transposeB ? 'T' : 'N';
+
+    int lda = transposeA ? M : K;
+    int ldb = transposeB ? K : N;
+    int ldc = N;
+
+    const float *src2_ptr;
+    if (isThreeInputs) {
+        auto& srcMemory2 = getParentEdgeAt(2)->getMemory();
+        src2_ptr = reinterpret_cast<const float *>(srcMemory2.GetData()) +
+                                srcMemory2.GetDescriptor().data.layout_desc.blocking.offset_padding;
+    } else {
+        src2_ptr = dst_ptr;
+    }
+
+    if (!isThreeInputs) {
+        beta = 0.f;
+    }
+
+    for (int b1 = 0; b1 < MB1; b1++) {
+        const float *a_ptr = src0_ptr;
+        const float *b_ptr = src1_ptr;
+        const float *c_ptr = src2_ptr;
+        float *d_ptr = dst_ptr;
+
+        for (int b2 = 0; b2 < MB2; b2++) {
+            if (isThreeInputs) {
+                memcpy(d_ptr, c_ptr, M * N * sizeof(float));
+                c_ptr += cOffsets[0];
+            }
+
+            mkldnn_sgemm(&transb, &transa, &N, &M, &K, &alpha, b_ptr, &ldb, a_ptr, &lda, &beta, d_ptr, &ldc);
+
+            a_ptr += aOffsets[0];
+            b_ptr += bOffsets[0];
+            d_ptr += M * N;
+        }
+
+        src0_ptr += aOffsets[1];
+        src1_ptr += bOffsets[1];
+        dst_ptr += MB2 * M * N;
+
+        if (isThreeInputs) {
+            src2_ptr += cOffsets[1];
+        }
+    }
+}
+
+bool MKLDNNGemmNode::created() const {
+    return getType() == Gemm;
+}
+
+int MKLDNNGemmNode::getMaxBatch() {
+    if (!outDims.empty())
+        return outDims[0][0];
+    return 0;
+}
diff --git a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.h
new file mode 100644 (file)
index 0000000..da171a0
--- /dev/null
@@ -0,0 +1,44 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+#include <string>
+#include <vector>
+
+namespace MKLDNNPlugin {
+
+class MKLDNNGemmNode : public MKLDNNNode {
+public:
+    MKLDNNGemmNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng);
+    ~MKLDNNGemmNode() override = default;
+
+    void getSupportedDescriptors() override;
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override;
+    void execute(mkldnn::stream strm) override;
+    bool created() const override;
+    int getMaxBatch() override;
+
+private:
+    static Register<MKLDNNGemmNode> reg;
+    float alpha;
+    float beta;
+    bool transposeA;
+    bool transposeB;
+
+    int xAxis;
+    int yAxis;
+
+    bool isThreeInputs;
+
+    std::vector<int> aOffsets;
+    std::vector<int> bOffsets;
+    std::vector<int> cOffsets;
+};
+
+}  // namespace MKLDNNPlugin
+
index 04cb400..b31b491 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -83,8 +82,7 @@ bool MKLDNNGenericNode::created(const MKLDNNExtensionManager::Ptr &extMgr) {
     if (getCnnLayer() && extMgr) {
         // We should save extension manager in otder to avoid situation when
         // it will destroyed before extensibility primitives
-        extensionManager = extMgr;
-        extFactory.reset(extensionManager->CreateExtensionFactory(getCnnLayer()));
+        extFactory.reset(extMgr->CreateExtensionFactory(getCnnLayer()));
 
         if (extFactory)
             setType(Generic);
@@ -147,11 +145,6 @@ void MKLDNNGenericNode::execLayer() {
     }
 }
 
-MKLDNNGenericNode::~MKLDNNGenericNode() {
-    extFactory.reset();
-    extensionManager.reset();
-}
-
 void MKLDNNGenericNode::initDescriptor(const InferenceEngine::LayerConfig &config) {
     InferenceEngine::LayerConfig rightConfig = config;
     InferenceEngine::StatusCode rc;
@@ -206,11 +199,3 @@ void MKLDNNGenericNode::initDescriptor(const InferenceEngine::LayerConfig &confi
         constant = ConstantType::Const;
     }
 }
-
-void MKLDNNGenericNode::initOptimalPrimitiveDescriptor() {
-    auto descriptor = getSelectedPrimitiveDescriptor();
-    if (descriptor != nullptr) {
-        auto config = descriptor->getConfig();
-        initDescriptor(config);
-    }
-}
index 5cc8b00..7bdd4a0 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -17,7 +16,7 @@ namespace MKLDNNPlugin {
 class MKLDNNGenericNode : public MKLDNNNode {
 public:
     MKLDNNGenericNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng);
-    ~MKLDNNGenericNode() override;
+    ~MKLDNNGenericNode() = default;
 
     void getSupportedDescriptors() override;
     void initSupportedPrimitiveDescriptors() override;
@@ -30,7 +29,6 @@ public:
     }
 
     void initDescriptor(const InferenceEngine::LayerConfig& config) override;
-    void initOptimalPrimitiveDescriptor() override;
 
     void execLayer();
     void cleanup() override;
@@ -42,7 +40,6 @@ protected:
 
 private:
     static Register<MKLDNNGenericNode> reg;
-    MKLDNNExtensionManager::Ptr extensionManager;
 };
 
 }  // namespace MKLDNNPlugin
index aa395a1..c23ce6e 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -69,6 +68,21 @@ void MKLDNNPermuteNode::initSupportedPrimitiveDescriptors() {
             config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
             supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
         }
+    } else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
+        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::ncdhw);
+        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::ncdhw);
+        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
+
+        auto srcDims = getParentEdgeAt(0)->getDims();
+        if (srcDims[1] % 8 == 0) {
+            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c);
+            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
+        }
+
+        if (srcDims[1] % 16 == 0) {
+            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c);
+            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
+        }
     } else {
         config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::any);
         config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
@@ -221,6 +235,70 @@ static void permute_to_3012(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
     }
 }
 
+static void permute_to_021(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
+    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+
+    const int C  = srcMemPtr->GetDims()[1];
+    const int S  = srcMemPtr->GetDims()[2];
+
+    parallel_for2d(MB, S, [&](int n, int s) {
+        int src_off = 0;
+        int dst_off = 0;
+
+        for (int c = 0; c < C; c++) {
+            src_off = n * C * S +
+                      c * S +
+                      s;
+            dst_off = n * S * C +
+                      s * C +
+                      c;
+
+            dst_data[dst_off] = src_data[src_off];
+        }
+    });
+}
+
+static void permute_to_034152(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
+    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+
+    const int DIM1 = srcMemPtr->GetDims()[1];
+    const int DIM2 = srcMemPtr->GetDims()[2];
+    const int DIM3 = srcMemPtr->GetDims()[3];
+    const int DIM4 = srcMemPtr->GetDims()[4];
+    const int DIM5 = srcMemPtr->GetDims()[5];
+
+    int src_off = 0;
+    int dst_off = 0;
+
+    for (int n = 0; n < MB; n++) {
+        for (int dim3 = 0; dim3 < DIM3; dim3++) {
+            for (int dim4 = 0; dim4 < DIM4; dim4++) {
+                for (int dim1 = 0; dim1 < DIM1; dim1++) {
+                    for (int dim5 = 0; dim5 < DIM5; dim5++) {
+                        for (int dim2 = 0; dim2 < DIM2; dim2++) {
+                            src_off = n * DIM1 * DIM2 * DIM3 * DIM4 * DIM5 +
+                                      dim1 * DIM2 * DIM3 * DIM4 * DIM5 +
+                                      dim2 * DIM3 * DIM4 * DIM5 +
+                                      dim3 * DIM4 * DIM5 +
+                                      dim4 * DIM5 +
+                                      dim5;
+
+                            dst_data[dst_off] = src_data[src_off];
+                            dst_off++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
 std::map<InferenceEngine::SizeVector, MKLDNNPermuteNode::PermuteImpl> MKLDNNPermuteNode::OptimizedCases = {
         {{0, 2, 3, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_0231, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
             return true;
@@ -237,6 +315,12 @@ std::map<InferenceEngine::SizeVector, MKLDNNPermuteNode::PermuteImpl> MKLDNNPerm
         {{0, 2, 1, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_0213, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
             return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
         })},  // shufflenet
+        {{0, 2, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_021, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+        })},  // self attention block
+        {{0, 3, 4, 1, 5, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_034152, [](MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
+            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+        })},  // learning-to-see-in-the-dark-sony
 };
 
 void MKLDNNPermuteNode::execute(mkldnn::stream strm) {
index 0ec7c0a..82e3eac 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,7 @@
 #include <vector>
 #include <mkldnn_types.h>
 #include <mkldnn_extension_utils.h>
+#include <ie_layers_internal.hpp>
 
 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@@ -23,12 +23,8 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
         return;
 
     InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
     auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
     precision = getCnnLayer()->outData[0]->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
     auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
 
     auto * poolingLayer = dynamic_cast<PoolingLayer*>(getCnnLayer().get());
@@ -45,15 +41,16 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
 
     invertVectorCopyUtoI(poolingLayer->_stride, stride);
     invertVectorCopyUtoI(poolingLayer->_kernel, kernel);
-    invertVectorCopyUtoI(poolingLayer->_padding, paddingL);
-    invertVectorCopyUtoI(poolingLayer->_pads_end, paddingR);
+    auto allPads = getPaddings(*poolingLayer);
+    invertVectorCopyUtoI(allPads.begin, paddingL);
+    invertVectorCopyUtoI(allPads.end, paddingR);
 
     auto parentDims = getParentEdgeAt(0)->getDims();
     auto childDims = getChildEdgeAt(0)->getDims();
     if ((parentDims.ndims() < 4) || (parentDims.ndims() > 5))
         THROW_IE_EXCEPTION << "Pooling layer. Unsupported mode. Only 4D and 5D blobs are supported as input.";
 
-    for (int i = 0; i < 2; i++) {
+    for (int i = 0; i < paddingR.size(); i++) {
         int krn = kernel[i];
         int src = getParentEdgeAt(0)->getDims()[2 + i];
         int dst = getChildEdgeAt(0)->getDims()[2 + i];
@@ -61,11 +58,11 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
         int calc_dst = (src - krn + paddingL[i]) / stride[i] + 1;
         paddingR[i] = (dst - calc_dst) * stride[i];
     }
-
     if (this->getCnnLayer()->precision == Precision::I8) {
-        MKLDNNMemoryDesc in_candidate{parentDims, memory::data_type::u8, memory::format::nhwc};
-        MKLDNNMemoryDesc out_candidate{childDims, memory::data_type::u8, memory::format::nhwc};
-        createDescriptor({in_candidate}, {out_candidate});
+        // i8 layers supports only nhwc layout
+        MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, memory::format::nhwc};
+        MKLDNNMemoryDesc out_candidate{childDims, outputDataType, memory::format::nhwc};
+        createDescriptor({ in_candidate }, { out_candidate });
     } else {
         // It doesn't support any format
         for (auto format : getAvailableFormatsForDims(parentDims)) {
@@ -97,7 +94,14 @@ void MKLDNNPoolingNode::createDescriptor(const std::vector<InferenceEngine::Tens
 
     algorithm alg;
     if (type == PoolingLayer::PoolType::AVG) {
-        if (!exclude_pad && (paddingL[0] != 0 || paddingL[1] != 0))
+        bool not_zero_l = false;
+        for (auto lr : paddingL) {
+            if (lr) {
+                not_zero_l = true;
+                break;
+            }
+        }
+        if (!exclude_pad && not_zero_l)
             alg = pooling_avg_include_padding;
         else
             alg = pooling_avg_exclude_padding;
@@ -114,7 +118,14 @@ void MKLDNNPoolingNode::createDescriptor(const std::vector<InferenceEngine::Tens
                                       stride, kernel, paddingL, paddingR,
                                       mkldnn::padding_kind::zero));
 
-    if (alg == pooling_avg_include_padding && (paddingR[0] || paddingR[1])) {
+    bool not_zero_r = false;
+    for (auto pr : paddingR) {
+        if (pr) {
+            not_zero_r = true;
+            break;
+        }
+    }
+    if (alg == pooling_avg_include_padding && not_zero_r) {
         // In case of AVG including paddings the norm coeff should be calculated
         // with tacking into account original pads. So we need to restore
         // original values (R_padding = L_padding).
index 3b16780..345b215 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -71,6 +70,17 @@ void MKLDNNReorderNode::createPrimitive() {
     if (getSelectedPrimitiveDescriptor() == nullptr)
         THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set.";
 
+    createReorderPrimitive(srcMemPtr->GetDescriptor(), srcMemPtr->GetPrimitive().get_data_handle(),
+            dstMemPtr->GetDescriptor(), dstMemPtr->GetPrimitive().get_data_handle());
+}
+
+void MKLDNNReorderNode::createReorderPrimitive(mkldnn::memory::desc srcDesc, void* srcPtr, mkldnn::memory::desc dstDesc, void* dstPtr) {
+    src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
+    src_blocked->Create(srcDesc, srcPtr);
+
+    dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
+    dst_blocked->Create(dstDesc, dstPtr);
+
     mkldnn::primitive_attr attr;
 
     if (_scales) {
@@ -90,52 +100,12 @@ void MKLDNNReorderNode::createPrimitive() {
         attr.set_int_output_round_mode(round_nearest);
     }
 
-    if (srcMemPtr->GetSize() == dstMemPtr->GetSize()) {
-        InferenceEngine::Precision dstPrec = getChildEdgeAt(0)->getDesc().getPrecision();
-        InferenceEngine::Precision srcPrec = getParentEdgeAt(0)->getDesc().getPrecision();
-
-        if ((srcPrec == InferenceEngine::Precision::I8 && dstPrec == InferenceEngine::Precision::U8)) {
-            // This reorder actually does nothing so we declare it in-place.
-            dstMemPtr->GetPrimitive().set_data_handle(srcMemPtr->GetPrimitive().get_data_handle());
-        } else {
-            try {
-                // No autoblocking. Reorder can be applied as is
-
-                reorder::primitive_desc pd = reorder::primitive_desc(srcMemPtr->GetPrimitiveDescriptor(), dstMemPtr->GetPrimitiveDescriptor(), attr);
-                prim.reset(new mkldnn::reorder(srcMemPtr->GetPrimitive(), dstMemPtr->GetPrimitive()));
-            } catch (...) {}
-        }
-    } else {
-        // Autoblocking case. nchw<=>nChw8c are only supported, but memory descriptor
-        // should be with strides. Prepare it from enlarged blob
-        memory::dims dims = srcMemPtr->GetDims();
-        memory::dims dims_dst = dstMemPtr->GetDims();
-
-        for (int i = 0; i < dims.size(); i++)  // min dims is a logical dims
-            dims[i] = std::min(dims[i], dims_dst[i]);
-
-        memory::desc src_d = srcMemPtr->GetDescriptor();
-        void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
-
-        memory::desc dst_d = dstMemPtr->GetDescriptor();
-        void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
-
-        for (int i = 0; i < dims.size(); i++)
-            src_d.data.dims[i] = dst_d.data.dims[i] = dims[i];
-
-        src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-        src_blocked->Create(src_d, src_data_hdl);
-
-        dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-        dst_blocked->Create(dst_d, dst_data_hdl);
-
-        // output blob should be zeroed. NaN value can occur in untouched place.
-        dstMemPtr->FillZero();
-
+    try {
+        // No autoblocking. Reorder can be applied as is
         reorder::primitive_desc pd = reorder::primitive_desc(src_blocked->GetPrimitiveDescriptor(), dst_blocked->GetPrimitiveDescriptor(), attr);
 
         prim.reset(new mkldnn::reorder(pd, src_blocked->GetPrimitive(), dst_blocked->GetPrimitive()));
-    }
+    } catch (...) {}
 }
 
 const std::vector<impl_desc_type>& MKLDNNReorderNode::getPrimitivesPriority() {
@@ -148,32 +118,9 @@ bool MKLDNNReorderNode::created() const {
 }
 
 void MKLDNNReorderNode::execute(mkldnn::stream strm) {
-    if (prim) {
-        if (src_blocked)
-            src_blocked->GetPrimitivePtr()->set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
-        if (dst_blocked)
-            dst_blocked->GetPrimitivePtr()->set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
-        MKLDNNNode::execute(strm);
-    } else {
-        InferenceEngine::Precision dstPrec = getChildEdgeAt(0)->getDesc().getPrecision();
-        InferenceEngine::Precision srcPrec = getParentEdgeAt(0)->getDesc().getPrecision();
-        if ((srcPrec == InferenceEngine::Precision::I8 && dstPrec == InferenceEngine::Precision::U8)) {
-            // Do nothing here
-        } else {
-            auto srcBlbPtr = getParentEdgeAt(0)->getBlob();
-            auto dstBlbPtr = getChildEdgeAt(0)->getBlob();
-
-            assert(srcBlbPtr->size() == dstBlbPtr->size());
-            int data_size = srcBlbPtr->size();
-
-            const auto* src_data = srcBlbPtr->cbuffer().as<const float *>();
-            auto* dst_data = dstBlbPtr->buffer().as<float *>();
-
-            InferenceEngine::parallel_for(data_size, [&](int i) {
-                dst_data[dstBlbPtr->getTensorDesc().offset(i)] = src_data[srcBlbPtr->getTensorDesc().offset(i)];
-            });
-        }
-    }
+    src_blocked->GetPrimitivePtr()->set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
+    dst_blocked->GetPrimitivePtr()->set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
+    MKLDNNNode::execute(strm);
 }
 
 void MKLDNNReorderNode::setDynamicBatchLim(int lim) {
@@ -186,21 +133,12 @@ void MKLDNNReorderNode::setDynamicBatchLim(int lim) {
         void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
         void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
 
-        if (src_blocked && dst_blocked) {
-            src_d = src_blocked->GetDescriptor();
-            dst_d = dst_blocked->GetDescriptor();
-            src_data_hdl = src_blocked->GetPrimitive().get_data_handle();
-            dst_data_hdl = dst_blocked->GetPrimitive().get_data_handle();
-        }
-        src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
         src_d.data.dims[0] = batchToProcess();
         src_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess();
-        src_blocked->Create(src_d, src_data_hdl);
 
-        dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
         dst_d.data.dims[0] = batchToProcess();
         dst_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess();
-        dst_blocked->Create(dst_d, dst_data_hdl);
-        prim.reset(new mkldnn::reorder(src_blocked->GetPrimitive(), dst_blocked->GetPrimitive()));
+
+        createReorderPrimitive(src_d, src_data_hdl, dst_d, dst_data_hdl);
     }
 }
index 3d74c20..7a228ec 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -51,6 +50,8 @@ private:
 
     MKLDNNMemoryPtr dst_blocked;
     MKLDNNMemoryPtr src_blocked;
+
+    void createReorderPrimitive(mkldnn::memory::desc srcDesc, void* srcPtr, mkldnn::memory::desc dstDesc, void* dstPtr);
 };
 
 }  // namespace MKLDNNPlugin
index cfd51bf..d959aa5 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -49,15 +48,6 @@ void MKLDNNReshapeNode::initSupportedPrimitiveDescriptors() {
     config.outConfs[0].constant = false;
     config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat);
     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
-    if (inDims.ndims() == 4 && inDims[1] % 8 == 0 && outDims.ndims() == 4 &&outDims[1] % 8 == 0) {
-        outFormat = memory::format::any;
-    }
-    config.inConfs[0].inPlace = -1;
-    config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format::any);
-    config.outConfs[0].inPlace = -1;
-    config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat);
-
-    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
 }
 
 void MKLDNNReshapeNode::createPrimitive() {
@@ -69,107 +59,6 @@ void MKLDNNReshapeNode::createPrimitive() {
         THROW_IE_EXCEPTION << "Input memory didn't allocate.";
     if (getSelectedPrimitiveDescriptor() == nullptr)
         THROW_IE_EXCEPTION << "Preferable primitive descriptor does not set.";
-
-    if (srcMemPtr->GetData() != dstMemPtr->GetData()) {
-        InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-        if (precision != InferenceEngine::Precision::FP32)
-            precision = InferenceEngine::Precision::FP32;
-        auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-        precision = getCnnLayer()->outData[0]->getPrecision();
-        if (precision != InferenceEngine::Precision::FP32)
-            precision = InferenceEngine::Precision::FP32;
-        auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-
-        auto dims = getParentEdgeAt(0)->getDims();
-
-        srcMem.reset(new MKLDNNMemory(getEngine()));
-        srcMem->Create(dims, inputDataType, MKLDNNMemory::GetPlainFormat(dims));
-
-        dstMem.reset(new MKLDNNMemory(getEngine()));
-        dstMem->Create(getChildEdgeAt(0)->getDims(), outputDataType,
-                       MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims()), srcMem->GetData());
-
-        if (srcMemPtr->GetSize() == srcMem->GetSize()) {
-            srcPrim.reset(new mkldnn::reorder(srcMemPtr->GetPrimitive(), srcMem->GetPrimitive()));
-        } else {
-            // Autoblocking mode
-            memory::dims dims = srcMem->GetDims();  // contains logical dims
-
-            memory::desc src_d = srcMemPtr->GetDescriptor();
-            void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
-
-            for (int i = 0; i < dims.size(); i++)
-                src_d.data.dims[i] =  dims[i];
-
-            memory::primitive_desc tmp_src_pd(src_d, getEngine());
-            src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-            src_blocked->Create(src_d, src_data_hdl);
-
-            srcPrim.reset(new mkldnn::reorder(src_blocked->GetPrimitive(), srcMem->GetPrimitive()));
-        }
-
-        if (dstMemPtr->GetSize() == dstMem->GetSize()) {
-            dstPrim.reset(new mkldnn::reorder(dstMem->GetPrimitive(), dstMemPtr->GetPrimitive()));
-        } else {
-            // Autoblocking mode
-            memory::dims dims = srcMem->GetDims();
-
-            memory::desc dst_d = dstMemPtr->GetDescriptor();
-            void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
-
-            for (int i = 0; i < dims.size(); i++)
-                dst_d.data.dims[i] =  dims[i];
-
-            dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-            dst_blocked->Create(dst_d, dst_data_hdl);
-
-            dstPrim.reset(new mkldnn::reorder(dst_blocked->GetPrimitive(), dstMemPtr->GetPrimitive()));
-        }
-    }
-}
-
-void MKLDNNReshapeNode::setDynamicBatchLim(int lim) {
-    dynBatchLim = lim;
-    if (srcPrim && dstPrim) {
-        auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
-        auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
-        memory::desc src_d = srcMemPtr->GetDescriptor();
-        memory::desc dst_d = dstMemPtr->GetDescriptor();
-        void *src_data_hdl = srcMemPtr->GetPrimitive().get_data_handle();
-        void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();
-        srcMem = std::make_shared<MKLDNNMemory>(getEngine());
-        src_d.data.dims[0] = batchToProcess();
-        srcMem->Create(src_d, src_data_hdl);
-        dstMemPtr = std::make_shared<MKLDNNMemory>(getEngine());
-        src_d.data.dims[0] = batchToProcess();
-        dstMemPtr->Create(src_d, src_data_hdl);
-
-        if (src_blocked && dst_blocked) {
-            src_d = src_blocked->GetDescriptor();
-            dst_d = dst_blocked->GetDescriptor();
-            src_data_hdl = src_blocked->GetPrimitive().get_data_handle();
-            dst_data_hdl = dst_blocked->GetPrimitive().get_data_handle();
-        }
-        src_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-        src_d.data.dims[0] = batchToProcess();
-        src_blocked->Create(src_d, src_data_hdl);
-
-        dst_blocked = std::make_shared<MKLDNNMemory>(getEngine());
-        dst_d.data.dims[0] = batchToProcess();
-        dst_blocked->Create(dst_d, dst_data_hdl);
-        srcPrim = std::make_shared<mkldnn::reorder>(src_blocked->GetPrimitive(), srcMem->GetPrimitive());
-        dstPrim = std::make_shared<mkldnn::reorder>(dst_blocked->GetPrimitive(), dstMemPtr->GetPrimitive());
-    }
-}
-
-void MKLDNNReshapeNode::execute(mkldnn::stream strm) {
-    if (srcPrim && dstPrim) {
-        if (src_blocked)
-            src_blocked->GetPrimitive().set_data_handle(getParentEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
-        if (dst_blocked)
-            dst_blocked->GetPrimitive().set_data_handle(getChildEdgeAt(0)->getMemory().GetPrimitive().get_data_handle());
-        strm.submit({*srcPrim, *dstPrim});
-    }
 }
 
 bool MKLDNNReshapeNode::created() const {
index eeb6660..bb30099 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -21,19 +20,10 @@ public:
     void getSupportedDescriptors() override;
     void initSupportedPrimitiveDescriptors() override;
     void createPrimitive() override;
-    void execute(mkldnn::stream strm) override;
     bool created() const override;
-    void setDynamicBatchLim(int lim) override;
 
 private:
     static Register<MKLDNNReshapeNode> reg;
-    std::shared_ptr<mkldnn::primitive> srcPrim;
-    std::shared_ptr<mkldnn::primitive> dstPrim;
-    MKLDNNMemoryPtr srcMem;
-    MKLDNNMemoryPtr dstMem;
-
-    MKLDNNMemoryPtr dst_blocked;
-    MKLDNNMemoryPtr src_blocked;
 };
 
 }  // namespace MKLDNNPlugin
index a474ca9..ba32285 100644 (file)
@@ -1,12 +1,11 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "mkldnn_rnn.h"
 #include "mkldnn_extension_utils.h"
 #include "desc_iterator.hpp"
-#include <ie_layers.h>
+#include <ie_layers_prv.h>
 
 #include <string>
 #include <utility>
@@ -16,39 +15,143 @@ using namespace InferenceEngine;
 
 namespace MKLDNNPlugin {
 
-MKLDNNRNN::MKLDNNRNN(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {}
+template <typename T, typename P>
+inline bool one_of(T val, P item) { return val == item; }
+template <typename T, typename P, typename... Args>
+inline bool one_of(T val, P item, Args... item_others) {
+    return val == item || one_of(val, item_others...);
+}
+
+rnn_direction ie2mkl(RNNLayer::Direction &direction) {
+    return direction == RNNLayer::RNN_FWD ? unidirectional_left2right
+         : direction == RNNLayer::RNN_BWD ? unidirectional_right2left
+         : direction == RNNLayer::RNN_BDR ? bidirectional_concat
+                                          : unidirectional;
+}
+
+MKLDNNRNN::MKLDNNRNN(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng) : MKLDNNNode(layer, eng) {
+    is_cell = layer->type == "LSTMCell";
+}
 
 bool MKLDNNRNN::created() const {
-    return getType() == RNN;
+    return getType() == (is_cell ? LSTMCell : RNN);
 }
 
 void MKLDNNRNN::getSupportedDescriptors() {
+    if (is_cell)
+        fillCellDesc();
+    else
+        fillSeqDesc();
+}
+
+void MKLDNNRNN::fillCellDesc() {
+    if (!descs.empty()) return;
+    auto cellLayer = std::dynamic_pointer_cast<InferenceEngine::LSTMCell>(getCnnLayer());
+
+    if (!cellLayer)
+        THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNLayer.";
+
+    auto &ins = cellLayer->insData;
+    auto &outs = cellLayer->outData;
+
+    if (ins.size() != 3)
+        THROW_IE_EXCEPTION << "Incorrect number of input ports for layer " << getName();
+    if (outs.size() != 2)
+        THROW_IE_EXCEPTION << "Incorrect number of output ports for layer " << getName();
+
+    auto in_data_dims = getParentEdgeAt(0)->getDims();
+    auto in_h_state_dims = getParentEdgeAt(1)->getDims();
+    auto in_c_state_dims = getParentEdgeAt(2)->getDims();
+
+    auto out_h_state_dims = getChildEdgeAt(0)->getDims();
+    auto out_c_state_dims = getChildEdgeAt(1)->getDims();
+
+    if (in_data_dims.ndims() != 2
+        || in_h_state_dims.ndims() != 2
+        || in_c_state_dims.ndims() != 2
+        || out_h_state_dims.ndims() != 2
+        || out_c_state_dims.ndims() != 2)
+        THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
+
+    T = 1;
+    N  = in_data_dims[0];
+    DC = in_data_dims[1];
+    SC = in_h_state_dims[1];
+
+    // Expected shapes
+    MKLDNNDims D_shape {N, DC}, S_shape {N, SC};
+
+    if (in_data_dims != D_shape
+        || in_h_state_dims != S_shape
+        || in_c_state_dims != S_shape
+        || out_h_state_dims != S_shape
+        || out_c_state_dims != S_shape)
+        THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
+
+    auto blobs = cellLayer->blobs;
+    Blob::Ptr weights, bias;
+    if (blobs.find("weights") != blobs.end()) weights = blobs["weights"];
+    if (blobs.find("biases") != blobs.end()) bias = blobs["biases"];
+
+    if (!weights)
+        THROW_IE_EXCEPTION << "RNN Layer. Weights do not present.";
+
+    if (weights->size() != G*SC*(SC+DC))
+        THROW_IE_EXCEPTION << "RNN Layer. Weights size is not correct. Expected size:" << G*SC*(SC+DC);
+
+    if (bias && bias->size() != G*SC)
+        THROW_IE_EXCEPTION << "RNN Layer. Biases size is not correct. Expected size:" << G*SC;
+
+    // Shapes and Attributes are correct. Can start internal stuff initialization.
+
+    in_state_d  = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
+    out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
+
+    in_data_d  = {{T, N, DC}, memory::f32, memory::tnc};;
+    out_data_d = {{T, N, SC}, memory::f32, memory::tnc};;
+
+    w_data_d   = {{L, D, DC, G, SC}, memory::f32, memory::ldigo};
+    w_state_d  = {{L, D, SC, G, SC}, memory::f32, memory::ldigo};
+
+    if (bias)
+        w_bias_d = {{L, D, G, SC}, memory::f32, memory::ldgo};
+
+    std::vector<TensorDesc> in_candidate;
+    in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, memory::f32, memory::nc});
+    in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+    in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+
+    std::vector<TensorDesc> out_candidate;
+    out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+    out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+
+    createDescriptor(in_candidate, out_candidate);
+}
+
+void MKLDNNRNN::fillSeqDesc() {
     if (!descs.empty()) return;
     auto rnnLayer = std::dynamic_pointer_cast<RNNLayer>(getCnnLayer());
 
     if (!rnnLayer)
         THROW_IE_EXCEPTION << "Wrong RNN layer representation. Cannot cast to RNNLayer.";
 
-    if (rnnLayer->cellType == LSTM)
-        cellr_type = LSTM;
-    else
+    if (!one_of(rnnLayer->cellType, "LSTM"))
         THROW_IE_EXCEPTION << "RNN layer supports only LSTM like cell";
 
-    swap_state = rnnLayer->params["swap_state"] == "YES";
+    if (!one_of(rnnLayer->axis, 0, 1))
+        THROW_IE_EXCEPTION << "RNN layer supports only sequence axis 0 or 1";
+    nativeOrder = rnnLayer->axis == 0;
 
-    if (rnnLayer->_axis == 0)
-        nativeOrder = true;
-    else if (rnnLayer->_axis == 1)
-        nativeOrder = false;
-    else
-        THROW_IE_EXCEPTION << "RNN layer supports only sequence axis == 1";
+    if (!one_of(rnnLayer->direction, RNNLayer::RNN_FWD, RNNLayer::RNN_BWD))
+        THROW_IE_EXCEPTION << "RNN layer supports only unidirectional RNN layer";
+    direction = ie2mkl(rnnLayer->direction);
 
     auto &ins = rnnLayer->insData;
     auto &outs = rnnLayer->outData;
 
-    if (ins.size() != 3 && ins.size() != 1)
+    if (!one_of(ins.size(), 3, 1))
         THROW_IE_EXCEPTION << "Incorrect number of input ports for layer " << getName();
-    if (outs.size() != 3 && outs.size() !=1)
+    if (!one_of(outs.size(), 3, 1))
         THROW_IE_EXCEPTION << "Incorrect number of output ports for layer " << getName();
 
     auto in_data_dims = getParentEdgeAt(0)->getDims();
@@ -62,31 +165,21 @@ void MKLDNNRNN::getSupportedDescriptors() {
         std::swap(out_data_dims[0], out_data_dims[1]);
     }
 
-    // IE specific order
-    seq       = in_data_dims[0];
-    batch     = in_data_dims[1];
-    data_len  = in_data_dims[2];
-    state_len = out_data_dims[2];
-
-    const int N = batch;
-    const int T = seq;
-    const int G = num_gates;
-    const int DC = data_len;
-    const int SC = state_len;
-    const int L = 1;  // What is a L ??
-    const int D = 1;
-    const int S = 2;
-
-    if (out_data_dims != MKLDNNDims {T, N, SC})
-        THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
+    T = in_data_dims[0];
+    N = in_data_dims[1];
+    DC = in_data_dims[2];
+    SC = out_data_dims[2];
 
-    MKLDNNDims state_dims {batch, state_len};
+    MKLDNNDims ID_shape {T, N, DC}, OD_shape {T, N, SC}, S_shape {N, SC};
+
+    if (out_data_dims != OD_shape)
+        THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();
 
     if (ins.size() == 3) {
         auto state_dims1 = getParentEdgeAt(1)->getDims();
         auto stats_dims2 = getParentEdgeAt(2)->getDims();
 
-        if (state_dims1 != state_dims || stats_dims2 != state_dims)
+        if (state_dims1 != S_shape || stats_dims2 != S_shape)
             THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
 
         in_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
@@ -96,7 +189,7 @@ void MKLDNNRNN::getSupportedDescriptors() {
         auto state_dims1 = getChildEdgeAt(1)->getDims();
         auto stats_dims2 = getChildEdgeAt(2)->getDims();
 
-        if (state_dims1 != state_dims || stats_dims2 != state_dims)
+        if (state_dims1 != S_shape || stats_dims2 != S_shape)
             THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
 
         out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
@@ -133,8 +226,8 @@ void MKLDNNRNN::getSupportedDescriptors() {
         in_candidate.push_back(MKLDNNMemoryDesc{{N, T, DC}, memory::f32, memory::ntc});
 
     if (ins.size() == 3) {
-        in_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
-        in_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
+        in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+        in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
     }
 
     std::vector<TensorDesc> out_candidate;
@@ -144,8 +237,8 @@ void MKLDNNRNN::getSupportedDescriptors() {
         out_candidate.push_back(MKLDNNMemoryDesc{{N, T, SC}, memory::f32, memory::ntc});
 
     if (outs.size() == 3) {
-        out_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
-        out_candidate.emplace_back(MKLDNNMemoryDesc {state_dims, memory::f32, memory::nc});
+        out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+        out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
     }
 
     createDescriptor(in_candidate, out_candidate);
@@ -156,7 +249,7 @@ void MKLDNNRNN::createDescriptor(const std::vector<TensorDesc> &inputDesc,
     MKLDNNDescriptor desc(std::shared_ptr<rnn_forward::desc>(
             new rnn_forward::desc(forward_scoring,
                     {algorithm::vanilla_lstm, algorithm::eltwise_tanh },
-                    unidirectional,
+                    direction,
                     /* In Data       */ in_data_d,
                     /* In State      */ in_state_d,
                     /* Weights data  */ w_data_d,
@@ -194,13 +287,8 @@ void MKLDNNRNN::createPrimitive() {
     std::shared_ptr<rnn_forward::desc> d = descs[0];
     rnn_forward::primitive_desc pd(*d, getEngine());
 
-    auto src_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
-    src_data_mem->Create(in_data_d, getParentEdgeAt(0)->getMemoryPtr()->GetData());
-    internalBlobMemory.push_back(src_data_mem);
-
-    auto dst_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
-    dst_data_mem->Create(out_data_d, getChildEdgeAt(0)->getMemoryPtr()->GetData());
-    internalBlobMemory.push_back(dst_data_mem);
+    auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
+    auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();
 
     // create weight blobs (data and state part)
     auto w_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
@@ -229,28 +317,27 @@ void MKLDNNRNN::createPrimitive() {
          *
          *   Gate order
          *   Caffe - IFOC, ONNX   - IOFC
-         *   IE    - FICO, mkldnn - FIOC
-         *
+         *   IE    - FICO, mkldnn - IFCO
          */
-        // FICO -> FIOC
-        const int gate_map[] = {0, 1, 3, 2};
+        // FICO -> IFCO
+        const int gate_map[] = {1, 0, 2, 3};
 
         auto ie_w_ptr = getCnnLayer()->blobs["weights"]->buffer().as<const float*>();
         auto w_ptr = static_cast<float*>(w_data_mem->GetData());
         auto r_ptr = static_cast<float*>(w_state_mem->GetData());
-        const int step = state_len * num_gates;
+        const int step = SC * G;
 
-        for (int g = 0; g < num_gates; g++) {
-            for (int out_i = 0; out_i < state_len; out_i++) {
-                float *l_w_ptr = w_ptr + gate_map[g]*state_len + out_i;
-                float *l_r_ptr = r_ptr + gate_map[g]*state_len + out_i;
-                for (int in_i = 0; in_i < data_len; in_i++) {
+        for (int g = 0; g < G; g++) {
+            for (int out_i = 0; out_i < SC; out_i++) {
+                float *l_w_ptr = w_ptr + gate_map[g]*SC + out_i;
+                float *l_r_ptr = r_ptr + gate_map[g]*SC+ out_i;
+                for (int in_i = 0; in_i < DC; in_i++) {
                     *l_w_ptr = *ie_w_ptr;
                     ie_w_ptr++;
                     l_w_ptr += step;
                 }
 
-                for (int in_i = 0; in_i < state_len; in_i++) {
+                for (int in_i = 0; in_i < SC; in_i++) {
                     *l_r_ptr = *ie_w_ptr;
                     ie_w_ptr++;
                     l_r_ptr += step;
@@ -261,9 +348,9 @@ void MKLDNNRNN::createPrimitive() {
         if (w_bias_d) {
             auto ie_b_ptr = getCnnLayer()->blobs["biases"]->buffer().as<const float*>();
             auto b_ptr = static_cast<float*>(w_bias_mem->GetData());
-            for (int g = 0; g < num_gates; g++) {
-                float *l_b_ptr = b_ptr + gate_map[g]*state_len;
-                for (int out_i = 0; out_i < state_len; out_i++) {
+            for (int g = 0; g < G; g++) {
+                float *l_b_ptr = b_ptr + gate_map[g]*SC;
+                for (int out_i = 0; out_i < SC; out_i++) {
                     *l_b_ptr = *ie_b_ptr;
                     ie_b_ptr++;
                     l_b_ptr++;
@@ -293,37 +380,35 @@ void MKLDNNRNN::createPrimitive() {
                 src_stat_1.get_primitive_desc().get_size());
         internalBlobMemory.push_back(high_half_state_mem);
 
-        if (!swap_state) {
-            exec_before.emplace_back(src_stat_1, low_half_state_mem->GetPrimitive());
-            exec_before.emplace_back(src_stat_2, high_half_state_mem->GetPrimitive());
-        } else {
-            exec_before.emplace_back(src_stat_2, low_half_state_mem->GetPrimitive());
-            exec_before.emplace_back(src_stat_1, high_half_state_mem->GetPrimitive());
-        }
+        exec_before.emplace_back(src_stat_1, low_half_state_mem->GetPrimitive());
+        exec_before.emplace_back(src_stat_2, high_half_state_mem->GetPrimitive());
     }
 
     auto dst_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
     dst_state_mem->Create(out_state_d);
     internalBlobMemory.push_back(dst_state_mem);
     if (out_state_d) {
+        int idx_H = is_cell ? 0 : 1;
+        int idx_C = is_cell ? 1 : 2;
         /* create copy/split primitive */
-        auto dst_stat_1 = getChildEdgeAt(1)->getMemory().GetPrimitive();
-        auto dst_stat_2 = getChildEdgeAt(2)->getMemory().GetPrimitive();
+        auto dst_stat_1 = getChildEdgeAt(idx_H)->getMemory().GetPrimitive();
+        auto dst_stat_2 = getChildEdgeAt(idx_C)->getMemory().GetPrimitive();
 
         auto low_half_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
         low_half_state_mem->Create(
                 dst_stat_1.get_primitive_desc().desc(),
-                src_state_mem->GetPrimitive().get_data_handle());
+                dst_state_mem->GetPrimitive().get_data_handle());
         internalBlobMemory.push_back(low_half_state_mem);
 
         auto high_half_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
         high_half_state_mem->Create(
                 dst_stat_2.get_primitive_desc().desc(),
-                static_cast<uint8_t*>(src_state_mem->GetPrimitive().get_data_handle()) +
+                static_cast<uint8_t*>(dst_state_mem->GetPrimitive().get_data_handle()) +
                         dst_stat_1.get_primitive_desc().get_size());
         internalBlobMemory.push_back(high_half_state_mem);
 
-        exec_after.emplace_back(low_half_state_mem->GetPrimitive(),  dst_stat_1);
+
+        if (!is_cell) exec_after.emplace_back(low_half_state_mem->GetPrimitive(),  dst_stat_1);
         exec_after.emplace_back(high_half_state_mem->GetPrimitive(), dst_stat_2);
     }
 
index a47fdf4..4399c30 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -28,18 +27,30 @@ public:
     void execute(mkldnn::stream strm) override;
 
 private:
+    void fillCellDesc();
+    void fillSeqDesc();
+
+private:
     static Register<MKLDNNRNN> reg;
 
-    InferenceEngine::CellType cellr_type = InferenceEngine::CellType::LSTM;
+    /** Specify mode Cell or Seq. true - Cell, false - Seq */
+    bool is_cell = false;
+
     /** Native order if [batch, seq, data], other case is [seq, batch, data] */
     bool nativeOrder = true;
-    bool swap_state = false;
 
-    int batch = 0;
-    int seq = 0;
-    int data_len = 0;
-    int state_len = 0;
-    const size_t num_gates = 4;
+    /** Direction of iteration through sequence dimension */
+    mkldnn::rnn_direction direction = mkldnn::unidirectional;
+
+    // Internal attributes
+    int N = 0;   /**< Batch value */
+    int T = 0;   /**< Sequence value */
+    int DC = 0;  /**< Input data channel size */
+    int SC = 0;  /**< State channel size value */
+    const int G = 4;   /**< Gate size. 4 for LSTM */
+    const int L = 1;   /**< What is it??. Constant for mkldnn impl */
+    const int D = 1;   /**< Num of direction. 1 or 2 */
+    const int S = 2;   /**< Num of state. 2 for LSTM (hidden and sell state). */
 
     MKLDNNMemoryDesc in_data_d;
     MKLDNNMemoryDesc out_data_d;
@@ -51,6 +62,7 @@ private:
     MKLDNNMemoryDesc w_state_d;
     MKLDNNMemoryDesc w_bias_d;
 
+    // List of in/out reorders if required
     std::vector<mkldnn::reorder> exec_before;
     std::vector<mkldnn::reorder> exec_after;
 };
index 618479c..90cf4f4 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -24,16 +23,15 @@ void MKLDNNSplitNode::getSupportedDescriptors() {
     if (splitLayer == nullptr)
         THROW_IE_EXCEPTION << "Cannot convert split layer.";
 
-    axis = splitLayer->_axis;
-
-    if (axis != 1)
-        THROW_IE_EXCEPTION << "Split support only axis 1.";
-
     if (getParentEdges().size() != 1)
         THROW_IE_EXCEPTION << "Incorrect number of input nodes.";
     if (getChildEdges().empty())
         THROW_IE_EXCEPTION << "Incorrect number of output nodes.";
 
+    axis = splitLayer->_axis;
+    if (axis >= getParentEdgeAt(0)->getDims().ndims())
+        THROW_IE_EXCEPTION << "Invalid value of axis parameter in split layer";
+
     // WA. Check applicability and limitations
     for (size_t i = 1; i < getCnnLayer()->outData.size(); i++) {
         int num_port_connection = getCnnLayer()->outData[i]->inputTo.size();
@@ -72,7 +70,7 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
     if (srcDims.ndims() < 2)
         THROW_IE_EXCEPTION << "Split " << getName() << " isn't supported 1d blobs";
 
-    auto num_chanels = 0;
+    auto axis_size = 0;
     auto dstFirstDims = getChildEdgeAt(0)->getDims();
     for (size_t i = 0; i < outDims.size(); i++) {
         auto o_Dims = outDims[i];
@@ -83,15 +81,15 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
         config.outConfs[i].inPlace = -1;
         config.outConfs[i].constant = false;
         config.outConfs[i].desc = MKLDNNMemoryDesc(o_Dims, outputDataType, memory::format::any);
-        num_chanels += o_Dims[1];
+        axis_size += o_Dims[axis];
         for (size_t j = 0; j < dstFirstDims.ndims(); j++) {
             if (j == axis)
                 continue;
             if (o_Dims[j] != dstFirstDims[j])
-                THROW_IE_EXCEPTION << "Split " << getName() << "has incorrect output dimensions";
+                THROW_IE_EXCEPTION << "Split " << getName() << " has incorrect output dimensions";
         }
     }
-    dstFirstDims[1] = num_chanels;
+    dstFirstDims[axis] = axis_size;
     if (dstFirstDims.size() != srcDims.size())
         THROW_IE_EXCEPTION << "The sizes of input blob and sum of output blobs are not equal.";
     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref);
@@ -99,11 +97,10 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
     auto numOfDim = static_cast<size_t>(srcDims.ndims());
 
     SizeVector order;
-    SizeVector offsets;
+    SizeVector offsets(numOfDim, 0lu);
     size_t offset = std::numeric_limits<size_t>::max();
     for (size_t i = 0; i < numOfDim; i++) {
         order.push_back(i);
-        offsets.push_back(0);
     }
 
     SizeVector strides(numOfDim);
@@ -125,23 +122,23 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
     }
     supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
 
-    if (numOfDim != 4)
+    if ((numOfDim != 4 && numOfDim != 5) || axis != 1)
         return;
 
-    order = {0, 1, 2, 3, 1};
-    offsets = {0, 0, 0, 0, 0};
-    numOfDim = 5;
+    order.push_back(1);
+    numOfDim = order.size();
+    offsets = SizeVector(numOfDim, 0lu);
 
     // nChw8c and nChw16c
-    for (int sizeS : {8, 16}) {
+    for (size_t sizeS : {8lu, 16lu}) {
         SizeVector blkDims = srcDims.ToSizeVector();
         if (blkDims[1] % sizeS)
             continue;
-        blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+        blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
         blkDims.push_back(sizeS);
 
         strides.resize(numOfDim);
-        strides[numOfDim - 1] = 1;
+        strides[numOfDim - 1] = 1lu;
         for (size_t i = 2; i <= numOfDim; i++) {
             if (numOfDim - i < axis) {
                 strides[numOfDim - i] = std::numeric_limits<size_t>::max();
@@ -160,9 +157,9 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
                 canInplace = false;
                 break;
             }
-            blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1 : 0);
+            blkDims[1] = blkDims[1] / sizeS + (blkDims[1] % sizeS ? 1lu : 0lu);
             blkDims.push_back(sizeS);
-            config.outConfs[i].desc =  TensorDesc(Precision::FP32, outDims, {blkDims, order, offset, offsets, strides});
+            config.outConfs[i].desc = TensorDesc(Precision::FP32, outDims, {blkDims, order, offset, offsets, strides});
         }
         if (canInplace)
             supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
@@ -190,18 +187,32 @@ void MKLDNNSplitNode::execute(mkldnn::stream strm) {
     int MB = batchToProcess();
     auto srcBlob = getParentEdgeAt(0)->getBlob();
     const auto *srcData = srcBlob->cbuffer().as<const float *>();
+
+    size_t outerSize = 1;
+    for (int i = 0; i < axis; i++) {
+        if (i == 0)
+            outerSize *= MB;
+        else
+            outerSize *= srcBlob->dims()[srcBlob->dims().size() - i - 1];
+    }
+
     size_t srcSize = getParentEdgeAt(0)->getMemory().GetSize();
-    size_t src_batch_off = srcBlob->getTensorDesc().offset(srcBlob->size() / srcBlob->getTensorDesc().getDims()[0])
+    size_t src_batch_off = srcBlob->getTensorDesc().offset(srcBlob->size() / outerSize)
             - srcBlob->getTensorDesc().offset(0);
 
     for (size_t i = 0, sIdx = 0; i < getChildEdges().size(); i++) {
         auto dstBlob = getChildEdgeAt(i)->getBlob();
         auto *dstData = dstBlob->buffer().as<float *>();
-        size_t dst_slice_size = dstBlob->size() / dstBlob->getTensorDesc().getDims()[0];
-        size_t dst_batch_off = dstBlob->getTensorDesc().offset(dst_slice_size) - dstBlob->getTensorDesc().offset(0);
 
-        for (size_t dIdx = 0; dIdx < dst_slice_size; dIdx++, sIdx++) {
-            for (unsigned b = 0; b < MB; b++) {
+        size_t innerSize = 1;
+        for (size_t j = axis; j < dstBlob->dims().size(); j++) {
+            innerSize *= dstBlob->dims()[dstBlob->dims().size() - j - 1];
+        }
+
+        size_t dst_batch_off = dstBlob->getTensorDesc().offset(innerSize) - dstBlob->getTensorDesc().offset(0);
+
+        for (size_t dIdx = 0; dIdx < innerSize; dIdx++, sIdx++) {
+            for (unsigned b = 0; b < outerSize; b++) {
                 if (sIdx + b*src_batch_off >= srcSize)
                     THROW_IE_EXCEPTION << "Incorrect configuration of split layer " << getName() << "!";
                 dstData[b * dst_batch_off + dstBlob->getTensorDesc().offset(dIdx)] =
@@ -436,3 +447,13 @@ void MKLDNNSplitNode::initOptimalPrimitiveDescriptor() {
     }
     initDescriptor(config);
 }
+
+void MKLDNNSplitNode::setDynamicBatchLim(int lim) {
+    if (axis == 0)
+        THROW_IE_EXCEPTION << "Dynamic batch is not supported by split layer with axis == 0 parameter";
+
+    dynBatchLim = lim;
+    if (prim) {
+        prim.setBatchLimit(batchToProcess(), getParentEdges().size(), getChildEdges().size());
+    }
+}
index 7d41577..905f806 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -26,6 +25,8 @@ public:
     bool isOptimized();
     void initOptimalPrimitiveDescriptor() override;
 
+    void setDynamicBatchLim(int lim) override;
+
 private:
     static Register<MKLDNNSplitNode> reg;
     size_t axis = 1;
index 204ea86..1226716 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -49,9 +48,11 @@ void MKLDNNTileNode::initSupportedPrimitiveDescriptors() {
         fmt = memory::format::nc;
     } else if (inDims.ndims() == 4) {
         fmt = memory::format::nchw;
+    } else if (inDims.ndims() == 5) {
+        fmt = memory::format::ncdhw;
     }
     if (fmt == memory::format::any) {
-        THROW_IE_EXCEPTION << "Tile " << getName() << " supports only 2d and 4d dimensions!";
+        THROW_IE_EXCEPTION << "Tile " << getName() << " supports only 2D, 4D and 5D dimensions!";
     }
 
     InferenceEngine::LayerConfig config;
@@ -101,14 +102,16 @@ void MKLDNNTileNode::execute(mkldnn::stream strm) {
         m_inner_dim *= batchToProcess();
     }
 
-    if (m_inner_dim == 1 && inDims.size() == 4 && m_outer_dim%8 == 0 && srcMemory.GetFormat() == memory::nChw8c) {
+    if (m_inner_dim == 1 && m_outer_dim % 8 == 0 && ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw8c) ||
+            (inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw8c))) {
         /*
          * We may enable tile processing directly to appropriate output format (nChw8c)
          */
         m_inner_dim *= 8;
         m_outer_dim /= 8;
-    } else if (m_inner_dim == 1 && inDims.size() == 4 && m_outer_dim%16 == 0
-               && srcMemory.GetFormat() == memory::nChw16c) {
+    } else if (m_inner_dim == 1 && m_outer_dim % 16 == 0 &&
+            ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw16c) ||
+            (inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw16c))) {
         /*
          * We may enable tile processing directly to appropriate output format (nChw16c)
          */
index 87f0c5f..3770a24 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp b/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp
new file mode 100644 (file)
index 0000000..24d2931
--- /dev/null
@@ -0,0 +1,370 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "blob_dump.h"
+#include "blob_factory.hpp"
+#include "mkldnn_memory.h"
+
+// It's so bad to include by relative path :-(
+#include "../../thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp"
+
+#include <fstream>
+
+using namespace InferenceEngine;
+
+namespace MKLDNNPlugin {
+
+// IEB file format routine
+static unsigned char IEB_MAGIC[4] = {'I', 'E', 'B', '0'};
+static unsigned char NO_SCALES = 0xFF;
+
+struct IEB_HEADER {
+    unsigned char magic[4];
+    unsigned char ver[2];
+
+    unsigned char precision;  // 0-8
+    unsigned char ndims;
+    unsigned int  dims[7];  // max is 7-D blob
+
+    unsigned char scaling_axis;  // FF - no scaling
+    unsigned char reserved[3];
+
+    unsigned long data_offset;
+    unsigned long data_size;
+    unsigned long scaling_data_offset;
+    unsigned long scaling_data_size;
+};
+
+static IEB_HEADER prepare_header(const TensorDesc& desc) {
+    IEB_HEADER header;
+
+    header.magic[0] = IEB_MAGIC[0];
+    header.magic[1] = IEB_MAGIC[1];
+    header.magic[2] = IEB_MAGIC[2];
+    header.magic[3] = IEB_MAGIC[3];
+
+    // IEB file format version 0.1
+    header.ver[0] = 0;
+    header.ver[1] = 1;
+
+    header.precision = desc.getPrecision();
+
+    if (desc.getDims().size() > 7)
+        THROW_IE_EXCEPTION << "Dumper support max 7D blobs";
+
+    header.ndims = desc.getDims().size();
+    for (int i = 0; i < header.ndims; i++)
+        header.dims[i] = desc.getDims()[i];
+
+    header.scaling_axis = NO_SCALES;
+
+    return header;
+}
+
+static TensorDesc parse_header(IEB_HEADER &header) {
+    if (header.magic[0] != IEB_MAGIC[0] ||
+        header.magic[1] != IEB_MAGIC[1] ||
+        header.magic[2] != IEB_MAGIC[2] ||
+        header.magic[3] != IEB_MAGIC[3])
+        THROW_IE_EXCEPTION << "Dumper cannot parse file. Wrong format.";
+
+    if (header.ver[0] != 0 ||
+        header.ver[1] != 1)
+        THROW_IE_EXCEPTION << "Dumper cannot parse file. Unsupported IEB format version.";
+
+    Precision prc = Precision(static_cast<Precision::ePrecision>(header.precision));
+    SizeVector dims(header.ndims);
+    for (int i = 0; i < header.ndims; i++)
+        dims[i] = header.dims[i];
+
+    return TensorDesc {prc, dims, plain_layout(dims)};
+}
+
+
+bool is_plain(Blob::Ptr blob) {
+    bool res = true;
+
+    auto orig_strides = blob->getTensorDesc().getBlockingDesc().getStrides();
+    auto orig_order = blob->getTensorDesc().getBlockingDesc().getOrder();
+    auto dims = blob->getTensorDesc().getDims();
+
+    for (int stride = 1, i = dims.size()-1; i >= 0; --i) {
+        if (stride != orig_strides[i] || i != orig_order[i]) res = false;
+        stride *= dims[i];
+    }
+
+    return res;
+}
+
+static Blob::Ptr prepare_plain_data(Blob::Ptr blob) {
+    // check if it already plain
+    if (is_plain(blob)) return blob;
+
+    Blob::Ptr pln_blob = make_plain_blob(blob->precision(), blob->getTensorDesc().getDims());
+    pln_blob->allocate();
+
+    // Copy to plain
+    MKLDNNMemoryDesc mdesc(blob->getTensorDesc());
+    mkldnn::memory::desc desc = mdesc;
+    mkldnn::impl::memory_desc_wrapper blob_wrp(desc.data);
+
+    int data_size = blob->size();
+
+    // TODO: make it with blob_copy utility
+    switch (blob->precision()) {
+        case Precision::FP32:
+        case Precision::I32: {
+            int32_t *pln_blob_ptr = pln_blob->buffer().as<int32_t*>();
+            int32_t *blob_ptr = blob->buffer().as<int32_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+            break;
+        }
+        case Precision::I16:
+        case Precision::U16: {
+            int16_t *pln_blob_ptr = pln_blob->buffer().as<int16_t*>();
+            int16_t *blob_ptr = blob->buffer().as<int16_t *>();
+            for (size_t i = 0; i < data_size; i++)
+                pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+            break;
+        }
+        case Precision::I8:
+        case Precision::U8: {
+            int8_t *pln_blob_ptr = pln_blob->buffer().as<int8_t*>();
+            int8_t *blob_ptr = blob->buffer().as<int8_t *>();
+            for (size_t i = 0; i < data_size; i++)
+                pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+            break;
+        }
+        default:
+            THROW_IE_EXCEPTION << "Dumper. Unsupported precision";
+    }
+
+    return pln_blob;
+}
+
+void BlobDumper::dump(std::ostream &stream) {
+    if (!_blob)
+        THROW_IE_EXCEPTION << "Dumper cannot dump empty Blob";
+
+    if (_blob->buffer().as<float*>() == nullptr)
+        THROW_IE_EXCEPTION << "Dumper cannot dump. Blob is not allocated.";
+
+    IEB_HEADER header = prepare_header(_blob->getTensorDesc());
+    Blob::Ptr pln_blob = prepare_plain_data(_blob);
+
+    header.data_offset = sizeof(header);
+    header.data_size = pln_blob->byteSize();
+    header.scaling_data_offset = 0;
+    header.scaling_data_size = 0;
+
+    if (_scales) {
+        header.scaling_axis = 1;
+        header.scaling_data_offset = header.data_offset + header.data_size;
+        header.scaling_data_size = _scales->byteSize();
+    }
+
+    stream.write(reinterpret_cast<char*>(&header), sizeof(header));
+    stream.write(pln_blob->buffer().as<char*>(), pln_blob->byteSize());
+
+    if (_scales) {
+        stream.write(_scales->buffer().as<char*>(), _scales->byteSize());
+    }
+}
+
+void BlobDumper::dumpAsTxt(std::ostream &stream) {
+    if (!_blob)
+        THROW_IE_EXCEPTION << "Dumper cannot dump empty Blob";
+
+    if (_blob->buffer().as<float*>() == nullptr)
+        THROW_IE_EXCEPTION << "Dumper cannot dump. Blob is not allocated.";
+
+    SizeVector dims = _blob->getTensorDesc().getDims();
+
+    // Header like "U8 4D shape: 2 3 224 224 ()
+    stream << _blob->precision().name() << " "
+           << dims.size() << "D "
+           << "shape: ";
+    for (size_t d : dims) stream << d << " ";
+    stream << "(" << _blob->size() << ")" <<std::endl;
+
+    // Dump data
+    MKLDNNMemoryDesc mdesc(_blob->getTensorDesc());
+    mkldnn::memory::desc desc = mdesc;
+    mkldnn::impl::memory_desc_wrapper blob_wrp(desc.data);
+
+    int data_size = _blob->size();
+    switch (_blob->precision()) {
+        case Precision::FP32: {
+            auto *blob_ptr = _blob->buffer().as<float*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << blob_ptr[blob_wrp.off_l(i)] << std::endl;
+            break;
+        }
+        case Precision::I32: {
+            auto *blob_ptr = _blob->buffer().as<int32_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << blob_ptr[blob_wrp.off_l(i)] << std::endl;
+            break;
+        }
+        case Precision::I16: {
+            auto *blob_ptr = _blob->buffer().as<int16_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+            break;
+        }
+        case Precision::U16: {
+            auto *blob_ptr = _blob->buffer().as<uint16_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+            break;
+        }
+        case Precision::I8: {
+            auto *blob_ptr = _blob->buffer().as<int8_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+            break;
+        }
+        case Precision::U8: {
+            auto *blob_ptr = _blob->buffer().as<uint8_t*>();
+            for (size_t i = 0; i < data_size; i++)
+                stream << static_cast<int>(blob_ptr[blob_wrp.off_l(i)]) << std::endl;
+            break;
+        }
+        default:
+            THROW_IE_EXCEPTION << "Dumper. Unsupported precision";
+    }
+}
+
+BlobDumper BlobDumper::read(std::istream &stream) {
+    IEB_HEADER header;
+    stream.read(reinterpret_cast<char*>(&header), sizeof(header));
+
+    TensorDesc desc = parse_header(header);
+    Blob::Ptr blob = make_blob_with_precision(desc);
+    blob->allocate();
+
+    stream.seekg(header.data_offset, stream.beg);
+    stream.read(blob->buffer().as<char*>(), header.data_size);
+
+    BlobDumper res(blob);
+
+    // Parse scales fields.
+    if (header.scaling_axis != NO_SCALES) {
+        if (header.scaling_axis != 1)
+            THROW_IE_EXCEPTION << "Dumper support scaling only for channel dims.";
+
+        size_t scl_size = header.scaling_data_size / sizeof(float);
+        auto scl = make_blob_with_precision({Precision::FP32, {scl_size}, C});
+        scl->allocate();
+
+        stream.seekg(header.scaling_data_offset, stream.beg);
+        stream.read(scl->buffer().as<char*>(), header.scaling_data_size);
+
+        res._scales = scl;
+    }
+    return res;
+}
+
+BlobDumper BlobDumper::read(const std::string &file_path) {
+    std::ifstream file;
+    file.open(file_path);
+    if (!file.is_open())
+        THROW_IE_EXCEPTION << "Dumper cannot open file " << file_path;
+
+    auto res = read(file);
+    file.close();
+    return res;
+}
+
+void BlobDumper::dump(const std::string &dump_path) {
+    std::ofstream dump_file;
+    dump_file.open(dump_path);
+    if (!dump_file.is_open())
+        THROW_IE_EXCEPTION << "Dumper cannot create dump file";
+
+    dump(dump_file);
+    dump_file.close();
+}
+
+void BlobDumper::dumpAsTxt(const std::string dump_path) {
+    std::ofstream dump_file;
+    dump_file.open(dump_path);
+    if (!dump_file.is_open())
+        THROW_IE_EXCEPTION << "Dumper cannot create dump file";
+
+    dumpAsTxt(dump_file);
+    dump_file.close();
+}
+
+Blob::Ptr BlobDumper::get() {
+    return _blob;
+}
+
+template <typename data_t>
+static void plain_copy(const Blob::Ptr &from, const Blob::Ptr &scls, Blob::Ptr &to) {
+    auto dims = from->getTensorDesc().getDims();
+
+    size_t data_size = from->size();
+    size_t outer_size = dims[0];
+    size_t c_size = dims.size() > 1 ? dims[1] : 1;
+    size_t inner_size = dims.size() == 4 ? dims[2]*dims[3] :
+                        dims.size() == 3 ? dims[2] : 1;
+
+    auto to_data  = to->buffer().as<float*>();
+    auto from_data = from->buffer().as<data_t*>();
+
+    if (scls) {
+        auto scls_data = scls->buffer().as<float*>();
+
+        for (size_t o=0; o < outer_size; o++)
+        for (size_t c=0; c < c_size; c++)
+        for (size_t i=0; i < inner_size; i++)
+            *to_data++ = static_cast<float>(*from_data++) * scls_data[c];
+    } else {
+        for (size_t i=0; i < data_size; i++)
+            *to_data++ = static_cast<float>(*from_data++);
+    }
+}
+
+Blob::Ptr BlobDumper::getRealValue() {
+    if (_blob->precision() == Precision::FP32 && !_scales)
+        return _blob;
+
+    auto res = make_plain_blob(Precision::FP32, _blob->getTensorDesc().getDims());
+    res->allocate();
+
+    switch (_blob->precision()) {
+        case Precision::U8: plain_copy<uint8_t>(_blob, _scales, res); break;
+        case Precision::FP32: plain_copy<float>(_blob, _scales, res); break;
+        case Precision::I8: plain_copy<int8_t >(_blob, _scales, res); break;
+        default: THROW_IE_EXCEPTION << "Unsupported precesion for getRealValue method.";
+    }
+
+    return res;
+}
+
+
+BlobDumper& BlobDumper::withScales(InferenceEngine::Blob::Ptr scales) {
+    if ( _blob->getTensorDesc().getDims().size() < 2  ||
+        scales->getTensorDesc().getDims().size() != 1 ||
+        scales->getTensorDesc().getDims()[0] != _blob->getTensorDesc().getDims()[1] ||
+        scales->getTensorDesc().getPrecision() != Precision::FP32)
+        THROW_IE_EXCEPTION << "Dumper cannot use passed scales. Blob has incompatible shape.";
+
+    _scales = scales;
+    return *this;
+}
+
+BlobDumper& BlobDumper::withoutScales() {
+    _scales.reset();
+    return *this;
+}
+
+
+const InferenceEngine::Blob::Ptr& BlobDumper::getScales() const {
+    return _scales;
+}
+
+}  // namespace MKLDNNPlugin
diff --git a/inference-engine/src/mkldnn_plugin/utils/blob_dump.h b/inference-engine/src/mkldnn_plugin/utils/blob_dump.h
new file mode 100644 (file)
index 0000000..4130d53
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_blob.h"
+
+#include <string>
+
+namespace MKLDNNPlugin {
+
+/**
+ * Utility class to dump blob contant in plain format.
+ * Every layout information will be lost.
+ *
+ * In case of low precision blob it allow to store
+ * with using scaling factors per channel.
+ * NB! Channel is a second dimension for all blob types.
+ */
+class BlobDumper {
+    InferenceEngine::Blob::Ptr _blob;
+    InferenceEngine::Blob::Ptr _scales;
+
+public:
+    BlobDumper() = default;
+    BlobDumper(const BlobDumper&) = default;
+    BlobDumper& operator = (BlobDumper&&) = default;
+
+    explicit BlobDumper(const InferenceEngine::Blob::Ptr blob):_blob(blob) {}
+
+    static BlobDumper read(const std::string &file_path);
+    static BlobDumper read(std::istream &stream);
+
+    void dump(const std::string &file_path);
+    void dump(std::ostream &stream);
+
+    void dumpAsTxt(const std::string file_path);
+    void dumpAsTxt(std::ostream &stream);
+
+    BlobDumper& withScales(InferenceEngine::Blob::Ptr scales);
+    BlobDumper& withoutScales();
+
+    const InferenceEngine::Blob::Ptr& getScales() const;
+
+    InferenceEngine::Blob::Ptr get();
+    InferenceEngine::Blob::Ptr getRealValue();
+};
+
+}  // namespace MKLDNNPlugin
index bfc6537..4fa0b44 100644 (file)
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 ####################################
 ## All next project will use C++11
 set (CMAKE_CXX_STANDARD 11)
index 684c2b6..4ab1278 100644 (file)
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 cmake_minimum_required(VERSION 2.8)
 set(TARGET_NAME helpers)
 
@@ -23,7 +24,7 @@ add_library(${TARGET_NAME} STATIC
             ${HELPERS_HEADERS})
 
 target_include_directories(${TARGET_NAME} PUBLIC ${PROJECT_BINARY_DIR})
-target_compile_definitions(${TARGET_NAME} PUBLIC -DMODELS_PATH="${MODELS_PATH}")
+target_compile_definitions(${TARGET_NAME} PUBLIC -DMODELS_PATH=\"${MODELS_PATH}\")
 
 set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD 11)
 set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD_REQUIRED ON)
index 9cc5a82..d0f0949 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/tests/helpers/ir_gen_helper.cpp b/inference-engine/tests/helpers/ir_gen_helper.cpp
new file mode 100644 (file)
index 0000000..40a05c4
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ir_gen_helper.hpp"
+
+namespace single_layer_tests {
+
+    std::string IRTemplateGenerator::getIRTemplate(const std::string& name,
+                                  const std::vector<size_t>& input_shape,
+                                  const std::string& precision,
+                                  const std::string& layers, 
+                                  const std::string& edges,
+                                  const unsigned ir_version) {
+        std::string model = model_t;
+        REPLACE_WITH_STR(model, "_NAME_", name);
+        REPLACE_WITH_NUM(model, "_IRv_", ir_version);
+        REPLACE_WITH_STR(model, "_PR_", precision);
+
+        std::string s_dims;
+        for (auto& dim : input_shape) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+        REPLACE_WITH_STR(model, "__SRC_DIMS__", s_dims);
+        REPLACE_WITH_STR(model, "_LAYERS_", layers);
+        REPLACE_WITH_STR(model, "_EDGES_", edges);
+
+        return model;
+    }
+
+    std::string IRTemplateGenerator::model_t = R"V0G0N(
+        <net name="_NAME_" version="_IRv_" precision="_PR_" batch="1">
+            <layers>
+                <layer name="in1" type="Input" precision="_PR_" id="0">
+                    <output>
+                        <port id="0">__SRC_DIMS__
+                        </port>
+                    </output>
+                </layer>
+                _LAYERS_
+            </layers>
+            <edges>
+                _EDGES_
+            </edges>
+        </net>
+        )V0G0N";
+}
\ No newline at end of file
diff --git a/inference-engine/tests/helpers/ir_gen_helper.hpp b/inference-engine/tests/helpers/ir_gen_helper.hpp
new file mode 100644 (file)
index 0000000..db8bff5
--- /dev/null
@@ -0,0 +1,27 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef IR_GEN_HELPER_HPP
+#define IR_GEN_HELPER_HPP
+
+#include "single_layer_common.hpp"
+
+namespace single_layer_tests {
+
+    class IRTemplateGenerator {
+        IRTemplateGenerator() = default;
+    public:
+        static std::string model_t;
+
+        static std::string getIRTemplate(const std::string& name,
+                                  const std::vector<size_t>& input_shape,
+                                  const std::string& precision,
+                                  const std::string& layers, 
+                                  const std::string& edges,
+                                  const unsigned ir_version = 4u);
+    };
+
+} // namespace single_layer_tests
+#endif /* IR_GEN_HELPER_HPP */
+
diff --git a/inference-engine/tests/helpers/single_layer_common.cpp b/inference-engine/tests/helpers/single_layer_common.cpp
new file mode 100644 (file)
index 0000000..434d3f2
--- /dev/null
@@ -0,0 +1,196 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cmath>
+#include <ie_blob.h>
+#include <ie_layers_property.hpp>
+#include <ie_precision.hpp>
+#include <inference_engine/precision_utils.h>
+#include <gtest/gtest.h>
+#include "single_layer_common.hpp"
+#include <math.h>
+
+using namespace InferenceEngine;
+
+void get_common_dims(const Blob &blob,
+                     int32_t &dimx,
+                     int32_t &dimy,
+                     int32_t &dimz) {
+    if (blob.dims().size() == 2) {
+        dimz = 1;
+        dimy = blob.dims()[1];
+        dimx = blob.dims()[0];
+    } else if (blob.dims().size() == 3 || (blob.dims().size() == 4 && blob.dims()[3] == 1)) {
+        dimx = blob.dims()[0];
+        dimy = blob.dims()[1];
+        dimz = blob.dims()[2];
+    }
+}
+
+void get_common_dims(const Blob &blob,
+                     int32_t &dimx,
+                     int32_t &dimy,
+                     int32_t &dimz,
+                     int32_t &dimn) {
+    dimn = 1;
+    if (blob.dims().size() == 2) {
+        dimz = 1;
+        dimy = blob.dims()[1];
+        dimx = blob.dims()[0];
+    } else if (blob.dims().size() == 3 || (blob.dims().size() == 4 && blob.dims()[3] == 1)) {
+        dimx = blob.dims()[0];
+        dimy = blob.dims()[1];
+        dimz = blob.dims()[2];
+    } else {
+        if (blob.dims().size() == 4 && blob.dims()[3] != 1) {
+            dimx = blob.dims()[0];
+            dimy = blob.dims()[1];
+            dimz = blob.dims()[2];
+            dimn = blob.dims()[3];
+        }
+    }
+}
+
+void GenRandomDataCommon(Blob::Ptr blob) {
+    if (blob->precision() == Precision::U8) {
+        auto * blobRawDataU8 = blob->buffer().as<uint8_t*>();
+        size_t count = blob->size();
+        for (size_t i = 0; i < count; i++) {
+            auto val = static_cast<uint8_t>(rand() % 256);
+            blobRawDataU8[i] = val;
+        }
+    } else if (blob->precision() == Precision::FP16) {
+        float scale = 2.0f / RAND_MAX;
+        /* fill by random data in the range (-1, 1)*/
+        auto * blobRawDataFp16 = blob->buffer().as<ie_fp16 *>();
+        size_t count = blob->size();
+        for (size_t indx = 0; indx < count; ++indx) {
+            float val = rand();
+            val = val * scale - 1.0f;
+            blobRawDataFp16[indx] = PrecisionUtils::f32tof16(val);
+        }
+    } else if (blob->precision() == Precision::FP32) {
+        float scale = 2.0f / RAND_MAX;
+        /* fill by random data in the range (-1, 1)*/
+        auto * blobRawDataFp16 = blob->buffer().as<float*>();
+        size_t count = blob->size();
+        for (size_t i = 0; i < count; i++) {
+            float val = rand();
+            val = val * scale - 1.0f;
+            blobRawDataFp16[i] = val;
+        }
+    }
+}
+
+BufferWrapper::BufferWrapper(const Blob::Ptr& blob) : BufferWrapper(blob, blob->precision()) {}
+
+BufferWrapper::BufferWrapper(const Blob::Ptr& blob, Precision _precision) : precision(_precision) {
+    if (precision == Precision::FP16) {
+        fp16_ptr = blob->buffer().as<ie_fp16*>();
+    } else if (precision == Precision::FP32) {
+        fp32_ptr = blob->buffer().as<float*>();
+    } else {
+        THROW_IE_EXCEPTION << "Unsupported precision for compare: " << precision;
+    }
+}
+
+float BufferWrapper::operator[](size_t index) {
+    if (precision == Precision::FP16) return PrecisionUtils::f16tof32(fp16_ptr[index]);
+    return fp32_ptr[index];
+}
+
+void BufferWrapper::insert(size_t index, float value) {
+    if (precision == Precision::FP16) {
+        fp16_ptr[index] = PrecisionUtils::f32tof16(value);
+    } else {
+        fp32_ptr[index] = value;
+    }
+}
+
+void CompareCommon(const Blob::Ptr& actual, const Blob::Ptr& expected, float tolerance) {
+    ASSERT_NE(actual, nullptr);
+    ASSERT_NE(expected, nullptr);
+
+    Layout res_layout = actual->layout();
+    Layout ref_layout = expected->layout();
+    SizeVector res_dims = actual->getTensorDesc().getDims();
+
+    BufferWrapper res_ptr(actual);
+    BufferWrapper ref_ptr(expected);
+
+    size_t res_size = actual->size();
+    size_t ref_size = expected->size();
+    ASSERT_EQ(res_size, ref_size);
+
+    float max_error = 0;
+    size_t actualMaxErrId = 0;
+    size_t expectedMaxErrId = 0;
+
+    if (res_layout == NCHW || res_layout == NHWC) {
+        size_t N = res_dims[0];
+        size_t C = res_dims[1];
+        size_t H = res_dims[2];
+        size_t W = res_dims[3];
+
+        for (size_t n = 0; n < N; n++) {
+            for (size_t c = 0; c < C; c++) {
+                for (size_t h = 0; h < H; h++) {
+                    for (size_t w = 0; w < W; w++) {
+                        size_t actualIdx = res_layout == NCHW ?
+                                           w + h * W + c * W * H + n * W * H * C : c + w * C + h * C * W +
+                                                                                   n * W * H * C;
+                        size_t expectedIdx = ref_layout == NCHW ?
+                                             w + h * W + c * W * H + n * W * H * C : c + w * C + h * C * W +
+                                                                                     n * C * W * H;
+                        float cur_diff = fabs(res_ptr[actualIdx] - ref_ptr[expectedIdx]);
+                        if (cur_diff > max_error) {
+                            max_error = cur_diff;
+                            actualMaxErrId = actualIdx;
+                            expectedMaxErrId = expectedIdx;
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        if (res_layout == NC) {
+
+            size_t N = res_dims[0];
+            size_t C = res_dims[1];
+            for (size_t n = 0; n < N; n++) {
+                for (size_t c = 0; c < C; c++) {
+                    size_t actualIdx =   c +  n * C;
+                    float cur_diff = fabs(res_ptr[actualIdx] - ref_ptr[actualIdx]);
+                    if (cur_diff > max_error) {
+                        max_error = cur_diff;
+                        actualMaxErrId = actualIdx;
+                        expectedMaxErrId = actualIdx;
+                    }
+                }
+            }
+        } else {
+            for (size_t i = 0; i < ref_size; i++) {
+                float cur_diff = fabs(res_ptr[i] - ref_ptr[i]);
+                if (cur_diff > max_error) {
+                    max_error = cur_diff;
+                    actualMaxErrId = expectedMaxErrId = i;
+                }
+            }
+        }
+    }
+
+    ASSERT_NEAR(ref_ptr[expectedMaxErrId], res_ptr[actualMaxErrId], tolerance)
+                                << "expectedMaxErrId = " << expectedMaxErrId
+                                << " actualMaxErrId = " << actualMaxErrId;
+}
+
+void fill_data_common(BufferWrapper& data, size_t size, size_t duty_ratio) {
+    for (size_t i = 0; i < size; i++) {
+        if ((i / duty_ratio) % 2 == 1) {
+            data.insert(i, 0.0);
+        } else {
+            data.insert(i, sin((float) i));
+        }
+    }
+}
index 7b852c9..1354129 100644 (file)
@@ -1,12 +1,20 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
 
+#include <ie_blob.h>
+#include <ie_layers_property.hpp>
+#include <inference_engine/precision_utils.h>
+#include <inference_engine/parsers.h>
+#include <xml_net_builder.hpp>
+#include <xml_helper.hpp>
+
 #ifndef USE_BOOST_RE
+
 #include <regex>
+
 #define REPLACE_WITH_STR(SRC, PATTERN, STR) SRC = std::regex_replace(SRC, std::regex(PATTERN), STR)
 #define FIND_STR(SRC, PATTERN) std::regex_search(SRC, std::regex(PATTERN))
 #else
 #endif
 
 #define REPLACE_WITH_NUM(SRC, PATTERN, NUM) REPLACE_WITH_STR(SRC, PATTERN, std::to_string(NUM))
+#define REPLACE_WITH_NUM_VECTOR(SRC, PATTERN, NUMS) \
+       { std::string result; \
+        if (NUMS.size() > 0) { \
+            result += std::to_string(NUMS[0]); \
+            for (int i = 1; i < NUMS.size(); i++) { \
+                    result += "," + std::to_string(NUMS[i]); \
+            } \
+        } \
+       REPLACE_WITH_STR(SRC, PATTERN, result); }
+#define REPLACE_WITH_NUM_VECTOR_REVERSE(SRC, PATTERN, NUMS) \
+       { std::string result; \
+        auto nums_size = NUMS.size(); \
+        if (nums_size > 0) { \
+            result += std::to_string(NUMS[nums_size - 1]); \
+            for (int i = 2; i <= nums_size; i++) { \
+                    result += "," + std::to_string(NUMS[nums_size - i]); \
+            } \
+        } \
+       REPLACE_WITH_STR(SRC, PATTERN, result); }
 #define REMOVE_LINE(SRC, PATTERN) REPLACE_WITH_STR(SRC, PATTERN, "")
+
+struct conv_common_params {
+    InferenceEngine::PropertyVector<unsigned int> stride;
+    InferenceEngine::PropertyVector<unsigned int> kernel;
+    InferenceEngine::PropertyVector<unsigned int> pads_begin;
+    InferenceEngine::PropertyVector<unsigned int> pads_end;
+    InferenceEngine::PropertyVector<unsigned int> dilation;
+    std::string auto_pad;
+    size_t group;
+    size_t out_c;
+};
+
+struct pool_common_params {
+    InferenceEngine::PropertyVector<unsigned int> stride;
+    InferenceEngine::PropertyVector<unsigned int> kernel;
+    InferenceEngine::PropertyVector<unsigned int> pads_begin;
+    InferenceEngine::PropertyVector<unsigned int> pads_end;
+    std::string auto_pad;
+    bool avg;
+    bool exclude_pad;
+};
+
+#define PRETTY_PARAM(name, type)                                                            \
+    class name                                                                              \
+    {                                                                                       \
+    public:                                                                                 \
+        typedef type param_type;                                                            \
+        name ( param_type arg = param_type ()) : val_(arg) {}                      \
+        operator param_type () const {return val_;}                                         \
+    private:                                                                                \
+        param_type val_;                                                                    \
+    };                                                                                      \
+    static inline void PrintTo(name param, ::std::ostream* os)                              \
+    {                                                                                       \
+        *os << #name ": " << ::testing::PrintToString((name::param_type)(param));           \
+    }
+
+struct MapStrStr {
+    std::map<std::string, std::string> data{};
+
+    explicit MapStrStr(std::map<std::string, std::string> _data) : data(std::move(_data)) {}
+
+    MapStrStr() = default;
+};
+
+void get_common_dims(const InferenceEngine::Blob &blob,
+                     int32_t &dimx,
+                     int32_t &dimy,
+                     int32_t &dimz);
+
+void get_common_dims(const InferenceEngine::Blob &blob,
+                     int32_t &dimx,
+                     int32_t &dimy,
+                     int32_t &dimz,
+                     int32_t &dimn);
+
+template<int Version = 3>
+inline InferenceEngine::details::CNNNetworkImplPtr
+buildSingleLayerNetworkCommon(InferenceEngine::details::IFormatParser *parser,
+                              const std::string &layerType,
+                              const testing::InOutData &inOutShapes,
+                              std::map<std::string, std::string> *params,
+                              const std::string &layerDataName = "data",
+                              const InferenceEngine::Precision &precision = InferenceEngine::Precision::FP32,
+                              size_t weightsSize = 0,
+                              size_t biasesSize = 0,
+                              const InferenceEngine::TBlob<uint8_t>::Ptr &weights = nullptr) {
+    IE_ASSERT(parser);
+    testing::XMLHelper xmlHelper(parser);
+    std::string precisionStr = precision.name();
+    auto netBuilder = testing::XmlNetBuilder<Version>::buildNetworkWithOneInput("Mock", inOutShapes.inDims[0],
+                                                                                precisionStr);
+    size_t inputsNumber = inOutShapes.inDims.size();
+    for (int i = 1; i < inputsNumber; i++) {
+        netBuilder.addInputLayer(precisionStr, inOutShapes.inDims[i]);
+    }
+    netBuilder.addLayer(layerType, precisionStr, params, inOutShapes, weightsSize, biasesSize, layerDataName);
+    std::string testContent;
+    if (inputsNumber > 1) {
+        auto edgeBuilder = netBuilder.havingEdges();
+        for (size_t i = 0; i < inputsNumber; i++) {
+            edgeBuilder.connect(i, inputsNumber);
+        }
+        testContent = edgeBuilder.finish();
+    } else {
+        testContent = netBuilder.finish();
+    }
+    xmlHelper.loadContent(testContent);
+    auto result = xmlHelper.parseWithReturningNetwork();
+    if (weights) xmlHelper.setWeights(weights);
+    return result;
+}
+
+void GenRandomDataCommon(InferenceEngine::Blob::Ptr blob);
+
+class BufferWrapper {
+    InferenceEngine::Precision precision;
+    InferenceEngine::ie_fp16 *fp16_ptr;
+    float *fp32_ptr;
+public:
+    explicit BufferWrapper(const InferenceEngine::Blob::Ptr &blob);
+
+    BufferWrapper(const InferenceEngine::Blob::Ptr &blob, InferenceEngine::Precision precision);
+
+    float operator[](size_t index);
+
+    void insert(size_t index, float value);
+};
+
+void
+CompareCommon(const InferenceEngine::Blob::Ptr &actual, const InferenceEngine::Blob::Ptr &expected, float tolerance);
+
+void fill_data_common(BufferWrapper &data, size_t size, size_t duty_ratio = 10);
index a8ae366..5e2ee36 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index ecce409..73f4fc6 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 6c3f3be..5356f98 100644 (file)
@@ -1,17 +1,5 @@
-//
-// Copyright 2017-2018 Intel Corporation.
-//
-// This software and the related documents are Intel copyrighted materials,
-// and your use of them is governed by the express license under which they
-// were provided to you (End User License Agreement for the Intel(R) Software
-// Development Products (Version May 2017)). Unless the License provides
-// otherwise, you may not use, modify, copy, publish, distribute, disclose or
-// transmit this software or the related documents without Intel's prior
-// written permission.
-//
-// This software and the related documents are provided as is, with no
-// express or implied warranties, other than those that are expressly
-// stated in the License.
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
index de5cc2b..69d97b8 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 9c61004..d9698ae 100644 (file)
@@ -1,9 +1,9 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
+
 #include <cctype>
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
@@ -114,15 +114,21 @@ public:
     static std::string make_so_name(const std::string & input) {
 #ifdef _WIN32
     #ifdef __MINGW32__
-        return "lib" + input + ".dll";
+        std::string pre = "lib";
+        std::string ext = ".dll";
     #else
-        return input + ".dll";
+        std::string pre = "";
+        std::string ext = ".dll";
     #endif
 #elif __APPLE__
-        return "lib" + input + ".dylib";
+        std::string pre = "lib";
+        std::string ext = ".dylib";
 #else
-        return "lib" + input + ".so";
+        std::string pre = "lib";
+        std::string ext = ".so";
 #endif
+        return pre + input + IE_BUILD_POSTFIX + ext;
+
     }
 
     static std::string make_plugin_name(const std::string & input) {
@@ -161,7 +167,7 @@ public:
         }
     }
 
-    void compare(InferenceEngine::Blob &res, InferenceEngine::Blob &ref, float max_diff = 0.01f) {
+    static void compare(InferenceEngine::Blob &res, InferenceEngine::Blob &ref, float max_diff = 0.01f) {
 
         float *res_ptr = res.buffer().as<float*>();
         size_t res_size = res.size();
@@ -176,7 +182,7 @@ public:
         }
     }
 
-    void compare_NRMSD(InferenceEngine::Blob &res, InferenceEngine::Blob &ref, float max_nrmsd = 0.01f) {
+    static void compare_NRMSD(InferenceEngine::Blob &res, InferenceEngine::Blob &ref, float max_nrmsd = 0.01f) {
 
         float *res_ptr = res.buffer().as<float*>();
         size_t res_size = res.size();
@@ -195,8 +201,8 @@ public:
             sqr *= sqr;
             sum += sqr;
 
-            mmin = std::min(mmin, ref_ptr[i]);
-            mmax = std::max(mmax, ref_ptr[i]);
+            mmin = (std::min)(mmin, ref_ptr[i]);
+            mmax = (std::max)(mmax, ref_ptr[i]);
 
             if (i % 10007 == 0) {
                 std::cout << i << ": " << res_ptr[i] << "\t" << ref_ptr[i] << "\t" << "\tdiv: " << ref_ptr[i] / res_ptr[i] << std::endl;
@@ -212,7 +218,7 @@ public:
         ASSERT_LE(sum, max_nrmsd);
     }
 
-    void compare(float* res, float* ref, size_t size, float max_diff = 0.01f) {
+    static void compare(float* res, float* ref, size_t size, float max_diff = 0.01f) {
         for (size_t i = 0; i < size; i++) {
             ASSERT_NEAR(res[i], ref[i], max_diff);
         }
index 1c1c8a6..387d5a6 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 2225f12..b23e726 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 14c883f..dbfa50c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 6d98af7..3a44889 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 12f7a88..7448c99 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 3e349ef..90b7d73 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -88,6 +87,10 @@ namespace testing {
             return _content;
         }
 
+        void add_content (std::string content) {
+            _content += content;
+        }
+
         std::string attr () const {
             return _attr;
         }
index 94ac84d..75cc131 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -15,6 +14,7 @@
 #include <fstream>
 #include <stdio.h>
 #include "cpp/ie_cnn_network.h"
+#include <gtest/gtest.h>
 #include "ie_icnn_network_stats.hpp"
 
 namespace testing {
@@ -45,6 +45,10 @@ namespace testing {
             return parser->Parse(*_root);
         }
 
+        void setWeights(const InferenceEngine::TBlob<uint8_t>::Ptr &weights) {
+            parser->SetWeights(weights);
+        }
+
         std::string readFileContent(const std::string & filePath) {
             const auto openFlags = std::ios_base::ate | std::ios_base::binary;
             std::ifstream fp (getXmlPath(filePath), openFlags);
index 892881c..45f9672 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -10,17 +9,6 @@
 
 using namespace ::testing;
 
-IDManager* IDManager::_instance = nullptr;
-size_t  IDManager::portID = 0;
-size_t  IDManager::layerID = 0;
-
-IDManager* IDManager::getInstance() {
-    if (!_instance) {
-        _instance = new IDManager();
-    }
-    return _instance;
-}
-
 size_t  IDManager::getNextLayerID() {
     return layerID++;
 }
@@ -33,16 +21,15 @@ void IDManager::reset() {
     portID = layerID = 0;
 }
 
-LayerDesc::LayerDesc(std::string type, InOutData& shapes) : _type(std::move(type)) {
-    auto idManager = IDManager::getInstance();
-    _layerID = idManager->getNextLayerID();
+LayerDesc::LayerDesc(std::string type, InOutData& shapes, IDManager &id_manager) : _type(std::move(type)) {
+    _layerID = id_manager.getNextLayerID();
     auto inDims = shapes.inDims;
     auto outDims = shapes.outDims;
     for (const auto& inDim : inDims) {
-        _inPortsID.emplace_back(idManager->getNextPortID(), inDim);
+        _inPortsID.emplace_back(id_manager.getNextPortID(), inDim);
     }
     for (const auto& outDim : outDims) {
-        _outPortsID.emplace_back(idManager->getNextPortID(), outDim);
+        _outPortsID.emplace_back(id_manager.getNextPortID(), outDim);
     }
 }
 
index 56ede0f..81fa21d 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -81,37 +80,28 @@ struct TokenType<0> {
  */
 class IDManager {
 public:
-    /**
-     * @brief Returns single instanse of the class
-     */
-    static IDManager* getInstance();
-
-    IDManager(IDManager const&) = delete;
-
+    IDManager() = default;
+//    IDManager(IDManager const&) = delete;
     void operator=(IDManager const&)  = delete;
 
     /**
      * @brief Returns new unique number for layer to be used in IR
      */
-    static size_t getNextLayerID();
+    size_t getNextLayerID();
 
     /**
      * @brief Returns new unique number for port to be used in IR
      */
-    static size_t getNextPortID();
+    size_t getNextPortID();
 
     /**
      * @brief Reset numbers for layers and ports. It's convenient to always start new network from zero number.
      */
-    static void reset();
+    void reset();
 
 private:
-    IDManager() = default;
-
-private:
-    static size_t layerID;
-    static size_t portID;
-    static IDManager* _instance;
+    size_t layerID = 0;
+    size_t portID = 0;
 };
 
 /**
@@ -147,7 +137,7 @@ public:
      * @param type - string with type of the layer
      * @param shapes - reference to the structure with input and output shapes
      */
-    explicit LayerDesc(std::string type, InOutData& shapes);
+    explicit LayerDesc(std::string type, InOutData& shapes, IDManager &id_manager);
 
     /**
      * @brief Resets current input and output ports to iterate over all input and output ports
@@ -227,15 +217,31 @@ class XmlNetBuilder {
     std::vector<LayerDesc::Ptr> layersDesc;
     std::shared_ptr<XMLFather> root;
     testing::Token<testing::Token<XMLFather>>& xml;
+    IDManager id_manager;
 
     XmlNetBuilder(std::shared_ptr<XMLFather> _root,
-                  typename testing::Token<testing::Token<XMLFather>>& _xml) : xml(_xml), root(_root) {
-        IDManager::reset();
-    };
+                  typename testing::Token<testing::Token<XMLFather>>& _xml) : xml(_xml), root(_root) {};
 
 public:
     static XmlNetBuilder buildNetworkWithOneInput(
-            std::string name = "AlexNet", std::vector<size_t> dims = {1, 3, 227, 227}, std::string precision = "Q78");
+            std::string name = "AlexNet", std::vector<size_t> dims = {1, 3, 227, 227}, std::string precision = "Q78") {
+        std::shared_ptr<XMLFather> root = std::make_shared<XMLFather>();
+        auto &exp = root->node("net").attr("name", name).attr("precision", precision).attr("version", Version);
+        if (Version == 1) {
+            auto &expFinal = exp.node("input").attr("name", "data");
+            addDims(expFinal, dims);
+            return XmlNetBuilder(root, expFinal.close().node("layers"));
+        } else {
+            auto &expFinal = exp.attr("batch", 1);
+            return XmlNetBuilder(root, expFinal.node("layers")).addInputLayer(precision, dims);
+        }
+    }
+
+    static XmlNetBuilder buildBody() {
+        auto root = std::make_shared<XMLFather>(XMLFather::make_without_schema());
+        auto &exp = root->node("body");
+        return XmlNetBuilder(root, exp.node("layers"));
+    }
 
     XmlNetBuilder& havingLayers() {
         return *this;
@@ -281,15 +287,55 @@ public:
         return addLayer("Pooling", "", &params, inout, 0, 0, "pooling_data");
     }
 
+    struct TIPortMap { int from_l, from_p, to_l, to_p, axis, stride, start, end; };
+
+    XmlNetBuilder& TILayer(InOutData inout,
+                           std::string body,
+                           std::vector<TIPortMap> inMap,
+                           std::vector<TIPortMap> outMap,
+                           std::vector<TIPortMap> backMap) {
+        auto builder = XMLFather::make_without_schema();
+        // Port map section
+        auto &ports = builder.node("port_map");
+        auto fill_port_map_info = [&] (std::string name, TIPortMap m) {
+            auto & exp =  ports.node(name)
+                    .attr("external_port_id", m.from_p)
+                    .attr("internal_layer_id", m.to_l)
+                    .attr("internal_port_id", m.to_p);
+            if (m.axis != -1)
+                exp.attr("axis", m.axis).attr("stride", m.stride).attr("start", m.start).attr("end", m.end);
+            exp.close();
+        };
+        for (auto &m : inMap)  fill_port_map_info("input", m);
+        for (auto &m : outMap) fill_port_map_info("output", m);
+        ports.close();
+        // BackEdge map section
+        auto &backedges = builder.node("back_edges");
+        for (auto &m : backMap) {
+            backedges.node("edge")
+                    .attr("from-layer", m.from_l)
+                    .attr("from-port", m.from_p)
+                    .attr("to-layer", m.to_l)
+                    .attr("to-port", m.to_p).close();
+        }
+        backedges.close();
+        // Serialize all TI info
+        std::string content = builder;
+        content += body;
+
+        return addLayer("TensorIterator", "FP32", nullptr, inout, 0,0, "data", content);
+    }
+
     XmlNetBuilder& addLayer(const std::string& type,
                             const std::string& precision,
                             std::map<std::string, std::string>* params,
                             InOutData inout,
                             int weightsSize = 0,
                             int biasesSize = 0,
-                            std::string layerDataName = "data") {
+                            std::string layerDataName = "data",
+                            std::string content = "") {
         layersNum++;
-        auto layerDesc = std::make_shared<LayerDesc>(type, inout);
+        auto layerDesc = std::make_shared<LayerDesc>(type, inout, id_manager);
         layersDesc.push_back(layerDesc);
 
         auto& layer = xml.node("layer").attr("name", layerDesc->getLayerName()).attr("precision", precision)
@@ -308,6 +354,8 @@ public:
                 layer = layer.node("biases").attr("offset", weightsSize).attr("size", biasesSize).close();
             }
         }
+        if (!content.empty())
+            layer.add_content(content);
         layer.close();
         return *this;
     }
@@ -384,7 +432,7 @@ private:
 
     template<class T>
     void addEdges(T& mainContent) {
-        size_t firstLayerNum = Version == 2 ? 0 : 1;
+        size_t firstLayerNum = Version >= 2 ? 0 : 1;
         if (layersNum <= firstLayerNum) {
             return;
         }
@@ -405,33 +453,13 @@ private:
     template<class T>
     void addPreProcess(T& mainContent) {
         auto& preProcess = mainContent.node("pre-process");
-        if (Version == 2) {
+        if (Version >= 2) {
             preProcess.attr("reference-layer-name", layersDesc[0]->getLayerName());
         }
         preProcess.close();
     }
 };
 
-template<>
-inline XmlNetBuilder<1> XmlNetBuilder<1>::buildNetworkWithOneInput(
-        std::string name, std::vector<size_t> dims, std::string precision) {
-    std::shared_ptr<XMLFather> root = std::make_shared<XMLFather>();
-
-    auto& exp = root->node("net").attr("name", name).attr("precision", precision).attr("version", 1)
-            .node("input").attr("name", "data");
-    addDims(exp, dims);
-    return XmlNetBuilder(root, exp.close().node("layers"));
-}
-
-template<>
-inline XmlNetBuilder<2> XmlNetBuilder<2>::buildNetworkWithOneInput(
-        std::string name, std::vector<size_t> dims, std::string precision) {
-    std::shared_ptr<XMLFather> root = std::make_shared<XMLFather>();
-
-    auto& exp = root->node("net").attr("name", name).attr("precision", precision).attr("version", 2).attr("batch", 1);
-    return XmlNetBuilder(root, exp.node("layers")).addInputLayer(precision, dims);
-}
-
 typedef XmlNetBuilder<1> V1NetBuilder;
 typedef XmlNetBuilder<2> V2NetBuilder;
 
index f991c70..dc1edfb 100644 (file)
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 set (TARGET_NAME "mock_engine")
 
 file (GLOB LIBRARY_SRC
@@ -15,9 +16,8 @@ file (GLOB LIBRARY_HEADERS
 
 if(UNIX)
     list(REMOVE_ITEM LIBRARY_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dllmain.cpp)
-else()
-    add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_API)
 endif()
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_API)
 
 # Create named folders for the sources within the .vcproj
 # Empty name lists them directly under the .vcproj
@@ -38,4 +38,4 @@ add_library(${TARGET_NAME} SHARED
 
 set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD 11)
 set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD_REQUIRED ON)
-set_property(TARGET ${TARGET_NAME} PROPERTY COMPILE_PDB_NAME ${TARGET_NAME})
\ No newline at end of file
+set_property(TARGET ${TARGET_NAME} PROPERTY COMPILE_PDB_NAME ${TARGET_NAME})
index 5d750ae..a9dd58a 100644 (file)
@@ -1,8 +1,7 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
-
+// dllmain.cpp : Defines the entry point for the DLL application.
 #ifdef _WIN32
 #define _WINSOCKAPI_
 #include <windows.h>
index e7b95ab..0d344c8 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index b67666f..9706381 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 9545650..fa2d9de 100644 (file)
@@ -1,6 +1,7 @@
 // Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+
 #include <random>
 #include <algorithm>
 
index 0c6b15f..4761e83 100644 (file)
@@ -1,6 +1,7 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
 cmake_minimum_required(VERSION 2.8)
 cmake_policy(SET CMP0054 NEW)
 
@@ -39,15 +40,24 @@ file(GLOB
         stress_tests/*.cpp
         )
 
-if (ENABLE_MKL_DNN)
-    if (THREADING STREQUAL "OMP")
-        find_package(OpenMP)
-        if (OPENMP_FOUND)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        endif ()
-    endif ()
+if (ENABLE_GNA)
+    file(GLOB
+            GNA_TESTS
+            engines/gna/*cpp
+            )
+    list(APPEND TEST_SRC ${GNA_TESTS})
+    source_group("gna" FILES ${GNA_TESTS})
+
+    find_package(libGNA)
+    include_directories(${libGNA_INCLUDE_DIRS})
 
+    set (GNA_TEST_ENGINE GNAPlugin_test_static)
+endif()
+
+if (ENABLE_MKL_DNN)
+    if (GEMM STREQUAL "MKL")
+        add_definitions(-DUSE_MKL)
+     endif ()
     file(GLOB
             MKLDNN_TESTS
             engines/mkldnn/*.cpp
@@ -87,10 +97,14 @@ include_directories(
         ${IE_MAIN_SOURCE_DIR}/include
         ${IE_MAIN_SOURCE_DIR}/src/inference_engine
         ${IE_MAIN_SOURCE_DIR}/src/mkldnn_plugin
+        ${IE_MAIN_SOURCE_DIR}/src/gna_plugin
         ${IE_MAIN_SOURCE_DIR}/src/extension
+        ${IE_MAIN_SOURCE_DIR}/src/extension/common
         ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/gflags/include
         mocks)
 add_executable(${TARGET_NAME} ${TEST_SRC} ${TEST_INCLUDE} ${MKLDNN_TESTS} ${MKLDNN_TESTS_INCLUDE} ${DLAI_TESTS})
+set_ie_threading_interface_for(${TARGET_NAME})
+
 set_target_properties(${TARGET_NAME} PROPERTIES "CMAKE_CXX_FLAGS" "${CMAKE_CXX_FLAGS} -fPIE"
 COMPILE_PDB_NAME ${TARGET_NAME})
 
@@ -111,11 +125,9 @@ else ()
     set(PUGI pugixml)
 endif ()
 
-add_definitions(-DMODELS_PATH="${MODELS_PATH}" -DDATA_PATH="${IE_MAIN_SOURCE_DIR}/tests/data")
+add_definitions(-DMODELS_PATH=\"${MODELS_PATH}\" -DDATA_PATH=\"${IE_MAIN_SOURCE_DIR}/tests/data\")
 
-target_compile_definitions(${TARGET_NAME} PUBLIC -DUSE_STATIC_IE)
-
-target_link_libraries(${TARGET_NAME}
+target_link_libraries(${TARGET_NAME} PRIVATE
         gtest
         gmock
         gtest_main
@@ -128,10 +140,13 @@ target_link_libraries(${TARGET_NAME}
         ${INTEL_ITT_LIBS}
         ${Boost_REGEX_LIBRARY}
         ${TBB_LIBRARY}
-        ${TBBMALLOC_LIBRARY})
+        ${TBBMALLOC_LIBRARY}
+        ${GNA_TEST_ENGINE})
+
+add_dependencies(${TARGET_NAME} ie_cpu_extension)
 
 if (ENABLE_MKL_DNN)
-    target_link_libraries(${TARGET_NAME}
+    target_link_libraries(${TARGET_NAME} PRIVATE
             test_MKLDNNPlugin
             mkldnn)
 endif ()
@@ -140,3 +155,6 @@ add_test(NAME ${TARGET_NAME}
         COMMAND ${TARGET_NAME})
 
 add_dependencies(${TARGET_NAME} mock_engine)
+
+# GAPI unit tests
+add_subdirectory(opencv_test_gapi)
diff --git a/inference-engine/tests/unit/builders/batch_normalization_layer_test.cpp b/inference-engine/tests/unit/builders/batch_normalization_layer_test.cpp
new file mode 100644 (file)
index 0000000..5d55c17
--- /dev/null
@@ -0,0 +1,36 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <string.h>
+#include <ie_builders.hpp>
+#include <builders/ie_batch_normalization_layer.hpp>
+
+#include "builder_test.hpp"
+
+using namespace testing;
+using namespace InferenceEngine;
+
+class BatchNormalizationLayerBuilderTest : public BuilderTestCommon {};
+
+TEST_F(BatchNormalizationLayerBuilderTest, cannotCreateBatchNormalizationWithoutWeightOrBiases) {
+    ASSERT_THROW(((Builder::Layer)Builder::BatchNormalizationLayer("in1")), InferenceEngine::details::InferenceEngineException);
+    ASSERT_THROW(((Builder::Layer)Builder::BatchNormalizationLayer("in1")
+            .setWeights(generateBlob(Precision::FP32, {3}, Layout::C))), InferenceEngine::details::InferenceEngineException);
+    ASSERT_THROW(((Builder::Layer)Builder::BatchNormalizationLayer("in1")
+            .setBiases(generateBlob(Precision::FP32, {3}, Layout::C))), InferenceEngine::details::InferenceEngineException);
+}
+
+TEST_F(BatchNormalizationLayerBuilderTest, getExistsLayerFromNetworkBuilder) {
+    Builder::Network network("Test");
+    Builder::BatchNormalizationLayer bnBuilder("bn");
+    bnBuilder.setWeights(generateBlob(Precision::FP32, {3}, Layout::C));
+    bnBuilder.setBiases(generateBlob(Precision::FP32, {3}, Layout::C));
+    size_t bnId = network.addLayer(bnBuilder);
+    Builder::BatchNormalizationLayer bnBuilderFromNetwork(network.getLayer(bnId));
+    ASSERT_EQ(bnBuilderFromNetwork.getEpsilon(), bnBuilder.getEpsilon());
+    bnBuilderFromNetwork.setEpsilon(2);
+    ASSERT_NE(bnBuilderFromNetwork.getEpsilon(), bnBuilder.getEpsilon());
+    ASSERT_EQ(bnBuilderFromNetwork.getEpsilon(), network.getLayer(bnId).getParameters()["epsilon"].asFloat());
+}
\ No newline at end of file
diff --git a/inference-engine/tests/unit/builders/builder_test.hpp b/inference-engine/tests/unit/builders/builder_test.hpp
new file mode 100644 (file)
index 0000000..28ef342
--- /dev/null
@@ -0,0 +1,33 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string.h>
+#include <ie_builders.hpp>
+#include <blob_factory.hpp>
+
+#include "tests_common.hpp"
+
+
+class BuilderTestCommon : public TestsCommon {
+public:
+    InferenceEngine::Blob::Ptr generateBlob(InferenceEngine::Precision precision,
+                                            InferenceEngine::SizeVector dims, InferenceEngine::Layout layout) {
+        InferenceEngine::Blob::Ptr blob = make_blob_with_precision(InferenceEngine::TensorDesc(precision, dims, layout));
+        blob->allocate();
+        fill_data(blob);
+        return blob;
+    }
+
+    template<class T>
+    InferenceEngine::Blob::Ptr generateBlob(InferenceEngine::Precision precision,
+                                            InferenceEngine::SizeVector dims, InferenceEngine::Layout layout,
+                                            std::vector<T> data) {
+        auto blob = generateBlob(precision, dims, layout);
+        auto *blbData = blob->buffer().as<T *>();
+        for (size_t i = 0; i < data.size(); i++) {
+            blbData[i] = data[i];
+        }
+        return blob;
+    }
+};
\ No newline at end of file
diff --git a/inference-engine/tests/unit/builders/input_layer_test.cpp b/inference-engine/tests/unit/builders/input_layer_test.cpp
new file mode 100644 (file)
index 0000000..6a30fdb
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <string.h>
+#include <ie_builders.hpp>
+
+#include "builder_test.hpp"
+
+using namespace testing;
+using namespace InferenceEngine;
+
+class InputLayerBuilderTest : public BuilderTestCommon {};
+
+TEST_F(InputLayerBuilderTest, cannotCreateInputWithoutPort) {
+    ASSERT_THROW(((Builder::Layer)Builder::InputLayer("in1")).build(), InferenceEngine::details::InferenceEngineException);
+}
+
+TEST_F(InputLayerBuilderTest, getExistsLayerFromNetworkBuilder) {
+    Builder::Network network("Test");
+    Builder::InputLayer inBuilder("in1");
+    inBuilder.setPort(Port({1, 3, 3, 3}));
+    size_t inId = network.addLayer(inBuilder);
+    ASSERT_EQ(inBuilder.getPort().shape(), Port({1, 3, 3, 3}).shape());
+    Builder::InputLayer inBuilderFromNetwork(network.getLayer(inId));
+    ASSERT_EQ(inBuilderFromNetwork.getPort().shape(), Port({1, 3, 3, 3}).shape());
+    inBuilderFromNetwork.setPort(Port({1, 3, 4, 4}));
+    ASSERT_EQ(inBuilderFromNetwork.getPort().shape(), Port({1, 3, 4, 4}).shape());
+    ASSERT_EQ(network.getLayer(inId).getOutputPorts()[0].shape(), Port({1, 3, 4, 4}).shape());
+    ASSERT_EQ(inBuilder.getPort().shape(), Port({1, 3, 3, 3}).shape());
+}
\ No newline at end of file
diff --git a/inference-engine/tests/unit/builders/network_builder_test.cpp b/inference-engine/tests/unit/builders/network_builder_test.cpp
new file mode 100644 (file)
index 0000000..3b53f12
--- /dev/null
@@ -0,0 +1,927 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <string.h>
+#include <ie_builders.hpp>
+
+
+#include "builder_test.hpp"
+
+using namespace testing;
+using namespace InferenceEngine;
+
+class NetworkBuilderTest : public BuilderTestCommon {
+protected:
+    std::vector<std::string> alexNetNames = {
+            "in1",
+            "mean",
+            "conv1",
+            "relu1",
+            "norm1",
+            "pool1",
+            "conv2",
+            "relu2",
+            "norm2",
+            "pool2",
+            "conv3",
+            "relu3",
+            "conv4",
+            "relu4",
+            "conv5",
+            "relu5",
+            "pool5",
+            "fc6",
+            "relu6",
+            "fc7",
+            "relu7",
+            "fc8",
+            "prob",
+            "sf_out"
+    };
+
+public:
+
+    Builder::Network prepateAlexnetBuilder() {
+        Context ctx;
+        Builder::Network builder(ctx, "AlexNet");
+        idx_t layerId = builder.addLayer(Builder::InputLayer(alexNetNames[0]).setPort(Port({1,3, 227, 227})));
+        layerId = builder.addLayer({{layerId}}, Builder::ScaleShiftLayer(alexNetNames[1]).setBiases(generateBlob(Precision::FP32, {3}, Layout::C)));
+        layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer(alexNetNames[2]).setKernel({11, 11}).setStrides({4, 4}).setOutDepth(96)
+                .setWeights(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))
+                .setBiases(generateBlob(Precision::FP32, {96}, Layout::C)));
+        layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[3]));
+        layerId = builder.addLayer({{layerId}}, Builder::NormLayer(alexNetNames[4]).setAlpha(9.999999747378752e-05f).setBeta(0.75f).setSize(5).setAcrossMaps(true));
+        layerId = builder.addLayer({{layerId}}, Builder::PoolingLayer(alexNetNames[5]).setExcludePad(false).setKernel({3, 3}).setPaddingsBegin({0, 0})
+                .setPaddingsEnd({0, 0}).setPoolingType(Builder::PoolingLayer::PoolingType::MAX).setStrides({2, 2}));
+        layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer(alexNetNames[6]).setKernel({5, 5}).setStrides({1, 1}).setOutDepth(256)
+                .setPaddingsBegin({2, 2}).setPaddingsEnd({2, 2}).setGroup(2).setDilation({1, 1})
+                .setWeights(generateBlob(Precision::FP32, {96, 256, 5, 5}, Layout::OIHW))
+                .setBiases(generateBlob(Precision::FP32, {256}, Layout::C)));
+        layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[7]));
+        layerId = builder.addLayer({{layerId}}, Builder::NormLayer(alexNetNames[8]).setAlpha(9.999999747378752e-05f).setBeta(0.75f).setSize(5).setAcrossMaps(true));
+        layerId = builder.addLayer({{layerId}}, Builder::PoolingLayer(alexNetNames[9]).setExcludePad(false).setKernel({3, 3}).setPaddingsBegin({0, 0})
+                .setPaddingsEnd({0, 0}).setPoolingType(Builder::PoolingLayer::PoolingType::MAX).setStrides({2, 2}));
+        layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer(alexNetNames[10]).setKernel({3, 3}).setStrides({1, 1}).setOutDepth(384)
+                .setPaddingsBegin({1, 1}).setPaddingsEnd({1, 1}).setGroup(1).setDilation({1, 1})
+                .setWeights(generateBlob(Precision::FP32, {256, 384, 3, 3}, Layout::OIHW))
+                .setBiases(generateBlob(Precision::FP32, {384}, Layout::C)));
+        layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[11]));
+        layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer(alexNetNames[12]).setKernel({3, 3}).setStrides({1, 1}).setOutDepth(384)
+                .setPaddingsBegin({1, 1}).setPaddingsEnd({1, 1}).setGroup(2).setDilation({1, 1})
+                .setWeights(generateBlob(Precision::FP32, {384, 384, 3, 3}, Layout::OIHW))
+                .setBiases(generateBlob(Precision::FP32, {384}, Layout::C)));
+        layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[13]));
+        layerId = builder.addLayer({{layerId}}, Builder::ConvolutionLayer(alexNetNames[14]).setKernel({3, 3}).setStrides({1, 1}).setOutDepth(256)
+                .setPaddingsBegin({1, 1}).setPaddingsEnd({1, 1}).setGroup(2).setDilation({1, 1})
+                .setWeights(generateBlob(Precision::FP32, {256, 384, 3, 3}, Layout::OIHW))
+                .setBiases(generateBlob(Precision::FP32, {384}, Layout::C)));
+        layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[15]));
+        layerId = builder.addLayer({{layerId}}, Builder::PoolingLayer(alexNetNames[16]).setExcludePad(false).setKernel({3, 3}).setPaddingsBegin({0, 0})
+                .setPaddingsEnd({0, 0}).setPoolingType(Builder::PoolingLayer::PoolingType::MAX).setStrides({2, 2}));
+        layerId = builder.addLayer({{layerId}}, Builder::FullyConnectedLayer(alexNetNames[17]).setOutputNum(4096)
+                .setWeights(generateBlob(Precision::FP32, {4096, 256, 6, 6}, Layout::OIHW))
+                .setBiases(generateBlob(Precision::FP32, {4096}, Layout::C)));
+        layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[18]));
+        layerId = builder.addLayer({{layerId}}, Builder::FullyConnectedLayer(alexNetNames[19]).setOutputNum(4096)
+                .setWeights(generateBlob(Precision::FP32, {4096, 4096}, Layout::NC))
+                .setBiases(generateBlob(Precision::FP32, {4096}, Layout::C)));
+        layerId = builder.addLayer({{layerId}}, Builder::ReLULayer(alexNetNames[20]));
+        layerId = builder.addLayer({{layerId}}, Builder::FullyConnectedLayer(alexNetNames[21]).setOutputNum(1000)
+                .setWeights(generateBlob(Precision::FP32, {1000, 4096}, Layout::NC))
+                .setBiases(generateBlob(Precision::FP32, {1000}, Layout::C)));
+        layerId = builder.addLayer({{layerId}}, Builder::SoftMaxLayer(alexNetNames[22]).setAxis(1));
+
+        idx_t outputId = builder.addLayer({PortInfo(layerId)}, Builder::OutputLayer(alexNetNames[23]));
+        return builder;
+    }
+
+    const INetwork::Ptr createAlexnet() {
+        return prepateAlexnetBuilder().build();
+    }
+
+    void compareWithICNNNetwork(const INetwork& network, const ICNNNetwork& cnnNetwork) {
+        for (const auto& layer : network) {
+            auto connections = network.getLayerConnections(layer->getId());
+            CNNLayerPtr cnnLayer;
+            StatusCode sts = cnnNetwork.getLayerByName(layer->getName().c_str(), cnnLayer, nullptr);
+            if (sts != OK && layer->getType() == "Output")
+                continue;
+            else if (sts != OK)
+                THROW_IE_EXCEPTION << "Cannot find CNNLayer by name: " << layer->getName();
+
+
+            // Output connections
+            for (size_t i = 0; i < cnnLayer->outData.size(); i++) {
+                for (const auto& it : cnnLayer->outData[i]->inputTo) {
+                    size_t j = 0;
+                    for (; j < it.second->insData.size(); j++) {
+                        auto lockedData = it.second->insData[j].lock();
+                        if (lockedData && lockedData.get() == cnnLayer->outData[i].get()) {
+                            break;
+                        }
+                    }
+
+                    for (auto conIt = connections.begin(); conIt != connections.end(); conIt++) {
+                        if (conIt->from().layerId() == layer->getId() && conIt->from().portId() == i &&
+                            network.getLayer(conIt->to().layerId())->getName() == it.second->name &&
+                            conIt->to().portId() == j) {
+                            connections.erase(conIt);
+                            break;
+                        }
+                    }
+                }
+            }
+
+            // Input connections
+            for (size_t i = 0; i < cnnLayer->insData.size(); i++) {
+                auto inData = cnnLayer->insData[i].lock();
+                if (!inData)
+                    continue;
+                auto creatorLayer = inData->creatorLayer.lock();
+                if (!creatorLayer)
+                    continue;
+                size_t j = 0;
+                for (; j < creatorLayer->outData.size(); j++) {
+                    if (creatorLayer->outData[j] && creatorLayer->outData[j].get() == inData.get()) {
+                        break;
+                    }
+                }
+
+                for (auto conIt = connections.begin(); conIt != connections.end(); conIt++) {
+                    if (conIt->to().layerId() == layer->getId() && conIt->from().portId() == j &&
+                        network.getLayer(conIt->from().layerId())->getName() == creatorLayer->name &&
+                        conIt->to().portId() == i) {
+                        connections.erase(conIt);
+                        break;
+                    }
+                }
+            }
+
+            if (connections.size() == 1 && network.getLayer(connections[0].to().layerId())->getType() == "Output")
+                connections.erase(connections.begin());
+
+            if (!connections.empty())
+                THROW_IE_EXCEPTION << "Not all connections were connected.";
+        }
+    }
+
+    void compareICNNNetworks(const ICNNNetwork& newNetwork, const ICNNNetwork& oldNetwork) {
+        CNNNetwork network((ICNNNetwork*)&newNetwork);
+
+        if (newNetwork.layerCount() != oldNetwork.layerCount())
+            THROW_IE_EXCEPTION << "ICNNNetworks have different numbers of layers!";
+        for (const auto& layer : network) {
+            CNNLayerPtr oldLayer;
+            StatusCode sts = oldNetwork.getLayerByName(layer->name.c_str(), oldLayer, nullptr);
+            bool success = sts == OK && layer->name == oldLayer->name &&
+                    layer->type == oldLayer->type &&
+                    layer->insData.size() == oldLayer->insData.size() &&
+                    layer->outData.size() == oldLayer->outData.size() &&
+                    layer->precision == oldLayer->precision;
+
+            for (size_t i = 0; i < layer->insData.size() && success; i++) {
+                auto lockedOldData = oldLayer->insData[i].lock();
+                auto lockedData = layer->insData[i].lock();
+                success = success && lockedOldData->name == lockedData->name &&
+                          lockedOldData->getTensorDesc() == lockedData->getTensorDesc();
+            }
+            for (size_t i = 0; i < layer->outData.size() && success; i++) {
+                success = success && oldLayer->outData[i]->name == layer->outData[i]->name &&
+                        oldLayer->outData[i]->getTensorDesc() == layer->outData[i]->getTensorDesc();
+            }
+
+            if (!success)
+                THROW_IE_EXCEPTION << "ICNNNetworks have different layers!";
+        }
+
+        InputsDataMap newInput;
+        OutputsDataMap newOutput;
+        newNetwork.getInputsInfo(newInput);
+        newNetwork.getOutputsInfo(newOutput);
+        InputsDataMap oldInput;
+        OutputsDataMap oldOutput;
+        oldNetwork.getInputsInfo(oldInput);
+        oldNetwork.getOutputsInfo(oldOutput);
+
+        bool success = newInput.size() == oldInput.size();
+        for (const auto& it : newInput) {
+            if (!success)
+                break;
+            success = success && oldInput.find(it.first) != oldInput.end();
+        }
+        if (!success)
+            THROW_IE_EXCEPTION << "ICNNNetworks have different inputs!";
+
+        success = newOutput.size() == oldOutput.size();
+        for (const auto& it : newOutput) {
+            if (!success)
+                break;
+            success = success && oldOutput.find(it.first) != oldOutput.end();
+        }
+        if (!success)
+            THROW_IE_EXCEPTION << "ICNNNetworks have different outputs!";
+    }
+};
+
+TEST_F(NetworkBuilderTest, checkReshapeAlexNet) {
+    std::map<std::string, std::vector<SizeVector>> inPorts = {
+            {alexNetNames[0], {}},
+            {alexNetNames[1], {{1, 3, 227, 227}}},
+            {alexNetNames[2], {{1, 3, 227, 227}}},
+            {alexNetNames[3], {{1, 96, 55, 55}}},
+            {alexNetNames[4], {{1, 96, 55, 55}}},
+            {alexNetNames[5], {{1, 96, 55, 55}}},
+            {alexNetNames[6], {{1, 96, 27, 27}}},
+            {alexNetNames[7], {{1, 256, 27, 27}}},
+            {alexNetNames[8], {{1, 256, 27, 27}}},
+            {alexNetNames[9], {{1, 256, 27, 27}}},
+            {alexNetNames[10], {{1, 256, 13, 13}}},
+            {alexNetNames[11], {{1, 384, 13, 13}}},
+            {alexNetNames[12], {{1, 384, 13, 13}}},
+            {alexNetNames[13], {{1, 384, 13, 13}}},
+            {alexNetNames[14], {{1, 384, 13, 13}}},
+            {alexNetNames[15], {{1, 256, 13, 13}}},
+            {alexNetNames[16], {{1, 256, 13, 13}}},
+            {alexNetNames[17], {{1, 256, 6, 6}}},
+            {alexNetNames[18], {{1, 4096}}},
+            {alexNetNames[19], {{1, 4096}}},
+            {alexNetNames[20], {{1, 4096}}},
+            {alexNetNames[21], {{1, 4096}}},
+            {alexNetNames[22], {{1, 1000}}},
+            {alexNetNames[23], {{1, 1000}}}
+    };
+
+    std::map<std::string, std::vector<SizeVector>> outPorts = {
+            {alexNetNames[0], {{1, 3, 227, 227}}},
+            {alexNetNames[1], {{1, 3, 227, 227}}},
+            {alexNetNames[2], {{1, 96, 55, 55}}},
+            {alexNetNames[3], {{1, 96, 55, 55}}},
+            {alexNetNames[4], {{1, 96, 55, 55}}},
+            {alexNetNames[5], {{1, 96, 27, 27}}},
+            {alexNetNames[6], {{1, 256, 27, 27}}},
+            {alexNetNames[7], {{1, 256, 27, 27}}},
+            {alexNetNames[8], {{1, 256, 27, 27}}},
+            {alexNetNames[9], {{1, 256, 13, 13}}},
+            {alexNetNames[10], {{1, 384, 13, 13}}},
+            {alexNetNames[11], {{1, 384, 13, 13}}},
+            {alexNetNames[12], {{1, 384, 13, 13}}},
+            {alexNetNames[13], {{1, 384, 13, 13}}},
+            {alexNetNames[14], {{1, 256, 13, 13}}},
+            {alexNetNames[15], {{1, 256, 13, 13}}},
+            {alexNetNames[16], {{1, 256, 6, 6}}},
+            {alexNetNames[17], {{1, 4096}}},
+            {alexNetNames[18], {{1, 4096}}},
+            {alexNetNames[19], {{1, 4096}}},
+            {alexNetNames[20], {{1, 4096}}},
+            {alexNetNames[21], {{1, 1000}}},
+            {alexNetNames[22], {{1, 1000}}},
+            {alexNetNames[23], {}}
+    };
+
+    Builder::Network builder = prepateAlexnetBuilder();
+    for (const auto &layer : builder.getLayers()) {
+        if (layer.getType() == "Input") {
+            ASSERT_EQ(outPorts[layer.getName()][0], layer.getOutputPorts()[0].shape());
+        } else {
+            for (size_t j = 0; j < layer.getOutputPorts().size(); j++) {
+                ASSERT_TRUE(layer.getOutputPorts()[j].shape().empty());
+            }
+        }
+    }
+    INetwork::Ptr graph;
+    ASSERT_NO_THROW(graph = builder.build());
+    for (const auto &layer : *graph) {
+        for (size_t i = 0; i < layer->getInputPorts().size(); i++) {
+            ASSERT_EQ(inPorts[layer->getName()][i], layer->getInputPorts()[i].shape());
+        }
+        for (size_t i = 0; i < layer->getOutputPorts().size(); i++) {
+            ASSERT_EQ(outPorts[layer->getName()][i], layer->getOutputPorts()[i].shape());
+        }
+    }
+}
+
+TEST_F(NetworkBuilderTest, checkNoImplWithCorrectPorts) {
+    Context ctx;
+    Builder::Network builder(ctx, "TestAlexNet");
+    idx_t inId = builder.addLayer(Builder::InputLayer(alexNetNames[0]).setPort(Port({1,3, 227, 227})));
+    idx_t convId = builder.addLayer({{inId}}, Builder::ConvolutionLayer(alexNetNames[2]).setKernel({11, 11}).setStrides({4, 4}).setOutDepth(96)
+            .setInputPort(Port({1,3, 227, 227})).setOutputPort(Port({1, 96, 55, 55}))
+            .setWeights(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))
+            .setBiases(generateBlob(Precision::FP32, {96}, Layout::C)));
+    idx_t testLayerId = builder.addLayer({PortInfo(convId)}, Builder::Layer("TestLayer", "testPort")
+            .setInputPorts({Port({1, 96, 55, 55})}).setOutputPorts({Port({1, 96, 55, 55})}));
+    idx_t outputId = builder.addLayer({PortInfo(testLayerId)}, Builder::OutputLayer("out").setPort({Port({1, 96, 55, 55})}));
+
+    ASSERT_NO_THROW(builder.build());
+}
+
+TEST_F(NetworkBuilderTest, checkNoImplWithIncorrectPorts) {
+    Context ctx;
+    Builder::Network builder(ctx, "TestAlexNet");
+    idx_t inId = builder.addLayer(Builder::InputLayer(alexNetNames[0]).setPort(Port({1,3, 227, 227})));
+    idx_t convId = builder.addLayer({{inId}}, Builder::ConvolutionLayer(alexNetNames[2]).setKernel({11, 11}).setStrides({4, 4}).setOutDepth(96)
+            .setInputPort(Port({1,3, 227, 227})).setOutputPort(Port({1, 96, 55, 55}))
+            .setWeights(generateBlob(Precision::FP32, {96, 3, 11, 11}, Layout::OIHW))
+            .setBiases(generateBlob(Precision::FP32, {96}, Layout::C)));
+    idx_t testLayerId = builder.addLayer({PortInfo(convId)}, Builder::Layer("TestLayer", "testPort")
+            .setInputPorts({Port({1, 3, 55, 55})}).setOutputPorts({Port({1, 96, 55, 55})}));
+
+    ASSERT_THROW(builder.build(), InferenceEngine::details::InferenceEngineException);
+}
+
+TEST_F(NetworkBuilderTest, createNetworkIterator) {
+    const INetwork::Ptr graph = createAlexnet();
+
+    ASSERT_NO_THROW(graph->begin());
+}
+
+TEST_F(NetworkBuilderTest, checkNetworkSize) {
+    const INetwork::Ptr graph = createAlexnet();
+
+    ASSERT_EQ(24, graph->size());
+}
+
+TEST_F(NetworkBuilderTest, iterateNetworkForeach) {
+    const INetwork::Ptr graph = createAlexnet();
+
+    size_t idx = 0;
+    for (const auto& layer : *graph) {
+        ASSERT_NE(idx, alexNetNames.size());
+        ASSERT_EQ(alexNetNames[idx], layer->getName());
+        idx++;
+    }
+}
+
+TEST_F(NetworkBuilderTest, iterateNetworkFor) {
+    const INetwork::Ptr graph = createAlexnet();
+
+    size_t idx = 0;
+    for (auto it = graph->begin(); it != graph->end(); it++) {
+        ASSERT_EQ(alexNetNames[idx], (*it)->getName());
+        idx++;
+    }
+}
+
+TEST_F(NetworkBuilderTest, convertFromICNNNetwork) {
+    std::string model = R"V0G0N(
+<net name="PVANET" version="2" batch="1">
+    <layers>
+        <layer name="data" type="Input" precision="FP32" id="0">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>3</dim>
+                    <dim>544</dim>
+                    <dim>992</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="conv1_1_conv" type="Convolution" precision="FP32" id="2">
+            <convolution_data stride-x="2" stride-y="2" pad-x="3" pad-y="3" kernel-x="7" kernel-y="7" output="16" group="1"/>
+            <input>
+                <port id="2">
+                    <dim>1</dim>
+                    <dim>3</dim>
+                    <dim>544</dim>
+                    <dim>992</dim>
+                </port>
+            </input>
+            <output>
+                <port id="3">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+            <weights offset="0" size="9408"/>
+            <biases offset="9408" size="64"/>
+        </layer>
+        <layer name="conv1_1_neg" type="Power" precision="FP32" id="3">
+            <power_data power="1" scale="-1" shift="0"/>
+            <input>
+                <port id="4">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="5">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="conv1_1_concat" type="Concat" precision="FP32" id="4">
+            <concat_data axis="1"/>
+            <input>
+                <port id="6">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+                <port id="7">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="8">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="conv1_1_scale" type="ScaleShift" precision="FP32" id="5">
+            <input>
+                <port id="9">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="10">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+            <weights offset="9472" size="128"/>
+            <biases offset="9600" size="128"/>
+        </layer>
+        <layer name="conv1_1_relu" type="ReLU" precision="FP32" id="6">
+            <data negative_slope="0" engine="caffe.ReLUParameter.DEFAULT"/>
+            <input>
+                <port id="11">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="12">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="pool1" type="Pooling" precision="FP32" id="7">
+            <pooling_data kernel-x="3" kernel-y="3" pad-x="0" pad-y="0" stride-x="2" stride-y="2" rounding-type="ceil" pool-method="max"/>
+            <input>
+                <port id="13">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="14">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>136</dim>
+                    <dim>248</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="2" to-port="2"/>
+        <edge from-layer="2" from-port="3" to-layer="3" to-port="4"/>
+        <edge from-layer="2" from-port="3" to-layer="4" to-port="6"/>
+        <edge from-layer="3" from-port="5" to-layer="4" to-port="7"/>
+        <edge from-layer="4" from-port="8" to-layer="5" to-port="9"/>
+        <edge from-layer="5" from-port="10" to-layer="6" to-port="11"/>
+        <edge from-layer="6" from-port="12" to-layer="7" to-port="13"/>
+    </edges>
+</net>)V0G0N";
+
+    InferenceEngine::CNNNetReader net_reader;
+    ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
+
+    InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {9728});
+    weights->allocate();
+    fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
+    InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
+
+    net_reader.SetWeights(weights_ptr);
+    INetwork::Ptr network = Builder::Network(net_reader.getNetwork()).build();
+
+    try {
+        compareWithICNNNetwork(*network, net_reader.getNetwork());
+    } catch (InferenceEngine::details::InferenceEngineException &ex) {
+        FAIL() << ex.what();
+    }
+}
+
+TEST_F(NetworkBuilderTest, convertFromICNNNetworkToICNNNetwork) {
+    std::string model = R"V0G0N(
+<net name="PVANET" version="2" batch="1">
+    <layers>
+        <layer name="data" type="Input" precision="FP32" id="0">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>3</dim>
+                    <dim>544</dim>
+                    <dim>992</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="conv1_1_conv" type="Convolution" precision="FP32" id="2">
+            <convolution_data stride-x="2" stride-y="2" pad-x="3" pad-y="3" kernel-x="7" kernel-y="7" output="16" group="1"/>
+            <input>
+                <port id="2">
+                    <dim>1</dim>
+                    <dim>3</dim>
+                    <dim>544</dim>
+                    <dim>992</dim>
+                </port>
+            </input>
+            <output>
+                <port id="3">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+            <weights offset="0" size="9408"/>
+            <biases offset="9408" size="64"/>
+        </layer>
+        <layer name="conv1_1_neg" type="Power" precision="FP32" id="3">
+            <power_data power="1" scale="-1" shift="0"/>
+            <input>
+                <port id="4">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="5">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="conv1_1_concat" type="Concat" precision="FP32" id="4">
+            <concat_data axis="1"/>
+            <input>
+                <port id="6">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+                <port id="7">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="8">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="conv1_1_scale" type="ScaleShift" precision="FP32" id="5">
+            <input>
+                <port id="9">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="10">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+            <weights offset="9472" size="128"/>
+            <biases offset="9600" size="128"/>
+        </layer>
+        <layer name="conv1_1_relu" type="ReLU" precision="FP32" id="6">
+            <data negative_slope="0" engine="caffe.ReLUParameter.DEFAULT"/>
+            <input>
+                <port id="11">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="12">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="pool1" type="Pooling" precision="FP32" id="7">
+            <pooling_data kernel-x="3" kernel-y="3" pad-x="0" pad-y="0" stride-x="2" stride-y="2" rounding-type="ceil" pool-method="max"/>
+            <input>
+                <port id="13">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="14">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>136</dim>
+                    <dim>248</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="2" to-port="2"/>
+        <edge from-layer="2" from-port="3" to-layer="3" to-port="4"/>
+        <edge from-layer="2" from-port="3" to-layer="4" to-port="6"/>
+        <edge from-layer="3" from-port="5" to-layer="4" to-port="7"/>
+        <edge from-layer="4" from-port="8" to-layer="5" to-port="9"/>
+        <edge from-layer="5" from-port="10" to-layer="6" to-port="11"/>
+        <edge from-layer="6" from-port="12" to-layer="7" to-port="13"/>
+    </edges>
+</net>)V0G0N";
+
+    InferenceEngine::CNNNetReader net_reader;
+    ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
+
+    InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {9728});
+    weights->allocate();
+    fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
+    InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
+
+    net_reader.SetWeights(weights_ptr);
+    std::shared_ptr<ICNNNetwork> network = Builder::convertToICNNNetwork(Builder::Network(net_reader.getNetwork()).build());
+
+    try {
+        compareICNNNetworks(*network, net_reader.getNetwork());
+    } catch (InferenceEngine::details::InferenceEngineException &ex) {
+        FAIL() << ex.what();
+    }
+}
+
+TEST_F(NetworkBuilderTest, connectTwoNetworks) {
+    std::string model = R"V0G0N(
+<net name="PVANET" version="2" batch="1">
+    <layers>
+        <layer name="data" type="Input" precision="FP32" id="0">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>3</dim>
+                    <dim>544</dim>
+                    <dim>992</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="conv1_1_conv" type="Convolution" precision="FP32" id="2">
+            <convolution_data stride-x="2" stride-y="2" pad-x="3" pad-y="3" pad-r="3" pad-b="3" kernel-x="7" kernel-y="7" output="16" group="1"/>
+            <input>
+                <port id="2">
+                    <dim>1</dim>
+                    <dim>3</dim>
+                    <dim>544</dim>
+                    <dim>992</dim>
+                </port>
+            </input>
+            <output>
+                <port id="3">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+            <weights offset="0" size="9408"/>
+            <biases offset="9408" size="64"/>
+        </layer>
+        <layer name="conv1_1_neg" type="Power" precision="FP32" id="3">
+            <power_data power="1" scale="-1" shift="0"/>
+            <input>
+                <port id="4">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="5">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="conv1_1_concat" type="Concat" precision="FP32" id="4">
+            <concat_data axis="1"/>
+            <input>
+                <port id="6">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+                <port id="7">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </input>
+            <output>
+                <port id="8">
+                    <dim>1</dim>
+                    <dim>32</dim>
+                    <dim>272</dim>
+                    <dim>496</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="2" to-port="2"/>
+        <edge from-layer="2" from-port="3" to-layer="3" to-port="4"/>
+        <edge from-layer="2" from-port="3" to-layer="4" to-port="6"/>
+        <edge from-layer="3" from-port="5" to-layer="4" to-port="7"/>
+    </edges>
+</net>)V0G0N";
+
+    InferenceEngine::CNNNetReader net_reader;
+    ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
+
+    InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {9472});
+    weights->allocate();
+    fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
+    InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
+
+    net_reader.SetWeights(weights_ptr);
+    Builder::Network originalNetwork(net_reader.getNetwork());
+    Builder::Network addNetwork(net_reader.getNetwork());
+
+    // Find output
+    idx_t lastLayerId(0);
+    for (const auto& layer : originalNetwork.getLayers()) {
+        if (layer.getType() != "Output")
+            continue;
+        const auto connections = originalNetwork.getLayerConnections(layer.getId());
+        ASSERT_EQ(1, connections.size());
+        ASSERT_EQ(layer.getId(), connections[0].to().layerId());
+        ASSERT_EQ(0, connections[0].from().portId());
+        lastLayerId = connections[0].from().layerId();
+        originalNetwork.disconnect(connections[0]);
+        originalNetwork.removeLayer(layer.getId());
+        break;
+    }
+
+    std::map<idx_t, idx_t> oldNewId;
+    for (const auto& layer : addNetwork.getLayers()) {
+        if (layer.getType() == "Input") {
+            oldNewId[layer.getId()] = lastLayerId;
+            continue;
+        }
+        oldNewId[layer.getId()] = originalNetwork.addLayer(layer);
+        const auto connections = addNetwork.getLayerConnections(layer.getId());
+        for (const auto& connection : connections) {
+            if (oldNewId.find(connection.from().layerId()) == oldNewId.end() ||
+                    oldNewId.find(connection.to().layerId()) == oldNewId.end())
+                continue;
+            originalNetwork.connect({oldNewId[connection.from().layerId()], connection.from().portId()},
+                    {oldNewId[connection.to().layerId()], connection.to().portId()});
+        }
+
+        if (layer.getType() == "Convolution") {
+            Builder::ConvolutionLayer(originalNetwork.getLayer(oldNewId[layer.getId()])).setWeights(generateBlob(Precision::FP32, {16, 32, 7, 7}, Layout::OIHW));
+        }
+    }
+    ASSERT_NO_THROW(originalNetwork.build());
+}
+
+TEST_F(NetworkBuilderTest, createLayersWithTheSameNames) {
+    InferenceEngine::Builder::Network netBuilder("");
+
+    // Connect conolutional layer with it's inputs and outputs.
+    InferenceEngine::Builder::InputLayer inpLayer("data");
+    inpLayer.setPort(InferenceEngine::Port({1, 1, 10, 10}));
+    auto inpLayerId = netBuilder.addLayer(inpLayer);
+
+    // Create convolutional layer
+    const size_t outCn = 1, inpCn = 1, kernelH = 3, kernelW = 3;
+    InferenceEngine::Builder::ConvolutionLayer ieLayer("conv1");
+
+    ieLayer.setKernel({outCn, inpCn, kernelH, kernelW});
+    ieLayer.setStrides({1, 1, 1, 1});
+    ieLayer.setDilation({1, 1, 1, 1});
+    ieLayer.setPaddingsBegin({0, 0, 0, 0});
+    ieLayer.setPaddingsEnd({0, 0, 0, 0});
+    ieLayer.setGroup(1);
+    ieLayer.setOutDepth(outCn);
+    auto convLayerId = netBuilder.addLayer({inpLayerId}, ieLayer);
+
+    // Connect convolution layer with it's output
+    InferenceEngine::Builder::OutputLayer outLayer("conv1");
+    auto convOutLayerId = netBuilder.addLayer({convLayerId}, outLayer);
+    ASSERT_NE(netBuilder.getLayer(convLayerId).getName(), netBuilder.getLayer(convOutLayerId).getName());
+    InferenceEngine::Builder::ReLULayer reLULayer("relu1");
+    reLULayer.setNegativeSlope(0);
+    auto reluLayerId = netBuilder.addLayer({convLayerId}, reLULayer);
+    InferenceEngine::Builder::OutputLayer outReLULayer("relu1");
+    auto reluOutLayerId = netBuilder.addLayer({reluLayerId}, outReLULayer);
+    ASSERT_NE(netBuilder.getLayer(reluLayerId).getName(), netBuilder.getLayer(reluOutLayerId).getName());
+
+    ASSERT_NO_THROW(netBuilder.build());
+}
+
+TEST_F(NetworkBuilderTest, RemoveLayerAndBuild) {
+    auto builder = prepateAlexnetBuilder();
+    builder.removeLayer(builder.getLayers()[2].getId());
+
+    ASSERT_THROW(builder.build(), InferenceEngine::details::InferenceEngineException);
+}
+
+TEST_F(NetworkBuilderTest, DocumentationExample) {
+    // Create graph with name
+    InferenceEngine::Builder::Network graph("Example1");
+
+    // Create network
+    // In-place add input layer
+    idx_t inputLayerId = graph.addLayer(Builder::InputLayer("in").setPort(Port({1, 3, 22, 22})));
+
+    // In-place add ReLU layer builder with a negative slope 0.1 and connect it with 0 output port of the Input layer builder
+    // In this example layerId is equal new Input layer builder ID, port index isn't set because 0 is a default value ({layerId} == {layerId, 0})
+    idx_t relu1Id = graph.addLayer({{inputLayerId}}, Builder::ReLULayer("relu1").setNegativeSlope(0.1f));
+
+    // In-place add ScaleShift layer builder
+    InferenceEngine::Blob::Ptr blobWithScaleShiftBiases = make_shared_blob<float>(TensorDesc(Precision::FP32, {3}, Layout::C));
+    blobWithScaleShiftBiases->allocate();
+    auto *data = blobWithScaleShiftBiases->buffer().as<float *>();
+    data[0] = 1;
+    data[1] = 2;
+    data[2] = 3;
+    idx_t scaleShiftId = graph.addLayer(Builder::ScaleShiftLayer("scaleShift1").setBiases(blobWithScaleShiftBiases));
+
+    // Connect ScaleShift layer with relu1
+    graph.connect({relu1Id}, {scaleShiftId}); // Also port indexes could be defined (0 is default value) builder.connect({layerId, outPortIdx}, {scaleShiftId, inPortIdx});
+
+    // Create ReLU layer with a negative slope 0.2 using generic layer builder and connect it with scaleShift
+    idx_t relu2Id = graph.addLayer({{scaleShiftId}}, Builder::Layer("ReLU", "relu2").setParameters({{"negative_slope", 0.2f}}).setOutputPorts({Port()}).setInputPorts({Port()}));
+
+    // All branches in the graph should be ended by Output layer. Let's create Output layer
+    idx_t outId = graph.addLayer({{relu2Id, 0}}, Builder::OutputLayer("out"));
+
+    // Build original network
+    InferenceEngine::INetwork::Ptr finalNetwork = graph.build();
+    std::shared_ptr<InferenceEngine::ICNNNetwork> cnnNetwork = InferenceEngine::Builder::convertToICNNNetwork(finalNetwork);
+
+    // Modify network
+    // Remove relu2 layer from the topology
+    std::vector<InferenceEngine::Connection> connections = graph.getLayerConnections(relu2Id);
+    for (const auto& connection : connections) {
+        graph.disconnect(connection);
+    }
+    graph.removeLayer(relu2Id);
+
+    // Connect scaleShift1 and out
+    graph.connect({scaleShiftId}, {outId});
+    // Build network without relu2
+    InferenceEngine::INetwork::Ptr changedNetwork = graph.build();
+}
index 09cbb50..e33362f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -1832,4 +1831,265 @@ TEST_F(CNNNetReaderImplTest, canRead3DPooling) {
     ASSERT_EQ(pool->_pads_end[X_AXIS], 5);
     ASSERT_EQ(pool->_pads_end[Y_AXIS], 3);
     ASSERT_EQ(pool->_pads_end[Z_AXIS], 1);
-}
\ No newline at end of file
+}
+
+TEST_F(CNNNetReaderImplTest, canParseWithoutInput_1to2) {
+    std::string model = R"V0G0N(
+<net batch="1" name="SimpleNet" version="2">
+    <layers>
+        <layer id="1" name="Boo" precision="FP32" type="Split">
+            <data operation="sum"/>
+            <input>
+                <port id="0">
+                    <dim>2</dim>
+                    <dim>16</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                </port>
+                <port id="2">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+</net>
+    )V0G0N";
+
+    CNNNetReaderImpl reader(make_shared<V2FormatParserCreator>());
+    sts = reader.ReadNetwork(model.data(), model.length(), &resp);
+    ASSERT_EQ(OK, sts) << resp.msg;
+
+    auto net = reader.getNetwork(&resp);
+    ASSERT_NE(nullptr, net ) << resp.msg;
+
+    InputsDataMap in_map;
+    OutputsDataMap out_map;
+    net->getInputsInfo(in_map);
+    net->getOutputsInfo(out_map);
+
+    ASSERT_EQ(in_map.size(), 1); auto i = in_map.begin();
+    ASSERT_EQ(i++->second->name(), "Boo");
+
+    ASSERT_EQ(out_map.size(), 2); auto o = out_map.begin();
+    ASSERT_EQ(o++->second->getName(), "Boo.0");
+    ASSERT_EQ(o++->second->getName(), "Boo.1");
+}
+
+TEST_F(CNNNetReaderImplTest, canParseWithoutInput_2to1) {
+    std::string model = R"V0G0N(
+<net batch="1" name="SimpleNet" version="2">
+    <layers>
+        <layer id="1" name="Foo" precision="FP32" type="Eltwise">
+            <data operation="sum"/>
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                </port>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                </port>
+            </input>
+            <output>
+                <port id="2">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+</net>
+    )V0G0N";
+
+    CNNNetReaderImpl reader(make_shared<V2FormatParserCreator>());
+    sts = reader.ReadNetwork(model.data(), model.length(), &resp);
+    ASSERT_EQ(OK, sts) << resp.msg;
+
+    auto net = reader.getNetwork(&resp);
+    ASSERT_NE(nullptr, net ) << resp.msg;
+
+    InputsDataMap in_map;
+    OutputsDataMap out_map;
+    net->getInputsInfo(in_map);
+    net->getOutputsInfo(out_map);
+
+    ASSERT_EQ(in_map.size(), 2); auto i = in_map.begin();
+    ASSERT_EQ(i++->second->name(), "Foo.0");
+    ASSERT_EQ(i++->second->name(), "Foo.1");
+
+    ASSERT_EQ(out_map.size(), 1); auto o = out_map.begin();
+    ASSERT_EQ(o++->second->getName(), "Foo");
+}
+
+TEST_F(CNNNetReaderImplTest, canParseSimpleTI) {
+        std::string model = R"V0G0N(
+<net batch="1" name="Simple_TI" version="4">
+    <layers>
+        <layer id="0" name="input" precision="FP32" type="Input">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>5</dim>
+                    <dim>16</dim>
+                </port>
+            </output>
+        </layer>
+        <layer id="1" name="Bias" precision="FP32" type="Const">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                </port>
+            </output>
+            <blobs>
+                <custom offset="0" size="64"/>
+            </blobs>
+        </layer>
+        <layer id="2" name="SomeTI" precision="FP32" type="TensorIterator">
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>5</dim>
+                    <dim>16</dim>
+                </port>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>16</dim>
+                </port>
+            </input>
+            <output>
+                <port id="3">
+                    <dim>1</dim>
+                    <dim>5</dim>
+                    <dim>16</dim>
+                </port>
+            </output>
+            <port_map>
+                <input  external_port_id="0" internal_layer_id="0" internal_port_id="0" axis="1" />
+                <input  external_port_id="1" internal_layer_id="1" internal_port_id="1"/>
+                <output external_port_id="3" internal_layer_id="2" internal_port_id="1" axis="1" />
+            </port_map>
+            <back_edges>
+                <edge from-layer="1" from-port="2" to-layer="1" to-port="1"/>
+            </back_edges>
+            <body>
+                <layers>
+                    <layer id="0" name="TI_reshape_in" precision="FP32" type="Reshape">
+                        <data axis="0" dim="1,512" num_axes="-1"/>
+                        <input>
+                            <port id="0">
+                                <dim>1</dim>
+                                <dim>1</dim>
+                                <dim>16</dim>
+                            </port>
+                        </input>
+                        <output>
+                            <port id="1">
+                                <dim>1</dim>
+                                <dim>16</dim>
+                            </port>
+                        </output>
+                    </layer>
+                    <layer id="1" name="TI_sum" precision="FP32" type="Eltwise">
+                        <data operation="sum"/>
+                        <input>
+                            <port id="0">
+                                <dim>1</dim>
+                                <dim>16</dim>
+                            </port>
+                            <port id="1">
+                                <dim>1</dim>
+                                <dim>16</dim>
+                            </port>
+                        </input>
+                        <output>
+                            <port id="2">
+                                <dim>1</dim>
+                                <dim>16</dim>
+                            </port>
+                        </output>
+                    </layer>
+                    <layer id="2" name="TI_reshape_out" precision="FP32" type="Reshape">
+                        <data axis="0" dim="1,1,256" num_axes="-1"/>
+                        <input>
+                            <port id="0">
+                                <dim>1</dim>
+                                <dim>16</dim>
+                            </port>
+                        </input>
+                        <output>
+                            <port id="1">
+                                <dim>1</dim>
+                                <dim>1</dim>
+                                <dim>16</dim>
+                            </port>
+                        </output>
+                    </layer>
+                </layers>
+                <edges>
+                    <edge from-layer="0" from-port="1" to-layer="1" to-port="0"/>
+                    <edge from-layer="1" from-port="2" to-layer="2" to-port="0"/>
+                </edges>
+            </body>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="2" to-port="0"/>
+        <edge from-layer="1" from-port="0" to-layer="2" to-port="1"/>
+    </edges>
+</net>
+    )V0G0N";
+
+        CNNNetReaderImpl reader(make_shared<V2FormatParserCreator>());
+        sts = reader.ReadNetwork(model.data(), model.length(), &resp);
+        ASSERT_EQ(OK, sts) << resp.msg;
+
+        auto network = reader.getNetwork(&resp);
+        ASSERT_NE(nullptr, network ) << resp.msg;
+
+        CNNLayerPtr layer;
+        sts = network->getLayerByName("SomeTI", layer, &resp);
+        ASSERT_EQ(OK, sts) << resp.msg;
+
+        auto *ti = dynamic_cast<TensorIterator*>(layer.get());
+        ASSERT_NE(nullptr, ti);
+        ASSERT_EQ(ti->type, "TensorIterator");
+
+        //  Check Input port mapping
+        ASSERT_EQ(ti->input_port_map.size(), 2);
+        int i = ti->input_port_map[0].axis == 1 ? 0 : 1;
+        ASSERT_EQ(ti->input_port_map[i].axis, 1);
+        ASSERT_EQ(ti->input_port_map[i].stride, 1);
+        ASSERT_EQ(ti->input_port_map[i].start, 0);
+        ASSERT_EQ(ti->input_port_map[i].end, -1);
+        ASSERT_EQ(ti->input_port_map[i].part_size, 1);
+        ASSERT_EQ(ti->input_port_map[1-i].axis, -1);
+        ASSERT_EQ(ti->input_port_map[1-i].stride, 1);
+        ASSERT_EQ(ti->input_port_map[1-i].start, 0);
+        ASSERT_EQ(ti->input_port_map[1-i].end, -1);
+        ASSERT_EQ(ti->input_port_map[1-i].part_size, 1);
+
+        //  Check Output port mapping
+        ASSERT_EQ(ti->output_port_map.size(), 1);
+        ASSERT_EQ(ti->output_port_map[0].axis, 1);
+        ASSERT_EQ(ti->output_port_map[0].stride, 1);
+        ASSERT_EQ(ti->output_port_map[0].start, 0);
+        ASSERT_EQ(ti->output_port_map[0].end, -1);
+        ASSERT_EQ(ti->output_port_map[0].part_size, 1);
+
+        //  No back edges
+        ASSERT_EQ(ti->back_edges.size(), 1);
+        ASSERT_EQ(ti->back_edges[0].from, 0);
+        ASSERT_EQ(ti->back_edges[0].to, 1);
+        ASSERT_EQ(ti->back_edges[0].axis, -1);
+        ASSERT_EQ(ti->back_edges[0].stride, 1);
+        ASSERT_EQ(ti->back_edges[0].start, 0);
+        ASSERT_EQ(ti->back_edges[0].end, -1);
+        ASSERT_EQ(ti->back_edges[0].part_size, 1);
+}
index a6a481a..3b4ffce 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 3ec99f1..28c4646 100644 (file)
@@ -1,21 +1,20 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
 
+#include <fstream>
 #include <gtest/gtest.h>
 #include "xml_father.hpp"
 #include "cnn_network_impl.hpp"
 #include  <tests_common.hpp>
-#include "v2_format_parser.h"
+#include "ie_format_parser.h"
 #include <string>
 #include "pugixml.hpp"
 #include "xml_parse_utils.h"
 #include "mean_image.h"
 #include "ie_blob_proxy.hpp"
-#include <fstream>
 
 class FormatParserTest : public TestsCommon {
  public:
@@ -93,7 +92,7 @@ class FormatParserTest : public TestsCommon {
         int version = XMLParseUtils::GetIntAttr(root, "version", 2);
         if (version < 2) THROW_IE_EXCEPTION << "Deprecated IR's versions: " << version;
         if (version > 3) THROW_IE_EXCEPTION << "cannot parse future versions: " << version;
-        parser.reset(new InferenceEngine::details::V2FormatParser(version));
+        parser.reset(new InferenceEngine::details::FormatParser(version));
 
         net = parser->Parse(root);
     }
@@ -331,7 +330,7 @@ xml().node("net").attr("name", "AlexNet").attr("version", x)\
         return testing::XMLFather();
     }
 
-    std::shared_ptr<InferenceEngine::details::V2FormatParser> parser;
+    std::shared_ptr<InferenceEngine::details::FormatParser> parser;
 
  public:
 
@@ -380,4 +379,4 @@ xml().node("net").attr("name", "AlexNet").attr("version", x)\
                    std::istreambuf_iterator<char>());
         return str;
     }
-};
\ No newline at end of file
+};
index f5713db..1b9cdc0 100644 (file)
@@ -1,12 +1,11 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include <cpp/ie_cnn_network.h>
 #include <gtest/gtest.h>
 #include "xml_father.hpp"
-#include "inference_engine/v2_format_parser.h"
+#include "inference_engine/ie_format_parser.h"
 #include <string>
 #include <pugixml.hpp>
 #include <fstream>
index a8c6e70..b80d2cb 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -95,3 +94,97 @@ TEST_F(V3FormatParserTest, DISABLE_conv3DInvalidKernel) {
 
     ASSERT_NO_FATAL_FAILURE(assertParseFail(content));
 }
+
+class V2ParserPublicSegments: public InferenceEngine::details::FormatParser {
+public:
+    const std::map<std::string, LayerParseParameters>& getLayerParseParameters() {
+        return layersParseInfo;
+    }
+};
+
+TEST_F(V3FormatParserTest, LargeWeights) {
+    std::string model = R"V0G0N(
+<net name="PVANET" version="3" batch="1">
+    <layers>
+        <layer name="data" type="Input" precision="FP32" id="0">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>1024</dim>
+                </port>
+            </output>
+        </layer>
+               <layer id="1" name="MatMul" precision="FP32" type="FullyConnected">
+                       <data out-size="800000"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>1024</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="3">
+                                       <dim>1</dim>
+                                       <dim>800000</dim>
+                               </port>
+                       </output>
+                       <blobs>
+                               <weights offset="891492352" size="3276800000"/>
+                               <biases offset="4168292352" size="3200000"/>
+                       </blobs>
+               </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="1" to-port="0"/>
+    </edges>
+</net>)V0G0N";
+
+    parse(model);
+
+    auto params = ((V2ParserPublicSegments *)parser.get())->getLayerParseParameters();
+    ASSERT_NE(params.end(), params.find("MatMul"));
+    ASSERT_EQ(891492352, params["MatMul"].blobs["weights"].start);
+    ASSERT_EQ(3276800000, params["MatMul"].blobs["weights"].size);
+    ASSERT_EQ(4168292352, params["MatMul"].blobs["biases"].start);
+    ASSERT_EQ(3200000, params["MatMul"].blobs["biases"].size);
+}
+
+TEST_F(V3FormatParserTest, IncorrectWeights) {
+    std::string model = R"V0G0N(
+<net name="PVANET" version="3" batch="1">
+    <layers>
+        <layer name="data" type="Input" precision="FP32" id="0">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>1024</dim>
+                </port>
+            </output>
+        </layer>
+               <layer id="1" name="MatMul" precision="FP32" type="FullyConnected">
+                       <data out-size="800000"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>1024</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="3">
+                                       <dim>1</dim>
+                                       <dim>800000</dim>
+                               </port>
+                       </output>
+                       <blobs>
+                               <weights offset="891492352" size="-64"/>
+                               <biases offset="4168292352" size="3200000"/>
+                       </blobs>
+               </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="1" to-port="0"/>
+    </edges>
+</net>)V0G0N";
+
+    assertParseFail(model);
+}
\ No newline at end of file
diff --git a/inference-engine/tests/unit/engines/gna/I8_quantisation_test.cpp b/inference-engine/tests/unit/engines/gna/I8_quantisation_test.cpp
new file mode 100644 (file)
index 0000000..8e69a3b
--- /dev/null
@@ -0,0 +1,117 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <gtest/gtest.h>
+#include <inference_engine/layer_transform.hpp>
+#include <gna_plugin/quantization/model_quantizer.hpp>
+#include <cpp/ie_cnn_net_reader.h>
+#include "gna_plugin/quantization/layer_quantizer.hpp"
+#include "gna_matcher.hpp"
+
+using namespace InferenceEngine;
+using namespace GNAPluginNS;
+using namespace GNATestIRs;
+
+class I8QuantisationTest : public GNATest {
+ protected:
+    LayersQuantizer<QuantI8> lc = LayersQuantizer<QuantI8> (1.0f);
+
+    InferenceEngine::CNNLayerPtr  quantize (InferenceEngine::CNNLayerPtr lp) {
+        auto newLayer = InferenceEngine::injectData<QuantizedLayerParams>(lp);
+        transformLayer(newLayer, lc);
+        return newLayer;
+    };
+
+    void SetUp() override  {
+    }
+
+};
+
+// TODO: add test for FC weights after quantization
+TEST_F(I8QuantisationTest, canQuantizeFCLayer){
+
+    auto fc = std::make_shared<FullyConnectedLayer>(LayerParams{"name", "type", Precision::FP32});
+    fc->_out_num = 9;
+    auto weights = make_shared_blob<float>(Precision::FP32, {1, 1});
+    fc->_weights = weights;
+    fc->_biases = make_shared_blob<float>(Precision::FP32, {1, 1});
+    fc->_weights->allocate();
+    fc->_biases->allocate();
+    std::shared_ptr<Data> outData = std::make_shared<Data>("data", SizeVector({1, 1}), Precision::FP32, Layout::NC);
+    fc->outData.push_back(outData);
+    fc->insData.push_back(outData);
+
+    // actual quantisation algorithm is involved
+    for (auto && w : *weights) {
+        w =  MAX_OUT_MULTIPLIER * MAX_VAL_1B_WEIGHT;
+    }
+
+    fillWeights(fc->_biases);
+
+    ASSERT_NO_THROW(quantize(fc));
+}
+
+TEST_F(I8QuantisationTest, canQuantizeActivation){
+
+    auto sigmoid = std::make_shared<GenericLayer >(LayerParams{"name", "type", Precision::FP32});
+    sigmoid->params["value"] = 2;
+    sigmoid->type = "Activation";
+
+    ASSERT_NO_THROW(quantize(sigmoid));
+}
+
+TEST_F(I8QuantisationTest, inputPrecisionIs16Bits){
+
+    ModelQuantizer<QuantI8> q;
+
+    CNNNetReader net_reader;
+    ASSERT_NO_THROW(net_reader.ReadNetwork(Fc2DOutputModel().data(), Fc2DOutputModel().length()));
+
+    auto weights = make_shared_blob<uint8_t >(Precision::U8, C, {440});
+    weights->allocate();
+    fillWeights(weights);
+    net_reader.SetWeights(weights);
+    auto newNet = q.quantize(net_reader.getNetwork(), 1000);
+    InputsDataMap inputs;
+    newNet->getInputsInfo(inputs);
+    auto inputLayer = inputs.begin()->second->getInputData()->inputTo.begin()->second->insData.front().lock()->creatorLayer.lock();
+
+    ASSERT_EQ(inputLayer->precision, Precision::I16);
+}
+
+TEST_F(I8QuantisationTest, failIfFCDimensionIs1){
+
+    ModelQuantizer<QuantI8> q;
+
+    CNNNetReader net_reader;
+    ASSERT_NO_THROW(net_reader.ReadNetwork(FCOnlyModel().data(), FCOnlyModel().length()));
+
+    auto weights = make_shared_blob<uint8_t >(Precision::U8, C, {440});
+    weights->allocate();
+    fillWeights(weights);
+    net_reader.SetWeights(weights);
+
+    ASSERT_ANY_THROW(q.quantize(net_reader.getNetwork(), 1000));
+}
+
+TEST_F(I8QuantisationTest, outputAffinePrecisionIs32Bits){
+
+    ModelQuantizer<QuantI8> q;
+
+    CNNNetReader net_reader;
+    ASSERT_NO_THROW(net_reader.ReadNetwork(Fc2DOutputModel().data(), Fc2DOutputModel().length()));
+
+    auto weights = make_shared_blob<uint8_t >(Precision::U8, C, {440});
+    weights->allocate();
+    fillWeights(weights);
+    net_reader.SetWeights(weights);
+
+    auto newNet = q.quantize(net_reader.getNetwork(), 1000);
+    InputsDataMap inputs;
+    newNet->getInputsInfo(inputs);
+    auto affineDataPtr = inputs.begin()->second->getInputData()->inputTo.begin()->second->outData.front();
+
+    ASSERT_EQ(affineDataPtr->precision, Precision::I32);
+}
diff --git a/inference-engine/tests/unit/engines/gna/configuration_test.cpp b/inference-engine/tests/unit/engines/gna/configuration_test.cpp
new file mode 100644 (file)
index 0000000..e17e6db
--- /dev/null
@@ -0,0 +1,136 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <mock_icnn_network.hpp>
+#include "gna_plugin/gna_plugin_config.hpp"
+#include "gna_matcher.hpp"
+#include "test_irs.hpp"
+
+using namespace InferenceEngine;
+using namespace GNAPluginNS;
+using namespace ::testing;
+
+class GNAConfigTest : public GNATest {
+
+ protected:
+    MockICNNNetwork net;
+
+    void SetUp() override  {
+    }
+};
+
+TEST_F(GNAConfigTest, reportAnErrorIfConfigNotFound) {
+
+    Config c ({{TargetDevice :: eGNA, Precision::I16},
+               {TargetDevice :: eCPU, Precision::FP32}});
+
+    EXPECT_CALL(net, getPrecision()).WillRepeatedly(Return(Precision::FP32));
+    EXPECT_CALL(net, getTargetDevice()).WillRepeatedly(Return(TargetDevice::eGNA));
+
+    ASSERT_ANY_THROW(c.find_configuration(net));
+}
+
+TEST_F(GNAConfigTest, canFindConfiguration) {
+
+    Config c ({{TargetDevice :: eGNA, Precision::I16},
+               {TargetDevice :: eCPU, Precision::FP32}});
+
+    EXPECT_CALL(net, getPrecision()).WillRepeatedly(Return(Precision::FP32));
+    EXPECT_CALL(net, getTargetDevice()).WillRepeatedly(Return(TargetDevice::eCPU));
+
+    auto match = c.find_configuration(net);
+
+    EXPECT_EQ(match.device, TargetDevice::eCPU);
+    EXPECT_EQ(match.networkPrec, Precision::FP32);
+}
+
+TEST_F(GNAConfigTest, canPassTroughNetworkAfterFindConfiguration) {
+
+    Config c ({{TargetDevice :: eGNA, Precision::I16},
+               {TargetDevice :: eCPU, Precision::FP32}});
+
+    EXPECT_CALL(net, getPrecision()).WillRepeatedly(Return(Precision::FP32));
+    EXPECT_CALL(net, getTargetDevice()).WillRepeatedly(Return(TargetDevice::eCPU));
+
+    auto match = c.find_configuration(net);
+
+    auto net2 = match.convert(net);
+
+    EXPECT_EQ(net2->getTargetDevice(), TargetDevice::eCPU);
+    EXPECT_EQ(net2->getPrecision(), Precision::FP32);
+}
+
+TEST_F(GNAConfigTest, canNotMatchWithDefaultDevice) {
+
+    Config c ({{TargetDevice :: eGNA, Precision::I16},
+               {TargetDevice :: eCPU, Precision::FP32}});
+
+    c.setDefaultDevice(TargetDevice::eGNA);
+
+    EXPECT_CALL(net, getPrecision()).WillRepeatedly(Return(Precision::FP32));
+    EXPECT_CALL(net, getTargetDevice()).WillRepeatedly(Return(TargetDevice::eDefault));
+
+    EXPECT_ANY_THROW(c.find_configuration(net).convert(net));
+}
+
+TEST_F(GNAConfigTest, canMatchWithDefaultDevice) {
+
+    Config c ({{TargetDevice :: eGNA, Precision::I16},
+               {TargetDevice :: eCPU, Precision::FP32}});
+
+    c.setDefaultDevice(TargetDevice::eGNA);
+
+    EXPECT_CALL(net, getPrecision()).WillRepeatedly(Return(Precision::I16));
+    EXPECT_CALL(net, getTargetDevice()).WillRepeatedly(Return(TargetDevice::eDefault));
+
+    auto net2 = c.find_configuration(net).convert(net);
+
+    EXPECT_EQ(net2->getTargetDevice(), TargetDevice::eDefault);
+    EXPECT_EQ(net2->getPrecision(), Precision::I16);
+}
+
+TEST_F(GNAConfigTest, canMatchWith1AsyncThread) {
+    assert_that()
+        .onInferModel(GNATestIRs::Fc2DOutputModel())
+        .inNotCompactMode()
+        .withAcceleratorThreadsNumber("1")
+        .gna().propagate_forward().called_without().pwl_inserted_into_nnet();
+}
+
+TEST_F(GNAConfigTest, canMatchWith4AsyncThreads) {
+    assert_that()
+        .onInferModel(GNATestIRs::Fc2DOutputModel())
+        .inNotCompactMode()
+        .withAcceleratorThreadsNumber("4")
+        .gna().propagate_forward().called_without().pwl_inserted_into_nnet();
+}
+
+TEST_F(GNAConfigTest, canNOTMatchWith0AsyncThreads) {
+    assert_that()
+        .onInferModel(GNATestIRs::Fc2DOutputModel())
+        .inNotCompactMode()
+        .withAcceleratorThreadsNumber("0")
+        .gna().propagate_forward().called_without().pwl_inserted_into_nnet()
+        .throws();
+}
+
+TEST_F(GNAConfigTest, canNOTMatchWith128AsyncThreads) {
+    assert_that()
+        .onInferModel(GNATestIRs::Fc2DOutputModel())
+        .inNotCompactMode()
+        .withAcceleratorThreadsNumber("128")
+        .gna().propagate_forward().called_without().pwl_inserted_into_nnet()
+        .throws();
+}
+
+TEST_F(GNAConfigTest, canMatchWithSingleMultipleOMPThreads) {
+    assert_that()
+        .onInferModel(GNATestIRs::Fc2DOutputModel())
+        .inNotCompactMode()
+        .enable_omp_multithreading()
+        .gna().propagate_forward().called_without().pwl_inserted_into_nnet();
+}
\ No newline at end of file
diff --git a/inference-engine/tests/unit/engines/gna/gna_allocator_test.cpp b/inference-engine/tests/unit/engines/gna/gna_allocator_test.cpp
new file mode 100644 (file)
index 0000000..35ddc77
--- /dev/null
@@ -0,0 +1,78 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gna_plugin/gna_allocator.hpp"
+
+#include <vector>
+#include <thread>
+
+#include <gtest/gtest.h>
+#include "gna_plugin/gna_device.hpp"
+//dummy definitions to work around issue with Linux userspace library
+typedef unsigned long long time_tsc;
+typedef struct
+{
+    time_tsc            start;      // time value on profiler start
+    time_tsc            stop;       // time value on profiler stop
+    time_tsc            passed;     // time passed between start and stop
+} intel_gna_profiler_tsc;
+
+void profilerTscStop(intel_gna_profiler_tsc* p) {
+    if (NULL == p) return;
+    p->passed = 0;
+    p->stop = 0;
+    p->start = 0;
+}
+void profilerTscStartAccumulate(intel_gna_profiler_tsc* p)
+{
+    if (NULL == p) return;
+    p->stop = 0;
+    p->start = 0;
+}
+void profilerTscStopAccumulate(intel_gna_profiler_tsc* p)
+{
+    if (NULL == p) return;
+    p->stop = 0;
+    p->passed += p->stop - p->start;
+}
+
+class GNAAllocatorTest : public ::testing::Test {
+
+ protected:
+    std::unique_ptr<GNADeviceHelper> gnadevice;
+    void SetUp() override  {
+       // gnadevice.reset(new GNADeviceHelper());
+    }
+};
+
+TEST_F(GNAAllocatorTest, canAllocateStdMemory) {
+    auto sp = make_polymorph<std::allocator<uint8_t>>();
+    uint8_t *x = nullptr;
+    ASSERT_NO_THROW(x = sp.allocate(100));
+    ASSERT_NE(x, nullptr);
+    ASSERT_NO_THROW(sp.deallocate(x, 100));
+}
+
+TEST_F(GNAAllocatorTest, canAllocateGNAMemory) {
+    //GNA device can be opened one per process for now
+    gnadevice.reset(new GNADeviceHelper());
+    auto sp = make_polymorph<GNAAllocator>(*gnadevice.get());
+    uint8_t *x = nullptr;
+    ASSERT_NO_THROW(x = sp.allocate(100));
+    ASSERT_NE(x, nullptr);
+    ASSERT_NO_THROW(sp.deallocate(x, 100));
+}
+
+TEST_F(GNAAllocatorTest, DISABLED_canOpenDevice) {
+    std::thread th([]()
+    {
+        GNADeviceHelper h1;
+    });
+    th.join();
+    std::thread th2([]()
+   {
+       GNADeviceHelper h1;
+   });
+    th2.join();
+}
diff --git a/inference-engine/tests/unit/engines/gna/gna_api_stub.cpp b/inference-engine/tests/unit/engines/gna/gna_api_stub.cpp
new file mode 100644 (file)
index 0000000..5417e52
--- /dev/null
@@ -0,0 +1,218 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#define INTEL_GNA_DLLEXPORT 1
+#include <gna-api.h>
+#include <gna-api-dumper.h>
+#include <gna-api-instrumentation.h>
+#include "gna_mock_api.hpp"
+
+static GNACppApi * current = nullptr;
+
+GNACppApi :: GNACppApi() {
+    current = this;
+}
+
+GNACppApi :: ~GNACppApi() {
+    current = nullptr;
+}
+
+#ifdef __cplusplus
+extern "C" {  // API uses C linkage so that it can be used by C and C++ applications
+#endif
+
+
+/**
+ * intel_gna_status_t members printable descriptions
+ *   Size: NUMGNASTATUS + 1
+ */
+DLLDECL const char *GNAStatusName[] = {"status"};
+
+/**
+ * intel_gmm_mode_t members printable descriptions
+ *   Size: NUMGMMMODES + 1
+ */
+DLLDECL const char *GMMModeName[] = {"model"};
+
+/**
+ * // TODO: fill
+ */
+DLLDECL intel_gna_status_t GNAScoreGaussians(
+    intel_gna_handle_t          handle,
+    const intel_feature_type_t* pFeatureType,
+    const intel_feature_t*      pFeatureData,
+    const intel_gmm_type_t*     pModelType,
+    const intel_gmm_t*          pModelData,
+    const uint32_t*             pActiveGMMIndices,
+    uint32_t                    nActiveGMMIndices,
+    uint32_t                    uMaximumScore,
+    intel_gmm_mode_t            nGMMMode,
+    uint32_t*                   pScores,
+    uint32_t*                   pReqId,
+    intel_gna_proc_t            nAccelerationType
+) {
+    if (current != nullptr) {
+        return current->GNAScoreGaussians(
+            //handle,
+            //pFeatureType,
+            pFeatureData,
+            pModelType,
+            pModelData,
+            pActiveGMMIndices,
+            nActiveGMMIndices,
+            uMaximumScore,
+            nGMMMode,
+            pScores,
+            pReqId,
+            nAccelerationType);
+    }
+    return GNA_NOERROR;
+}
+
+DLLDECL intel_gna_status_t GNAPropagateForward(
+    intel_gna_handle_t          handle,
+    const intel_nnet_type_t*    pNeuralNetwork,
+    const uint32_t*             pActiveIndices,
+    uint32_t                    nActiveIndices,
+    uint32_t*                   pReqId,
+    intel_gna_proc_t            nAccelerationType
+) {
+    if (current != nullptr) {
+        return current->GNAPropagateForward(
+            handle,
+            pNeuralNetwork,
+            pActiveIndices,
+            nActiveIndices,
+            pReqId,
+            nAccelerationType);
+    }
+    return GNA_NOERROR;
+}
+
+// TODO: add output status
+/**
+ * // TODO: fill
+ */
+DLLDECL void *GNAAlloc(
+    intel_gna_handle_t nGNADevice,   // handle to GNA accelerator
+    uint32_t           sizeRequested,
+    uint32_t*          sizeGranted
+) {
+    if (current != nullptr) {
+        return current->GNAAlloc(nGNADevice, sizeRequested, sizeGranted);
+    }
+    if (sizeGranted != nullptr) {
+        *sizeGranted = sizeRequested;
+    }
+    return (void*)1;
+}
+
+/**
+ * // TODO: fill
+ */
+DLLDECL intel_gna_status_t GNAFree(
+    intel_gna_handle_t nGNADevice   // handle to GNA accelerator
+) {
+    if (current != nullptr) {
+        return current->GNAFree(nGNADevice);
+    }
+    return GNA_NOERROR;
+}
+
+/**
+ * // TODO: fill
+ */
+DLLDECL intel_gna_handle_t GNADeviceOpen(
+    intel_gna_status_t* status         // Status of the call
+) {
+    if (current != nullptr) {
+        return current->GNADeviceOpen(status);
+    }
+    return 0;
+
+}
+
+/**
+* // TODO: fill
+*/
+DLLDECL intel_gna_handle_t GNADeviceOpenSetThreads(
+    intel_gna_status_t* status,                // Status of the call
+    uint8_t n_threads                          // Number of worker threads
+) {
+    if (current != nullptr) {
+        return current->GNADeviceOpenSetThreads(status, n_threads);
+    }
+    return GNA_NOERROR;
+
+}
+
+/**
+ * // TODO: fill
+ */
+DLLDECL intel_gna_status_t GNADeviceClose(
+    intel_gna_handle_t nGNADevice // handle to GNA accelerator
+) {
+    if (current != nullptr) {
+        return current->GNADeviceClose(nGNADevice);
+    }
+    return GNA_NOERROR;
+
+}
+
+/**
+ * // TODO: fill
+ */
+DLLDECL intel_gna_status_t GNAWait(
+    intel_gna_handle_t nGNADevice,            // handle to GNA accelerator
+    uint32_t           nTimeoutMilliseconds,
+    uint32_t           reqId                  // IN score request ID
+) {
+    if (current != nullptr) {
+        return current->GNAWait(nGNADevice, nTimeoutMilliseconds, reqId);
+    }
+    return GNA_NOERROR;
+}
+
+DLLDECL intel_gna_status_t GNAWaitPerfRes(
+    intel_gna_handle_t nGNADevice,            // handle to GNA accelerator
+    uint32_t           nTimeoutMilliseconds,
+    uint32_t           reqId,                 // IN score request ID
+    intel_gna_perf_t*  nGNAPerfResults
+) {
+    if (current != nullptr) {
+        return current->GNAWaitPerfRes(nGNADevice,
+                                       nTimeoutMilliseconds,
+                                       reqId,
+                                       nGNAPerfResults);
+    }
+    return GNA_NOERROR;
+}
+
+DLLDECL void* GNADumpXnn(
+    const intel_nnet_type_t*    neuralNetwork,
+    const uint32_t*             activeIndices,
+    uint32_t                    activeIndicesCount,
+    intel_gna_model_header*     modelHeader,
+    intel_gna_status_t*         status,
+    intel_gna_alloc_cb          customAlloc) {
+        if (current != nullptr) {
+            return current->GNADumpXnn(neuralNetwork,
+                                        activeIndices,
+                                        activeIndicesCount,
+                                        modelHeader,
+                                        status,
+                                        customAlloc);
+        }
+        return nullptr;
+}
+
+DLLDECL void gmmSetThreads(
+    int num
+) {
+    current->gmmSetThreads((num != 0) ? num : 1);
+}
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/inference-engine/tests/unit/engines/gna/gna_graph_aot_test.cpp b/inference-engine/tests/unit/engines/gna/gna_graph_aot_test.cpp
new file mode 100644 (file)
index 0000000..45385be
--- /dev/null
@@ -0,0 +1,85 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <gtest/gtest.h>
+#include <inference_engine/layer_transform.hpp>
+#include <gna_plugin/quantization/model_quantizer.hpp>
+#include "gna_plugin/quantization/layer_quantizer.hpp"
+#include "gna_matcher.hpp"
+
+using namespace InferenceEngine;
+using namespace GNAPluginNS;
+using namespace GNATestIRs;
+
+class GNAAOTTests : public GNATest {
+ protected:
+    std::list<std::string> files_to_remove;
+    std::string registerFileForRemove(std::string file_to_remove) {
+        files_to_remove.push_back(file_to_remove);
+        return file_to_remove;
+    }
+    void TearDown() override {
+        for (auto & file : files_to_remove) {
+            std::remove(file.c_str());
+        }
+    }
+
+    void SetUp() override  {
+    }
+};
+
+TEST_F(GNAAOTTests, AffineWith2AffineOutputs_canbe_export_imported) {
+
+    const std::string X = registerFileForRemove("unit_tests.bin");
+
+    // running export to a file
+    export_network(AffineWith2AffineOutputsModel())
+        .inNotCompactMode().as().gna().model().to(X);
+
+    // running infer using imported model instead of IR
+    assert_that().onInferModel().importedFrom(X)
+        .inNotCompactMode().gna().propagate_forward().called().once();
+}
+
+
+TEST_F(GNAAOTTests, AffineWith2AffineOutputs_canbe_imported_verify_structure) {
+
+    auto & nnet_type = storage<intel_nnet_type_t>();
+
+    // saving pointer to nnet - todo probably deep copy required
+    save_args().onInferModel(AffineWith2AffineOutputsModel())
+        .inNotCompactMode().from().gna().propagate_forward().to(&nnet_type);
+
+    const std::string X = registerFileForRemove("unit_tests.bin");
+
+    // running export to a file
+    export_network(AffineWith2AffineOutputsModel())
+        .inNotCompactMode().as().gna().model().to(X);
+
+    // running infer using imported model instead of IR
+    assert_that().onInferModel().importedFrom(X)
+        .inNotCompactMode().gna().propagate_forward().called_with().exact_nnet_structure(&nnet_type);
+
+}
+
+TEST_F(GNAAOTTests, CanConvertFromAOTtoSueModel) {
+
+    auto & nnet_type = storage<intel_nnet_type_t>();
+
+    // saving pointer to nnet - todo probably deep copy required
+    save_args().onInferModel(AffineWith2AffineOutputsModel())
+        .inNotCompactMode().from().gna().propagate_forward().to(&nnet_type);
+
+    const std::string X = registerFileForRemove("unit_tests.bin");
+
+    // running export to a file
+    export_network(AffineWith2AffineOutputsModel())
+        .inNotCompactMode().as().gna().model().to(X);
+
+    // running infer using imported model instead of IR
+    assert_that().onInferModel().importedFrom(X)
+        .inNotCompactMode().withGNAConfig(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), "sue.dump").gna().dumpXNN().called();
+}
+
diff --git a/inference-engine/tests/unit/engines/gna/gna_hardware_precision_test.cpp b/inference-engine/tests/unit/engines/gna/gna_hardware_precision_test.cpp
new file mode 100644 (file)
index 0000000..b7dba21
--- /dev/null
@@ -0,0 +1,49 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <mock_icnn_network.hpp>
+#include <cpp/ie_cnn_net_reader.h>
+#include <gmock/gmock-generated-actions.h>
+#include "gna_matcher.hpp"
+
+using namespace std;
+using namespace InferenceEngine;
+using namespace ::testing;
+
+class GNAHWPrecisionTest : public GNATest {
+
+};
+
+TEST_F(GNAHWPrecisionTest, defaultPrecisionIsInt16) {
+    assert_that().onInfer1AFModel().gna().propagate_forward().called_with().
+        nnet_input_precision(Precision::I16).
+        nnet_ouput_precision(Precision::I32).
+        nnet_weights_precision(Precision::I16).
+        nnet_biases_precision(Precision::I32);
+}
+
+TEST_F(GNAHWPrecisionTest, canPassInt8Precision) {
+    assert_that().onInfer1AFModel().withConfig(PRECISION, Precision::I8).
+        gna().propagate_forward().called_with().
+            nnet_input_precision(Precision::I16).
+            nnet_ouput_precision(Precision::I32).
+            nnet_weights_precision(Precision::I8).
+            nnet_biases_precision(Precision::fromType<intel_compound_bias_t>());
+}
+
+TEST_F(GNAHWPrecisionTest, canPassInt16Precision) {
+    assert_that().onInfer1AFModel().withConfig(PRECISION, Precision::I16).
+        gna().propagate_forward().called_with().
+        nnet_input_precision(Precision::I16).
+        nnet_ouput_precision(Precision::I32).
+        nnet_weights_precision(Precision::I16).
+        nnet_biases_precision(Precision::I32);
+}
+
+TEST_F(GNAHWPrecisionTest, failToCreatePluginWithUnsuportedPrecision) {
+    assert_that().creating().gna_plugin().withConfig(PRECISION, Precision::FP32).throws();
+}
\ No newline at end of file
diff --git a/inference-engine/tests/unit/engines/gna/gna_matcher.cpp b/inference-engine/tests/unit/engines/gna/gna_matcher.cpp
new file mode 100644 (file)
index 0000000..c609e4e
--- /dev/null
@@ -0,0 +1,440 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <mock_icnn_network.hpp>
+#include "gna_matcher.hpp"
+#include <gna/gna_config.hpp>
+#include <gna-api-types-xnn.h>
+#include <gna_plugin/gna_executable_network.hpp>
+#include "gna_plugin.hpp"
+#include "gna_mock_api.hpp"
+#include "matchers/precision_matcher.hpp"
+#include "matchers/pwl_matcher.hpp"
+#include "matchers/copy_matcher.hpp"
+#include "matchers/diag_matcher.hpp"
+#include "matchers/pwl_quantization_metrics_matcher.hpp"
+#include "matchers/conv_matcher.hpp"
+#include "matchers/pool_matcher.hpp"
+
+#include <gmock/gmock-generated-actions.h>
+#include <gmock/gmock-more-actions.h>
+#include "gmock/gmock.h"
+
+using namespace std;
+using namespace InferenceEngine;
+using namespace GNAPluginNS;
+using namespace ::testing;
+
+class NullAllocator : public IAllocator {
+ void * ptr = nullptr;
+public:
+    NullAllocator() {
+        ptr = malloc(1);
+    }
+    ~NullAllocator() {
+        free(ptr);
+    }
+    void * lock(void * handle, LockOp = LOCK_FOR_WRITE)  noexcept override {
+        return ptr;
+    }
+    void  unlock(void * handle) noexcept override {
+
+    }
+    void * alloc(size_t size) noexcept override {
+        return ptr;
+    }
+    virtual bool   free(void* handle) noexcept {
+        return true;
+    }
+    virtual void Release() noexcept {
+        delete this;
+    }
+};
+
+void GNAPropagateMatcher :: match() {
+    try {
+        // matching gna propagate forward call.
+        GNAPlugin plugin(_env.config);
+        size_t inputSize = 10;
+        size_t outputSize = 10;
+
+        auto loadNetworkFromIR = [&] () {
+            CNNNetReader net_reader;
+            ASSERT_NO_THROW_IE_EXCEPTION(net_reader.ReadNetwork(_env.model.data(), _env.model.length()));
+
+            auto weights_fake = make_shared<TBlob<uint8_t>>(Precision::U8, C, SizeVector({std::numeric_limits<uint32_t>::max()}), make_shared<NullAllocator>());
+            net_reader.SetWeights(weights_fake);
+
+            auto net_original = net_reader.getNetwork();
+            auto input_dims = net_original.getInputsInfo().begin()->second->getTensorDesc().getDims();
+            auto output = net_original.getOutputsInfo();
+            // sometimes network might be created without outputs - ex memory output only
+            auto output_dims = !output.empty() ? output.begin()->second->getTensorDesc().getDims() : input_dims;
+
+            inputSize = details::product(std::begin(input_dims), std::end(input_dims));
+            outputSize = details::product(std::begin(output_dims), std::end(output_dims));
+
+            size_t weightsSize = 0;
+            for (auto &layer : net_original) {
+                auto w = layer->blobs["weights"];
+                auto b = layer->blobs["biases"];
+
+                if (w) {
+                    weightsSize += w->byteSize();
+                }
+                if (b) {
+                    weightsSize += b->byteSize();
+                }
+            }
+            auto weights = make_shared_blob<uint8_t >(Precision::U8, C, {weightsSize});
+
+            weights->allocate();
+            GNATest::fillWeights(weights);
+            net_reader.SetWeights(weights);
+
+            net_reader.getNetwork().setTargetDevice(_env.target_device);
+
+            if (_env.cb) {
+                auto network = net_reader.getNetwork();
+                _env.cb(network);
+            }
+
+            plugin.LoadNetwork(net_reader.getNetwork());
+        };
+
+        auto loadNetworkFromAOT = [&] () {
+            plugin.ImportNetwork(_env.importedModelFileName);
+        };
+
+        TBlob<float>::Ptr input, output;
+        size_t in_N = 1;
+        size_t out_N = in_N;
+        size_t in_C;
+        size_t out_C;
+
+
+        auto loadNetwork = [&]() {
+            if (!_env.importedModelFileName.empty()) {
+                ASSERT_NO_FATAL_FAILURE(loadNetworkFromAOT());
+            } else {
+                ASSERT_NO_FATAL_FAILURE(loadNetworkFromIR());
+            }
+            in_C = _env.matchOutput == true ? _env.input_init.size(): inputSize;
+            out_C = _env.matchOutput == true ? _env.expected_output.size(): outputSize;
+
+            input.reset(new TBlob<float>(Precision::FP32, NC, {in_C, in_N}));
+            input->allocate();
+
+            if(_env.matchOutput == true) {
+                std::copy_n(_env.input_init.cbegin(), in_N * in_C, input->buffer().as<float *>());
+            }
+
+            output.reset(new TBlob<float>(Precision::FP32, NC, {out_C, out_N}));
+            output->allocate();
+        };
+
+
+        StrictMock<GNACppApi> mockApi;
+        std::vector<uint8_t> data;
+
+        if (_env.target_device == InferenceEngine::TargetDevice::eGNA &&
+                                                         !_env.matchThrows) {
+
+            EXPECT_CALL(mockApi, GNAAlloc(_,_,_)).WillOnce(Invoke([&data](
+                intel_gna_handle_t nGNADevice,   // handle to GNA accelerator
+                uint32_t           sizeRequested,
+                uint32_t*          sizeGranted
+            ) {
+                data.resize(sizeRequested);
+                *sizeGranted = sizeRequested;
+                return &data.front();
+            }));
+            EXPECT_CALL(mockApi, GNADeviceOpenSetThreads(_, _)).WillOnce(Return(1));
+
+            if(_env.is_profiling_enabled == false) {
+                EXPECT_CALL(mockApi, GNAWait(_, _, _)).WillOnce(Return(GNA_NOERROR));
+            } else {
+                EXPECT_CALL(mockApi, GNAWaitPerfRes(_, _, _, _)).WillOnce(Return(GNA_NOERROR));
+            }
+
+            if(_env.is_setup_of_omp_theads_expected == true) {
+                EXPECT_CALL(mockApi, gmmSetThreads(_)).Times(1);
+            } else {
+                EXPECT_CALL(mockApi, gmmSetThreads(_)).Times(0);
+            }
+
+            std::unique_ptr<NNetComponentMatcher> combined(new NNetComponentMatcher());
+
+            for (auto & matchWhat : _env.whatToMatch) {
+                switch(matchWhat) {
+                    case GnaPluginTestEnvironment::matchPrecision :
+                        combined->add(new NNetPrecisionMatcher(_env.nnet_precision, INTEL_AFFINE));
+                        break;
+                    case GnaPluginTestEnvironment::matchProcType :
+                        EXPECT_CALL(mockApi, GNAPropagateForward(_, _, _, _, _, Eq(_env.proc_type)))
+                            .WillOnce(Return(GNA_NOERROR));
+                        break;
+                    case GnaPluginTestEnvironment::matchPwlInserted :
+                        combined->add(new PWLMatcher(_env.matchInserted, _env.matchQuantity));
+                        break;
+                    case GnaPluginTestEnvironment::matchConvInserted:
+                        combined->add(new ConvoluionLayerMatcher(_env.matchInserted, _env.matchQuantity));
+                        break;
+                    case GnaPluginTestEnvironment::matchMaxPoolingInserted:
+                        combined->add(new PoolingLayerMatcher(_env.matchInserted, _env.matchQuantity, true));
+                        break;
+                    case GnaPluginTestEnvironment::matchPwlQuantizeMetrics :
+                        combined->add(new PWLQuantizationMetricsMatcher(_env.type,
+                                                                        _env.quantization_presicion_threshold,
+                                                                        _env.quantization_segments_threshold));
+                        break;
+                    case GnaPluginTestEnvironment::matchCopyInserted :
+                        combined->add(new CopyLayerMatcher(_env.matchInserted, _env.matchQuantity));
+                        break;
+                    case GnaPluginTestEnvironment::matchDiagonalInserted :
+                        combined->add(new DiagLayerMatcher(_env.matchInserted, _env.matchQuantity));
+                        break;
+                    case GnaPluginTestEnvironment::saveArgs :
+                        EXPECT_CALL(mockApi, GNAPropagateForward(_, _, _, _, _, _))
+                            .WillOnce(DoAll(SaveArgPointee<1>(savedNet), Return(GNA_NOERROR)));
+                        break;
+                    default:
+                        EXPECT_CALL(mockApi, GNAPropagateForward(_, _, _, _, _, _))
+                            .WillOnce(Return(GNA_NOERROR));
+                        break;
+                }
+            }
+            if (combined && !combined->empty()) {
+                EXPECT_CALL(mockApi, GNAPropagateForward(_, ::testing::MakeMatcher(combined.release()), _, _, _,_)).WillOnce(Return(GNA_NOERROR));
+            }
+        }
+
+        loadNetwork();
+        plugin.Infer(*input, *output);
+        if(_env.matchOutput == true) {
+            std::vector<float> actual_output(output->size());
+
+            std::copy_n(output->cbuffer().as<float *>(), out_C * out_N, actual_output.begin());
+
+            ASSERT_EQ(true,
+                    std::equal(_env.expected_output.begin(), _env.expected_output.end(), actual_output.begin())
+                  );
+        }
+
+        std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> perfMap;
+        plugin.GetPerformanceCounts(perfMap);
+
+        if(_env.is_profiling_enabled != false) {
+            ASSERT_NE(perfMap.empty(),true);
+        } else {
+            ASSERT_NE(perfMap.empty(),false);
+        }
+
+    }
+    catch(std::exception &ex) {
+        if (!_env.matchThrows) {
+            FAIL() << ex.what();
+        }
+    }
+    catch(...) {
+        if (!_env.matchThrows) {
+            FAIL() << "unknown exception thrown";
+        }
+    }
+
+}
+
+void GNAPluginCreationMatcher :: match() {
+    if (_env.matchThrows) {
+        ASSERT_ANY_THROW(GNAPlugin(_env.config));
+        return;
+    }
+    GNAPlugin(_env.config);
+}
+
+
+void GNAPluginAOTMatcher :: match() {
+    // matching gna_propagate forward call.
+    MockICNNNetwork net;
+    CNNNetReader net_reader;
+    ASSERT_NO_THROW_IE_EXCEPTION(net_reader.ReadNetwork(_env.model.data(), _env.model.length()));
+
+    size_t weightsSize = 440*3;
+
+    auto weights = make_shared_blob<uint8_t >(Precision::U8, C, {weightsSize});
+    weights->allocate();
+    GNATest::fillWeights(weights);
+    net_reader.SetWeights(weights);
+
+    GNAPlugin plugin(_env.config);
+
+    TBlob<float> input(Precision::FP32, NC, {10, 1});
+    input.allocate();
+
+
+    TBlob<float> output(Precision::FP32, NC, {10, 1});
+    output.allocate();
+
+    net_reader.getNetwork().setTargetDevice(TargetDevice::eGNA);
+
+    if (_env.cb) {
+        auto network = net_reader.getNetwork();
+        _env.cb(network);
+    }
+
+    GNACppApi mockApi;
+    std::vector<uint8_t> data(10000);
+    EXPECT_CALL(mockApi, GNAAlloc(_,_,_)).WillOnce(DoAll(SetArgPointee<2>(10000), Return(&data.front())));
+    EXPECT_CALL(mockApi, GNADeviceOpenSetThreads(_, _)).WillOnce(Return(1));
+
+    plugin.LoadNetwork(net_reader.getNetwork());
+    plugin.Export(_env.exportedModelFileName);
+}
+
+
+void GNADumpXNNMatcher::load(GNAPlugin & plugin) {
+
+    // matching gna DumpXNN forward call.
+    plugin = GNAPlugin(_env.config);
+
+    auto loadNetworkFromIR = [&]() {
+        MockICNNNetwork net;
+        CNNNetReader net_reader;
+        ASSERT_NO_THROW_IE_EXCEPTION(net_reader.ReadNetwork(_env.model.data(), _env.model.length()));
+
+        size_t weightsSize = 440 * 3;
+
+        auto weights = make_shared_blob<uint8_t>(Precision::U8, C, {weightsSize});
+        weights->allocate();
+        GNATest::fillWeights(weights);
+        net_reader.SetWeights(weights);
+
+        net_reader.getNetwork().setTargetDevice(TargetDevice::eGNA);
+
+        if (_env.cb) {
+            auto network = net_reader.getNetwork();
+            _env.cb(network);
+        }
+
+        plugin.LoadNetwork(net_reader.getNetwork());
+    };
+
+    auto loadNetworkFromAOT = [&]() {
+        plugin.ImportNetwork(_env.importedModelFileName);
+    };
+
+    auto loadNetwork = [&]() {
+        if (!_env.importedModelFileName.empty()) {
+            loadNetworkFromAOT();
+        } else {
+            loadNetworkFromIR();
+        }
+    };
+
+    loadNetwork();
+}
+
+void GNADumpXNNMatcher::match() {
+
+    GNACppApi mockApi;
+    std::vector<uint8_t> data(10000);
+    if (!_env.matchThrows) {
+
+        EXPECT_CALL(mockApi, GNAAlloc(_,_,_)).WillOnce(DoAll(SetArgPointee<2>(10000), Return(&data.front())));
+        EXPECT_CALL(mockApi, GNADeviceOpenSetThreads(_, _)).WillOnce(Return(1));
+        intel_gna_model_header header = {};
+        header.model_size = 1;
+        EXPECT_CALL(mockApi, GNADumpXnn(_, _, _, _, _,_)).WillOnce(DoAll(SetArgPointee<3>(header), Return((void*)::operator new(1))));
+        EXPECT_CALL(mockApi, GNAFree(_)).WillOnce(Return(GNA_NOERROR));
+        EXPECT_CALL(mockApi, GNADeviceClose(_)).WillOnce(Return(GNA_NOERROR));
+    }
+
+    try {
+        // matching gna DumpXNN forward call.
+        GNAPluginNS::GNAPlugin plugin;
+        load(plugin);
+    }
+    catch(std::exception &ex) {
+        if (!_env.matchThrows) {
+            FAIL() << ex.what();
+        }
+    }
+    catch(...) {
+        if (!_env.matchThrows) {
+            FAIL() << "unknown exception thrown";
+        }
+    }
+
+}
+
+void GNAQueryStateMatcher :: match() {
+
+   //  TODO : avoid copy pastes
+    GNACppApi mockApi;
+    std::vector<uint8_t> data(10000);
+
+    std::shared_ptr<IExecutableNetworkInternal> executer;
+    auto loadNetworkFromIR = [&]() {
+        MockICNNNetwork net;
+        CNNNetReader net_reader;
+        ASSERT_NO_THROW_IE_EXCEPTION(net_reader.ReadNetwork(_env.model.data(), _env.model.length()));
+
+        size_t weightsSize = 440 * 3;
+
+        auto weights = make_shared_blob<uint8_t>(Precision::U8, C, {weightsSize});
+        weights->allocate();
+        GNATest::fillWeights(weights);
+        net_reader.SetWeights(weights);
+
+        net_reader.getNetwork().setTargetDevice(TargetDevice::eGNA);
+
+        if (_env.cb) {
+            auto network = net_reader.getNetwork();
+            _env.cb(network);
+        }
+
+        executer.reset(new GNAExecutableNetwork(net_reader.getNetwork(), _env.config));
+    };
+
+    auto loadNetworkFromAOT = [&]() {
+        executer.reset(new GNAExecutableNetwork(_env.importedModelFileName, _env.config));
+    };
+
+    auto loadNetwork = [&]() {
+        if (!_env.importedModelFileName.empty()) {
+            return loadNetworkFromAOT();
+        } else {
+            return loadNetworkFromIR();
+        }
+    };
+
+
+    EXPECT_CALL(mockApi, GNAAlloc(_,_,_)).WillOnce(DoAll(SetArgPointee<2>(10000), Return(&data.front())));
+    EXPECT_CALL(mockApi, GNADeviceOpenSetThreads(_, _)).WillOnce(Return(1));
+    EXPECT_CALL(mockApi, GNAFree(_)).WillOnce(Return(GNA_NOERROR));
+    EXPECT_CALL(mockApi, GNADeviceClose(_)).WillOnce(Return(GNA_NOERROR));
+
+    try {
+        loadNetwork();
+        if (GnaPluginTestEnvironment::kAnyNotNull == _env.numberOfStates) {
+            auto states = executer->QueryState();
+            ASSERT_NE(states.size(), 0);
+            // usually states are callable
+            for (auto & state : states) {
+                state->Reset();
+            }
+        } else if (_env.numberOfStates >= 0) {
+            ASSERT_EQ(executer->QueryState().size(), _env.numberOfStates);
+        } else {
+            FAIL() << "number of memory states expectation not set";
+        }
+
+    }
+    catch(std::exception &ex) {
+        FAIL() << ex.what();
+    }
+    catch(...) {
+        FAIL() << "unknown exception thrown";
+    }
+}
\ No newline at end of file
diff --git a/inference-engine/tests/unit/engines/gna/gna_matcher.hpp b/inference-engine/tests/unit/engines/gna/gna_matcher.hpp
new file mode 100644 (file)
index 0000000..b249aa2
--- /dev/null
@@ -0,0 +1,490 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <limits>
+#include <inference_engine/graph_tools.hpp>
+#include "gtest/gtest.h"
+#include "inference_engine.hpp"
+#include "gna/gna_config.hpp"
+#include "gna_plugin.hpp"
+#include "gna-api.h"
+#include "test_irs.hpp"
+#include "dnn.h"
+
+
+#define withConfig(key, value) withGNAConfig(GNA_CONFIG_KEY(key), value)
+#define ASSERT_NO_THROW_IE_EXCEPTION(expr) \
+try {\
+expr;\
+}catch(std::exception & e) {\
+    FAIL() << e.what();\
+}\
+catch(...) {\
+    FAIL() << "unknown exception";\
+}
+
+/**
+ * GNA unit tests environment
+ */
+class GnaPluginTestEnvironment {
+ public:
+    struct NnetPrecision {
+        InferenceEngine::Precision input_precision;
+        InferenceEngine::Precision output_precision;
+        InferenceEngine::Precision weights_precision;
+        InferenceEngine::Precision biases_precision;
+    };
+    enum MatchWhat {
+        exactNNetStructure,
+        matchNone,
+        matchProcType,
+        matchPrecision,
+        matchPwlInserted,
+        matchConvInserted,
+        matchMaxPoolingInserted,
+        matchPwlQuantizeMetrics,
+        matchCopyInserted,
+        matchDiagonalInserted,
+        saveArgs
+    };
+    std::vector<MatchWhat> whatToMatch;
+    enum {
+        kUnset = -1,
+        kAnyNotNull= -2
+    };
+    InferenceEngine::TargetDevice target_device =
+                            InferenceEngine::TargetDevice::eGNA;
+    int matchQuantity = kUnset;
+    int numberOfStates = kUnset;
+    bool matchInserted = true;
+    NnetPrecision nnet_precision;
+    float quantization_presicion_threshold = 1.0f;
+    uint16_t quantization_segments_threshold = UINT16_MAX;
+    uint32_t type = 0;
+    std::string model;
+    std::string exportedModelFileName;
+    bool exportNetworkOnly = false;
+    std::function<void (InferenceEngine::CNNNetwork &)> cb;
+    std::map<std::string, std::string> config;
+    bool matchThrows = false;
+    uint32_t proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
+    std::string importedModelFileName;
+    bool is_profiling_enabled = false;
+    bool matchOutput = false;
+    bool is_setup_of_omp_theads_expected = false;
+    std::vector<float> input_init;
+    std::vector<float> expected_output;
+};
+
+class GNATestBase {
+ public:
+    virtual ~GNATestBase() = default;
+};
+
+template <class T>
+class GNATestConfigurability : public GNATestBase{
+ protected:
+    bool needNextMatcher = true;
+    GnaPluginTestEnvironment _env;
+    GnaPluginTestEnvironment::MatchWhat & getMatcher() {
+        if (needNextMatcher) {
+            needNextMatcher = false;
+            _env.whatToMatch.push_back({});
+        }
+        return _env.whatToMatch.back();
+    }
+ public:
+    GNATestConfigurability(GnaPluginTestEnvironment env) : _env(env) {
+    }
+    T & And() {
+        needNextMatcher = true;
+        return *dynamic_cast<T*>(this);
+    }
+    template <class VType>
+    T & withGNAConfig(const std::string keyName, const VType &value) {
+        std::stringstream ss;
+        ss << value;
+        _env.config[keyName] = ss.str();
+        return *dynamic_cast<T*>(this);
+    }
+    T & withGNADeviceMode(std::string value) {
+        _env.config[GNA_CONFIG_KEY(DEVICE_MODE)] = value;
+        return *dynamic_cast<T*>(this);
+    }
+    T & withAcceleratorThreadsNumber(std::string value) {
+        _env.config[GNA_CONFIG_KEY(LIB_N_THREADS)] = value;
+        return *dynamic_cast<T*>(this);
+    }
+    T & throws() {
+        _env.matchThrows = true;
+        return *dynamic_cast<T*>(this);
+    }
+    T & profiling_counters() {
+        _env.is_profiling_enabled = true;
+        _env.config[CONFIG_KEY(PERF_COUNT)] = InferenceEngine::PluginConfigParams::YES;
+        return *dynamic_cast<T*>(this);
+    }
+
+    T & enable_omp_multithreading() {
+        _env.is_setup_of_omp_theads_expected = true;
+        _env.config[CONFIG_KEY(SINGLE_THREAD)] = InferenceEngine::PluginConfigParams::NO;
+        return *dynamic_cast<T*>(this);
+    }
+};
+
+/**
+ * @brief matches loadnetwork + infer + call to gna_api propagate
+ */
+class GNAPropagateMatcher : public GNATestConfigurability<GNAPropagateMatcher> {
+ public:
+    using base = GNATestConfigurability<GNAPropagateMatcher>;
+    using base::base;
+    using base::getMatcher;
+
+    ~GNAPropagateMatcher() {
+        match();
+    }
+
+    GNAPropagateMatcher & called() {
+        // inserting default matcher that matches any propagate_forward call
+        getMatcher();
+        return *this;
+    }
+
+    GNAPropagateMatcher & called_with() {
+        return *this;
+    }
+
+    GNAPropagateMatcher & called_without() {
+        _env.matchInserted = false;
+        return *this;
+    }
+
+    GNAPropagateMatcher & called_with_input_and_expected_output(std::vector<float>& input_data,
+                                                                std::vector<float>& expect) {
+        _env.matchOutput = true;
+        _env.input_init = input_data;
+        _env.expected_output = expect;
+        return *this;
+    }
+
+    GNAPropagateMatcher & once() {
+        _env.matchQuantity = 1;
+        return *this;
+    }
+
+    GNAPropagateMatcher & twice() {
+        _env.matchQuantity = 2;
+        return *this;
+    }
+
+    GNAPropagateMatcher & args(std::string args) {
+        return *this;
+    }
+
+    GNAPropagateMatcher & exact_nnet_structure(intel_nnet_type_t * pNet) {
+
+        getMatcher() = GnaPluginTestEnvironment::exactNNetStructure;
+        original_nnet = pNet;
+        return *this;
+    }
+
+    GNAPropagateMatcher & pwl_inserted_into_nnet() {
+        getMatcher() = GnaPluginTestEnvironment::matchPwlInserted;
+        return *this;
+    }
+
+    GNAPropagateMatcher & max_pooling_inserted_into_nnet() {
+        getMatcher() = GnaPluginTestEnvironment::matchMaxPoolingInserted;
+        return *this;
+    }
+
+    GNAPropagateMatcher & succeed() {
+        return *this;
+    }
+
+    GNAPropagateMatcher & convolution_inserted_into_nnet() {
+        getMatcher() = GnaPluginTestEnvironment::matchConvInserted;
+        return *this;
+    }
+
+
+    GNAPropagateMatcher & pwl_quantization_activation(uint32_t activation_type) {
+        getMatcher() = GnaPluginTestEnvironment::matchPwlQuantizeMetrics;
+        _env.type = activation_type;
+        return *this;
+    }
+
+    GNAPropagateMatcher & pwl_quantization_precision_threshold(float threshold) {
+        getMatcher() = GnaPluginTestEnvironment::matchPwlQuantizeMetrics;
+        _env.quantization_presicion_threshold = threshold;
+        return *this;
+    }
+
+    GNAPropagateMatcher & pwl_quantization_segments_threshold(uint16_t threshold) {
+        getMatcher() = GnaPluginTestEnvironment::matchPwlQuantizeMetrics;
+        _env.quantization_segments_threshold = threshold;
+        return *this;
+    }
+
+    GNAPropagateMatcher & diagonal_inserted_into_nnet() {
+        getMatcher() = GnaPluginTestEnvironment::matchDiagonalInserted;
+        return *this;
+    }
+
+    GNAPropagateMatcher & copy_inserted_into_nnet() {
+        getMatcher() = GnaPluginTestEnvironment::matchCopyInserted;
+        return *this;
+    }
+
+    GNAPropagateMatcher & nnet_input_precision(const InferenceEngine::Precision &precision) {
+        getMatcher() = GnaPluginTestEnvironment::matchPrecision;
+        _env.nnet_precision.input_precision = precision;
+        return *this;
+    }
+    GNAPropagateMatcher & nnet_ouput_precision(const InferenceEngine::Precision &precision) {
+        getMatcher() = GnaPluginTestEnvironment::matchPrecision;
+        _env.nnet_precision.output_precision = precision;
+        return *this;
+    }
+    GNAPropagateMatcher & nnet_weights_precision(const InferenceEngine::Precision &precision) {
+        getMatcher() = GnaPluginTestEnvironment::matchPrecision;
+        _env.nnet_precision.weights_precision = precision;
+        return *this;
+    }
+    GNAPropagateMatcher & nnet_biases_precision(const InferenceEngine::Precision &precision) {
+        getMatcher() = GnaPluginTestEnvironment::matchPrecision;
+        _env.nnet_precision.biases_precision = precision;
+        return *this;
+    }
+
+    GNAPropagateMatcher & proc_type(uint32_t proc_type) {
+        getMatcher() = GnaPluginTestEnvironment::matchProcType;
+        _env.proc_type = proc_type;
+        return * this;
+    }
+
+    GNAPropagateMatcher & to(intel_nnet_type_t *savedNet) {
+        this->savedNet = savedNet;
+        return *this;
+    }
+
+    GNAPropagateMatcher & onCPU() {
+        _env.target_device = InferenceEngine::TargetDevice::eCPU;
+        return *this;
+    }
+ protected:
+    void match();
+    intel_nnet_type_t * original_nnet = nullptr;
+    intel_nnet_type_t * savedNet = nullptr;
+};
+
+
+/**
+ * @brief GNAPlugin matches creation only case
+ */
+class GNAPluginCreationMatcher : public GNATestConfigurability<GNAPluginCreationMatcher> {
+ public:
+    using base = GNATestConfigurability<GNAPluginCreationMatcher>;
+    using base::base;
+
+    GNAPluginCreationMatcher & gna_plugin() {
+        return * this;
+    }
+    ~GNAPluginCreationMatcher () {
+        match();
+    }
+ protected:
+    void match();
+};
+
+/**
+ * @brief GNAPlugin matches creation only case
+ */
+class GNAPluginAOTMatcher : public GNATestConfigurability<GNAPluginAOTMatcher> {
+ public:
+    using base = GNATestConfigurability<GNAPluginAOTMatcher>;
+    using base::base;
+
+    ~GNAPluginAOTMatcher() {
+        match();
+    }
+ protected:
+    void match();
+};
+
+/**
+ * @brief xnn api tests
+ */
+class GNADumpXNNMatcher : public GNATestConfigurability<GNADumpXNNMatcher> {
+ public:
+    using base = GNATestConfigurability<GNADumpXNNMatcher>;
+    using base::base;
+
+    ~GNADumpXNNMatcher() {
+        if (match_in_dctor) {
+            match();
+        }
+    }
+    GNADumpXNNMatcher& called() {
+        return *this;
+    }
+ protected:
+
+    bool match_in_dctor = true;
+    void load(GNAPluginNS::GNAPlugin & plugin);
+    void match();
+};
+
+/**
+ * @brief xnn api tests
+ */
+class GNAQueryStateMatcher : public GNADumpXNNMatcher {
+ public:
+    using base = GNADumpXNNMatcher;
+    using base::base;
+
+    ~GNAQueryStateMatcher() {
+        if (match_in_dctor) {
+            match();
+            match_in_dctor = false;
+        }
+    }
+    void isEmpty() {
+        _env.numberOfStates = 0;
+    }
+    void isNotEmpty() {
+        _env.numberOfStates = GnaPluginTestEnvironment::kAnyNotNull;
+    }
+
+ protected:
+    void match();
+};
+
+
+
+/**
+ * @brief base for test fixture
+ */
+class GNATest : public ::testing::Test, public GNATestConfigurability<GNATest>  {
+    using base = GNATestConfigurability<GNATest>;
+    using base::_env;
+    std::list<std::vector<uint8_t>> dataUsedInMatchers;
+    std::list<std::shared_ptr<GNATestBase>> returnedMatchers;
+
+ public:
+    template <class T>
+    T & storage () {
+        dataUsedInMatchers.push_back(std::vector<uint8_t >(sizeof(T)));
+        return *reinterpret_cast<T*> (&dataUsedInMatchers.back().front());
+    }
+    GNATest()  : base(GnaPluginTestEnvironment()) {}
+    GNATest & as() {
+        return *this;
+    }
+    GNATest & model() {
+        return *this;
+    }
+    GNATest & assert_that() {
+        return *this;
+    }
+    GNATest & export_network(std::string modelName) {
+        _env.model = modelName;
+        _env.exportNetworkOnly = true;
+        return *this;
+    }
+    GNATest & save_args() {
+        getMatcher() = GnaPluginTestEnvironment::saveArgs;
+        return *this;
+    }
+
+    GNATest & onInfer1AFModel() {
+        _env.model = GNATestIRs::Fc2DOutputModel();
+        return *this;
+    }
+    GNATest & onLoad(std::string _model) {
+        _env.model = _model;
+        return *this;
+    }
+    GNATest & afterLoadingModel(std::string _model) {
+        _env.model = _model;
+        return *this;
+    }
+
+    GNAQueryStateMatcher & queryState() {
+        returnedMatchers.push_back(std::make_shared<GNAQueryStateMatcher>(_env));
+        // clearing env;
+        _env = GnaPluginTestEnvironment();
+        return dynamic_cast<GNAQueryStateMatcher&>(*returnedMatchers.back());
+    }
+
+    /**importing indicates no infer happened ata all **/
+    GNAPropagateMatcher & importingModelFrom(std::string fileName) {
+        _env.importedModelFileName = fileName;
+        returnedMatchers.push_back(std::make_shared<GNAPropagateMatcher>(_env));
+        // clearing env;
+        _env = GnaPluginTestEnvironment();
+        return dynamic_cast<GNAPropagateMatcher&>(*returnedMatchers.back());
+    }
+    GNATest & importedFrom(std::string fileName) {
+        _env.importedModelFileName = fileName;
+        return *this;
+    }
+    GNATest & onInferModel(std::string _model = "",
+                           std::function<void (InferenceEngine::CNNNetwork &)> _cb = [](InferenceEngine::CNNNetwork & net){}) {
+        _env.model = _model;
+        _env.cb = _cb;
+        return *this;
+    }
+    GNATest & gna() {
+        return *this;
+    }
+    GNATest & from() {
+        return *this;
+    }
+    GNATest & inNotCompactMode() {
+        _env.config[GNA_CONFIG_KEY(COMPACT_MODE)] = CONFIG_VALUE(NO);
+        return *this;
+    }
+    GNATest & withUniformPWLAlgo() {
+        base::_env.config[GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN)] = CONFIG_VALUE(YES);
+        return *this;
+    }
+    GNAPropagateMatcher& propagate_forward() {
+        returnedMatchers.push_back(std::make_shared<GNAPropagateMatcher>(_env));
+        //clearing env;
+        _env = GnaPluginTestEnvironment();
+        return dynamic_cast<GNAPropagateMatcher&>(*returnedMatchers.back());
+    }
+    GNADumpXNNMatcher& dumpXNN() {
+        returnedMatchers.push_back(std::make_shared<GNADumpXNNMatcher>(_env));
+        //clearing env;
+        _env = GnaPluginTestEnvironment();
+        return dynamic_cast<GNADumpXNNMatcher&>(*returnedMatchers.back());
+    }
+    GNATest & withNanScaleFactor() {
+        base::_env.config[GNA_CONFIG_KEY(SCALE_FACTOR)] = std::to_string(std::numeric_limits<float>::quiet_NaN());
+        return *this;
+    }
+    GNATest & withInfScaleFactor() {
+        base::_env.config[GNA_CONFIG_KEY(SCALE_FACTOR)] = std::to_string(std::numeric_limits<float>::infinity());
+        return *this;
+    }
+    GNAPluginCreationMatcher creating() {
+        return _env;
+    }
+
+    GNAPluginAOTMatcher & to (std::string fileName) {
+        _env.exportedModelFileName = fileName;
+        returnedMatchers.push_back(std::make_shared<GNAPluginAOTMatcher>(_env));
+        //clearing env;
+        _env = GnaPluginTestEnvironment();
+        return dynamic_cast<GNAPluginAOTMatcher&>(*returnedMatchers.back());
+    }
+
+    static void fillWeights(InferenceEngine::Blob::Ptr weights, float value = 1) {
+        std::fill_n(weights->buffer().as<float*>(), weights->byteSize()/sizeof(float), value);
+    }
+};
diff --git a/inference-engine/tests/unit/engines/gna/gna_memory_test.cpp b/inference-engine/tests/unit/engines/gna/gna_memory_test.cpp
new file mode 100644 (file)
index 0000000..aaf0f57
--- /dev/null
@@ -0,0 +1,440 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <gtest/gtest.h>
+#include "gna_plugin/gna_memory.hpp"
+
+using namespace GNAPluginNS;
+
+class GNAMemoryTest : public ::testing::Test {
+
+ protected:
+    GNAMemory<std::allocator<uint8_t>> mem;
+
+    void SetUp() override  {
+    }
+};
+
+TEST_F(GNAMemoryTest, canStoreActualBlob){
+    float input [] = {1,2,3};
+    float* pFuture = nullptr;
+    size_t len = sizeof(input);
+
+    mem.push_ptr(&pFuture, input, len);
+    mem.commit();
+
+    ASSERT_NE(pFuture, nullptr);
+    ASSERT_NE(pFuture, input);
+    ASSERT_EQ(pFuture[0], 1);
+    ASSERT_EQ(pFuture[1], 2);
+    ASSERT_EQ(pFuture[2], 3);
+}
+
+TEST_F(GNAMemoryTest, canStore2Blobs) {
+    float input [] = {1,2,3,4};
+    float* pFuture = nullptr;
+    float* pFuture2 = nullptr;
+
+    mem.push_ptr(&pFuture, input, 3*4);
+    mem.push_ptr(&pFuture2, input+1, 3*4);
+    mem.commit();
+
+    ASSERT_NE(pFuture, input);
+    ASSERT_NE(pFuture2, input);
+    ASSERT_EQ(pFuture + 3, pFuture2);
+
+    ASSERT_EQ(pFuture[0], 1);
+    ASSERT_EQ(pFuture[1], 2);
+    ASSERT_EQ(pFuture[2], 3);
+    ASSERT_EQ(pFuture[3], 2);
+    ASSERT_EQ(pFuture[4], 3);
+    ASSERT_EQ(pFuture[5], 4);
+}
+
+TEST_F(GNAMemoryTest, canStoreBlobsALIGNED) {
+    float input [] = {1,2,3,4,5,6,7,8};
+    float* pFuture = nullptr;
+
+    mem.push_ptr(&pFuture, input, 3*4, 8);
+    mem.commit();
+
+    ASSERT_EQ(16 , mem.getTotalBytes());
+
+    ASSERT_NE(pFuture, input);
+    ASSERT_NE(pFuture, nullptr);
+
+    ASSERT_EQ(pFuture[0], 1);
+    ASSERT_EQ(pFuture[1], 2);
+    ASSERT_EQ(pFuture[2], 3);
+    //least probability for next element to be equal if not copied
+    ASSERT_NE(pFuture[3], 4);
+}
+
+TEST_F(GNAMemoryTest, canStore2BlobsALIGNED) {
+    float input [] = {1,2,3,4,5,6,7,8};
+    float* pFuture = nullptr;
+    float* pFuture2 = nullptr;
+
+    mem.push_ptr(&pFuture, input, 3*4, 8);
+    mem.push_ptr(&pFuture2, input, 3*4, 16);
+    mem.commit();
+
+    ASSERT_EQ(32 , mem.getTotalBytes());
+
+    ASSERT_NE(pFuture, nullptr);
+
+    ASSERT_EQ(pFuture[0], 1);
+    ASSERT_EQ(pFuture[1], 2);
+    ASSERT_EQ(pFuture[2], 3);
+    //least probability for next element to be equal if not copied
+    ASSERT_EQ(pFuture[4], 1);
+    ASSERT_EQ(pFuture[5], 2);
+    ASSERT_EQ(pFuture[6], 3);
+
+}
+
+TEST_F(GNAMemoryTest, canReserveData) {
+
+    float* pFuture = nullptr;
+    mem.reserve_ptr(&pFuture, 3*4);
+    mem.commit();
+
+    ASSERT_NE(pFuture, nullptr);
+}
+
+TEST_F(GNAMemoryTest, canReserveDataByVoid) {
+    mem.reserve_ptr(nullptr, 3*4);
+    ASSERT_NO_THROW(mem.commit());
+}
+
+
+TEST_F(GNAMemoryTest, canReserveAndPushData) {
+
+    float input[] = {1, 2, 3};
+    float *pFuture = nullptr;
+    float* pFuture2 = nullptr;
+    size_t len = sizeof(input) ;
+
+    mem.push_ptr(&pFuture, input, len);
+    mem.reserve_ptr(&pFuture2, 3*4);
+    mem.commit();
+
+    ASSERT_NE(pFuture, nullptr);
+    ASSERT_NE(pFuture2, nullptr);
+    ASSERT_NE(pFuture, input);
+    ASSERT_NE(pFuture2, pFuture);
+
+    pFuture2[0] = -1;
+    pFuture2[1] = -1;
+    pFuture2[2] = -1;
+
+    ASSERT_EQ(pFuture[0], 1);
+    ASSERT_EQ(pFuture[1], 2);
+    ASSERT_EQ(pFuture[2], 3);
+}
+
+TEST_F(GNAMemoryTest, canBindAndResolve) {
+
+    float input[] = {1, 2, 3};
+    float *pFuture = nullptr;
+    float *pFuture2 = nullptr;
+    float *pFuture3 = nullptr;
+    size_t len = sizeof(input);
+
+    mem.bind_ptr(&pFuture3, &pFuture);
+    mem.push_ptr(&pFuture, input, len);
+    mem.bind_ptr(&pFuture2, &pFuture);
+
+    mem.commit();
+
+    ASSERT_NE(pFuture, input);
+    ASSERT_NE(pFuture2, nullptr);
+    ASSERT_EQ(pFuture2, pFuture);
+    ASSERT_EQ(pFuture3, pFuture);
+
+    ASSERT_EQ(pFuture2[0], 1);
+    ASSERT_EQ(pFuture2[1], 2);
+    ASSERT_EQ(pFuture2[2], 3);
+}
+
+TEST_F(GNAMemoryTest, canBindTransitevlyAndResolve) {
+
+    float input[] = {1, 2, 3};
+    float *pFuture = nullptr;
+    float *pFuture3 = nullptr;
+    float *pFuture4 = nullptr;
+    size_t len = sizeof(input);
+
+    mem.bind_ptr(&pFuture4, &pFuture3);
+    mem.bind_ptr(&pFuture3, &pFuture);
+    mem.push_ptr(&pFuture, input, len);
+
+    mem.commit();
+
+    ASSERT_NE(pFuture, input);
+    ASSERT_EQ(pFuture3, pFuture);
+    ASSERT_EQ(pFuture4, pFuture);
+
+    ASSERT_NE(pFuture4, nullptr);
+
+    ASSERT_EQ(pFuture4[0], 1);
+    ASSERT_EQ(pFuture4[1], 2);
+    ASSERT_EQ(pFuture4[2], 3);
+}
+
+TEST_F(GNAMemoryTest, canBindTransitevlyWithOffsetsAndResolve) {
+
+    float input[] = {1, 2, 3};
+    float *pFuture = nullptr;
+    float *pFuture3 = nullptr;
+    float *pFuture4 = nullptr;
+    size_t len = sizeof(input);
+
+    mem.bind_ptr(&pFuture4, &pFuture3, 4);
+    mem.bind_ptr(&pFuture3, &pFuture, 4);
+    mem.push_ptr(&pFuture, input, len);
+
+    mem.commit();
+
+    ASSERT_NE(pFuture, input);
+    ASSERT_EQ(pFuture3, pFuture + 1);
+    ASSERT_EQ(pFuture4, pFuture + 2);
+
+    ASSERT_NE(pFuture, nullptr);
+
+    ASSERT_EQ(pFuture[0], 1);
+    ASSERT_EQ(pFuture[1], 2);
+    ASSERT_EQ(pFuture[2], 3);
+}
+
+TEST_F(GNAMemoryTest, canBindWithOffsetAndResolve) {
+
+    float input[] = {1, 2, 3};
+    float *pFuture = nullptr;
+    float *pFuture2 = nullptr;
+    float *pFuture3 = nullptr;
+    size_t len = sizeof(input);
+
+    mem.bind_ptr(&pFuture3, &pFuture, 4);
+    mem.push_ptr(&pFuture, input, len);
+    mem.bind_ptr(&pFuture2, &pFuture);
+
+    mem.commit();
+
+    ASSERT_NE(pFuture, input);
+    ASSERT_NE(pFuture2, nullptr);
+    ASSERT_EQ(pFuture2, pFuture);
+    ASSERT_NE(pFuture3, nullptr);
+    ASSERT_EQ(pFuture3, pFuture + 1);
+
+    ASSERT_EQ(pFuture2[0], 1);
+    ASSERT_EQ(pFuture2[1], 2);
+    ASSERT_EQ(pFuture2[2], 3);
+    ASSERT_EQ(pFuture3[0], 2);
+}
+
+
+TEST_F(GNAMemoryTest, canPushLocal) {
+
+    float* pFuture = (float*)&pFuture;
+
+    {
+        std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+        mem.push_local_ptr(pFuture, &*input.begin(), 4 * 4, 1);
+    }
+
+    //poison stack
+    float input [] = {11,21,31,41};
+    mem.commit();
+
+    ASSERT_FLOAT_EQ(pFuture[0], 1);
+    ASSERT_FLOAT_EQ(pFuture[1], 2);
+    ASSERT_FLOAT_EQ(pFuture[2], 3);
+    ASSERT_FLOAT_EQ(pFuture[3], 4);
+}
+
+TEST_F(GNAMemoryTest, canPushValue) {
+
+    float* pFuture = (float*)&pFuture;
+    float* pFuture2 = (float*)&pFuture2;
+
+    {
+        mem.push_value(pFuture, 3.f,  2);
+        mem.push_value(pFuture2, 13.f, 2);
+    }
+
+    mem.commit();
+
+    ASSERT_FLOAT_EQ(pFuture[0], 3);
+    ASSERT_FLOAT_EQ(pFuture[1], 3);
+    ASSERT_FLOAT_EQ(pFuture[2], 13);
+    ASSERT_FLOAT_EQ(pFuture[3], 13);
+}
+
+TEST_F(GNAMemoryTest, canPushReadOnlyValue) {
+
+    float* pFuture = (float*)&pFuture;
+    float* pFuture2 = (float*)&pFuture2;
+
+    {
+        mem.push_value(pFuture, 3.f,  2);
+        mem.readonly().push_value(pFuture2, 13.f, 2);
+    }
+
+    mem.commit();
+
+    ASSERT_FLOAT_EQ(pFuture[0], 3);
+    ASSERT_FLOAT_EQ(pFuture[1], 3);
+    ASSERT_FLOAT_EQ(pFuture[2], 13);
+    ASSERT_FLOAT_EQ(pFuture[3], 13);
+}
+
+TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSize) {
+
+    mem.push_value(nullptr, 3.f,  2);
+    mem.readonly().push_value(nullptr, 13.f, 2);
+    mem.commit();
+
+    ASSERT_EQ(mem.getTotalBytes(), 4 * sizeof(float));
+    ASSERT_EQ(mem.getRWBytes(), 2 * sizeof(float));
+}
+
+TEST_F(GNAMemoryTest, canCalculateReadWriteSectionSizeWithAlignment) {
+
+    GNAMemory<std::allocator<uint8_t>> memAligned(64);
+
+    memAligned.push_value(nullptr, 3.f,  2);
+    memAligned.readonly().push_value(nullptr, 13.f, 2);
+    memAligned.commit();
+
+    ASSERT_EQ(memAligned.getTotalBytes(), 128);
+    ASSERT_EQ(memAligned.getRWBytes(), 64);
+}
+
+TEST_F(GNAMemoryTest, canSetUpReadWriteSectionPtr) {
+
+    float* pFuture2 = (float*)&pFuture2;
+    float* pFuture1 = (float*)&pFuture1;
+    float* pFuture3 = (float*)&pFuture3;
+
+
+    mem.readonly().push_value(pFuture1, 3.f,  2);
+    mem.push_value(pFuture2, 13.f, 3);
+    mem.readonly().push_value(pFuture3, 32.f,  4);
+    mem.commit();
+
+    ASSERT_EQ(mem.getTotalBytes(), (2+3+4) * sizeof(float));
+    ASSERT_EQ(mem.getRWBytes(), 3 * sizeof(float));
+
+    ASSERT_LT(&pFuture2[0], &pFuture1[0]);
+    ASSERT_LT(&pFuture1[0], &pFuture3[0]);
+
+    ASSERT_FLOAT_EQ(pFuture1[0], 3.f);
+    ASSERT_FLOAT_EQ(pFuture1[1], 3.f);
+
+    ASSERT_FLOAT_EQ(pFuture2[0], 13.f);
+    ASSERT_FLOAT_EQ(pFuture2[1], 13.f);
+    ASSERT_FLOAT_EQ(pFuture2[2], 13.f);
+
+    ASSERT_FLOAT_EQ(pFuture3[0], 32.f);
+    ASSERT_FLOAT_EQ(pFuture3[1], 32.f);
+    ASSERT_FLOAT_EQ(pFuture3[2], 32.f);
+    ASSERT_FLOAT_EQ(pFuture3[3], 32.f);
+}
+
+
+TEST_F(GNAMemoryTest, canUpdateSizeOfPushRequestWithBindRequest) {
+    float input[]  = {1, 2, 3};
+
+    float *pFuture = nullptr;
+    float *pFuture2 = nullptr;
+    float *pFuture3 = nullptr;
+
+    size_t len = sizeof(input);
+
+    mem.push_ptr(&pFuture, input, len);
+    mem.bind_ptr(&pFuture2, &pFuture, len, len);
+    mem.bind_ptr(&pFuture3, &pFuture2, 2 * len, len);
+
+    mem.commit();
+
+    ASSERT_EQ(mem.getTotalBytes(), 4 * len);
+    ASSERT_NE(pFuture, nullptr);
+    ASSERT_EQ(pFuture2, pFuture + 3);
+    ASSERT_EQ(pFuture3, pFuture + 9);
+
+    ASSERT_FLOAT_EQ(pFuture[0], 1);
+    ASSERT_FLOAT_EQ(pFuture[1], 2);
+    ASSERT_FLOAT_EQ(pFuture[2], 3);
+    ASSERT_FLOAT_EQ(pFuture[3], 0);
+    ASSERT_FLOAT_EQ(pFuture[4], 0);
+    ASSERT_FLOAT_EQ(pFuture[5], 0);
+    ASSERT_FLOAT_EQ(pFuture[6], 0);
+    ASSERT_FLOAT_EQ(pFuture[7], 0);
+    ASSERT_FLOAT_EQ(pFuture[8], 0);
+}
+
+TEST_F(GNAMemoryTest, canUpdateSizeOfPushRequestWithBindRequestWhenPush) {
+    float input[]  = {1, 2, 3};
+    float input2[]  = {6, 7, 8};
+
+    float *pFutureInput2 = nullptr;
+    float *pFuture = nullptr;
+    float *pFuture2 = nullptr;
+
+    size_t len = sizeof(input);
+
+    mem.push_ptr(&pFuture, input, len);
+    mem.bind_ptr(&pFuture2, &pFuture, len, len);
+    mem.push_ptr(&pFutureInput2, input2, len);
+
+    mem.commit();
+
+    ASSERT_EQ(mem.getTotalBytes(), 3 * len);
+    ASSERT_NE(pFuture, nullptr);
+    ASSERT_NE(pFutureInput2, nullptr);
+    ASSERT_EQ(pFuture2, pFuture + 3);
+
+    ASSERT_FLOAT_EQ(pFuture[0], 1);
+    ASSERT_FLOAT_EQ(pFuture[1], 2);
+    ASSERT_FLOAT_EQ(pFuture[2], 3);
+    ASSERT_FLOAT_EQ(pFuture[3], 0);
+    ASSERT_FLOAT_EQ(pFuture[4], 0);
+
+    ASSERT_FLOAT_EQ(pFutureInput2[0], 6);
+    ASSERT_FLOAT_EQ(pFutureInput2[1], 7);
+    ASSERT_FLOAT_EQ(pFutureInput2[2], 8);
+}
+
+TEST_F(GNAMemoryTest, canUpdateSizeOfPushRequestWithBindRequestWhenAlloc) {
+    float input[]  = {1, 2, 3};
+
+    float *pFutureInput = nullptr;
+    float *pFuture = nullptr;
+    float *pFuture2 = nullptr;
+
+    size_t len = sizeof(input);
+
+    mem.reserve_ptr(&pFuture, len);
+    mem.bind_ptr(&pFuture2, &pFuture, len, len);
+    mem.push_ptr(&pFutureInput, input, len);
+
+    mem.commit();
+
+    ASSERT_EQ(mem.getTotalBytes(), 3 * len);
+    ASSERT_NE(pFuture, nullptr);
+    ASSERT_NE(pFutureInput, nullptr);
+    ASSERT_EQ(pFuture2, pFuture + 3);
+
+    ASSERT_FLOAT_EQ(pFuture[0], 0);
+    ASSERT_FLOAT_EQ(pFuture[1], 0);
+    ASSERT_FLOAT_EQ(pFuture[2], 0);
+    ASSERT_FLOAT_EQ(pFuture[3], 0);
+    ASSERT_FLOAT_EQ(pFuture[4], 0);
+
+    ASSERT_FLOAT_EQ(pFutureInput[0], 1);
+    ASSERT_FLOAT_EQ(pFutureInput[1], 2);
+    ASSERT_FLOAT_EQ(pFutureInput[2], 3);
+}
\ No newline at end of file
diff --git a/inference-engine/tests/unit/engines/gna/gna_mock_api.hpp b/inference-engine/tests/unit/engines/gna/gna_mock_api.hpp
new file mode 100644 (file)
index 0000000..230c5ab
--- /dev/null
@@ -0,0 +1,70 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <gmock/gmock-generated-function-mockers.h>
+
+class GNACppApi {
+
+ public:
+    GNACppApi();
+    ~GNACppApi();
+    MOCK_METHOD10(GNAScoreGaussians, intel_gna_status_t(
+        //intel_gna_handle_t          nGNADevice,            // handle to GNA accelerator
+        //const intel_feature_type_t* pFeatureType,
+        const intel_feature_t*      pFeatureData,
+        const intel_gmm_type_t*     pModelType,
+        const intel_gmm_t*          pModelData,
+        const uint32_t*             pActiveGMMIndices,
+        uint32_t                    nActiveGMMIndices,
+        uint32_t                    uMaximumScore,
+        intel_gmm_mode_t            nGMMMode,
+        uint32_t*                   pScores,
+        uint32_t*                   pReqId,
+        intel_gna_proc_t            nAccelerationType));
+
+
+    MOCK_METHOD6(GNAPropagateForward, intel_gna_status_t (
+        intel_gna_handle_t          nGNADevice,            // handle to GNA accelerator
+        const intel_nnet_type_t*    pNeuralNetwork,
+        const uint32_t*             pActiveIndices,
+        uint32_t                    nActiveIndices,
+        uint32_t*                   pReqId,
+        intel_gna_proc_t            nAccelerationType));
+
+    MOCK_METHOD3(GNAAlloc, void *(
+        intel_gna_handle_t nGNADevice,   // handle to GNA accelerator
+        uint32_t           sizeRequested,
+        uint32_t*          sizeGranted));
+
+    MOCK_METHOD1(GNAFree, intel_gna_status_t (intel_gna_handle_t nGNADevice));
+
+    MOCK_METHOD1(GNADeviceOpen, intel_gna_handle_t (intel_gna_status_t* status));
+
+    MOCK_METHOD2(GNADeviceOpenSetThreads, intel_gna_handle_t (intel_gna_status_t* status, uint8_t n_threads));
+    MOCK_METHOD1(GNADeviceClose, intel_gna_status_t (intel_gna_handle_t nGNADevice));
+
+    MOCK_METHOD3(GNAWait, intel_gna_status_t(
+                 intel_gna_handle_t nGNADevice,            // handle to GNA accelerator
+                 uint32_t           nTimeoutMilliseconds,
+                 uint32_t           reqId                  // IN score request ID);
+    ));
+
+    MOCK_METHOD4(GNAWaitPerfRes, intel_gna_status_t(
+                 intel_gna_handle_t nGNADevice,            // handle to GNA accelerator
+                 uint32_t           nTimeoutMilliseconds,
+                 uint32_t           reqId,                 // IN score request ID);
+                 intel_gna_perf_t*  nGNAPerfResults
+    ));
+
+    MOCK_METHOD6(GNADumpXnn, void* (
+        const intel_nnet_type_t*    neuralNetwork,
+        const uint32_t*             activeIndices,
+        uint32_t                    activeIndicesCount,
+        intel_gna_model_header*     modelHeader,
+        intel_gna_status_t*         status,
+        intel_gna_alloc_cb          customAlloc));
+
+    MOCK_METHOD1(gmmSetThreads, intel_gna_handle_t (uint8_t num));
+};
diff --git a/inference-engine/tests/unit/engines/gna/gna_proc_type_test.cpp b/inference-engine/tests/unit/engines/gna/gna_proc_type_test.cpp
new file mode 100644 (file)
index 0000000..de17de7
--- /dev/null
@@ -0,0 +1,40 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <mock_icnn_network.hpp>
+#include <gmock/gmock-generated-actions.h>
+#include <gna/gna_config.hpp>
+#include "gna_plugin.hpp"
+#include "gna_mock_api.hpp"
+#include "gna_matcher.hpp"
+
+using namespace std;
+using namespace InferenceEngine;
+using namespace GNAPluginNS;
+using namespace ::testing;
+
+class GNAProcTypeTest : public GNATest {
+
+ protected:
+};
+
+TEST_F(GNAProcTypeTest, defaultProcTypeIsSWEXACT) {
+    assert_that().onInfer1AFModel().gna().propagate_forward().called_with().proc_type(GNA_SOFTWARE & GNA_HARDWARE);
+}
+
+TEST_F(GNAProcTypeTest, canPassHWProcTypeToGNA) {
+    assert_that().onInfer1AFModel().withGNADeviceMode("GNA_HW").gna().propagate_forward().called_with().proc_type(GNA_HARDWARE);
+}
+
+TEST_F(GNAProcTypeTest, canPassSWProcTypeToGNA) {
+    assert_that().onInfer1AFModel().withGNADeviceMode("GNA_SW").gna().propagate_forward().called_with().proc_type(GNA_SOFTWARE);
+}
+
+TEST_F(GNAProcTypeTest, canPassSWEXACTProcTypeToGNA) {
+    assert_that().onInfer1AFModel().withGNADeviceMode("GNA_SW_EXACT").gna().
+        propagate_forward().called_with().proc_type(GNA_SOFTWARE & GNA_HARDWARE);
+}
\ No newline at end of file
diff --git a/inference-engine/tests/unit/engines/gna/gna_pwl_test.cpp b/inference-engine/tests/unit/engines/gna/gna_pwl_test.cpp
new file mode 100644 (file)
index 0000000..408deec
--- /dev/null
@@ -0,0 +1,214 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <gtest/gtest.h>
+#include "gna_matcher.hpp"
+
+class PWLAproximationTest : public GNATest {
+ protected:
+    void SetUp() override  {
+    }
+};
+using namespace GNATestIRs;
+
+// Recursive Algorithm
+// Precision Threshold
+
+TEST_F(PWLAproximationTest, forTanhOnRecursiveAlgoWithPrecisionThresholdIsSuccess) {
+    assert_that().onInferModel(TanhActivationModel())
+                                   .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActTanh)
+                                .pwl_quantization_precision_threshold(0.0053);
+}
+
+TEST_F(PWLAproximationTest, forSigmoidOnRecursiveAlgoWithPrecisionThresholdIsSuccess) {
+    assert_that().onInferModel(SigmoidActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActSigmoid)
+                                .pwl_quantization_precision_threshold(0.0027);
+}
+
+TEST_F(PWLAproximationTest, forReLUonRecursiveAlgoWithPrecisionThresholdIsSuccess) {
+    assert_that().onInferModel(ReLUActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActRelu)
+                                .pwl_quantization_precision_threshold(0.0001);
+}
+
+TEST_F(PWLAproximationTest, forLeakyReLUonRecursiveAlgoWithPrecisionThresholdIsSuccess) {
+    assert_that().onInferModel(LeakyReLUActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActLeakyRelu)
+                                .pwl_quantization_precision_threshold(0.0003);
+}
+
+TEST_F(PWLAproximationTest, DISABLED_forIdentityOnRecursiveAlgoWithPrecisionThresholdIsSuccess) {
+    assert_that().onInferModel(IdentityActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActIdentity)
+                                .pwl_quantization_precision_threshold(0.0003);
+}
+
+TEST_F(PWLAproximationTest, forClampOnRecursiveAlgoWithPrecisionThresholdIsSuccess) {
+    assert_that().onInferModel(ClampActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActKaldiLstmClipping)
+                                .pwl_quantization_precision_threshold(0.0001);
+}
+
+// Uniform Algorithm
+// Precision Threshold
+
+TEST_F(PWLAproximationTest, forTanhOnUniformAlgoWithPrecisionThresholdIsSuccess) {
+    assert_that().onInferModel(TanhActivationModel())
+                                .inNotCompactMode()
+                                .withUniformPWLAlgo()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActTanh)
+                                .pwl_quantization_precision_threshold(0.0009);
+}
+
+TEST_F(PWLAproximationTest, forSigmoidOnUniformAlgoWithPrecisionThresholdIsSuccess) {
+    assert_that().onInferModel(SigmoidActivationModel())
+                                .inNotCompactMode()
+                                .withUniformPWLAlgo()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActSigmoid)
+                                .pwl_quantization_precision_threshold(0.0004);
+}
+
+TEST_F(PWLAproximationTest, DISABLED_forIdentityOnUniformAlgoWithPrecisionThresholdIsSuccess) {
+    assert_that().onInferModel(IdentityActivationModel())
+                                .inNotCompactMode()
+                                .withUniformPWLAlgo()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActIdentity)
+                                .pwl_quantization_precision_threshold(0.0003);
+}
+
+TEST_F(PWLAproximationTest, forClampOnUniformAlgoWithPrecisionThresholdIsSuccess) {
+    assert_that().onInferModel(ClampActivationModel())
+                                .inNotCompactMode()
+                                .withUniformPWLAlgo()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActKaldiLstmClipping)
+                                .pwl_quantization_precision_threshold(0.0001);
+}
+
+// Recursive Algorithm
+// Segment Threshold
+
+TEST_F(PWLAproximationTest, forSigmoidonRecursiveAlgoWithSegmentThresholdIsSuccess) {
+    assert_that().onInferModel(SigmoidActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActSigmoid)
+                                .pwl_quantization_segments_threshold(12);
+}
+
+TEST_F(PWLAproximationTest, forTanhonRecursiveAlgoWithSegmentThresholdIsSuccess) {
+    assert_that().onInferModel(TanhActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActTanh)
+                                .pwl_quantization_segments_threshold(12);
+}
+
+TEST_F(PWLAproximationTest, forReLUonRecursiveAlgoWithSegmentThresholdIsSuccess) {
+    assert_that().onInferModel(ReLUActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActRelu)
+                                .pwl_quantization_segments_threshold(2);
+}
+
+TEST_F(PWLAproximationTest, forLeakyReLUonRecursiveAlgoWithSegmentThresholdIsSuccess) {
+    assert_that().onInferModel(LeakyReLUActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActLeakyRelu)
+                                .pwl_quantization_segments_threshold(2);
+}
+
+TEST_F(PWLAproximationTest, DISABLED_forIdentityOnRecursiveAlgoWithSegmentThresholdIsSuccess) {
+    assert_that().onInferModel(IdentityActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActIdentity)
+                                .pwl_quantization_segments_threshold(3);
+}
+
+TEST_F(PWLAproximationTest, forClampOnRecursiveAlgoWithSegmentThresholdIsSuccess) {
+    assert_that().onInferModel(ClampActivationModel())
+                                .inNotCompactMode()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActKaldiLstmClipping)
+                                .pwl_quantization_segments_threshold(3);
+}
+
+// Uniform Algorithm
+// Segment Threshold
+
+TEST_F(PWLAproximationTest, forSigmoidonUniformAlgoWithSegmentThresholdIsSuccess) {
+    assert_that().onInferModel(SigmoidActivationModel())
+                                .inNotCompactMode()
+                                .withUniformPWLAlgo()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActSigmoid)
+                                .pwl_quantization_segments_threshold(65);
+}
+
+TEST_F(PWLAproximationTest, forTanhonUniformAlgoWithSegmentThresholdIsSuccess) {
+    assert_that().onInferModel(TanhActivationModel())
+                                .inNotCompactMode()
+                                .withUniformPWLAlgo()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActTanh)
+                                .pwl_quantization_segments_threshold(65);
+}
+
+TEST_F(PWLAproximationTest, DISABLED_forIdentityOnUniformAlgoWithSegmentThresholdIsSuccess) {
+    assert_that().onInferModel(IdentityActivationModel())
+                                .inNotCompactMode()
+                                .withUniformPWLAlgo()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActIdentity)
+                                .pwl_quantization_segments_threshold(3);
+}
+
+TEST_F(PWLAproximationTest, forClampOnUniformAlgoWithSegmentThresholdIsSuccess) {
+    assert_that().onInferModel(ClampActivationModel())
+                                .inNotCompactMode()
+                                .withUniformPWLAlgo()
+                                .propagate_forward()
+                                .called_with()
+                                .pwl_quantization_activation(DnnActivationType::kActKaldiLstmClipping)
+                                .pwl_quantization_segments_threshold(3);
+}
diff --git a/inference-engine/tests/unit/engines/gna/gna_query_state_tests.cpp b/inference-engine/tests/unit/engines/gna/gna_query_state_tests.cpp
new file mode 100644 (file)
index 0000000..f61aecd
--- /dev/null
@@ -0,0 +1,25 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <gtest/gtest.h>
+#include "gna_matcher.hpp"
+
+class QueryStateTest : public GNATest {
+ protected:
+    void SetUp() override  {
+    }
+};
+using namespace GNATestIRs;
+
+// Recursive Algorithm
+// Precision Threshold
+
+TEST_F(QueryStateTest, returnEmptyCollectionOfStatesIfNoMemoryInIR) {
+    assert_that().afterLoadingModel(TanhActivationModel()).queryState().isEmpty();
+}
+
+TEST_F(QueryStateTest, returnNonEmptyCollectionOfStatesForMemoryIR) {
+    assert_that().afterLoadingModel(affineToMemoryModel()).queryState().isNotEmpty();
+}
diff --git a/inference-engine/tests/unit/engines/gna/i16_quantisation_test.cpp b/inference-engine/tests/unit/engines/gna/i16_quantisation_test.cpp
new file mode 100644 (file)
index 0000000..c8767b0
--- /dev/null
@@ -0,0 +1,381 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <gtest/gtest.h>
+#include <inference_engine/layer_transform.hpp>
+#include "gna_plugin/quantization/model_quantizer.hpp"
+#include "gna_plugin/quantization/layer_quantizer.hpp"
+#include "gna_matcher.hpp"
+
+using namespace InferenceEngine;
+using namespace GNAPluginNS;
+using namespace GNATestIRs;
+
+class I16QuantisationTest : public GNATest {
+ protected:
+    LayersQuantizer<QuantI16> lc = LayersQuantizer<QuantI16>(1.0f);
+
+    InferenceEngine::CNNLayerPtr  quantize (InferenceEngine::CNNLayerPtr lp) {
+        auto newLayer = InferenceEngine::injectData<QuantizedLayerParams>(lp);
+        transformLayer(newLayer, lc);
+        return newLayer;
+    };
+
+
+    void SetUp() override  {
+    }
+
+};
+
+template <class T>
+T  setWeights(T blob) {
+    blob->allocate();
+    // actual quantisation algorithm is involved - we need to provide weights that will be quantized with scale factor of 1
+    for (auto && w : *blob) {
+        w = MAX_VAL_2B_WEIGHT;
+    }
+    return blob;
+}
+
+template <>
+TBlob<uint8_t>::Ptr  setWeights(TBlob<uint8_t>::Ptr blob) {
+    blob->allocate();
+    auto buf = blob->buffer();
+    auto ptr = buf.as<float*>();
+
+    for (int i = 0; i != blob->byteSize() / 4; i++) {
+        ptr[i] = MAX_VAL_2B_WEIGHT;
+    }
+    return blob;
+}
+
+
+// TODO: add test for FC weights after quantization
+TEST_F(I16QuantisationTest, canQuantizeFCLayer){
+
+    auto fc = std::make_shared<FullyConnectedLayer>(LayerParams{"name", "type", Precision::FP32});
+    fc->_out_num = 9;
+    fc->_weights = setWeights(make_shared_blob<float>(Precision::FP32, {1, 1}));
+    fillWeights(fc->_weights);
+    fc->_biases  = make_shared_blob<float>(Precision::FP32, Layout::NC, {1, 1});
+    fc->_biases->allocate();
+    fillWeights(fc->_biases);
+
+    std::shared_ptr<Data> outData = std::make_shared<Data>("data", SizeVector({1, 1}), Precision::FP32, Layout::NC);
+    fc->outData.push_back(outData);
+    fc->insData.push_back(outData);
+
+
+    ASSERT_NO_THROW(quantize(fc));
+}
+
+TEST_F(I16QuantisationTest, canQuantizeActivation){
+
+    auto sigmoid = std::make_shared<GenericLayer >(LayerParams{"name", "type", Precision::FP32});
+    sigmoid->params["value"] = 2;
+    sigmoid->type = "Activation";
+
+    ASSERT_NO_THROW(quantize(sigmoid));
+}
+
+TEST_F(I16QuantisationTest, outputAffinePrecisionIs32Bits){
+
+    ModelQuantizer<QuantI16> q;
+
+    CNNNetReader net_reader;
+    ASSERT_NO_THROW(net_reader.ReadNetwork(Fc2DOutputModel().data(), Fc2DOutputModel().length()));
+
+    auto weights = make_shared_blob<uint8_t>(Precision::U8, C, {440});
+    weights->allocate();
+    fillWeights(weights);
+    net_reader.SetWeights(weights);
+
+    auto newNet = q.quantize(net_reader.getNetwork(), 1000);
+    InputsDataMap inputs;
+    newNet->getInputsInfo(inputs);
+    auto affineDataPtr = inputs.begin()->second->getInputData()->inputTo.begin()->second->outData.front();
+
+    ASSERT_EQ(affineDataPtr->precision, Precision::I32);
+}
+
+
+TEST_F(I16QuantisationTest, canQuantizeLstmLikeTopology) {
+    ModelQuantizer<QuantI16> q;
+
+    CNNNetReader net_reader;
+    ASSERT_NO_THROW(net_reader.ReadNetwork(affineToMemoryModel().data(), affineToMemoryModel().length()));
+
+    auto weights = setWeights(make_shared_blob<uint8_t >(Precision::U8, C, {440}));
+    //std::fill_n(weights->buffer().as<float*>(), weights->byteSize()/sizeof(float), 0);
+    net_reader.SetWeights(weights);
+
+    ASSERT_NO_THROW(q.quantize(net_reader.getNetwork(), 1000));
+}
+
+TEST_F(I16QuantisationTest, DISABLED_outputScaleFactorForAffineIsCorrect){
+
+    ModelQuantizer<QuantI16> q;
+
+    CNNNetReader net_reader;
+    ASSERT_NO_THROW(net_reader.ReadNetwork(Fc2DOutputModel().data(), Fc2DOutputModel().length()));
+
+    auto weights = make_shared_blob<uint8_t >(Precision::U8, C, {440});
+    weights->allocate();
+    fillWeights(weights, 100);
+    net_reader.SetWeights(weights);
+
+    auto newNet = q.quantize(net_reader.getNetwork(), 1000);
+    InputsDataMap inputs;
+    newNet->getInputsInfo(inputs);
+    auto affineLayerPtr = inputs.begin()->second->getInputData()->inputTo.begin()->second;
+
+    auto quantParams = getInjectedData<QuantizedLayerParams>(affineLayerPtr);
+
+
+    ASSERT_FLOAT_EQ(quantParams->_dst_quant.scale, 100);
+    ASSERT_FLOAT_EQ(quantParams->_weights_quant.scale, 100);
+}
+
+TEST_F(I16QuantisationTest, OnlyAffine_NoActivationInsertion) {
+    assert_that()
+        .onInferModel(Fc2DOutputModel())
+        .inNotCompactMode()
+        .gna().propagate_forward().called_without().pwl_inserted_into_nnet();
+}
+
+TEST_F(I16QuantisationTest, OnlyAffine_NoActivationInsertion_ProfilingEnabled) {
+    assert_that()
+        .onInferModel(Fc2DOutputModel())
+        .inNotCompactMode()
+        .gna().propagate_forward().called_without().pwl_inserted_into_nnet().profiling_counters();
+}
+
+TEST_F(I16QuantisationTest, OnlyAffineWithNanScaleFactorFails) {
+    gna()
+        .onInferModel(Fc2DOutputModel())
+        .withNanScaleFactor()
+        .propagate_forward().throws();
+}
+
+TEST_F(I16QuantisationTest, OnlyAffineWithInfScaleFactorFails) {
+    gna()
+        .onInferModel(Fc2DOutputModel())
+        .withInfScaleFactor()
+        .propagate_forward().throws();
+}
+
+TEST_F(I16QuantisationTest, AffineToMemoryWillResultInActivationInsertion) {
+    assert_that()
+        .onInferModel(affineToMemoryModel())
+        .inNotCompactMode()
+        .gna().propagate_forward().called_with().pwl_inserted_into_nnet();
+}
+
+TEST_F(I16QuantisationTest, EltwiseToMemoryWithNoOutputActivationInsertion) {
+    assert_that().onInferModel(eltwiseToMemoryModelNoOutput(), [](CNNNetwork & net){
+            net.addOutput("Eltwise_8");
+        }).inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet();
+}
+
+TEST_F(I16QuantisationTest, EltwiseToMemory_ActivationInsertion) {
+    assert_that().onInferModel(eltwiseToMemoryModel())
+        .inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet();
+}
+
+
+TEST_F(I16QuantisationTest, SplitFollowedByActivation_DummyDiagonalAffineInsertion) {
+    assert_that().onInferModel(activationAfterSplitModel())
+        .inNotCompactMode().gna().propagate_forward().called_with().diagonal_inserted_into_nnet();
+}
+
+TEST_F(I16QuantisationTest, SplitFollowedByFCAndEltwiseOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    std::vector<float> expected_result = {12.0, 12.0, 12.0, 12.0, 12.0,
+                                          12.0, 12.0, 12.0, 12.0, 12.0};
+    assert_that().onInferModel(FCWithPaddingAfterSplitModel())
+        .inNotCompactMode().gna().propagate_forward().onCPU()
+        .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, SliceFollowedByFCAndEltwiseOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    std::vector<float> expected_result = {14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0};
+    assert_that().onInferModel(FCWithPaddingAfterSliceModel())
+        .inNotCompactMode().gna().propagate_forward().onCPU()
+        .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, SliceFollowedByAlignedFCAndEltwiseOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    std::vector<float> expected_result = {18.0, 18.0, 18.0, 18.0};
+    assert_that().onInferModel(SliceModelWithAlignedOutputs())
+        .inNotCompactMode().gna().propagate_forward().onCPU()
+        .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, SliceFollowedBy2FCsAnd2EltwisesOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    std::vector<float> expected_result = {27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0};
+    assert_that().onInferModel(twoFCWithPaddingAfterSliceModel())
+        .inNotCompactMode().gna().propagate_forward().onCPU()
+        .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, EltwiseSumm_onlyOneIdentityInsertion) {
+    assert_that().onInferModel(eltwiseSummModel())
+        .inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet().once();
+}
+
+
+TEST_F(I16QuantisationTest, canDetectLeakyRelu) {
+    assert_that().onInferModel(TFLeakyReluModel())
+        .inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet();
+}
+
+TEST_F(I16QuantisationTest, MaxPool_followedAfterActivation) {
+    assert_that().onInferModel(maxpoolAfterRelu())
+        .inNotCompactMode().gna().propagate_forward().called_with()
+        .convolution_inserted_into_nnet()
+        .And()
+        .pwl_inserted_into_nnet()
+        .And()
+        .max_pooling_inserted_into_nnet();
+}
+
+TEST_F(I16QuantisationTest, EltwiseMull_willInsertTwoIdentities) {
+    assert_that().onInferModel(eltwiseMulModel())
+        .inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet().twice();
+}
+
+TEST_F(I16QuantisationTest, ConcatPropagateForwardWithSuccessOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    std::vector<float> expected_result = {121.0, 121.0, 121.0, 121.0, 121.0,
+                                          121.0, 121.0, 121.0, 121.0, 121.0,
+                                          121.0, 121.0, 121.0, 121.0, 121.0,
+                                          121.0, 121.0, 121.0, 121.0, 121.0};
+
+    assert_that().onInferModel(concatModel())
+        .inNotCompactMode().gna().propagate_forward().onCPU()
+        .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, DoubleConcatPropageteForwardWithSuccessOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    std::vector<float> expected_result = {141.0, 141.0, 141.0, 141.0, 141.0,
+                                          141.0, 141.0, 141.0, 141.0, 141.0,
+                                          141.0, 141.0, 141.0, 141.0, 141.0,
+                                          141.0, 141.0, 141.0, 141.0, 141.0,
+                                          141.0, 141.0, 141.0, 141.0, 141.0,
+                                          141.0, 141.0, 141.0, 141.0, 141.0,
+                                          141.0, 141.0, 141.0, 141.0, 141.0,
+                                          141.0, 141.0, 141.0, 141.0, 141.0};
+
+    assert_that().onInferModel(doubleConcatModel())
+        .inNotCompactMode().gna().propagate_forward().onCPU()
+        .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, ScaleShift_Affine_WillResultInIdentityInsertion) {
+    assert_that().onInferModel(scaleShiftAffineModel())
+        .inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet().once();
+}
+
+TEST_F(I16QuantisationTest, ClampFollowedByTanh_ResultInDiagonalInsertion) {
+    assert_that().onInferModel(clampFollowedByTanhModel())
+        .inNotCompactMode().gna().propagate_forward().called_with().diagonal_inserted_into_nnet().twice();
+}
+
+TEST_F(I16QuantisationTest, EltwiseWithMemoryAndActivationInput_ResultInDiagonalInsertion) {
+    assert_that().onInferModel(eltwiseWithMemoryAndActivationInputModel())
+        .inNotCompactMode().gna().propagate_forward().called_with().diagonal_inserted_into_nnet().once();
+}
+
+TEST_F(I16QuantisationTest, AffineWith2AffineOutputs_ResultInOnlyOneIdentityInsertion) {
+    // one Identity activation from first FC, and one Identity activation for eltwise
+    assert_that().onInferModel(AffineWith2AffineOutputsModel())
+        .inNotCompactMode().gna().propagate_forward().called_with().pwl_inserted_into_nnet().twice();
+}
+
+// TODO: this mode not required in rel life scenarios so far
+TEST_F(I16QuantisationTest, DISABLED_AffineWithOutputToMemoryAndToAnotherNode_ResultInCopyInsertion) {
+    assert_that().onInferModel(affineToMemoryModel()).inNotCompactMode().gna().propagate_forward().
+        called_with().copy_inserted_into_nnet();
+}
+
+TEST_F(I16QuantisationTest, CropWithoutOffsetPropagateForwardWithSuccessOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                     0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    std::vector<float> expected_result = {11.0, 11.0, 11.0, 11.0, 11.0,
+                                          11.0, 11.0, 11.0, 11.0, 11.0};
+
+    assert_that().onInferModel(cropWithoutOffsetModel())
+    .inNotCompactMode().gna().propagate_forward().onCPU()
+    .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, CropWithAlignedOffsetPropagateForwardWithSuccessOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                     0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    std::vector<float> expected_result = {3.0, 3.0, 3.0, 3.0, 3.0,
+                                          3.0, 3.0, 3.0, 3.0, 3.0};
+
+    assert_that().onInferModel(cropWithAlignedOffsetModel())
+    .inNotCompactMode().gna().propagate_forward().onCPU()
+    .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, CropWithOffsetPropagateForwardWithSuccessOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                     0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    std::vector<float> expected_result = {7.0, 7.0, 7.0, 7.0, 7.0,
+                                          7.0, 7.0, 7.0, 7.0, 7.0};
+
+    assert_that().onInferModel(cropWithOffsetModel())
+    .inNotCompactMode().gna().propagate_forward().onCPU()
+    .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, CropWithMaxOffsetPropagateForwardWithSuccessOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                     0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    std::vector<float> expected_result = {1.0, 1.0, 1.0, 1.0, 1.0,
+                                          1.0, 1.0, 1.0, 1.0, 1.0};
+
+    assert_that().onInferModel(cropWithMaxOffsetModel())
+    .inNotCompactMode().gna().propagate_forward().onCPU()
+    .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, CropWithOffsetAfterFCPropagateForwardWithSuccessOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                     0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    std::vector<float> expected_result = {111.0, 111.0, 111.0, 111.0, 111.0,
+                                          111.0, 111.0, 111.0, 111.0, 111.0};
+
+    assert_that().onInferModel(cropWithOffsetExtendedModel())
+    .inNotCompactMode().gna().propagate_forward().onCPU()
+    .called_with_input_and_expected_output(input_data, expected_result);
+}
+
+TEST_F(I16QuantisationTest, CopySimpleCasePropagateForwardWithSuccessOnCPU) {
+    std::vector<float> input_data = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                                     0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    std::vector<float> expected_result = {12.0, 12.0, 12.0, 12.0, 12.0,
+                                          12.0, 12.0, 12.0, 12.0, 12.0,
+                                          11.0, 11.0, 11.0, 11.0, 11.0,
+                                          11.0, 11.0, 11.0, 11.0, 11.0,};
+
+    assert_that().onInferModel(copyModel())
+    .inNotCompactMode().gna().propagate_forward().onCPU()
+    .called_with_input_and_expected_output(input_data, expected_result);
+}
diff --git a/inference-engine/tests/unit/engines/gna/matchers/conv_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/conv_matcher.hpp
new file mode 100644 (file)
index 0000000..4d59470
--- /dev/null
@@ -0,0 +1,34 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include"gna-api.h"
+#include "nnet_base_matcher.hpp"
+#include "quantization/quantization.h"
+
+class ConvoluionLayerMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
+    bool matchInserted;
+    int matchQuantity;
+ public:
+    ConvoluionLayerMatcher(bool matchInserted, int matchQuantity) : matchInserted(matchInserted), matchQuantity(matchQuantity) {}
+    bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override {
+        if (foo == nullptr)
+            return false;
+        for(int i = 0; i < foo->nLayers; i++) {
+            if (foo->pLayers[i].nLayerKind != INTEL_CONVOLUTIONAL) continue;
+
+            auto conv = (intel_convolutional_layer_t*)foo->pLayers[i].pLayerStruct;
+
+            return matchInserted;
+        }
+        return !matchInserted;
+    };
+    void DescribeTo(::std::ostream *os) const override {
+        *os << "should "<< (matchInserted ? "" : "not ") << "have Convolution primitive as part of nnet structure";
+    }
+};
+
+
+
diff --git a/inference-engine/tests/unit/engines/gna/matchers/copy_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/copy_matcher.hpp
new file mode 100644 (file)
index 0000000..c947ecd
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "nnet_base_matcher.hpp"
+class CopyLayerMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
+    bool matchInserted;
+    const int matchQuantity;
+ public:
+    CopyLayerMatcher(bool matchInserted, int matchQuantity) : matchInserted(matchInserted), matchQuantity(matchQuantity) {}
+    bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override {
+        if (foo == nullptr)
+            return false;
+        for(int i = 0; i < foo->nLayers; i++) {
+            if (foo->pLayers[i].nLayerKind != INTEL_COPY) continue;
+            return matchInserted;
+        }
+        return !matchInserted;
+    };
+    void DescribeTo(::std::ostream *os) const override {
+        *os << "should "<< (matchInserted ? "" : "not ") << "have Copy primitive as part of nnet structure";
+    }
+};
+
+inline ::testing::Matcher<const intel_nnet_type_t*> HasCopyLayer(bool matchInserted = false, int matchQuantity = -1) {
+    std::unique_ptr<NNetComponentMatcher> c (new NNetComponentMatcher());
+    c->add(new CopyLayerMatcher(matchInserted, matchQuantity));
+    return ::testing::MakeMatcher(c.release());
+}
+
+
diff --git a/inference-engine/tests/unit/engines/gna/matchers/diag_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/diag_matcher.hpp
new file mode 100644 (file)
index 0000000..cd6c246
--- /dev/null
@@ -0,0 +1,51 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include"gna-api.h"
+#include "nnet_base_matcher.hpp"
+#include "quantization/quantization.h"
+
+class DiagLayerMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
+    bool matchInserted;
+    int matchQuantity;
+ public:
+    DiagLayerMatcher(bool matchInserted, int matchQuantity) : matchInserted(matchInserted), matchQuantity(matchQuantity) {}
+    bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override {
+        if (foo == nullptr)
+            return false;
+        for(int i = 0; i < foo->nLayers; i++) {
+            if (foo->pLayers[i].nLayerKind != INTEL_AFFINE_DIAGONAL) continue;
+            // diagonal layer has to have 1 for weights and 0 for biases
+
+            auto diag = (intel_affine_func_t*)foo->pLayers[i].pLayerStruct;
+
+            bool bWeightsOK = true;
+            for (int j =0; j < foo->pLayers[i].nOutputRows; j++) {
+                auto weights = (int16_t*)diag->pWeights;    
+                auto biases = (int32_t*)diag->pBiases;
+                // identity matrix tansformed to 16384 values
+                if (weights[j] != MAX_VAL_2B_WEIGHT || biases[j] != 0) {
+                    bWeightsOK = false;
+                    break;
+                }
+            }
+            if (!bWeightsOK) continue;
+
+            return matchInserted;
+        }
+        return !matchInserted;
+    };
+    void DescribeTo(::std::ostream *os) const override {
+        *os << "should "<< (matchInserted ? "" : "not ") << "have Identity Diagonal Primitive primitive as part of nnet structure";
+    }
+};
+
+inline ::testing::Matcher<const intel_nnet_type_t*> HasDiagonalLayer(bool matchInserted = false, int matchQuantity = -1) {
+    std::unique_ptr<NNetComponentMatcher> c (new NNetComponentMatcher());
+    c->add(new DiagLayerMatcher(matchInserted, matchQuantity));
+    return ::testing::MakeMatcher(c.release());
+}
+
+
diff --git a/inference-engine/tests/unit/engines/gna/matchers/nnet_base_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/nnet_base_matcher.hpp
new file mode 100644 (file)
index 0000000..7c1f69b
--- /dev/null
@@ -0,0 +1,86 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+
+class NNetComponentMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
+    std::vector<std::shared_ptr<::testing::MatcherInterface<const intel_nnet_type_t*>>> matchers;
+    mutable int failIdx = -1;
+    mutable std::stringstream reason;
+    int bitness;
+ public:
+    NNetComponentMatcher(int bitness  = 16) : bitness(bitness) {}
+    NNetComponentMatcher& add(::testing::MatcherInterface<const intel_nnet_type_t*> * p) {
+        matchers.push_back(std::shared_ptr<::testing::MatcherInterface<const intel_nnet_type_t*>>(p));
+        return *this;
+    }
+    bool empty() const {
+        return matchers.empty();
+    }
+    bool MatchAndExplain(const intel_nnet_type_t* foo, ::testing::MatchResultListener* listener) const override {
+        if (foo == nullptr)
+            return false;
+        reason.str("");
+        // checking pointers are set
+        for (int i=0; i < foo->nLayers; i++) {
+            if (nullptr == foo->pLayers[i].pInputs ||
+                nullptr == foo->pLayers[i].pOutputs) {
+                reason << "input/output pointers in pLayers[" << i << "] shouldn't be null NULL";
+                return false;
+            }
+            if (foo->pLayers[i].nBytesPerInput * 8 != bitness) {
+                reason << "numberOfBytes per input in pLayers[" << i << "] should be " << (bitness/8) << ", but was "
+                    << foo->pLayers[i].nBytesPerInput;
+                return false;
+            }
+
+            if (foo->pLayers[i].nBytesPerOutput * 8 != bitness) {
+                // if this output is a output to a bias this is fine
+                // also if this output is defacto network output - other words this whouldnt use in inputs,
+                for (int j=0; j < foo->nLayers; j++) {
+                    // bad
+                    if (foo->pLayers[j].pInputs == foo->pLayers[i].pOutputs) {
+                        reason << "numberOfBytes per output int pLayers[" << i << "] should be " << (bitness/8) << ", but was "
+                               << foo->pLayers[i].nBytesPerOutput << "cannot use this output as inputs for layer :" << j;
+                        return false;
+                    }
+                    if (foo->pLayers[j].nLayerKind == INTEL_AFFINE ||
+                        foo->pLayers[j].nLayerKind == INTEL_AFFINE_DIAGONAL) {
+                        auto pAffine = reinterpret_cast<intel_affine_func_t*>(foo->pLayers[j].pLayerStruct);
+
+                        if (pAffine->pWeights == foo->pLayers[i].pOutputs) {
+                            reason << "numberOfBytes per output int pLayers[" << i << "] should be " << (bitness/8) << ", but was "
+                                   << foo->pLayers[i].nBytesPerOutput << "cannot use this output as weights for affine layer :" << j;
+                            return false;
+                        }
+                    }
+                }
+            }
+        }
+
+        int i = 0;
+        for (auto && matcher : matchers) {
+            bool res = matcher->MatchAndExplain(foo, listener);
+            if (!res) {
+                failIdx = i;
+                return false;
+            }
+            i++;
+        }
+        return true;
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+
+        if (failIdx != -1) {
+            matchers[failIdx]->DescribeTo(os);
+            return;
+        }
+
+        *os << reason.str();
+    }
+
+};
+
diff --git a/inference-engine/tests/unit/engines/gna/matchers/pool_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/pool_matcher.hpp
new file mode 100644 (file)
index 0000000..009e61c
--- /dev/null
@@ -0,0 +1,37 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include"gna-api.h"
+#include "nnet_base_matcher.hpp"
+#include "quantization/quantization.h"
+
+class PoolingLayerMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
+    bool matchInserted;
+    int matchQuantity;
+    bool bMaxPool;
+ public:
+    PoolingLayerMatcher(bool matchInserted, int matchQuantity, bool bMaxPool)
+        : matchInserted(matchInserted), matchQuantity(matchQuantity), bMaxPool(bMaxPool) {}
+    bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override {
+        if (foo == nullptr)
+            return false;
+        for(int i = 0; i < foo->nLayers; i++) {
+            if (foo->pLayers[i].nLayerKind != INTEL_CONVOLUTIONAL) continue;
+
+            auto conv = (intel_convolutional_layer_t*)foo->pLayers[i].pLayerStruct;
+            if (conv->poolType != INTEL_MAX_POOLING) continue;
+
+            return matchInserted;
+        }
+        return !matchInserted;
+    };
+    void DescribeTo(::std::ostream *os) const override {
+        *os << "should "<< (matchInserted ? "" : "not ") << "have MaxPooling primitive as part of nnet structure";
+    }
+};
+
+
+
diff --git a/inference-engine/tests/unit/engines/gna/matchers/precision_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/precision_matcher.hpp
new file mode 100644 (file)
index 0000000..9dfdc87
--- /dev/null
@@ -0,0 +1,54 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "nnet_base_matcher.hpp"
+
+class NNetPrecisionMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
+    GnaPluginTestEnvironment::NnetPrecision nnetPrecision;
+    intel_layer_kind_t layerKind = (intel_layer_kind_t)-1;
+ public:
+    explicit  NNetPrecisionMatcher(GnaPluginTestEnvironment::NnetPrecision nnetPrecision,
+                                   intel_layer_kind_t layerKind = (intel_layer_kind_t)-1) : nnetPrecision(nnetPrecision), layerKind(layerKind) {}
+    bool MatchAndExplain(const intel_nnet_type_t* foo, ::testing::MatchResultListener* listener) const override {
+
+        auto ioPrecision = (foo->pLayers->nBytesPerInput == nnetPrecision.input_precision.size()) &&
+            (foo->pLayers->nBytesPerOutput== nnetPrecision.output_precision.size());
+        if (!ioPrecision) {
+            return false;
+        }
+        if (layerKind != (intel_layer_kind_t)-1) {
+            if (foo->pLayers->nLayerKind != layerKind) {
+                return false;
+            }
+            switch (layerKind) {
+                case INTEL_AFFINE : {
+                    auto affine = (intel_affine_layer_t *) (foo->pLayers->pLayerStruct);
+
+                    return affine->affine.nBytesPerBias == nnetPrecision.biases_precision.size() &&
+                        affine->affine.nBytesPerWeight == nnetPrecision.weights_precision.size();
+                }
+                default :
+                    return false;
+            }
+
+        }
+        return  true;
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+        *os << "intel_nnet_layer_t nBytesPerInput equals " << nnetPrecision.input_precision.size() << std::endl;
+        *os << "intel_nnet_layer_t nBytesPerOutput equals " << nnetPrecision.output_precision.size() << std::endl;
+        *os << "intel_nnet_layer_t nBytesPerWeights equals " << nnetPrecision.weights_precision.size() << std::endl;
+        *os << "intel_nnet_layer_t nBytesPerBises equals " << nnetPrecision.biases_precision.size() << std::endl;
+        *os << "foo->pLayers->nLayerKind INTEL_AFFINE" ;
+    }
+};
+
+inline ::testing::Matcher<const intel_nnet_type_t*> BitnessOfNNetEq(GnaPluginTestEnvironment::NnetPrecision nnetPrecision,
+                                                         intel_layer_kind_t component) {
+    std::unique_ptr<NNetComponentMatcher> c (new NNetComponentMatcher());
+    c->add(new NNetPrecisionMatcher(nnetPrecision, component));
+    return ::testing::MakeMatcher(c.release());
+}
diff --git a/inference-engine/tests/unit/engines/gna/matchers/pwl_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/pwl_matcher.hpp
new file mode 100644 (file)
index 0000000..9060cd5
--- /dev/null
@@ -0,0 +1,61 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "nnet_base_matcher.hpp"
+
+class PWLMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
+    bool matchInserted;
+    const int matchQuantity;
+    mutable int timesInserted = 0;
+ public:
+    PWLMatcher(bool inserted, int matchQuantity) : matchInserted(inserted), matchQuantity(matchQuantity) {}
+
+    bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override {
+        if (foo == nullptr)
+            return false;
+        timesInserted = 0;
+        for(int i = 0; i < foo->nLayers; i++) {
+            if (foo->pLayers[i].nLayerKind != INTEL_AFFINE &&
+                foo->pLayers[i].nLayerKind != INTEL_AFFINE_DIAGONAL &&
+                foo->pLayers[i].nLayerKind != INTEL_CONVOLUTIONAL) continue;
+            auto affine = reinterpret_cast<intel_affine_layer_t*>(foo->pLayers[i].pLayerStruct);
+            if (affine == nullptr) continue;
+
+            bool hasPwl = affine->pwl.nSegments != 0 && affine->pwl.pSegments != nullptr;
+
+            if (hasPwl) {
+                if (matchQuantity == -1)
+                    return matchInserted;
+                else
+                    timesInserted ++;
+            }
+        }
+        if (matchInserted) {
+            if (matchQuantity != -1) {
+                return timesInserted == matchQuantity;
+            }
+            return timesInserted != 0;
+        }
+
+        return timesInserted == 0;
+    };
+    void DescribeTo(::std::ostream *os) const override {
+        if (!matchInserted ) {
+            *os << "should not have PWL layer as part of nnet structure, but was found " << timesInserted <<" times" ;
+        } else {
+            if (matchQuantity == -1) {
+                *os << "should have PWL layer as part of nnet structure, but it was not found " ;
+            } else {
+                *os << "should have PWL layer as part of nnet structure, for " << matchQuantity <<" times, but was found only " << timesInserted ;
+            }
+        }
+    }
+};
+
+inline ::testing::Matcher<const intel_nnet_type_t*> HasPwlLayer(bool inserted = true, int matchQuantity = -1) {
+    std::unique_ptr<NNetComponentMatcher> c (new NNetComponentMatcher());
+    c->add(new PWLMatcher(inserted, matchQuantity));
+    return ::testing::MakeMatcher(c.release());
+}
diff --git a/inference-engine/tests/unit/engines/gna/matchers/pwl_quantization_metrics_matcher.hpp b/inference-engine/tests/unit/engines/gna/matchers/pwl_quantization_metrics_matcher.hpp
new file mode 100644 (file)
index 0000000..cccd940
--- /dev/null
@@ -0,0 +1,139 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <cmath>
+#include <numeric>
+
+#include "nnet_base_matcher.hpp"
+#include "dnn.h"
+#include "pwl.h"
+#include "iostream"
+
+class PWLQuantizationMetricsMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
+    const float rmse_threshold;
+    const uint32_t activation_type;
+    const uint16_t segment_threshold;
+ public:
+    PWLQuantizationMetricsMatcher(uint32_t type, float presicion_threshold, uint16_t segments) :
+                                                            activation_type(type),
+                                                            rmse_threshold(presicion_threshold),
+                                                            segment_threshold(segments) {}
+
+    bool MatchAndExplain(const intel_nnet_type_t *nnet, ::testing::MatchResultListener *listener) const override {
+        float rmse = 0.f;
+        const float test_arg_scale_factor = 16384;
+
+        if (nnet == nullptr)
+            return false;
+
+        for(int i = 0; i < nnet->nLayers; ++i) {
+            if (nnet->pLayers[i].nLayerKind != INTEL_AFFINE &&
+                nnet->pLayers[i].nLayerKind != INTEL_AFFINE_DIAGONAL &&
+                nnet->pLayers[i].nLayerKind != INTEL_CONVOLUTIONAL) continue;
+
+            auto affine = reinterpret_cast<intel_affine_layer_t*>(nnet->pLayers[i].pLayerStruct);
+
+            if (affine == nullptr ||
+                affine->pwl.nSegments == 0 ||
+                affine->pwl.pSegments == nullptr) continue;
+
+            if (affine->pwl.nSegments > segment_threshold) {
+                return false;
+            }
+
+            int32_t domain = 0;
+            std::function<float(float)> activation_func = nullptr;
+            switch (activation_type) {
+                case kActSigmoid:
+                    domain = 10000;
+                    activation_func = [](float x)-> float {
+                                    float exp_value;
+                                    exp_value =
+                                            exp(static_cast<double>(-(x)));
+                                    return  1 / (1 + exp_value);};
+                    break;
+                case kActTanh:
+                    domain = 5000;
+                    activation_func = [](float x)-> float {return tanh(x);};
+                    break;
+                case kActIdentity:
+                    domain = 1000;
+                    activation_func = [](float x)-> float {return x;};
+                    break;
+                case kActRelu:
+                    domain = 1000;
+                    activation_func = [](float x)-> float {return relu(x);};
+                    break;
+                case kActLeakyRelu:
+                    domain = 1000;
+                    activation_func = [](float x)-> float {return leaky_relu(x);};
+                    break;
+                case kActKaldiLstmClipping:
+                    domain = 16000;
+                    activation_func = [](float x)-> float {
+                                        return clipping(x,
+                                                KALDI_LSTM_CLIP_LOWER,
+                                                KALDI_LSTM_CLIP_UPPER);};
+                    break;
+                default:
+                    domain = 50000;
+                    activation_func = [](float x)-> float {return 0;};
+            }
+
+            std::vector<double> y_diviation(2*domain);
+            std::vector<intel_pwl_segment_t*> segments_vector(affine->pwl.nSegments);
+            std::iota(segments_vector.begin(), segments_vector.begin()+affine->pwl.nSegments,
+                                                                                affine->pwl.pSegments);
+
+            auto current_segment = segments_vector.begin();
+            auto diviation_itr = y_diviation.begin();
+
+            for(int i=-domain; i<domain; ++i) {
+                float value = 0.0;
+                const float arg = i/1000.0;
+                while(current_segment != segments_vector.end() &&
+                        arg > static_cast<int32_t>((*current_segment)->xBase & XBASEMASK) / test_arg_scale_factor) {
+                    ++current_segment;
+                }
+                auto prev_segment = std::prev(current_segment,1);
+                value = activation_func(arg);
+
+                float base_arg = static_cast<int32_t>((*prev_segment)->xBase & XBASEMASK) / test_arg_scale_factor;
+                float base_value = static_cast<int32_t>((*prev_segment)->yBase) / ACTIVATION_SCALE_FACTOR;
+
+                uint32_t slope_scale_index = (*prev_segment)->xBase & ~XBASEMASK;
+
+                uint64_t slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                float slope =
+                        test_arg_scale_factor*(static_cast<float>((*prev_segment)->slope ) / (slope_scale*ACTIVATION_SCALE_FACTOR));
+
+                float quant_value = (arg - base_arg)*slope + base_value;
+
+                *diviation_itr = std::pow(std::abs(value-quant_value),2);
+                ++diviation_itr;
+            }
+
+            // sort ascending to do not lost precision
+            std::sort(y_diviation.begin(),y_diviation.end());
+            double sum = std::accumulate(y_diviation.begin(), y_diviation.end(), 0.0);
+            rmse = std::sqrt(sum/static_cast<float>(y_diviation.size()));
+        }
+
+        return rmse_threshold > rmse;
+    };
+    void DescribeTo(::std::ostream *os) const override {
+        *os << "Has the activation layer type " <<  activation_type <<" rmse less that threshold "<< rmse_threshold
+                                                << " or segments count less that threshold " <<  segment_threshold
+                                                << " ?";
+    }
+};
+
+inline ::testing::Matcher<const intel_nnet_type_t*> PrecisionOfQuantizedPwlMetrics(uint32_t type,
+                                                                                    float threshold,
+                                                                                    uint16_t segments) {
+    std::unique_ptr<NNetComponentMatcher> c (new NNetComponentMatcher());
+    c->add(new PWLQuantizationMetricsMatcher(type, threshold, segments));
+    return ::testing::MakeMatcher(c.release());
+}
diff --git a/inference-engine/tests/unit/engines/gna/test_irs.cpp b/inference-engine/tests/unit/engines/gna/test_irs.cpp
new file mode 100644 (file)
index 0000000..f9a0353
--- /dev/null
@@ -0,0 +1,2678 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_irs.hpp"
+
+namespace GNATestIRs {
+
+std::string FCOnlyModel() {
+    return R"V0G0N(
+<Net Name="FullyConnected_Only" version="2" precision="FP32" batch="1">
+       <layers>
+               <layer name="input_1" type="input" id="0" precision="FP32">
+                       <output>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+
+        <layer name="FullyConnected" id="1" type="InnerProduct" precision="FP32">
+
+            <fc out-size="10" />
+
+            <biases offset="0" size="40" />
+            <weights offset="40" size="400" />
+
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+               <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+
+std::string Fc2DOutputModel() {
+    return R"V0G0N(
+<Net Name="FullyConnected_Only" version="2" precision="FP32" batch="1">
+       <layers>
+               <layer name="input_1" type="input" id="0" precision="FP32">
+                       <output>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+
+        <layer name="FullyConnected" id="1" type="InnerProduct" precision="FP32">
+
+            <fc out-size="10" />
+
+            <biases offset="0" size="40" />
+            <weights offset="40" size="400" />
+
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+               <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+
+std::string affineToMemoryModel() {
+    return R"V0G0N(
+<Net Name="FullyConnected_ToMemory" version="2" precision="FP32" batch="1">
+       <layers>
+               <layer name="input_1" type="input" id="0" precision="FP32">
+                       <output>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+        <layer name="FullyConnected" id="1" type="InnerProduct" precision="FP32">
+
+            <fc out-size="10" />
+
+            <biases offset="0" size="40" />
+            <weights offset="40" size="400" />
+
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+
+        <layer name="Eltwise_8" type="Eltwise" id="11" precision="FP32">
+                       <data operation="sum" />
+                       <input>
+                               <port id="0">
+                                       <!--connected to FullyConnected-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                               <port id="1">
+                                       <!--connected to Memory_28-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="2">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+        <layer name="Memory_27" type="Memory" id="27" precision="FP32">
+                       <data id="r_27-28" index="0" size="2" />
+                       <input>
+                               <port id="60">
+                                       <!--connected to FullyConnected-->
+                    <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+               </layer>
+
+               <layer name="Memory_28" type="Memory" id="28" precision="FP32">
+                       <data id="r_27-28" index="1" size="2" />
+                       <output>
+                               <port id="59">
+                                       <!--connected to , Eltwise_8-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+    </layers>
+    <edges>
+               <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+        <edge from-layer="1" from-port="1" to-layer="27" to-port="60" />
+        <edge from-layer="1" from-port="1" to-layer="11" to-port="1" />
+        <edge from-layer="28" from-port="59" to-layer="11" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+std::string eltwiseToMemoryModelNoOutput() {
+    return R"V0G0N(
+<Net Name="FullyConnected_ToMemory" version="2" precision="FP32" batch="1">
+       <layers>
+               <layer name="input_1" type="input" id="0" precision="FP32">
+                       <output>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+
+        <layer name="Eltwise_8" type="Eltwise" id="11" precision="FP32">
+                       <data operation="sum" />
+                       <input>
+                               <port id="0">
+                                       <!--connected to FullyConnected-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                               <port id="1">
+                                       <!--connected to Memory_28-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="2">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+
+        <layer name="Memory_27" type="Memory" id="27" precision="FP32">
+                       <data id="r_27-28" index="0" size="2" />
+                       <input>
+                               <port id="60">
+                                       <!--connected to Eltwise_8-->
+                    <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+               </layer>
+
+               <layer name="Memory_28" type="Memory" id="28" precision="FP32">
+                       <data id="r_27-28" index="1" size="2" />
+                       <output>
+                               <port id="59">
+                                       <!--connected to , Eltwise_8-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+    </layers>
+    <edges>
+               <edge from-layer="0" from-port="0" to-layer="11" to-port="1" />
+        <edge from-layer="11" from-port="2" to-layer="27" to-port="60" />
+        <edge from-layer="28" from-port="59" to-layer="11" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+std::string eltwiseToMemoryModel() {
+    return R"V0G0N(
+<Net Name="FullyConnected_ToMemory" version="2" precision="FP32" batch="1">
+       <layers>
+               <layer name="input_1" type="input" id="0" precision="FP32">
+                       <output>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+        <layer name="Eltwise_8" type="Eltwise" id="11" precision="FP32">
+                       <data operation="sum" />
+                       <input>
+                               <port id="0">
+                                       <!--connected to Memory_28-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                               <port id="1">
+                                       <!--connected to input-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="2">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+        <layer name="Eltwise_9" type="Eltwise" id="12" precision="FP32">
+                       <data operation="sum" />
+                       <input>
+                               <port id="0">
+                                       <!--connected Memory_28 to -->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                               <port id="1">
+                                       <!--connected to Elwise_8-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="2">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+        <layer name="Memory_27" type="Memory" id="27" precision="FP32">
+                       <data id="r_27-28" index="0" size="2" />
+                       <input>
+                               <port id="60">
+                                       <!--connected to Eltwise_8-->
+                    <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+               </layer>
+
+               <layer name="Memory_28" type="Memory" id="28" precision="FP32">
+                       <data id="r_27-28" index="1" size="2" />
+                       <output>
+                               <port id="59">
+                                       <!--connected to , Eltwise_8-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                <port id="5010">
+                                       <!--connected to , Eltwise_9-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+    </layers>
+    <edges>
+               <edge from-layer="0" from-port="0" to-layer="11" to-port="1" />
+        <edge from-layer="11" from-port="2" to-layer="27" to-port="60" />
+        <edge from-layer="11" from-port="2" to-layer="12" to-port="1" />
+        <edge from-layer="28" from-port="59" to-layer="11" to-port="0" />
+        <edge from-layer="28" from-port="5010" to-layer="12" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+
+std::string activationAfterSplitModel() {
+    return R"V0G0N(
+    <Net Name="activationAfterSplit" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Eltwise_8" type="Eltwise" id="11" precision="FP32">
+                <data operation="sum" />
+                <input>
+                    <port id="0">
+                        <!--connected to split-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                    <port id="1">
+                        <!--connected to tanh_28-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+
+            <layer name="Split_1" type="Split" id="12" precision="FP32">
+                <data axis="1" />
+                <input>
+                    <port id="0">
+                        <!--connected to input-->
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <!--connected to tanh-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                    <port id="2">
+                        <!--connected to eltwise-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Activation_38" type="Activation" id="38" precision="FP32">
+                <data type="tanh" />
+                <input>
+                    <port id="82">
+                        <!--connected to Eltwise_37-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="83">
+                        <!--connected to , Eltwise_41-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="12" to-port="0" />
+            <edge from-layer="12" from-port="1" to-layer="11" to-port="0" />
+            <edge from-layer="12" from-port="2" to-layer="38" to-port="82" />
+            <edge from-layer="38" from-port="83" to-layer="11" to-port="1" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string FCWithPaddingAfterSplitModel() {
+    return R"V0G0N(
+    <Net Name="FCWithPaddingAfterSplitModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Split_1" type="Split" id="1" precision="FP32">
+                <data axis="1" />
+                <input>
+                    <port id="0">
+                        <!--connected to input-->
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <!--connected to eltwise-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                    <port id="2">
+                        <!--connected to fc-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected" id="11" type="InnerProduct" precision="FP32">
+                <fc out-size="10" />
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Eltwise_8" type="Eltwise" id="21" precision="FP32">
+                <data operation="sum" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="21" to-port="0" />
+            <edge from-layer="1" from-port="2" to-layer="11" to-port="0" />
+            <edge from-layer="11" from-port="1" to-layer="21" to-port="1" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string twoFCWithPaddingAfterSliceModel() {
+    return R"V0G0N(
+    <Net Name="twoFCWithPaddingAfterSliceModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Slice_1" type="Slice" id="1" precision="FP32">
+                <data axis="1" slice_point="8" slice_dim="1"/>
+                <input>
+                    <port id="0">
+                        <!--connected to input-->
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <!--connected to eltwise-->
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                    <port id="2">
+                        <!--connected to fc-->
+                        <dim>1</dim>
+                        <dim>12</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected1" id="11" type="InnerProduct" precision="FP32">
+                <fc out-size="8" />
+                <biases offset="0" size="32" />
+                <weights offset="32" size="384" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>12</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected2" id="12" type="InnerProduct" precision="FP32">
+                <fc out-size="8" />
+                <biases offset="0" size="32" />
+                <weights offset="32" size="384" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>12</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Eltwise_1" type="Eltwise" id="21" precision="FP32">
+                <data operation="sum" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Eltwise_2" type="Eltwise" id="22" precision="FP32">
+                <data operation="sum" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="21" to-port="0" />
+            <edge from-layer="1" from-port="2" to-layer="11" to-port="0" />
+            <edge from-layer="1" from-port="2" to-layer="12" to-port="0" />
+            <edge from-layer="11" from-port="1" to-layer="21" to-port="1" />
+            <edge from-layer="21" from-port="2" to-layer="22" to-port="0" />
+            <edge from-layer="12" from-port="1" to-layer="22" to-port="1" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string FCWithPaddingAfterSliceModel() {
+    return R"V0G0N(
+    <Net Name="FCWithPaddingAfterSliceModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Slice_1" type="Slice" id="1" precision="FP32">
+                <data axis="1" slice_point="8" slice_dim="1"/>
+                <input>
+                    <port id="0">
+                        <!--connected to input-->
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <!--connected to eltwise-->
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                    <port id="2">
+                        <!--connected to fc-->
+                        <dim>1</dim>
+                        <dim>12</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected" id="11" type="InnerProduct" precision="FP32">
+                <fc out-size="8" />
+                <biases offset="0" size="32" />
+                <weights offset="32" size="384" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>12</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Eltwise_8" type="Eltwise" id="21" precision="FP32">
+                <data operation="sum" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>8</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="21" to-port="0" />
+            <edge from-layer="1" from-port="2" to-layer="11" to-port="0" />
+            <edge from-layer="11" from-port="1" to-layer="21" to-port="1" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string SliceModelWithAlignedOutputs() {
+    return R"V0G0N(
+    <Net Name="SliceModelWithAlignedOutputs" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Slice_1" type="Slice" id="1" precision="FP32">
+                <data axis="1" slice_point="8" slice_dim="1"/>
+                <input>
+                    <port id="0">
+                        <!--connected to input-->
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <!--connected to fc-->
+                        <dim>1</dim>
+                        <dim>16</dim>
+                    </port>
+                    <port id="2">
+                        <!--connected to eltwise-->
+                        <dim>1</dim>
+                        <dim>4</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected" id="11" type="InnerProduct" precision="FP32">
+                <fc out-size="4" />
+                <biases offset="0" size="16" />
+                <weights offset="16" size="512" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>16</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>4</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Eltwise_8" type="Eltwise" id="21" precision="FP32">
+                <data operation="sum" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>4</dim>
+                    </port>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>4</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>4</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="11" to-port="0" />
+            <edge from-layer="1" from-port="2" to-layer="21" to-port="0" />
+            <edge from-layer="11" from-port="1" to-layer="21" to-port="1" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string eltwiseSummModel()  {
+    return R"V0G0N(
+    <Net Name="activationAfterSplit" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected" id="2" type="InnerProduct" precision="FP32">
+
+                <fc out-size="10" />
+
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+
+        <layer name="FullyConnected_1" id="3" type="InnerProduct" precision="FP32">
+
+            <fc out-size="10" />
+
+            <biases offset="0" size="40" />
+            <weights offset="40" size="400" />
+
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+
+            <layer name="Eltwise_8" type="Eltwise" id="11" precision="FP32">
+                <data operation="sum" />
+                <input>
+                    <port id="0">
+                        <!--connected to FC1-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                    <port id="1">
+                        <!--connected to FC2-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+            <edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
+            <edge from-layer="2" from-port="1" to-layer="11" to-port="0" />
+            <edge from-layer="3" from-port="1" to-layer="11" to-port="1" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+
+std::string eltwiseMulModel()  {
+    return R"V0G0N(
+    <Net Name="eltwiseMul" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected" id="2" type="InnerProduct" precision="FP32">
+
+                <fc out-size="10" />
+
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+
+        <layer name="FullyConnected_1" id="3" type="InnerProduct" precision="FP32">
+
+            <fc out-size="10" />
+
+            <biases offset="0" size="40" />
+            <weights offset="40" size="400" />
+
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+
+            <layer name="Eltwise_8" type="Eltwise" id="11" precision="FP32">
+                <data operation="mul" />
+                <input>
+                    <port id="0">
+                        <!--connected to FC1-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                    <port id="1">
+                        <!--connected to FC2-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+            <edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
+            <edge from-layer="2" from-port="1" to-layer="11" to-port="0" />
+            <edge from-layer="3" from-port="1" to-layer="11" to-port="1" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string scaleShiftAffineModel() {
+    return R"V0G0N(
+<Net Name="FullyConnected_Only" version="2" precision="FP32" batch="1">
+       <layers>
+               <layer name="input_1" type="input" id="0" precision="FP32">
+                       <output>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+        <layer name="ScaleShift_21" type="ScaleShift" id="21" precision="FP32">
+                       <input>
+                               <port id="46">
+                                       <!--connected to input-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="45">
+                                       <!--connected to , FullyConnected-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+                       <weights offset="0" size="40" precision="FP32" />
+               </layer>
+
+        <layer name="FullyConnected" id="1" type="InnerProduct" precision="FP32">
+
+            <fc out-size="10" />
+
+            <biases offset="0" size="40" />
+            <weights offset="40" size="400" />
+
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+               <edge from-layer="0" from-port="0" to-layer="21" to-port="46" />
+        <edge from-layer="21" from-port="45" to-layer="1" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+
+}
+
+std::string clampFollowedByTanhModel() {
+    return R"V0G0N(
+<Net Name="clampFollowedByTanhModel" version="2" precision="FP32" batch="1">
+       <layers>
+               <layer name="input_1" type="input" id="0" precision="FP32">
+                       <output>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+        <layer name="Clamp_20" type="Clamp" id="20" precision="FP32">
+                       <data max="50" min="-50" />
+                       <input>
+                               <port id="43">
+                                       <!--connected to Eltwise_19-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="44">
+                                       <!--connected to , ScaleShift_21, Activation_24, Memory_4-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+        <layer name="Activation_38" type="Activation" id="38" precision="FP32">
+                <data type="tanh" />
+                <input>
+                    <port id="82">
+                        <!--connected to Eltwise_37-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="83">
+                        <!--connected to , Eltwise_41-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+
+
+    </layers>
+    <edges>
+               <edge from-layer="0" from-port="0" to-layer="20" to-port="43" />
+        <edge from-layer="20" from-port="44" to-layer="38" to-port="82" />
+    </edges>
+</Net>
+)V0G0N";
+
+}
+
+std::string eltwiseWithMemoryAndActivationInputModel() {
+    return R"V0G0N(
+    <Net Name="activationAfterSplit" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+
+        <layer name="Memory_27" type="Memory" id="27" precision="FP32">
+                       <data id="r_27-28" index="0" size="2" />
+                       <input>
+                               <port id="60">
+                                       <!--connected to Activation_38-->
+                    <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+               </layer>
+
+               <layer name="Memory_28" type="Memory" id="28" precision="FP32">
+                       <data id="r_27-28" index="1" size="2" />
+                       <output>
+                               <port id="59">
+                                       <!--connected to , Eltwise_8-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+            <layer name="FullyConnected" id="2" type="InnerProduct" precision="FP32">
+
+                <fc out-size="10" />
+
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+
+            <layer name="Activation_38" type="Activation" id="38" precision="FP32">
+                <data type="tanh" />
+                <input>
+                    <port id="82">
+                        <!--connected to Eltwise_37-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="83">
+                        <!--connected to , Eltwise_41-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+
+            <layer name="Eltwise_8" type="Eltwise" id="11" precision="FP32">
+                <data operation="sum" />
+                <input>
+                    <port id="0">
+                        <!--connected to FC1-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                    <port id="1">
+                        <!--connected to FC2-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+            <edge from-layer="2" from-port="1" to-layer="38" to-port="82" />
+            <edge from-layer="38" from-port="83" to-layer="11" to-port="0" />
+            <edge from-layer="28" from-port="59" to-layer="11" to-port="1" />
+            <edge from-layer="38" from-port="83" to-layer="27" to-port="60" />
+        </edges>
+    </Net>
+    )V0G0N";
+
+}
+std::string AffineWith2AffineOutputsModel() {
+    return R"V0G0N(
+    <Net Name="eltwiseMul" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected" id="2" type="InnerProduct" precision="FP32">
+
+                <fc out-size="10" />
+
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+
+        <layer name="FullyConnected_1" id="3" type="InnerProduct" precision="FP32">
+
+            <fc out-size="10" />
+
+            <biases offset="0" size="40" />
+            <weights offset="40" size="400" />
+
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+
+        <layer name="FullyConnected_5" id="4" type="InnerProduct" precision="FP32">
+
+            <fc out-size="10" />
+
+            <biases offset="0" size="40" />
+            <weights offset="40" size="400" />
+
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="Eltwise_8" type="Eltwise" id="11" precision="FP32">
+                       <data operation="sum" />
+                       <input>
+                               <port id="0">
+                                       <!--connected to FullyConnected-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                               <port id="1">
+                                       <!--connected to Memory_28-->
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="2">
+                                       <dim>1</dim>
+                                       <dim>10</dim>
+                               </port>
+                       </output>
+               </layer>
+
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+            <edge from-layer="2" from-port="1" to-layer="3" to-port="0" />
+            <edge from-layer="2" from-port="1" to-layer="4" to-port="0" />
+            <edge from-layer="4" from-port="1" to-layer="11" to-port="0" />
+            <edge from-layer="3" from-port="1" to-layer="11" to-port="1" />
+        </edges>
+    </Net>
+    )V0G0N";
+
+}
+
+std::string SigmoidActivationModel() {
+    return R"V0G0N(
+<Net Name="InputLayerWithSigmoidActivation" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="input_1" type="input" id="0" precision="FP32">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="Sig_Activation" id="2" type="Activation" precision="FP32">
+            <data type="sigmoid" />
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+
+std::string TanhActivationModel() {
+    return R"V0G0N(
+<Net Name="InputLayerWithTanhActivation" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="input_1" type="input" id="0" precision="FP32">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="Tanh_Activation" id="2" type="Activation" precision="FP32">
+            <data type="tanh" />
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+
+std::string ReLUActivationModel() {
+    return R"V0G0N(
+<Net Name="InputLayerWithReLUActivation" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="input_1" type="input" id="0" precision="FP32">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>1</dim>
+                    <dim>10</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="ReLU_Activation" type="Activation" id="2" precision="FP32">
+            <data type="ReLU" negative_slope="0.000000" />
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>1</dim>
+                    <dim>10</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="4">
+                    <dim>1</dim>
+                    <dim>1</dim>
+                    <dim>10</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+
+std::string LeakyReLUActivationModel() {
+    return R"V0G0N(
+<Net Name="InputLayerWithLeakyReLUActivation" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="input_1" type="input" id="0" precision="FP32">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>1</dim>
+                    <dim>10</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="LeakyReLU_Activation" type="Activation" id="2" precision="FP32">
+            <data type="ReLU" negative_slope="0.010000" />
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>1</dim>
+                    <dim>10</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="4">
+                    <dim>1</dim>
+                    <dim>1</dim>
+                    <dim>10</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+
+std::string ClampActivationModel() {
+    return R"V0G0N(
+<Net Name="InputLayerWithClippingActivation" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="input_1" type="input" id="0" precision="FP32">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="Clamp_Activation" id="2" type="Activation" precision="FP32">
+            <data type="clamp" min="-5" max="5" />
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+
+std::string IdentityActivationModel() {
+    return R"V0G0N(
+<Net Name="InputLayerWithIdentityActivation" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="input_1" type="input" id="0" precision="FP32">
+            <output>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="Identity_Activation" id="2" type="Activation" precision="FP32">
+            <data type="identity" />
+            <input>
+                <port id="0">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </input>
+            <output>
+                <port id="1">
+                    <dim>1</dim>
+                    <dim>10</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+    </edges>
+</Net>
+)V0G0N";
+}
+
+std::string concatModel()  {
+    return R"V0G0N(
+    <Net Name="concatinationModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Split1" type="Split" id="1" precision="FP32">
+                <data axis="1" />
+                <input>
+                    <port id="0">
+                        <!--connected to input-->
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <!--connected to eltwise-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                    <port id="2">
+                        <!--connected to fc-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+           <layer name="ReLU1" id="11" type="Activation" precision="FP32">
+                <data type="ReLU" negative_slope="0.000000" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected1" id="12" type="InnerProduct" precision="FP32">
+                <fc out-size="10" />
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="concat1" id="21"  precision="FP32" type="Concat">
+                    <data axis="1"/>
+                    <input>
+                            <port id="0">
+                                    <dim>1</dim>
+                                    <dim>10</dim>
+                            </port>
+                            <port id="1">
+                                    <dim>1</dim>
+                                    <dim>10</dim>
+                            </port>
+                    </input>
+                    <output>
+                            <port id="2">
+                                    <dim>1</dim>
+                                    <dim>20</dim>
+                            </port>
+                    </output>
+            </layer>
+            <layer name="FullyConnected2" id="31" type="InnerProduct" precision="FP32">
+                <fc out-size="20" />
+                <biases offset="0" size="80" />
+                <weights offset="80" size="1600" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="11" to-port="0" />
+            <edge from-layer="1" from-port="2" to-layer="12" to-port="0" />
+            <edge from-layer="11" from-port="1" to-layer="21" to-port="0" />
+            <edge from-layer="12" from-port="1" to-layer="21" to-port="1" />
+            <edge from-layer="21" from-port="2" to-layer="31" to-port="0" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+std::string TFLeakyReluModel() {
+    return R"V0G0N(
+    <?xml version="1.0" ?>
+    <net batch="1" name="model" version="2">
+        <layers>
+            <layer id="0" name="Placeholder" precision="FP32" type="Input">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>126</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer id="1" name="conv1_node/Conv2D" precision="FP32" type="Convolution">
+                <data dilation-x="1" dilation-y="1" group="1" kernel-x="5" kernel-y="1" output="128" pad-x="0" pad-y="0" stride="1,1,1,1" stride-x="1" stride-y="1"/>
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>126</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="3">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </output>
+                <blobs>
+                    <weights offset="0" size="327680"/>
+                    <biases offset="327680" size="512"/>
+                </blobs>
+            </layer>
+            <layer id="2" name="conv1_node/Relu" precision="FP32" type="ReLU">
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer id="3" name="conv1_node/Neg" precision="FP32" type="Power">
+                <data power="1" scale="-1" shift="0"/>
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer id="4" name="conv1_node/Relu_1" precision="FP32" type="ReLU">
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer id="5" name="conv1_node/mul" precision="FP32" type="Power">
+                <data power="1" scale="0.20000000298023224" shift="0"/>
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer id="47" name="conv1_node/sub/negate_86" precision="FP32" type="Power">
+                <data power="1" scale="-1" shift="0"/>
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer id="48" name="conv1_node/sub/add_87" precision="FP32" type="Eltwise">
+                <data operation="sum"/>
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>128</dim>
+                        <dim>1</dim>
+                        <dim>122</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+               <edge from-layer="0" from-port="0" to-layer="1" to-port="0"/>
+               <edge from-layer="1" from-port="3" to-layer="2" to-port="0"/>
+               <edge from-layer="1" from-port="3" to-layer="3" to-port="0"/>
+               <edge from-layer="3" from-port="1" to-layer="4" to-port="0"/>
+               <edge from-layer="4" from-port="1" to-layer="5" to-port="0"/>
+               <edge from-layer="5" from-port="1" to-layer="47" to-port="0"/>
+               <edge from-layer="2" from-port="1" to-layer="48" to-port="0"/>
+               <edge from-layer="47" from-port="1" to-layer="48" to-port="1"/>
+        </edges>
+    </net>
+    )V0G0N";
+}
+std::string maxpoolAfterRelu() {
+    return R"V0G0N(
+<?xml version="1.0" ?>
+<net batch="1" name="model" version="2">
+       <layers>
+               <layer id="0" name="Placeholder" precision="FP32" type="Input">
+                       <output>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>126</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="1" name="conv1_node/Conv2D" precision="FP32" type="Convolution">
+                       <data dilation-x="1" dilation-y="1" group="1" kernel-x="5" kernel-y="1" output="128" pad-x="0" pad-y="0" stride="1,1,1,1" stride-x="1" stride-y="1"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>126</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="3">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </output>
+                       <blobs>
+                               <weights offset="0" size="327680"/>
+                               <biases offset="327680" size="512"/>
+                       </blobs>
+               </layer>
+               <layer id="2" name="conv1_node/Relu" precision="FP32" type="ReLU">
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="3" name="conv1_node/Neg" precision="FP32" type="Power">
+                       <data power="1" scale="-1" shift="0"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="4" name="conv1_node/Relu_1" precision="FP32" type="ReLU">
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="5" name="conv1_node/mul" precision="FP32" type="Power">
+                       <data power="1" scale="0.20000000298023224" shift="0"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="47" name="conv1_node/sub/negate_86" precision="FP32" type="Power">
+                       <data power="1" scale="-1" shift="0"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="48" name="conv1_node/sub/add_87" precision="FP32" type="Eltwise">
+                       <data operation="sum"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="2">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="6" name="conv2_node/Conv2D" precision="FP32" type="Convolution">
+                       <data dilation-x="1" dilation-y="1" group="1" kernel-x="5" kernel-y="1" output="128" pad-x="0" pad-y="0" stride="1,1,1,1" stride-x="1" stride-y="1"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>122</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="3">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </output>
+                       <blobs>
+                               <weights offset="328192" size="327680"/>
+                               <biases offset="655872" size="512"/>
+                       </blobs>
+               </layer>
+               <layer id="7" name="conv2_node/Relu" precision="FP32" type="ReLU">
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="8" name="conv2_node/Neg" precision="FP32" type="Power">
+                       <data power="1" scale="-1" shift="0"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="9" name="conv2_node/Relu_1" precision="FP32" type="ReLU">
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="10" name="conv2_node/mul" precision="FP32" type="Power">
+                       <data power="1" scale="0.20000000298023224" shift="0"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="53" name="conv2_node/sub/negate_92" precision="FP32" type="Power">
+                       <data power="1" scale="-1" shift="0"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="54" name="conv2_node/sub/add_93" precision="FP32" type="Eltwise">
+                       <data operation="sum"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="2">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </output>
+               </layer>
+               <layer id="11" name="pool1_node/MaxPool" precision="FP32" type="Pooling">
+                       <data exclude-pad="true" kernel-x="2" kernel-y="1" pad-x="0" pad-y="0" pool-method="max" stride="1,1,1,2" stride-x="2" stride-y="1"/>
+                       <input>
+                               <port id="0">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>118</dim>
+                               </port>
+                       </input>
+                       <output>
+                               <port id="1">
+                                       <dim>1</dim>
+                                       <dim>128</dim>
+                                       <dim>1</dim>
+                                       <dim>59</dim>
+                               </port>
+                       </output>
+               </layer>
+        </layers>
+        <edges>
+               <edge from-layer="0" from-port="0" to-layer="1" to-port="0"/>
+               <edge from-layer="1" from-port="3" to-layer="2" to-port="0"/>
+               <edge from-layer="1" from-port="3" to-layer="3" to-port="0"/>
+               <edge from-layer="3" from-port="1" to-layer="4" to-port="0"/>
+               <edge from-layer="4" from-port="1" to-layer="5" to-port="0"/>
+               <edge from-layer="5" from-port="1" to-layer="47" to-port="0"/>
+               <edge from-layer="2" from-port="1" to-layer="48" to-port="0"/>
+               <edge from-layer="47" from-port="1" to-layer="48" to-port="1"/>
+               <edge from-layer="48" from-port="2" to-layer="6" to-port="0"/>
+               <edge from-layer="6" from-port="3" to-layer="7" to-port="0"/>
+               <edge from-layer="6" from-port="3" to-layer="8" to-port="0"/>
+               <edge from-layer="8" from-port="1" to-layer="9" to-port="0"/>
+               <edge from-layer="9" from-port="1" to-layer="10" to-port="0"/>
+               <edge from-layer="10" from-port="1" to-layer="53" to-port="0"/>
+               <edge from-layer="7" from-port="1" to-layer="54" to-port="0"/>
+               <edge from-layer="53" from-port="1" to-layer="54" to-port="1"/>
+               <edge from-layer="54" from-port="2" to-layer="11" to-port="0"/>
+        </edges>
+    </net>
+
+    )V0G0N";
+}
+
+std::string doubleConcatModel() {
+    return R"V0G0N(
+    <Net Name="concatinationModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>40</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Split1" type="Split" id="1" precision="FP32">
+                <data axis="1" />
+                <input>
+                    <port id="0">
+                        <!--connected to input-->
+                        <dim>1</dim>
+                        <dim>40</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <!--connected to relu-->
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                    <port id="2">
+                        <!--connected to split-->
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+           <layer name="ReLU1" id="11" type="Activation" precision="FP32">
+                <data type="ReLU" negative_slope="0.000000" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Split2" type="Split" id="12" precision="FP32">
+                <data axis="1" />
+                <input>
+                    <port id="0">
+                        <!--connected to split-->
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <!--connected to relu-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                    <port id="2">
+                        <!--connected to fc-->
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+           <layer name="ReLU2" id="21" type="Activation" precision="FP32">
+                <data type="ReLU" negative_slope="0.000000" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected1" id="22" type="InnerProduct" precision="FP32">
+                <fc out-size="10" />
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="concat1" id="31"  precision="FP32" type="Concat">
+                    <data axis="1"/>
+                    <input>
+                            <port id="0">
+                                    <dim>1</dim>
+                                    <dim>10</dim>
+                            </port>
+                            <port id="1">
+                                    <dim>1</dim>
+                                    <dim>10</dim>
+                            </port>
+                    </input>
+                    <output>
+                            <port id="2">
+                                    <dim>1</dim>
+                                    <dim>20</dim>
+                            </port>
+                    </output>
+            </layer>
+             <layer name="concat2" id="41"  precision="FP32" type="Concat">
+                    <data axis="1"/>
+                    <input>
+                            <port id="0">
+                                    <dim>1</dim>
+                                    <dim>20</dim>
+                            </port>
+                            <port id="1">
+                                    <dim>1</dim>
+                                    <dim>20</dim>
+                            </port>
+                    </input>
+                    <output>
+                            <port id="2">
+                                    <dim>1</dim>
+                                    <dim>40</dim>
+                            </port>
+                    </output>
+            </layer>
+            <layer name="FullyConnected2" id="51" type="InnerProduct" precision="FP32">
+                <fc out-size="40" />
+                <biases offset="400" size="160" />
+                <weights offset="560" size="6960" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>40</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>40</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="11" to-port="0" />
+            <edge from-layer="1" from-port="2" to-layer="12" to-port="0" />
+            <edge from-layer="11" from-port="1" to-layer="41" to-port="0" />
+            <edge from-layer="12" from-port="1" to-layer="21" to-port="0" />
+            <edge from-layer="12" from-port="2" to-layer="22" to-port="0" />
+            <edge from-layer="21" from-port="1" to-layer="31" to-port="0" />
+            <edge from-layer="22" from-port="1" to-layer="31" to-port="1" />
+            <edge from-layer="31" from-port="2" to-layer="41" to-port="1" />
+            <edge from-layer="41" from-port="2" to-layer="51" to-port="0" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string cropWithoutOffsetModel() {
+    return R"V0G0N(
+    <Net Name="cropWithoutOffsetModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Crop1" type="Crop" id="1" precision="FP32">
+                <data axis="1" dim="10" offset="0"/>
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected1" id="2" type="InnerProduct" precision="FP32">
+                <fc out-size="10" />
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="2" to-port="0" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string cropWithAlignedOffsetModel() {
+    return R"V0G0N(
+    <Net Name="cropWithAlignedOffsetModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Crop1" type="Crop" id="1" precision="FP32">
+                <data axis="1" dim="10" offset="8"/>
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected1" id="2" type="InnerProduct" precision="FP32">
+                <fc out-size="12" />
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="2" to-port="0" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string cropWithOffsetModel() {
+    return R"V0G0N(
+    <Net Name="cropWithOffsetModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Crop1" type="Crop" id="1" precision="FP32">
+                <data axis="1" dim="10" offset="5"/>
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected1" id="2" type="InnerProduct" precision="FP32">
+                <fc out-size="10" />
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="2" to-port="0" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string cropWithMaxOffsetModel() {
+    return R"V0G0N(
+    <Net Name="cropWithOffsetModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Crop1" type="Crop" id="1" precision="FP32">
+                <data axis="1" dim="10" offset="10"/>
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected1" id="2" type="InnerProduct" precision="FP32">
+                <fc out-size="10" />
+                <biases offset="0" size="40" />
+                <weights offset="40" size="400" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="2" to-port="0" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string cropWithOffsetExtendedModel() {
+    return R"V0G0N(
+    <Net Name="cropWithOffsetExtendedModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected1" id="1" type="InnerProduct" precision="FP32">
+                <fc out-size="20" />
+                <biases offset="0" size="80" />
+                <weights offset="80" size="1920" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Crop1" type="Crop" id="11" precision="FP32">
+                <data axis="1" dim="10" offset="5"/>
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected2" id="12" type="InnerProduct" precision="FP32">
+                <fc out-size="10" />
+                <biases offset="1920" size="40" />
+                <weights offset="1960" size="640" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>10</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="11" to-port="0" />
+            <edge from-layer="11" from-port="1" to-layer="12" to-port="0" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+
+std::string copyModel() {
+    return R"V0G0N(
+    <Net Name="cropWithOffsetExtendedModel" version="2" precision="FP32" batch="1">
+        <layers>
+            <layer name="input_1" type="input" id="0" precision="FP32">
+                <output>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="FullyConnected1" id="1" type="InnerProduct" precision="FP32">
+                <fc out-size="20" />
+                <biases offset="0" size="80" />
+                <weights offset="80" size="1920" />
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Copy1" id="2" type="Copy" precision="FP32">
+                <input>
+                    <port id="0">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+            <layer name="Eltwise_1" type="Eltwise" id="11" precision="FP32">
+                <data operation="sum" />
+                <input>
+                    <port id="0">
+                        <!--connected to FullyConnected-->
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                    <port id="1">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </input>
+                <output>
+                    <port id="2">
+                        <dim>1</dim>
+                        <dim>20</dim>
+                    </port>
+                </output>
+            </layer>
+        </layers>
+        <edges>
+            <edge from-layer="0" from-port="0" to-layer="1" to-port="0" />
+            <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
+            <edge from-layer="1" from-port="1" to-layer="11" to-port="0" />
+            <edge from-layer="2" from-port="1" to-layer="11" to-port="1" />
+        </edges>
+    </Net>
+    )V0G0N";
+}
+}  // namespace GNATestIRs
diff --git a/inference-engine/tests/unit/engines/gna/test_irs.hpp b/inference-engine/tests/unit/engines/gna/test_irs.hpp
new file mode 100644 (file)
index 0000000..c7b4b0c
--- /dev/null
@@ -0,0 +1,43 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+
+namespace GNATestIRs {
+
+std::string FCOnlyModel() ;
+std::string Fc2DOutputModel();
+std::string affineToMemoryModel();
+std::string eltwiseToMemoryModel();
+std::string eltwiseToMemoryModelNoOutput();
+std::string activationAfterSplitModel();
+std::string FCWithPaddingAfterSplitModel();
+std::string SliceModelWithAlignedOutputs();
+std::string FCWithPaddingAfterSliceModel();
+std::string twoFCWithPaddingAfterSliceModel();
+std::string eltwiseSummModel();
+std::string eltwiseMulModel();
+std::string concatModel();
+std::string doubleConcatModel();
+std::string scaleShiftAffineModel();
+std::string clampFollowedByTanhModel();
+std::string eltwiseWithMemoryAndActivationInputModel();
+std::string AffineWith2AffineOutputsModel();
+std::string SigmoidActivationModel();
+std::string TanhActivationModel();
+std::string ReLUActivationModel();
+std::string LeakyReLUActivationModel();
+std::string ClampActivationModel();
+std::string IdentityActivationModel();
+std::string maxpoolAfterRelu();
+std::string TFLeakyReluModel();
+std::string cropWithoutOffsetModel();
+std::string cropWithAlignedOffsetModel();
+std::string cropWithOffsetModel();
+std::string cropWithMaxOffsetModel();
+std::string cropWithOffsetExtendedModel();
+std::string copyModel();
+}  // namespace GNATestIRs
diff --git a/inference-engine/tests/unit/engines/mkldnn/dump_test.cpp b/inference-engine/tests/unit/engines/mkldnn/dump_test.cpp
new file mode 100644 (file)
index 0000000..042f7ac
--- /dev/null
@@ -0,0 +1,136 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "ie_blob.h"
+#include "blob_factory.hpp"
+#include "utils/blob_dump.h"
+
+using namespace InferenceEngine;
+using namespace MKLDNNPlugin;
+
+TEST(MKLDNNDumpTests, UnallocatedBlob_NoDump) {
+    SizeVector dims {2,3,4,5};
+    Blob::Ptr blob = make_blob_with_precision({Precision::U8, dims, NHWC});
+
+    std::stringstream buff;
+
+    EXPECT_THROW({
+        BlobDumper(blob).dump(buff);
+    }, details::InferenceEngineException);
+}
+
+TEST(MKLDNNDumpTests, EmptyBlob_NoDump) {
+    SizeVector dims {2,3,4,5};
+    Blob::Ptr blob;
+
+    std::stringstream buff;
+
+    EXPECT_THROW({
+        BlobDumper(blob).dump(buff);
+    }, details::InferenceEngineException);
+}
+
+TEST(MKLDNNDumpTests, Ser) {
+    SizeVector dims {2,3,4,5};
+    Blob::Ptr blob = make_blob_with_precision({Precision::U8, dims, NHWC});
+    blob->allocate();
+
+    std::stringstream buff;
+    BlobDumper(blob).dump(buff);
+
+    ASSERT_GT(buff.str().size(), blob->byteSize());
+}
+
+TEST(MKLDNNDumpTests, SerDeser) {
+    SizeVector dims {2,3,4,5};
+    Blob::Ptr blob = make_blob_with_precision({Precision::U8, dims, NCHW});
+    blob->allocate();
+
+    std::stringstream buff;
+
+    BlobDumper(blob).dump(buff);
+    Blob::Ptr deser_blob = BlobDumper::read(buff).get();
+
+    ASSERT_EQ(deser_blob->dims(), blob->dims());
+    ASSERT_EQ(deser_blob->precision(), blob->precision());
+
+    std::vector<uint8_t> data(blob->buffer().as<uint8_t*>(), blob->buffer().as<uint8_t*>() + blob->size());
+    std::vector<uint8_t> deser_data(deser_blob->buffer().as<uint8_t*>(), deser_blob->buffer().as<uint8_t*>()
+                                    + deser_blob->size());
+    ASSERT_EQ(deser_data, data);
+}
+
+TEST(MKLDNNDumpTests, SerDeserWithScales) {
+    SizeVector dims {2,3,4,5};
+    auto blob = make_blob_with_precision({Precision::U8, dims, NCHW});
+    blob->allocate();
+
+    auto scls = make_blob_with_precision({Precision::FP32, {3}, C});
+    scls->allocate();
+
+    std::stringstream buff;
+
+    BlobDumper(blob).withScales(scls).dump(buff);
+    auto deser = BlobDumper::read(buff);
+    auto deser_blob = deser.get();
+    auto deser_scls = deser.getScales();
+
+    ASSERT_EQ(deser_blob->dims(), blob->dims());
+    ASSERT_EQ(deser_blob->precision(), blob->precision());
+
+    std::vector<uint8_t> data(blob->buffer().as<uint8_t*>(), blob->buffer().as<uint8_t*>() + blob->size());
+    std::vector<uint8_t> deser_data(deser_blob->buffer().as<uint8_t*>(), deser_blob->buffer().as<uint8_t*>()
+                                                                         + deser_blob->size());
+    ASSERT_EQ(deser_data, data);
+
+    std::vector<uint8_t> scls_data(scls->buffer().as<uint8_t*>(), scls->buffer().as<uint8_t*>() + scls->size());
+    std::vector<uint8_t> deser_scls_data(deser_scls->buffer().as<uint8_t*>(), deser_scls->buffer().as<uint8_t*>()
+                                                                         + deser_scls->size());
+    ASSERT_EQ(deser_scls_data, scls_data);
+}
+
+
+TEST(MKLDNNDumpTests, SerU8AsTxt) {
+    SizeVector dims {2,3,4,5};
+
+    Blob::Ptr blob = make_blob_with_precision({Precision::U8, dims, NCHW});
+    blob->allocate();
+
+    Blob::Ptr scls = make_blob_with_precision({Precision::FP32, {dims[1]}, C});
+    scls->allocate();
+
+    std::stringstream buff;
+    BlobDumper(blob).withScales(scls).dumpAsTxt(buff);
+
+    std::string deser_header, ref_header = "U8 4D shape: 2 3 4 5 (120)";
+    std::getline(buff, deser_header);
+    ASSERT_EQ(deser_header, ref_header);
+
+    auto num_line = std::count(std::istreambuf_iterator<char>(buff),
+            std::istreambuf_iterator<char>(), '\n');
+    ASSERT_EQ(num_line, blob->size());
+}
+
+TEST(MKLDNNDumpTests, SerAsTxt) {
+    SizeVector dims {2,3};
+
+    Blob::Ptr blob = make_blob_with_precision({Precision::FP32, dims, NC});
+    blob->allocate();
+
+    Blob::Ptr scls = make_blob_with_precision({Precision::FP32, {dims[1]}, C});
+    scls->allocate();
+
+    std::stringstream buff;
+    BlobDumper(blob).withScales(scls).dumpAsTxt(buff);
+
+    std::string deser_header, ref_header = "FP32 2D shape: 2 3 (6)";
+    std::getline(buff, deser_header);
+    ASSERT_EQ(deser_header, ref_header);
+
+    auto num_line = std::count(std::istreambuf_iterator<char>(buff),
+                               std::istreambuf_iterator<char>(), '\n');
+    ASSERT_EQ(num_line, blob->size());
+}
\ No newline at end of file
diff --git a/inference-engine/tests/unit/engines/mkldnn/dumper_test.cpp b/inference-engine/tests/unit/engines/mkldnn/dumper_test.cpp
new file mode 100644 (file)
index 0000000..383a1e7
--- /dev/null
@@ -0,0 +1,99 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "mkldnn_graph.h"
+#include "mkldnn_graph_dumper.h"
+#include "ie_blob.h"
+#include "ie_util_internal.hpp"
+#include "details/ie_cnn_network_tools.h"
+#include "xml_net_builder.hpp"
+#include "graph_tools.hpp"
+
+#include <string>
+#include <map>
+
+using namespace InferenceEngine;
+using namespace MKLDNNPlugin;
+using std::string;
+using std::map;
+
+class NetGen : testing::V2NetBuilder {
+    string model;
+    TBlob<uint8_t>::Ptr weights;
+
+public:
+    NetGen(): testing::V2NetBuilder(buildNetworkWithOneInput(
+            "SomeNet", {2,3,16,16}, "FP32")) {
+        using prm_t = map<string, string>;
+
+        testing::InOutData inout = {{{2,3,16,16}},{{2,16,16,16}}};
+
+        prm_t conv_prm = {
+                {"stride-x", std::to_string(1)},
+                {"stride-y", std::to_string(1)},
+                {"pad-x",    std::to_string(1)},
+                {"pad-y",    std::to_string(1)},
+                {"kernel-x", std::to_string(3)},
+                {"kernel-y", std::to_string(3)},
+                {"output",   std::to_string(16)},
+                {"group",    std::to_string(1)}
+        };
+        size_t wght = 3*16*3*3*sizeof(float);
+        size_t bias = 16*sizeof(float);
+
+        prm_t relu_prm = {{"negative_slope", std::to_string(0)}};
+
+        addLayer("Convolution", "FP32", &conv_prm, {{{2,3,16,16}},{{2,16,16,16}}}, wght, bias);
+        addLayer("Relu", "FP32", &relu_prm, {{{2,16,16,16}},{{2,16,16,16}}});
+
+        model = finish();
+
+        weights.reset(new TBlob<uint8_t>({Precision::U8, {wght+bias}, C}));
+        weights->allocate();
+    }
+
+    CNNNetwork net() {
+        CNNNetReader net_reader;
+        net_reader.ReadNetwork(model.data(), model.length());
+        net_reader.SetWeights(weights);
+
+        return net_reader.getNetwork();
+    }
+};
+
+TEST(MKLDNNLayersTests, DumpSimpleGraph) {
+    auto net = NetGen().net();
+    MKLDNNGraph graph;
+    MKLDNNExtensionManager::Ptr extMgr;
+    graph.CreateGraph(net, extMgr);
+
+    auto dump_net = dump_graph_as_ie_net(graph);
+    auto layers = details::CNNNetSortTopologically(*dump_net);
+
+    ASSERT_EQ(layers.size(), 4);
+    ASSERT_EQ(layers[0]->type, "Input");
+    ASSERT_EQ(layers[1]->type, "Conv_Activ");
+    ASSERT_EQ(layers[2]->type, "Reorder");
+    ASSERT_EQ(layers[3]->type, "Output");
+}
+
+TEST(MKLDNNLayersTests, DumpSimpleGraphToDot) {
+    auto net = NetGen().net();
+    MKLDNNGraph graph;
+    MKLDNNExtensionManager::Ptr extMgr;
+    graph.CreateGraph(net, extMgr);
+
+    std::stringstream buff;
+    dump_graph_as_dot(graph, buff);
+
+    std::string dot = buff.str();
+    std::cout << dot;
+    ASSERT_EQ(std::count(dot.begin(), dot.end(), '{'), 1); // 1-graph
+    ASSERT_EQ(std::count(dot.begin(), dot.end(), '}'), 1);
+    ASSERT_EQ(std::count(dot.begin(), dot.end(), '['), 10); // 4-node 3-data 3-shape
+    ASSERT_EQ(std::count(dot.begin(), dot.end(), ']'), 10);
+    ASSERT_EQ(std::count(dot.begin(), dot.end(), '>'), 6); // connection
+}
\ No newline at end of file
index 988ab44..4e22a72 100644 (file)
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include <extension/ext_base.hpp>
-#include <extension/ext_base.cpp>
 #include <extension/ext_list.hpp>
+#include <extension/ext_base.cpp>
+
+#include <string>
+#include <map>
+#include <memory>
+#include <algorithm>
+
+using namespace InferenceEngine;
+using namespace Extensions;
+
+struct TestExtensionsHolder {
+    std::map<std::string, Cpu::ext_factory> list;
+    std::map<std::string, IShapeInferImpl::Ptr> si_list;
+};
+
+
+class FakeExtensions : public IExtension {
+ public:
+
+    void SetLogCallback(InferenceEngine::IErrorListener &listener) noexcept override {};
+
+    void Unload() noexcept override {};
+
+    void Release() noexcept override {
+        delete this;
+    };
+
+    static std::shared_ptr<TestExtensionsHolder> GetExtensionsHolder() {
+        static std::shared_ptr<TestExtensionsHolder> localHolder;
+        if (localHolder == nullptr) {
+            localHolder = std::shared_ptr<TestExtensionsHolder>(new TestExtensionsHolder());
+        }
+        return localHolder;
+    }
+
+    static void AddExt(std::string name, Cpu::ext_factory factory) {
+        GetExtensionsHolder()->list[name] = factory;
+    }
+
+    void GetVersion(const Version *&versionInfo) const noexcept override {
+        static Version ExtensionDescription = {
+            {1, 0},    // extension API version
+            "1.0",
+            "ie-cpu-ext"  // extension description message
+        };
+
+        versionInfo = &ExtensionDescription;
+    }
+
+    StatusCode getPrimitiveTypes(char **&types, unsigned int &size, ResponseDesc *resp) noexcept override {
+        collectTypes(types, size, GetExtensionsHolder()->list);
+        return OK;
+    };
+    StatusCode getFactoryFor(ILayerImplFactory *&factory, const CNNLayer *cnnLayer, ResponseDesc *resp) noexcept override {
+        auto &factories = GetExtensionsHolder()->list;
+        if (factories.find(cnnLayer->type) == factories.end()) {
+            std::string errorMsg = std::string("Factory for ") + cnnLayer->type + " wasn't found!";
+            errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
+            return NOT_FOUND;
+        }
+        factory = factories[cnnLayer->type](cnnLayer);
+        return OK;
+    }
+    StatusCode getShapeInferTypes(char **&types, unsigned int &size, ResponseDesc *resp) noexcept override {
+        collectTypes(types, size, GetExtensionsHolder()->si_list);
+        return OK;
+    };
 
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
+    StatusCode getShapeInferImpl(IShapeInferImpl::Ptr &impl, const char *type, ResponseDesc *resp) noexcept override {
+        auto &factories = GetExtensionsHolder()->si_list;
+        if (factories.find(type) == factories.end()) {
+            std::string errorMsg = std::string("Shape Infer Implementation for ") + type + " wasn't found!";
+            if (resp) errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
+            return NOT_FOUND;
+        }
+        impl = factories[type];
+        return OK;
+    }
 
-class FakeLayerPLNImpl: public ExtLayerBase {
+    template<class T>
+    void collectTypes(char **&types, unsigned int &size, const std::map<std::string, T> &factories) {
+        types = new char *[factories.size()];
+        unsigned count = 0;
+        for (auto it = factories.begin(); it != factories.end(); it++, count++) {
+            types[count] = new char[it->first.size() + 1];
+            std::copy(it->first.begin(), it->first.end(), types[count]);
+            types[count][it->first.size()] = '\0';
+        }
+        size = count;
+    }
+};
+
+ class FakeLayerPLNImpl: public Cpu::ExtLayerBase {
 public:
     explicit FakeLayerPLNImpl(const CNNLayer* layer) {
         try {
@@ -27,7 +111,7 @@ public:
     }
 };
 
-class FakeLayerBLKImpl: public ExtLayerBase {
+class FakeLayerBLKImpl: public Cpu::ExtLayerBase {
 public:
     explicit FakeLayerBLKImpl(const CNNLayer* layer) {
         try {
@@ -48,9 +132,24 @@ public:
     }
 };
 
-REG_FACTORY_FOR(ImplFactory<FakeLayerPLNImpl>, FakeLayerPLN);
-REG_FACTORY_FOR(ImplFactory<FakeLayerBLKImpl>, FakeLayerBLK);
+template<typename Ext>
+class FakeRegisterBase {
+ public:
+    explicit FakeRegisterBase(const std::string& type) {
+        FakeExtensions::AddExt(type,
+                              [](const CNNLayer* layer) -> InferenceEngine::ILayerImplFactory* {
+                                  return new Ext(layer);
+                              });
+    }
+};
+
+#define REG_FAKE_FACTORY_FOR(__prim, __type) \
+static FakeRegisterBase<__prim> __reg__##__type(#__type)
+
+REG_FAKE_FACTORY_FOR(Cpu::ImplFactory<FakeLayerPLNImpl>, FakeLayerPLN);
+REG_FAKE_FACTORY_FOR(Cpu::ImplFactory<FakeLayerBLKImpl>, FakeLayerBLK);
+
 
-}
-}
-}
+InferenceEngine::IExtensionPtr make_FakeExtensions() {
+    return InferenceEngine::IExtensionPtr(new FakeExtensions());
+}
\ No newline at end of file
diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/gather_tests.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/extensions/gather_tests.cpp
new file mode 100644 (file)
index 0000000..b4300fb
--- /dev/null
@@ -0,0 +1,695 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <gmock/gmock-spec-builders.h>
+#include "mkldnn_plugin/mkldnn_graph.h"
+
+#include "test_graph.hpp"
+
+#include "single_layer_common.hpp"
+#include <mkldnn_plugin/mkldnn_extension_utils.h>
+#include <extension/ext_list.hpp>
+#include "tests_common.hpp"
+
+
+using namespace ::testing;
+using namespace std;
+using namespace mkldnn;
+
+
+struct gather_test_params {
+    std::string inIdxPrecision;
+    InferenceEngine::SizeVector inIdx;
+    InferenceEngine::SizeVector inDict;
+    int axis;
+    InferenceEngine::SizeVector out;
+
+    size_t num_prim_desc;
+    int selectedType;
+
+    std::vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
+};
+
+
+inline void clipping(int *idx, const int min, const int max) {
+    (*idx) = ((*idx) > min) ? (*idx) : min;
+    (*idx) = ((*idx) < max) ? (*idx) : (max - 1);
+    return;
+}
+
+template <typename data_t>
+void ref_gather(InferenceEngine::TBlob<data_t> &srcIdx, InferenceEngine::TBlob<float> &srcDct, InferenceEngine::TBlob<float> &dst, size_t axis) {
+    size_t i, j;
+    const data_t *src_dataIdx = srcIdx.data();
+    float* src_dataDict = srcDct.data();
+    float *dst_data = dst.data();
+    size_t src_size = srcIdx.size();
+
+    std::vector<size_t> dims = srcDct.getTensorDesc().getDims();
+    std::vector<size_t> dims_actual;
+
+    //  Remove redundant dimensions
+    for (size_t i = 0; i < dims.size(); i++) {
+        if (dims[i] > 1) {
+            for (size_t j = i; j < dims.size(); j++)
+                dims_actual.push_back(dims[j]);
+            break;
+        }
+    }
+
+    //  Find number of dictionaries, index range and data length
+    size_t numDictionaries = 1;
+    for (i = 0; i < axis; i++)
+        numDictionaries *= dims_actual[i];
+    size_t indexRange = dims_actual[axis];
+    size_t dataLength = 1;
+    for (i = axis + 1; i < dims_actual.size(); i++)
+        dataLength *= dims_actual[i];
+
+    //  The gathering process
+    for (i = 0; i < src_size; i++) {
+        int idx = static_cast<int>(src_dataIdx[i]);
+
+        //  Index clipping
+        clipping(&idx, 0, indexRange);
+
+        //  Copying data to destination from Dictionary
+        for (j = 0; j < numDictionaries; j++) {
+            memcpy(&dst_data[dataLength * (i + j * src_size)],
+                   &src_dataDict[dataLength * (idx + j * indexRange)], sizeof(float)*dataLength);
+        }
+    }
+}
+
+class MKLDNNCPUExtGatherTests: public TestsCommon, public WithParamInterface<gather_test_params> {
+    std::string model_t = R"V0G0N(
+<net Name="Gather_net" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="InputText" type="Input" precision="_IIDXP_" id="1">
+            <output>
+                <port id="1">
+                    _IIDX_
+                </port>
+            </output>
+        </layer>
+        <layer name="InputDictionary" type="Input" precision="FP32" id="2">
+            <output>
+                <port id="2">
+                    _IDICT_
+                </port>
+            </output>
+        </layer>
+        <layer name="gather" id="3" type="Gather" precision="FP32">
+            <data axis="_AX_"/>
+            <input>
+                <port id="1">
+                    _IDICT_
+                </port>
+                <port id="2">
+                    _IIDX_
+                </port>
+            </input>
+            <output>
+                <port id="3">
+                    _OUT_
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="1" from-port="1" to-layer="3" to-port="2"/>
+        <edge from-layer="2" from-port="2" to-layer="3" to-port="1"/>
+    </edges>
+</net>
+)V0G0N";
+
+    std::string getModel(gather_test_params p) {
+        std::string model = model_t;
+        std::string inIdx;
+        std::string inDict;
+        std::string out;
+
+        for (auto& idx : p.inIdx) {
+            inIdx += "<dim>";
+            inIdx += std::to_string(idx) + "</dim>\n";
+        }
+
+        for (auto& dct : p.inDict) {
+            inDict += "<dim>";
+            inDict += std::to_string(dct) + "</dim>\n";
+        }
+
+        for (auto& dst : p.out) {
+            out += "<dim>";
+            out += std::to_string(dst) + "</dim>\n";
+        }
+
+        REPLACE_WITH_STR(model, "_IIDXP_", p.inIdxPrecision);
+        REPLACE_WITH_STR(model, "_IIDX_", inIdx);
+        REPLACE_WITH_STR(model, "_IDICT_", inDict);
+        REPLACE_WITH_NUM(model, "_AX_", p.axis);
+        REPLACE_WITH_STR(model, "_OUT_", out);
+
+        return model;
+    }
+
+    template <typename data_t>
+    static void fill_data_dbgval(data_t *data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = static_cast<data_t>(i & (sizeof(data_t) * 8 - 1));
+        }
+    }
+protected:
+    virtual void TearDown() {
+    }
+
+    virtual void SetUp() {
+        try {
+            TestsCommon::SetUp();
+            gather_test_params p = ::testing::WithParamInterface<gather_test_params>::GetParam();
+            std::string model = getModel(p);
+
+            InferenceEngine::CNNNetReader net_reader;
+            ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
+
+            InferenceEngine::Extension cpuExt(make_so_name("cpu_extension"));
+            MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager());
+            extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){}));
+
+            MKLDNNGraphTestClass graph;
+            graph.CreateGraph(net_reader.getNetwork(), extMgr);
+
+            auto& nodes = graph.getNodes();
+            nodes = graph.getNodes();
+
+            for (auto &node : nodes) {
+                if (node->getName() == "gather") {
+                    ASSERT_EQ(p.num_prim_desc, node->getSupportedPrimitiveDescriptors().size());
+                    for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
+                        p.comp.at(j)(node->getSupportedPrimitiveDescriptors().at(j));
+                    }
+                    ASSERT_NE(nullptr, node->getSelectedPrimitiveDescriptor());
+                    ASSERT_EQ(p.selectedType,
+                              node->getSelectedPrimitiveDescriptor()->getImplementationType() & p.selectedType);
+                }
+            }
+            ASSERT_EQ(4, nodes.size());
+
+            // Input Dictionary
+            InferenceEngine::Blob::Ptr srcDict = InferenceEngine::make_shared_blob<float>({ InferenceEngine::Precision::FP32, p.inDict, InferenceEngine::TensorDesc::getLayoutByDims(p.inDict) });
+            srcDict->allocate();
+            fill_data(srcDict->buffer(), srcDict->size());
+            auto * srcDictPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(srcDict.get());
+            if (srcDictPtr == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+
+            // Output Data
+            InferenceEngine::OutputsDataMap out;
+            out = net_reader.getNetwork().getOutputsInfo();
+            InferenceEngine::BlobMap outputBlobs;
+
+            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
+
+            InferenceEngine::TBlob<float>::Ptr output;
+            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
+            output->allocate();
+            outputBlobs[item.first] = output;
+
+            // Output Reference
+            InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
+            dst_ref.allocate();
+
+            // Input Indexes
+            InferenceEngine::Blob::Ptr srcIdx;
+            if (p.inIdxPrecision == "I32") {
+                srcIdx = InferenceEngine::make_shared_blob<int32_t>({ InferenceEngine::Precision::I32, p.inIdx, InferenceEngine::TensorDesc::getLayoutByDims(p.inIdx) });
+                srcIdx->allocate();
+                fill_data_dbgval(static_cast<int32_t*>(srcIdx->buffer()), srcIdx->size());
+                auto * srcIdxPtr = dynamic_cast<InferenceEngine::TBlob<int32_t>*>(srcIdx.get());
+                if (srcIdxPtr == nullptr)
+                    FAIL() << "Cannot cast blob to TBlob<int32_t>.";
+
+                // Check results
+                ref_gather(*srcIdxPtr, *srcDictPtr, dst_ref, p.axis);
+            }
+            else if (p.inIdxPrecision == "FP32") {
+                srcIdx = InferenceEngine::make_shared_blob<float>({ InferenceEngine::Precision::FP32, p.inIdx, InferenceEngine::TensorDesc::getLayoutByDims(p.inIdx) });
+                srcIdx->allocate();
+                fill_data(srcIdx->buffer(), srcIdx->size());
+                auto * srcIdxPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(srcIdx.get());
+                if (srcIdxPtr == nullptr)
+                    FAIL() << "Cannot cast blob to TBlob<float>.";
+
+                // Check results
+                ref_gather(*srcIdxPtr, *srcDictPtr, dst_ref, p.axis);
+            }
+            else if (p.inIdxPrecision == "U16") {
+                srcIdx = InferenceEngine::make_shared_blob<uint16_t>({ InferenceEngine::Precision::U16, p.inIdx, InferenceEngine::TensorDesc::getLayoutByDims(p.inIdx) });
+                srcIdx->allocate();
+                fill_data_dbgval(static_cast<uint16_t*>(srcIdx->buffer()), srcIdx->size());
+                auto * srcIdxPtr = dynamic_cast<InferenceEngine::TBlob<uint16_t>*>(srcIdx.get());
+                if (srcIdxPtr == nullptr)
+                    FAIL() << "Cannot cast blob to TBlob<uint16_t>.";
+
+                // Check results
+                ref_gather(*srcIdxPtr, *srcDictPtr, dst_ref, p.axis);
+            }
+            else if (p.inIdxPrecision == "I16") {
+                srcIdx = InferenceEngine::make_shared_blob<int16_t>({ InferenceEngine::Precision::I16, p.inIdx, InferenceEngine::TensorDesc::getLayoutByDims(p.inIdx) });
+                srcIdx->allocate();
+                fill_data_dbgval(static_cast<int16_t*>(srcIdx->buffer()), srcIdx->size());
+                auto * srcIdxPtr = dynamic_cast<InferenceEngine::TBlob<int16_t>*>(srcIdx.get());
+                if (srcIdxPtr == nullptr)
+                    FAIL() << "Cannot cast blob to TBlob<int16_t>.";
+
+                // Check results
+                ref_gather(*srcIdxPtr, *srcDictPtr, dst_ref, p.axis);
+            }
+            else if (p.inIdxPrecision == "U8") {
+                srcIdx = InferenceEngine::make_shared_blob<uint8_t>({ InferenceEngine::Precision::U8, p.inIdx, InferenceEngine::TensorDesc::getLayoutByDims(p.inIdx) });
+                srcIdx->allocate();
+                fill_data_dbgval(static_cast<uint8_t*>(srcIdx->buffer()), srcIdx->size());
+                auto * srcIdxPtr = dynamic_cast<InferenceEngine::TBlob<uint8_t>*>(srcIdx.get());
+                if (srcIdxPtr == nullptr)
+                    FAIL() << "Cannot cast blob to TBlob<uint8_t>.";
+
+                // Check results
+                ref_gather(*srcIdxPtr, *srcDictPtr, dst_ref, p.axis);
+            }
+            else if (p.inIdxPrecision == "I8") {
+                srcIdx = InferenceEngine::make_shared_blob<int8_t>({ InferenceEngine::Precision::I8, p.inIdx, InferenceEngine::TensorDesc::getLayoutByDims(p.inIdx) });
+                srcIdx->allocate();
+                fill_data_dbgval(static_cast<int8_t*>(srcIdx->buffer()), srcIdx->size());
+                auto * srcIdxPtr = dynamic_cast<InferenceEngine::TBlob<int8_t>*>(srcIdx.get());
+                if (srcIdxPtr == nullptr)
+                    FAIL() << "Cannot cast blob to TBlob<int8_t>.";
+
+                // Check results
+                ref_gather(*srcIdxPtr, *srcDictPtr, dst_ref, p.axis);
+            }
+            else {
+                return;
+            }
+
+            InferenceEngine::BlobMap srcs;
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("InputDictionary", srcDict));
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("InputText", srcIdx));
+
+            // Infer
+            graph.Infer(srcs, outputBlobs);
+            compare(*output, dst_ref);
+        } catch (const InferenceEngine::details::InferenceEngineException &e) {
+            FAIL() << e.what();
+        }
+    }
+};
+
+TEST_P(MKLDNNCPUExtGatherTests, TestsGather) {}
+
+INSTANTIATE_TEST_CASE_P(
+        TestsGather, MKLDNNCPUExtGatherTests,
+            ::testing::Values(
+                gather_test_params{ "FP32", {1, 1, 12, 256}, {1, 1, 71, 16}, 0, {1, 12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{  "I32", {1, 1, 12, 256}, {1, 1, 71, 16}, 0, {1, 12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{  "I16", {1, 1, 12, 256}, {1, 1, 71, 16}, 0, {1, 12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{   "U8", {1, 1, 12, 256}, {1, 1, 71, 16}, 0, {1, 12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{   "I8", {1, 1, 12, 256}, {1, 1, 71, 16}, 0, {1, 12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{  "I32", {12, 256}, {71, 16}, 0, {12, 256, 16}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{  "I32", {3, 4}, {2, 5, 6}, 0, {3, 4, 5, 6}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{  "I32", {3, 4}, {5, 1}, 0, {3, 4, 1}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{ "FP32", {1, 1, 12, 256}, {1, 1, 71, 16}, 1, {1, 71, 12, 256}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{  "I32", {1, 1, 3, 4}, {1, 2, 5, 6}, 1, {2, 3, 4, 6}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{  "I32", {1, 1, 3, 4}, {1, 2, 5, 6}, 2, {2, 5, 3, 4}, 1, MKLDNNPlugin::impl_desc_type::unknown },
+                gather_test_params{  "I32", {12, 4, 9, 8}, {6, 13, 10, 3}, 1, {6, 12, 4, 9, 8, 10, 3}, 1, MKLDNNPlugin::impl_desc_type::unknown }
+            ));
+
+
+
+
+struct gatherTF_test_params {
+    InferenceEngine::SizeVector in_dim;
+    std::vector<int32_t> in;
+
+    InferenceEngine::SizeVector dct_dim;
+    std::vector<float> dct;
+
+    int axis;
+
+    InferenceEngine::SizeVector ref_dim;
+    std::vector<float> ref;
+
+    std::vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
+};
+
+class MKLDNNCPUExtGatherTFTests : public TestsCommon, public WithParamInterface<gatherTF_test_params> {
+    std::string model_t = R"V0G0N(
+<net Name="Gather_net" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="InputText" type="Input" precision="I32" id="1">
+            <output>
+                <port id="1">
+                    _IIDX_
+                </port>
+            </output>
+        </layer>
+        <layer name="InputDictionary" type="Input" precision="FP32" id="2">
+            <output>
+                <port id="2">
+                    _IDICT_
+                </port>
+            </output>
+        </layer>
+        <layer name="gather" id="3" type="Gather" precision="FP32">
+            <data axis="_AX_"/>
+            <input>
+                <port id="1">
+                    _IDICT_
+                </port>
+                <port id="2">
+                    _IIDX_
+                </port>
+            </input>
+            <output>
+                <port id="3">
+                    _OUT_
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="1" from-port="1" to-layer="3" to-port="2"/>
+        <edge from-layer="2" from-port="2" to-layer="3" to-port="1"/>
+    </edges>
+</net>
+)V0G0N";
+
+    std::string getModel(gatherTF_test_params p) {
+        std::string model = model_t;
+        std::string inIdx;
+        std::string inDict;
+        std::string out;
+
+        for (auto& idx : p.in_dim) {
+            inIdx += "<dim>";
+            inIdx += std::to_string(idx) + "</dim>\n";
+        }
+
+        for (auto& dct : p.dct_dim) {
+            inDict += "<dim>";
+            inDict += std::to_string(dct) + "</dim>\n";
+        }
+
+        for (auto& dst : p.ref_dim) {
+            out += "<dim>";
+            out += std::to_string(dst) + "</dim>\n";
+        }
+
+        REPLACE_WITH_STR(model, "_IIDX_", inIdx);
+        REPLACE_WITH_STR(model, "_IDICT_", inDict);
+        REPLACE_WITH_NUM(model, "_AX_", p.axis);
+        REPLACE_WITH_STR(model, "_OUT_", out);
+
+        return model;
+    }
+
+protected:
+    virtual void TearDown() {
+    }
+
+    virtual void SetUp() {
+        try {
+            TestsCommon::SetUp();
+            gatherTF_test_params p = ::testing::WithParamInterface<gatherTF_test_params>::GetParam();
+            std::string model = getModel(p);
+
+            InferenceEngine::CNNNetReader net_reader;
+            ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
+
+            InferenceEngine::Extension cpuExt(make_so_name("cpu_extension"));
+            MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager());
+            extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){}));
+
+            MKLDNNGraphTestClass graph;
+            graph.CreateGraph(net_reader.getNetwork(), extMgr);
+
+            // Input Indexes
+            InferenceEngine::Blob::Ptr srcIdx;
+            srcIdx = InferenceEngine::make_shared_blob<int32_t>({ InferenceEngine::Precision::I32, p.in_dim, InferenceEngine::TensorDesc::getLayoutByDims(p.in_dim) });
+            srcIdx->allocate();
+            memcpy(static_cast<int32_t*>(srcIdx->buffer()), &p.in[0], sizeof(int32_t)*p.in.size());
+            auto * srcIdxPtr = dynamic_cast<InferenceEngine::TBlob<int32_t>*>(srcIdx.get());
+            if (srcIdxPtr == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<int32_t>.";
+
+            //  Input Dictionary
+            InferenceEngine::Blob::Ptr srcDict = InferenceEngine::make_shared_blob<float>({ InferenceEngine::Precision::FP32, p.dct_dim, InferenceEngine::TensorDesc::getLayoutByDims(p.dct_dim) });
+            srcDict->allocate();
+            memcpy(srcDict->buffer(), &p.dct[0], sizeof(float)*p.dct.size());
+            auto * srcDictPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(srcDict.get());
+            if (srcDictPtr == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+
+            //  Output Data
+            InferenceEngine::OutputsDataMap out;
+            out = net_reader.getNetwork().getOutputsInfo();
+            InferenceEngine::BlobMap outputBlobs;
+            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
+            InferenceEngine::TBlob<float>::Ptr output;
+            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
+            output->allocate();
+            outputBlobs[item.first] = output;
+
+            //  Infer
+            InferenceEngine::BlobMap srcs;
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("InputDictionary", srcDict));
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("InputText", srcIdx));
+            graph.Infer(srcs, outputBlobs);
+
+            //  Check results
+            if (memcmp((*output).data(), &p.ref[0], p.ref.size()) != 0)
+                FAIL() << "Wrong result with compare TF reference!";
+        } catch (const InferenceEngine::details::InferenceEngineException &e) {
+            FAIL() << e.what();
+        }
+    }
+};
+
+TEST_P(MKLDNNCPUExtGatherTFTests, TestsGather) {}
+
+//  Test data vectors
+std::vector<int32_t> in0 = { 0, 1, 1, 0 };
+std::vector<int32_t> in1 = { 0, 1, 2, 1 };
+std::vector<float> dict = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f };
+std::vector<float> ref_in0_a0_d223 = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }; // 2x2x2x3
+std::vector<float> ref_in0_a2_d232 = { 1.f, 2.f, 2.f, 1.f, 3.f, 4.f, 4.f, 3.f, 5.f, 6.f, 6.f, 5.f, 7.f, 8.f, 8.f, 7.f, 9.f, 10.f, 10.f, 9.f, 11.f, 12.f, 12.f, 11.f }; // 2x3x2x2
+std::vector<float> ref_in1_a0_d322 = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 5.f, 6.f, 7.f, 8.f }; // 2x2x2x2
+std::vector<float> ref_in1_a1_d232 = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 3.f, 4.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 9.f, 10.f }; // 2x2x2x2
+std::vector<float> ref_in1_a2_d223 = { 1.f, 2.f, 3.f, 2.f, 4.f, 5.f, 6.f, 5.f, 7.f, 8.f, 9.f, 8.f, 10.f, 11.f, 12.f, 11.f }; // 2x2x2x2
+
+INSTANTIATE_TEST_CASE_P(
+        TestsGather, MKLDNNCPUExtGatherTFTests,
+        ::testing::Values(
+        gatherTF_test_params{ { 2, 2 }, in0,{ 2, 2, 3 }, dict, 0, { 2, 2, 2, 3 }, ref_in0_a0_d223 },
+        gatherTF_test_params{ { 2, 2 }, in0,{ 2, 2, 3 }, dict,-3, { 2, 2, 2, 3 }, ref_in0_a0_d223 },
+        gatherTF_test_params{ { 2, 2 }, in0,{ 2, 3, 2 }, dict, 2, { 2, 3, 2, 2 }, ref_in0_a2_d232 },
+        gatherTF_test_params{ { 2, 2 }, in0,{ 2, 3, 2 }, dict,-1, { 2, 3, 2, 2 }, ref_in0_a2_d232 },
+        gatherTF_test_params{ { 2, 2 }, in1,{ 3, 2, 2 }, dict, 0, { 2, 2, 2, 2 }, ref_in1_a0_d322 },
+        gatherTF_test_params{ { 2, 2 }, in1,{ 3, 2, 2 }, dict,-3, { 2, 2, 2, 2 }, ref_in1_a0_d322 },
+        gatherTF_test_params{ { 2, 2 }, in1,{ 2, 3, 2 }, dict, 1, { 2, 2, 2, 2 }, ref_in1_a1_d232 },
+        gatherTF_test_params{ { 2, 2 }, in1,{ 2, 3, 2 }, dict,-2, { 2, 2, 2, 2 }, ref_in1_a1_d232 },
+        gatherTF_test_params{ { 2, 2 }, in1,{ 2, 2, 3 }, dict, 2, { 2, 2, 2, 2 }, ref_in1_a2_d223 },
+        gatherTF_test_params{ { 2, 2 }, in1,{ 2, 2, 3 }, dict,-1, { 2, 2, 2, 2 }, ref_in1_a2_d223 }));
+
+
+class MKLDNNCPUExtGatherHolesTests : public TestsCommon, public WithParamInterface<gatherTF_test_params> {
+    std::string model_t = R"V0G0N(
+<net Name="Gather_net" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="InputText" type="Input" precision="I32" id="1">
+            <output>
+                <port id="1">
+                    <dim>2</dim>
+                    <dim>2</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="InputDictionary" type="Input" precision="FP32" id="2">
+            <output>
+                <port id="2">
+                    <dim>3</dim>
+                    <dim>2</dim>
+                    <dim>2</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="Input3" type="Input" precision="FP32" id="3">
+            <output>
+                <port id="3">
+                    <dim>2</dim>
+                    <dim>5</dim>
+                    <dim>2</dim>
+                    <dim>2</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="gather" id="4" type="Gather" precision="FP32">
+            <data axis="0"/>
+            <input>
+                <port id="1">
+                    <dim>3</dim>
+                    <dim>2</dim>
+                    <dim>2</dim>
+                </port>
+                <port id="2">
+                    <dim>2</dim>
+                    <dim>2</dim>
+                </port>
+            </input>
+            <output>
+                <port id="3">
+                    <dim>2</dim>
+                    <dim>2</dim>
+                    <dim>2</dim>
+                    <dim>2</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="con" id="5" type="Concat" precision="FP32">
+            <concat_data axis="1"/>
+            <input>
+                <port id="1">
+                    <dim>2</dim>
+                    <dim>2</dim>
+                    <dim>2</dim>
+                    <dim>2</dim>
+                </port>
+                <port id="2">
+                    <dim>2</dim>
+                    <dim>5</dim>
+                    <dim>2</dim>
+                    <dim>2</dim>
+                </port>
+            </input>
+            <output>
+                <port id="3">
+                    <dim>2</dim>
+                    <dim>7</dim>
+                    <dim>2</dim>
+                    <dim>2</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="1" from-port="1" to-layer="4" to-port="2"/>
+        <edge from-layer="2" from-port="2" to-layer="4" to-port="1"/>
+        <edge from-layer="4" from-port="3" to-layer="5" to-port="1"/>
+        <edge from-layer="3" from-port="3" to-layer="5" to-port="2"/>
+    </edges>
+</net>
+)V0G0N";
+
+    std::string getModel(gatherTF_test_params p) {
+        std::string model = model_t;
+        std::string inIdx;
+        std::string inDict;
+        std::string out;
+
+        for (auto& idx : p.in_dim) {
+            inIdx += "<dim>";
+            inIdx += std::to_string(idx) + "</dim>\n";
+        }
+
+        for (auto& dct : p.dct_dim) {
+            inDict += "<dim>";
+            inDict += std::to_string(dct) + "</dim>\n";
+        }
+
+        for (auto& dst : p.ref_dim) {
+            out += "<dim>";
+            out += std::to_string(dst) + "</dim>\n";
+        }
+
+        REPLACE_WITH_STR(model, "_OUTC_", inIdx);
+        REPLACE_WITH_STR(model, "_IDICT_", inDict);
+        REPLACE_WITH_NUM(model, "_AX_", p.axis);
+        REPLACE_WITH_STR(model, "_OUT_", out);
+
+        return model;
+    }
+
+protected:
+    virtual void TearDown() {
+    }
+
+    virtual void SetUp() {
+        try {
+            TestsCommon::SetUp();
+            gatherTF_test_params p = ::testing::WithParamInterface<gatherTF_test_params>::GetParam();
+            std::string model = getModel(p);
+
+            InferenceEngine::CNNNetReader net_reader;
+            ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
+
+            InferenceEngine::Extension cpuExt(make_so_name("cpu_extension"));
+            MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager());
+            extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){}));
+
+            MKLDNNGraphTestClass graph;
+            graph.CreateGraph(net_reader.getNetwork(), extMgr);
+
+            // Input Indexes
+            InferenceEngine::Blob::Ptr srcIdx;
+            int32_t in_size = 4;
+            InferenceEngine::SizeVector in_dim = {2, 2};
+            srcIdx = InferenceEngine::make_shared_blob<int32_t>({ InferenceEngine::Precision::I32, in_dim, InferenceEngine::TensorDesc::getLayoutByDims(in_dim) });
+            srcIdx->allocate();
+            memcpy(static_cast<int32_t*>(srcIdx->buffer()), &p.in[0], sizeof(int32_t)*in_size);
+            auto * srcIdxPtr = dynamic_cast<InferenceEngine::TBlob<int32_t>*>(srcIdx.get());
+            if (srcIdxPtr == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<int32_t>.";
+
+            //  Input Dictionary
+            InferenceEngine::Blob::Ptr srcDict = InferenceEngine::make_shared_blob<float>({ InferenceEngine::Precision::FP32, p.dct_dim, InferenceEngine::TensorDesc::getLayoutByDims(p.dct_dim) });
+            srcDict->allocate();
+            memcpy(srcDict->buffer(), &p.dct[0], sizeof(float)*p.dct.size());
+            auto * srcDictPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(srcDict.get());
+            if (srcDictPtr == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+
+            //  Output Data
+            InferenceEngine::OutputsDataMap out;
+            out = net_reader.getNetwork().getOutputsInfo();
+            InferenceEngine::BlobMap outputBlobs;
+            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
+            InferenceEngine::TBlob<float>::Ptr output;
+            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
+            output->allocate();
+            outputBlobs[item.first] = output;
+
+            //  Infer
+            InferenceEngine::BlobMap srcs;
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("InputDictionary", srcDict));
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("InputText", srcIdx));
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("Input3", srcIdx));
+            graph.Infer(srcs, outputBlobs);
+
+            //  Check results
+            if (memcmp((*output).data(), &p.ref[0], p.ref.size()) != 0)
+                FAIL() << "Wrong result with compare TF reference!";
+        }
+        catch (const InferenceEngine::details::InferenceEngineException &e) {
+            FAIL() << e.what();
+        }
+    }
+};
+
+TEST_P(MKLDNNCPUExtGatherHolesTests, TestsGather) {}
+
+INSTANTIATE_TEST_CASE_P(
+    TestsGather, MKLDNNCPUExtGatherHolesTests,
+    ::testing::Values(
+        gatherTF_test_params{ { 1, 5, 2, 2 }, in1,{ 1, 3, 2, 2 }, dict, 1,{ 2, 2, 2, 2 }, ref_in1_a0_d322 }));
+
index d4f1bf7..49e62bc 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -1361,9 +1360,10 @@ TEST_F(MKLDNNGraphGenericTests, ExecuteNotInLineGRN) {
         <edge from-layer="2" from-port="4" to-layer="3" to-port="6"/>
     </edges>
 </net>)V0G0N";
-    std::shared_ptr<InferenceEngine::IExtension> cpuExt(new InferenceEngine::Extensions::Cpu::CpuExtensions());
+    InferenceEngine::Extension cpuExt(make_so_name("cpu_extension"));
     MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager());
-    extMgr->AddExtension(cpuExt);
+
+    extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){}));
 
     InferenceEngine::CNNNetReader net_reader;
     ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
@@ -1503,9 +1503,10 @@ TEST_F(MKLDNNGraphGenericTests, ExecuteInLineGRN) {
         <edge from-layer="3" from-port="4" to-layer="4" to-port="6"/>
     </edges>
 </net>)V0G0N";
-    std::shared_ptr<InferenceEngine::IExtension> cpuExt(new InferenceEngine::Extensions::Cpu::CpuExtensions());
+
+    InferenceEngine::Extension cpuExt(make_so_name("cpu_extension"));
     MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager());
-    extMgr->AddExtension(cpuExt);
+    extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){}));
 
     InferenceEngine::CNNNetReader net_reader;
     ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
index a37f135..6bc9b75 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,7 +10,6 @@
 
 #include "single_layer_common.hpp"
 #include <mkldnn_plugin/mkldnn_extension_utils.h>
-#include <extension/ext_list.hpp>
 #include "tests_common.hpp"
 
 
@@ -192,9 +190,9 @@ protected:
             InferenceEngine::CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
-            std::shared_ptr<InferenceEngine::IExtension> cpuExt(new InferenceEngine::Extensions::Cpu::CpuExtensions());
+            InferenceEngine::Extension cpuExt(make_so_name("cpu_extension"));
             MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager());
-            extMgr->AddExtension(cpuExt);
+            extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){}));
 
             MKLDNNGraphTestClass graph;
             graph.CreateGraph(net_reader.getNetwork(), extMgr);
index ec2a232..84511a1 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include <mkldnn_plugin/mkldnn_extension_utils.h>
 #include <extension/ext_list.hpp>
 #include "tests_common.hpp"
+#include "ir_gen_helper.hpp"
 
-
+using namespace InferenceEngine;
 using namespace ::testing;
 using namespace std;
 using namespace mkldnn;
+using namespace single_layer_tests;
 
 
 struct mvn_test_params {
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } in;
+    // Formats: NCHW, NCDHW
+    vector<size_t> dims;
 
     int across_channels;
     int normalize_variance;
@@ -36,53 +33,84 @@ struct mvn_test_params {
     bool isBlockedFormat;
     int selectedType;
 
-    std::vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
+    vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
 };
 
+extern InferenceEngine::IExtensionPtr make_FakeExtensions();
+
 template <typename data_t>
-void ref_mvn(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<data_t> &dst, mvn_test_params prm) {
+void ref_mvn(const TBlob<data_t> &src, TBlob<data_t> &dst, mvn_test_params prm) {
     const data_t *src_data = src.readOnly();
     data_t *dst_data = dst.data();
+    size_t dims_size = prm.dims.size();
 
-    size_t N = prm.in.n;
-    size_t C = prm.in.c;
-    size_t H = prm.in.h;
-    size_t W = prm.in.w;
+    size_t N = prm.dims[0];
+    size_t C = prm.dims[1];
+    size_t D = dims_size > 4 ? prm.dims[dims_size - 3lu] : 1lu;
+    size_t H = dims_size > 3 ? prm.dims[dims_size - 2lu] : 1lu;
+    size_t W = prm.dims[dims_size - 1lu];
 
     float eps = prm.eps;
 
-    for (int b = 0; b < N; b++) {
+    size_t C1 = H * W;
+    size_t C2 = C1 * D;
+    size_t C3 = C2 * C;
+
+    for (size_t b = 0lu; b < N; b++) {
+        size_t cb = b * C3;
         // Calculate mean value
         if (prm.across_channels) {
-            double mean = 0;
-            for (int c = 0; c < C; c++) {
-                for (int h = 0; h < H; h++) {
-                    for (int w = 0; w < W; w++) {
-                        mean += src_data[b*C*H*W + c*H*W + h*W + w];
+            double mean = 0.0;
+            for (size_t c = 0lu; c < C; c++) {
+                size_t cc = cb + c * C2;
+                for (size_t d = 0lu; d < D; d++) {
+                    size_t cd = cc + d * C1;
+                    for (size_t h = 0lu; h < H; h++) {
+                        size_t ch = cd + h * W;
+                        for (size_t w = 0lu; w < W; w++) {
+                            mean += src_data[ch + w];
+                        }
                     }
                 }
             }
-            mean /= C*H*W;
-            for (int c = 0; c < C; c++) {
-                for (int h = 0; h < H; h++) {
-                    for (int w = 0; w < W; w++) {
-                        dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] - mean;
+            mean /= (double)C3;
+            for (size_t c = 0lu; c < C; c++) {
+                size_t cc = cb + c * C2;
+                for (size_t d = 0lu; d < D; d++) {
+                    size_t cd = cc + d * C1;
+                    for (size_t h = 0lu; h < H; h++) {
+                        size_t ch = cd + h * W;
+                        for (size_t w = 0lu; w < W; w++) {
+                            size_t index = ch + w;
+                            dst_data[index] = src_data[index] - mean;
+                        }
                     }
                 }
             }
         } else {
-            for (int c = 0; c < C; c++) {
-                double mean = 0;
-                for (int h = 0; h < H; h++) {
-                    for (int w = 0; w < W; w++) {
-                        mean += src_data[b*C*H*W + c*H*W + h*W + w];
+            for (size_t c = 0lu; c < C; c++) {
+                size_t cc = cb + c * C2;
+                double mean = 0.0;
+                for (size_t d = 0lu; d < D; d++) {
+                    size_t cd = cc + d * C1;
+                    for (size_t h = 0lu; h < H; h++) {
+                        size_t ch = cd + h * W;
+                        for (size_t w = 0lu; w < W; w++) {
+                            mean += src_data[ch + w];
+                        }
                     }
                 }
-                mean /= H*W;
 
-                for (int h = 0; h < H; h++) {
-                    for (int w = 0; w < W; w++) {
-                        dst_data[b*C*H*W + c*H*W + h*W + w] = src_data[b*C*H*W + c*H*W + h*W + w] - mean;
+                mean /= (double)C2;
+
+                for (size_t d = 0lu; d < D; d++) {
+                    size_t cd = cc + d * C1;
+                    for (size_t h = 0lu; h < H; h++) {
+                        size_t ch = cd + h * W;
+                        for (size_t w = 0lu; w < W; w++) {
+                            size_t index = ch + w;
+                            dst_data[index] = src_data[index] - mean;
+                        }
                     }
                 }
             }
@@ -90,41 +118,61 @@ void ref_mvn(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<d
     }
 
     if (prm.normalize_variance) {
-        for (int b = 0; b < N; b++) {
+        for (size_t b = 0; b < N; b++) {
+            size_t cb = b * C3;
             // Calculate variances value
             if (prm.across_channels) {
-                double variance = 0;
-                for (int c = 0; c < C; c++) {
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            variance += std::pow(dst_data[b*C*H*W + c*H*W + h*W + w], 2);
+                double variance = 0.f;
+                for (size_t c = 0lu; c < C; c++) {
+                    size_t cc = cb + c * C2;
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = cc + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * W;
+                            for (size_t w = 0lu; w < W; w++) {
+                                variance += std::pow(dst_data[ch + w], 2);
+                            }
                         }
                     }
                 }
-                variance /= C*H*W;
-                variance = std::pow(variance, 0.5f);
+                variance /= C3;
                 variance += eps;
-                for (int c = 0; c < C; c++) {
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            dst_data[b*C*H*W + c*H*W + h*W + w] /= variance;
+                variance = std::pow(variance, 0.5f);
+                for (size_t c = 0lu; c < C; c++) {
+                    size_t cc = cb + c * C2;
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = cc + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * W;
+                            for (size_t w = 0lu; w < W; w++) {
+                                dst_data[ch + w] /= variance;
+                            }
                         }
                     }
                 }
             } else {
-                for (int c = 0; c < C; c++) {
-                    double variance = 0;
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            variance += std::pow(dst_data[b*C*H*W + c*H*W + h*W + w], 2);
+                for (size_t c = 0lu; c < C; c++) {
+                    size_t cc = cb + c * C2;
+                    double variance = 0.0;
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = cc + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * W;
+                            for (size_t w = 0lu; w < W; w++) {
+                                variance += std::pow(dst_data[ch + w], 2);
+                            }
                         }
                     }
-                    variance /= H*W;
-                    variance = std::pow(variance, 0.5f);
+                    variance /= C2;
                     variance += eps;
-                    for (int h = 0; h < H; h++) {
-                        for (int w = 0; w < W; w++) {
-                            dst_data[b*C*H*W + c*H*W + h*W + w] /= variance;
+                    variance = std::pow(variance, 0.5f);
+                    for (size_t d = 0lu; d < D; d++) {
+                        size_t cd = cc + d * C1;
+                        for (size_t h = 0lu; h < H; h++) {
+                            size_t ch = cd + h * W;
+                            for (size_t w = 0lu; w < W; w++) {
+                                dst_data[ch + w] /= variance;
+                            }
                         }
                     }
                 }
@@ -134,34 +182,16 @@ void ref_mvn(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<d
 }
 
 class MKLDNNCPUExtMVNTests: public TestsCommon, public WithParamInterface<mvn_test_params> {
-    std::string model_t = R"V0G0N(
-<Net Name="MVN_net" version="2" precision="FP32" batch="1">
-    <layers>
-        <layer name="in1" type="Input" precision="FP32" id="0">
-            <output>
-                <port id="0">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
-                </port>
-            </output>
-        </layer>
+    std::string layers_t = R"V0G0N(
         <layer name="fakeLayer" id="1" type="_FL_" precision="FP32">
             <input>
                 <port id="1">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                    __SRC_DIMS__
                 </port>
             </input>
             <output>
                 <port id="2">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                    __SRC_DIMS__
                 </port>
             </output>
         </layer>
@@ -169,45 +199,42 @@ class MKLDNNCPUExtMVNTests: public TestsCommon, public WithParamInterface<mvn_te
             <data across_channels="_AC_" normalize_variance="_NV_" eps="_EPS_"/>
             <input>
                 <port id="3">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                    __SRC_DIMS__
                 </port>
             </input>
             <output>
                 <port id="4">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                    __SRC_DIMS__
                 </port>
             </output>
         </layer>
-    </layers>
-    <edges>
+)V0G0N";
+
+    std::string edges_t = R"V0G0N(
         <edge from-layer="0" from-port="0" to-layer="1" to-port="1"/>
         <edge from-layer="1" from-port="2" to-layer="2" to-port="3"/>
-    </edges>
-</Net>
 )V0G0N";
 
     std::string getModel(mvn_test_params p) {
-        std::string model = model_t;
+        std::string model = layers_t;
         if (p.isBlockedFormat)
             REPLACE_WITH_STR(model, "_FL_", "FakeLayerBLK");
         else
             REPLACE_WITH_STR(model, "_FL_", "FakeLayerPLN");
 
-        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
-        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
-        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
-        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
+        std::string s_dims;
+        for (auto& dim : p.dims) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS__", s_dims);
 
         REPLACE_WITH_NUM(model, "_AC_", p.across_channels);
         REPLACE_WITH_NUM(model, "_NV_", p.normalize_variance);
         REPLACE_WITH_NUM(model, "_EPS_", p.eps);
 
+        model = IRTemplateGenerator::getIRTemplate("MVN_Only", p.dims, "FP32", model, edges_t);
+
         return model;
     }
 
@@ -221,12 +248,14 @@ protected:
             mvn_test_params p = ::testing::WithParamInterface<mvn_test_params>::GetParam();
             std::string model = getModel(p);
 
-            InferenceEngine::CNNNetReader net_reader;
+            CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
-            std::shared_ptr<InferenceEngine::IExtension> cpuExt(new InferenceEngine::Extensions::Cpu::CpuExtensions());
+            InferenceEngine::Extension cpuExt(make_so_name("cpu_extension"));
             MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager());
-            extMgr->AddExtension(cpuExt);
+            extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){}));
+            extMgr->AddExtension(make_FakeExtensions());
+
 
             MKLDNNGraphTestClass graph;
             graph.CreateGraph(net_reader.getNetwork(), extMgr);
@@ -250,38 +279,48 @@ protected:
             else
                 ASSERT_EQ(5, nodes.size()); // TODO: should be 4 (redudant reorder in case of both layers are inplace)
 
-            InferenceEngine::SizeVector dims_src = {p.in.w, p.in.h, p.in.c, p.in.n};
+            SizeVector dims_src = p.dims;
+
+            Layout layout = ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = NCHW;
+                    break;
+                case 5:
+                    layout = NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NHWC, dims_src);
+            Blob::Ptr src = make_shared_blob<float, const SizeVector>(Precision::FP32, layout, dims_src);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
-            auto * srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
+            auto * srcPtr = dynamic_cast<TBlob<float>*>(src.get());
 
             if (srcPtr == nullptr)
                 FAIL() << "Cannot cast blob to TBlob<float>.";
 
-            InferenceEngine::BlobMap srcs;
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src));
+            BlobMap srcs;
+            srcs.insert(std::pair<std::string, Blob::Ptr>("in1", src));
 
-            InferenceEngine::OutputsDataMap out;
+            OutputsDataMap out;
             out = net_reader.getNetwork().getOutputsInfo();
-            InferenceEngine::BlobMap outputBlobs;
+            BlobMap outputBlobs;
 
-            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
+            std::pair<std::string, DataPtr> item = *out.begin();
 
-            InferenceEngine::TBlob<float>::Ptr output;
-            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
+            TBlob<float>::Ptr output;
+            output = make_shared_blob<float>(item.second->getTensorDesc());
             output->allocate();
             outputBlobs[item.first] = output;
 
             graph.Infer(srcs, outputBlobs);
 
-            InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
+            TBlob<float> dst_ref(item.second->getTensorDesc());
             dst_ref.allocate();
             ref_mvn(*srcPtr, dst_ref, p);
-            compare(*output, dst_ref);
-        } catch (const InferenceEngine::details::InferenceEngineException &e) {
+            compare(*output, dst_ref, 0.0001f);
+        } catch (const details::InferenceEngineException &e) {
             FAIL() << e.what();
         }
     }
@@ -292,7 +331,7 @@ TEST_P(MKLDNNCPUExtMVNTests, TestsMVN) {}
 INSTANTIATE_TEST_CASE_P(
         TestsMVN, MKLDNNCPUExtMVNTests,
         ::testing::Values(
-                mvn_test_params{{2, 64, 15, 15}, 0, 0, 0.00001, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
+        /*0*/   mvn_test_params{{2, 64, 15, 15}, 0, 0, 0.00001, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
                 mvn_test_params{{2,  2, 33, 65}, 0, 0, 0.00001, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
                 mvn_test_params{{2, 64, 15, 15}, 0, 1, 0.00001, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
                 mvn_test_params{{2,  2, 33, 65}, 0, 1, 0.00001, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
@@ -301,10 +340,22 @@ INSTANTIATE_TEST_CASE_P(
                 mvn_test_params{{2, 64, 15, 15}, 1, 1, 0.00001, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
                 mvn_test_params{{2,  2, 33, 65}, 1, 1, 0.00001, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
                 mvn_test_params{{2, 64, 15, 15}, 0, 0, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
-                mvn_test_params{{2,  2, 33, 65}, 0, 0, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
+        /*9*/   mvn_test_params{{2,  2, 33, 65}, 0, 0, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
                 mvn_test_params{{2, 64, 15, 15}, 0, 1, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
                 mvn_test_params{{2,  2, 33, 65}, 0, 1, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
                 mvn_test_params{{2, 64, 15, 15}, 1, 0, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
                 mvn_test_params{{2,  2, 33, 65}, 1, 0, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
-                mvn_test_params{{2,640, 15, 15}, 1, 1, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
-                mvn_test_params{{2,  2, 33, 65}, 1, 1, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown }));
+        /*14*/  mvn_test_params{{2,640, 15, 15}, 1, 1, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
+                mvn_test_params{{2,  2, 33, 65}, 1, 1, 0.00001, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
+
+                // 5D
+        /*16*/  mvn_test_params{{2, 64, 24, 32, 40}, 0, 0, 0.00001f, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
+                mvn_test_params{{2, 64, 24, 32, 40}, 0, 1, 0.00001f, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
+                mvn_test_params{{2, 64, 24, 32, 40}, 1, 0, 0.00001f, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
+                mvn_test_params{{2, 64, 24, 32, 40}, 1, 1, 0.00001f, 2, false, MKLDNNPlugin::impl_desc_type::unknown },
+                mvn_test_params{{2, 64, 24, 32, 40}, 0, 0, 0.00001f, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
+                mvn_test_params{{2, 64, 24, 32, 40}, 0, 1, 0.00001f, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
+                mvn_test_params{{2, 64, 24, 32, 40}, 1, 0, 0.00001f, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
+        /*23*/  mvn_test_params{{2, 64, 24, 32, 40}, 1, 1, 0.00001f, 2, true, MKLDNNPlugin::impl_desc_type::unknown },
+                mvn_test_params{{1, 64, 32, 32, 32}, 0, 1, 0.001f, 2, true, MKLDNNPlugin::impl_desc_type::unknown }
+            ));
index 71b86cc..f3e4bad 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,7 +10,6 @@
 
 #include "single_layer_common.hpp"
 #include <mkldnn_plugin/mkldnn_extension_utils.h>
-#include <extension/ext_list.hpp>
 #include "tests_common.hpp"
 
 using namespace ::testing;
@@ -42,6 +40,8 @@ static inline float triangleCoeff(float x) {
     return std::max(0.0f, 1 - std::abs(x));
 }
 
+extern InferenceEngine::IExtensionPtr make_FakeExtensions();
+
 template <typename data_t>
 void ref_resample(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<data_t> &dst, resample_test_params prm) {
     const data_t *src_data = src.readOnly();
@@ -222,9 +222,10 @@ protected:
             InferenceEngine::CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
-            std::shared_ptr<InferenceEngine::IExtension> cpuExt(new InferenceEngine::Extensions::Cpu::CpuExtensions());
+            InferenceEngine::Extension cpuExt(make_so_name("cpu_extension"));
             MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager());
-            extMgr->AddExtension(cpuExt);
+            extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){}));
+            extMgr->AddExtension(make_FakeExtensions());
 
             MKLDNNGraphTestClass graph;
             graph.CreateGraph(net_reader.getNetwork(), extMgr);
index 380ccb3..a0898b5 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -21,12 +20,8 @@ struct activation_test_params {
     float alpha;
     float beta;
 
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } in;
+    // Formats: NCHW, NCDHW
+    vector<size_t> dims;
 
     size_t num_prim_desc;
 
@@ -56,10 +51,17 @@ T bounded_relu_fwd(T s, A alpha) {
     return s > alpha ? (T)(alpha) : s;
 }
 
+template <typename T> T tanh_fwd(T s) {
+    return static_cast<T>(::tanhf((float)s));
+}
+
 template <typename data_t>
 void ref_activation(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<data_t> &dst, activation_test_params prm) {
-    size_t IW = src.dims()[3];
-    size_t IH = src.dims()[2];
+    auto dims_size = src.dims().size();
+    
+    size_t IW = src.dims()[dims_size - 1];
+    size_t IH = src.dims()[dims_size - 2];
+    size_t ID = dims_size == 5 ? src.dims()[dims_size - 3] : 1u;
     size_t IC = src.dims()[1];
     size_t MB = src.dims()[0];
 
@@ -68,18 +70,23 @@ void ref_activation(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::
 
     for(int mb = 0; mb < MB; mb++) {
         for(int c = 0; c < IC; c++) {
-            for(int h = 0; h < IH; h++) {
-                for(int w = 0; w < IW; w++) {
-                    int idx = mb * IC * IH * IW
-                              + c * IH * IW
-                              + h * IW + w;
-
-                    switch (prm.alg) {
-                        case eltwise_relu:         dst_data[idx] = relu_fwd(src_data[idx], prm.alpha);         break;
-                        case eltwise_elu:          dst_data[idx] = elu_fwd(src_data[idx], prm.alpha);          break;
-                        case eltwise_logistic:     dst_data[idx] = logistic_fwd(src_data[idx]);                break;
-                        case eltwise_bounded_relu: dst_data[idx] = bounded_relu_fwd(src_data[idx], prm.alpha); break;
-                        default: assert(!"unknown alg_kind");
+            for(int d = 0; d < ID; d++) {
+                for(int h = 0; h < IH; h++) {
+                    for(int w = 0; w < IW; w++) {
+                        int idx = mb * IC * ID * IH * IW
+                                  + c * ID * IH * IW
+                                  + d * IH * IW
+                                  + h * IW
+                                  + w;
+
+                        switch (prm.alg) {
+                            case eltwise_relu:         dst_data[idx] = relu_fwd(src_data[idx], prm.alpha);         break;
+                            case eltwise_elu:          dst_data[idx] = elu_fwd(src_data[idx], prm.alpha);          break;
+                            case eltwise_logistic:     dst_data[idx] = logistic_fwd(src_data[idx]);                break;
+                            case eltwise_bounded_relu: dst_data[idx] = bounded_relu_fwd(src_data[idx], prm.alpha); break;
+                            case eltwise_tanh:         dst_data[idx] = tanh_fwd(src_data[idx]); break;
+                            default: assert(!"unknown alg_kind");
+                        }
                     }
                 }
             }
@@ -90,24 +97,26 @@ void ref_activation(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::
 class MKLDNNGraphActivationTests: public TestsCommon,
                                      public WithParamInterface<activation_test_params> {
     std::string model_t = R"V0G0N(
-<Net Name="Activation" version="2" precision="FP32" batch="1">
+<Net Name="Activation" version="3" precision="FP32" batch="1">
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="0">
             <output>
                 <port id="0">
                     <dim>_IN_</dim>
                     <dim>_IC_</dim>
+                    <dim>_ID_</dim>
                     <dim>_IH_</dim>
                     <dim>_IW_</dim>
                 </port>
             </output>
         </layer>
         <layer name="activation" id="1" type="_LT_" precision="FP32">
-            <data _P1_NAME_="_P1_VAL_" _P2_NAME_="_P2_VAL_" PrimitivesPriority="_IMPLS_"/>
+            <data _P1_ _P2_ PrimitivesPriority="_IMPLS_"/>
             <input>
                 <port id="1">
                     <dim>_IN_</dim>
                     <dim>_IC_</dim>
+                    <dim>_ID_</dim>
                     <dim>_IH_</dim>
                     <dim>_IW_</dim>
                 </port>
@@ -116,6 +125,7 @@ class MKLDNNGraphActivationTests: public TestsCommon,
                 <port id="2">
                     <dim>_IN_</dim>
                     <dim>_IC_</dim>
+                    <dim>_ID_</dim>
                     <dim>_IH_</dim>
                     <dim>_IW_</dim>
                 </port>
@@ -134,30 +144,49 @@ protected:
 
     std::string getModel(activation_test_params p) {
         std::string model = model_t;
+        auto dims_size = p.dims.size();
+
+        switch (dims_size) {
+            case 3:
+                REMOVE_LINE(model, "<dim>_IH_</dim>");
+            case 4:
+                REMOVE_LINE(model, "<dim>_ID_</dim>");
+        }
 
         switch (p.alg) {
             case eltwise_relu:         REPLACE_WITH_STR(model, "_LT_", "ReLU"); break;
             case eltwise_elu:          REPLACE_WITH_STR(model, "_LT_", "ELU"); break;
             case eltwise_logistic:     REPLACE_WITH_STR(model, "_LT_", "Sigmoid"); break;
             case eltwise_bounded_relu: REPLACE_WITH_STR(model, "_LT_", "ReLU6"); break;
+            case eltwise_tanh:         REPLACE_WITH_STR(model, "_LT_", "Activation"); break;
             default: assert(!"unknown alg_kind");
         }
 
-        if (p.alg == eltwise_relu)
-            REPLACE_WITH_STR(model, "_P1_NAME_", "negative_slope");
-        else if (p.alg == eltwise_bounded_relu)
-            REPLACE_WITH_STR(model, "_P1_NAME_", "n");
-        else
-            REPLACE_WITH_STR(model, "_P1_NAME_", "alpha");
-        REPLACE_WITH_NUM(model, "_P1_VAL_", p.alpha);
-
-        REPLACE_WITH_STR(model, "_P2_NAME_", "beta");
-        REPLACE_WITH_NUM(model, "_P2_VAL_", p.beta);
-
-        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
-        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
-        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
-        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
+        string P1, P2;
+        if (p.alg == eltwise_relu) {
+            P1 = string("negative_slope=\"") + to_string(p.alpha) + string("\"");
+            P2 = string("beta=\"") + to_string(p.beta) + string("\"");
+        } else if (p.alg == eltwise_bounded_relu) {
+            P1 = string("n=\"") + to_string(p.alpha) + string("\"");
+            P2 = string("beta=\"") + to_string(p.beta) + string("\"");
+        } else if (p.alg == eltwise_tanh) {
+            P1 = string("type=\"tanh\"");
+        } else {
+            P1 = string("alpha=\"") + to_string(p.alpha) + string("\"");
+            P2 = string("beta=\"") + to_string(p.beta) + string("\"");
+        }
+        REPLACE_WITH_STR(model, "_P1_", P1);
+        REPLACE_WITH_STR(model, "_P2_", P2);
+
+        REPLACE_WITH_NUM(model, "_IW_", p.dims[dims_size - 1]);
+        REPLACE_WITH_NUM(model, "_IC_", p.dims[1]);
+        REPLACE_WITH_NUM(model, "_IN_", p.dims[0]);
+        switch (dims_size) {
+            case 5:
+                REPLACE_WITH_NUM(model, "_ID_", p.dims[dims_size - 3]);
+            case 4:
+                REPLACE_WITH_NUM(model, "_IH_", p.dims[dims_size - 2]);
+        }
 
         std::string impls;
         for (const auto& preferType : p.preferTypes) {
@@ -194,9 +223,18 @@ protected:
                 }
             }
 
-            InferenceEngine::SizeVector dims_src = {p.in.n, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
@@ -226,7 +264,7 @@ protected:
 
             ref_activation(*srcPtr, dst_ref, p);
 
-            compare(*output, dst_ref);
+            compare(*output, dst_ref, 0.0005f);
         } catch (const InferenceEngine::details::InferenceEngineException &e) {
             FAIL() << e.what();
         }
@@ -265,7 +303,9 @@ INSTANTIATE_TEST_CASE_P(
                 activation_test_params{eltwise_bounded_relu, 6.0f, 0.0f, {1, 32, 128, 256}, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
                 activation_test_params{eltwise_bounded_relu, 6.0f, 0.0f, {4, 3, 228, 228}, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
                 activation_test_params{eltwise_bounded_relu, 0.1f, 0.0f, {1, 32, 128, 256}, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
-                activation_test_params{eltwise_bounded_relu, 0.1f, 0.0f, {4, 3, 228, 228}, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}}
+                activation_test_params{eltwise_bounded_relu, 0.1f, 0.0f, {4, 3, 228, 228}, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                // 5D
+                activation_test_params{eltwise_tanh, 0.f, 0.f, {1, 1, 64, 64, 64}, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}}
         ));
 
 class MKLDNNGraphDynBatchActivationTests: public MKLDNNGraphActivationTests {
@@ -275,7 +315,7 @@ protected:
             TestsCommon::SetUp();
             activation_test_params p = ::testing::WithParamInterface<activation_test_params>::GetParam();
             std::string model = getModel(p);
-            size_t MB = p.in.n;
+            size_t MB = p.dims[0];
             if (MB < 2)
                 MB = 2;
 
@@ -292,9 +332,18 @@ protected:
             graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
             graph.CreateGraph(net_reader.getNetwork());
 
-            InferenceEngine::SizeVector dims_src = {MB, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
index 2ec54e4..7396700 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -19,17 +18,11 @@ using namespace ::testing;
 using namespace std;
 using namespace mkldnn;
 
-struct dim4 {
-    size_t n;
-    size_t c;
-    size_t h;
-    size_t w;
-};
 
 struct concat_test_params {
-    dim4 in1;
-
-    dim4 in2;
+    // Formats: NCHW, NCDHW
+    vector<size_t> in1;
+    vector<size_t> in2;
 
     size_t axis;
 
@@ -43,50 +36,30 @@ struct concat_test_params {
 class MKLDNNGraphConcatTests: public TestsCommon,
                               public WithParamInterface<concat_test_params> {
     std::string model_t = R"V0G0N(
-<net name="ConcatOnly" version="2" precision="FP32" batch="1">
+<net name="ConcatOnly" version="3" precision="FP32" batch="1">
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="1">
             <output>
-                <port id="1">
-                    <dim>_IN1_</dim>
-                    <dim>_IC1_</dim>
-                    <dim>_IH1_</dim>
-                    <dim>_IW1_</dim>
+                <port id="1">__SRC_DIMS_1__
                 </port>
             </output>
         </layer>
         <layer name="in2" type="Input" precision="FP32" id="2">
             <output>
-                <port id="2">
-                    <dim>_IN2_</dim>
-                    <dim>_IC2_</dim>
-                    <dim>_IH2_</dim>
-                    <dim>_IW2_</dim>
+                <port id="2">__SRC_DIMS_2__
                 </port>
             </output>
         </layer>
         <layer name="con" id="3" type="Concat" precision="FP32">
             <concat_data axis="_AXIS_"/>
             <input>
-                <port id="1">
-                    <dim>_IN1_</dim>
-                    <dim>_IC1_</dim>
-                    <dim>_IH1_</dim>
-                    <dim>_IW1_</dim>
+                <port id="1">__SRC_DIMS_1__
                 </port>
-                <port id="2">
-                    <dim>_IN2_</dim>
-                    <dim>_IC2_</dim>
-                    <dim>_IH2_</dim>
-                    <dim>_IW2_</dim>
+                <port id="2">__SRC_DIMS_2__
                 </port>
             </input>
             <output>
-                <port id="3">
-                    <dim>_ON_</dim>
-                    <dim>_OC_</dim>
-                    <dim>_OH_</dim>
-                    <dim>_OW_</dim>
+                <port id="3">__DST_DIMS__
                 </port>
             </output>
         </layer>
@@ -100,20 +73,27 @@ class MKLDNNGraphConcatTests: public TestsCommon,
 
     std::string getModel(concat_test_params p) {
         std::string model = model_t;
-        REPLACE_WITH_NUM(model, "_IN1_", p.in1.n);
-        REPLACE_WITH_NUM(model, "_IC1_", p.in1.c);
-        REPLACE_WITH_NUM(model, "_IW1_", p.in1.w);
-        REPLACE_WITH_NUM(model, "_IH1_", p.in1.h);
+        std::string s_dims;
+        for (auto& dim : p.in1) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS_1__", s_dims);
 
-        REPLACE_WITH_NUM(model, "_IN2_", p.in2.n);
-        REPLACE_WITH_NUM(model, "_IC2_", p.in2.c);
-        REPLACE_WITH_NUM(model, "_IW2_", p.in2.w);
-        REPLACE_WITH_NUM(model, "_IH2_", p.in2.h);
+        s_dims = "";
+        for (auto& dim : p.in2) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS_2__", s_dims);
 
-        REPLACE_WITH_NUM(model, "_ON_", p.axis == 0 ? p.in1.n + p.in2.n : p.in1.n);
-        REPLACE_WITH_NUM(model, "_OC_", p.axis == 1 ? p.in1.c + p.in2.c : p.in1.c);
-        REPLACE_WITH_NUM(model, "_OH_", p.axis == 2 ? p.in1.h + p.in2.h : p.in1.h);
-        REPLACE_WITH_NUM(model, "_OW_", p.axis == 3 ? p.in1.w + p.in2.w : p.in1.w);
+        s_dims = "";
+        for (size_t i = 0; i < p.in1.size(); i++) {
+            size_t dim = p.axis == i ? p.in1[i] + p.in2[i] : p.in1[i];
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__DST_DIMS__", s_dims);
 
         REPLACE_WITH_NUM(model, "_AXIS_", p.axis);
         return model;
@@ -147,14 +127,23 @@ protected:
             }
             ASSERT_LE(3, nodes.size());
 
-            InferenceEngine::SizeVector dims_src1 = {p.in1.n, p.in1.c, p.in1.h, p.in1.w};
-            InferenceEngine::SizeVector dims_src2 = {p.in2.n, p.in2.c, p.in2.h, p.in2.w};
+            InferenceEngine::SizeVector dims_src1 = p.in1;
+            InferenceEngine::SizeVector dims_src2 = p.in2;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.in1.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src1);
+            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src1);
             src1->allocate();
 
             fill_data(src1->buffer(), src1->size());
-            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src2);
+            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src2);
             src2->allocate();
             fill_data(src2->buffer(), src2->size());
             InferenceEngine::BlobMap srcs;
@@ -215,6 +204,35 @@ protected:
 
 TEST_P(MKLDNNGraphConcatTests, TestsConcat) {}
 
+INSTANTIATE_TEST_CASE_P(
+        TestsConcat, MKLDNNGraphConcatTests,
+        ::testing::Values(
+                concat_test_params {
+                        {1, 3, 3, 5},
+                        {1, 3, 3, 5},
+                        1, 2
+                },
+                concat_test_params {
+                        {1, 7, 1, 5},
+                        {1, 7, 9, 5},
+                        2, 1, MKLDNNPlugin::impl_desc_type::ref
+                },
+                concat_test_params {
+                        {1, 2, 3, 5, 3},
+                        {1, 5, 3, 5, 3},
+                        1, 2
+                },
+                concat_test_params {
+                        {1, 32, 3, 4, 5},
+                        {1, 32, 3, 4, 5},
+                        1, 6, MKLDNNPlugin::impl_desc_type::unknown
+                },
+                concat_test_params {
+                        {1, 64, 16, 16, 16, 1},
+                        {1, 64, 16, 16, 16, 1},
+                        5, 1, MKLDNNPlugin::impl_desc_type::ref
+                }));
+
 class MKLDNNGraphDynBatchConcatTests: public TestsCommon, public WithParamInterface<concat_test_params> {
     std::string model_t = R"V0G0N(
 <net name="ConcatOnly" version="2" precision="FP32" batch="1">
@@ -222,20 +240,14 @@ class MKLDNNGraphDynBatchConcatTests: public TestsCommon, public WithParamInterf
         <layer name="in1" type="Input" precision="FP32" id="1">
             <output>
                 <port id="1">
-                    <dim>1</dim>
-                    <dim>_IC1_</dim>
-                    <dim>_IH1_</dim>
-                    <dim>_IW1_</dim>
+                    <dim>1</dim>__SRC_DIMS_1__
                 </port>
             </output>
         </layer>
         <layer name="in2" type="Input" precision="FP32" id="2">
             <output>
                 <port id="2">
-                    <dim>1</dim>
-                    <dim>_IC2_</dim>
-                    <dim>_IH2_</dim>
-                    <dim>_IW2_</dim>
+                    <dim>1</dim>__SRC_DIMS_2__
                 </port>
             </output>
         </layer>
@@ -243,24 +255,15 @@ class MKLDNNGraphDynBatchConcatTests: public TestsCommon, public WithParamInterf
             <concat_data axis="_AXIS_"/>
             <input>
                 <port id="1">
-                    <dim>1</dim>
-                    <dim>_IC1_</dim>
-                    <dim>_IH1_</dim>
-                    <dim>_IW1_</dim>
+                    <dim>1</dim>__SRC_DIMS_1__
                 </port>
                 <port id="2">
-                    <dim>1</dim>
-                    <dim>_IC2_</dim>
-                    <dim>_IH2_</dim>
-                    <dim>_IW2_</dim>
+                    <dim>1</dim>__SRC_DIMS_2__
                 </port>
             </input>
             <output>
                 <port id="3">
-                    <dim>1</dim>
-                    <dim>_OC_</dim>
-                    <dim>_OH_</dim>
-                    <dim>_OW_</dim>
+                    <dim>1</dim>__DST_DIMS__
                 </port>
             </output>
         </layer>
@@ -274,20 +277,27 @@ class MKLDNNGraphDynBatchConcatTests: public TestsCommon, public WithParamInterf
 
     std::string getModel(concat_test_params p) {
         std::string model = model_t;
-        REPLACE_WITH_NUM(model, "_IN1_", p.in1.n);
-        REPLACE_WITH_NUM(model, "_IC1_", p.in1.c);
-        REPLACE_WITH_NUM(model, "_IW1_", p.in1.w);
-        REPLACE_WITH_NUM(model, "_IH1_", p.in1.h);
+        std::string s_dims;
+        for (size_t i = 1; i < p.in1.size(); i++) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(p.in1[i]) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS_1__", s_dims);
 
-        REPLACE_WITH_NUM(model, "_IN2_", p.in2.n);
-        REPLACE_WITH_NUM(model, "_IC2_", p.in2.c);
-        REPLACE_WITH_NUM(model, "_IW2_", p.in2.w);
-        REPLACE_WITH_NUM(model, "_IH2_", p.in2.h);
+        s_dims = "";
+        for (size_t i = 1; i < p.in2.size(); i++) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(p.in2[i]) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS_2__", s_dims);
 
-        REPLACE_WITH_NUM(model, "_ON_", p.axis == 0 ? p.in1.n + p.in2.n : p.in1.n);
-        REPLACE_WITH_NUM(model, "_OC_", p.axis == 1 ? p.in1.c + p.in2.c : p.in1.c);
-        REPLACE_WITH_NUM(model, "_OH_", p.axis == 2 ? p.in1.h + p.in2.h : p.in1.h);
-        REPLACE_WITH_NUM(model, "_OW_", p.axis == 3 ? p.in1.w + p.in2.w : p.in1.w);
+        s_dims = "";
+        for (size_t i = 1; i < p.in1.size(); i++) {
+            size_t dim = p.axis == i ? p.in1[i] + p.in2[i] : p.in1[i];
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__DST_DIMS__", s_dims);
 
         REPLACE_WITH_NUM(model, "_AXIS_", p.axis);
         return model;
@@ -302,7 +312,7 @@ protected:
             TestsCommon::SetUp();
             concat_test_params p = ::testing::WithParamInterface<concat_test_params>::GetParam();
             std::string model = getModel(p);
-            size_t MB = p.in1.n;
+            size_t MB = p.in1[0];
             if (MB < 2)
                 MB = 2;
 
@@ -319,14 +329,23 @@ protected:
             graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
             graph.CreateGraph(net_reader.getNetwork());
 
-            InferenceEngine::SizeVector dims_src1 = {MB, p.in1.c, p.in1.h, p.in1.w};
-            InferenceEngine::SizeVector dims_src2 = {MB, p.in2.c, p.in2.h, p.in2.w};
+            InferenceEngine::SizeVector dims_src1 = p.in1;
+            InferenceEngine::SizeVector dims_src2 = p.in2;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.in1.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src1);
+            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src1);
             src1->allocate();
 
             fill_data(src1->buffer(), src1->size());
-            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src2);
+            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src2);
             src2->allocate();
             fill_data(src2->buffer(), src2->size());
             InferenceEngine::BlobMap srcs;
@@ -396,6 +415,11 @@ INSTANTIATE_TEST_CASE_P(
                         {2, 2, 3, 3},
                         {2, 3, 3, 3},
                         1, 2, MKLDNNPlugin::impl_desc_type::unknown
+                },
+                concat_test_params {
+                        {2, 2, 3, 3, 3},
+                        {2, 3, 3, 3, 3},
+                        1, 2, MKLDNNPlugin::impl_desc_type::unknown
                 }));
 
 struct concat_param {
@@ -406,9 +430,10 @@ struct concat_param {
 };
 
 struct two_concat_test_params {
-    dim4 in1;
-    dim4 in2;
-    dim4 in3;
+    // Formats: NCHW, NCDHW
+    vector<size_t> in1;
+    vector<size_t> in2;
+    vector<size_t> in3;
 
     concat_param concat1;
     concat_param concat2;
@@ -421,31 +446,19 @@ class MKLDNNGraphTwoConcatTests: public TestsCommon,
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="1">
             <output>
-                <port id="1">
-                    <dim>_IN1_</dim>
-                    <dim>_IC1_</dim>
-                    <dim>_IH1_</dim>
-                    <dim>_IW1_</dim>
+                <port id="1">__SRC_DIMS_1__
                 </port>
             </output>
         </layer>
         <layer name="in2" type="Input" precision="FP32" id="2">
             <output>
-                <port id="1">
-                    <dim>_IN2_</dim>
-                    <dim>_IC2_</dim>
-                    <dim>_IH2_</dim>
-                    <dim>_IW2_</dim>
+                <port id="1">__SRC_DIMS_2__
                 </port>
             </output>
         </layer>
         <layer name="in3" type="Input" precision="FP32" id="3">
             <output>
-                <port id="1">
-                    <dim>_IN3_</dim>
-                    <dim>_IC3_</dim>
-                    <dim>_IH3_</dim>
-                    <dim>_IW3_</dim>
+                <port id="1">__SRC_DIMS_3__
                 </port>
             </output>
         </layer>
@@ -455,22 +468,20 @@ class MKLDNNGraphTwoConcatTests: public TestsCommon,
                 <port id="1">
                     <dim>_CI41N_</dim>
                     <dim>_CI41C_</dim>
+                    <dim>_CI41D_</dim>
                     <dim>_CI41H_</dim>
                     <dim>_CI41W_</dim>
                 </port>
                 <port id="2">
                     <dim>_CI42N_</dim>
                     <dim>_CI42C_</dim>
+                    <dim>_CI42D_</dim>
                     <dim>_CI42H_</dim>
                     <dim>_CI42W_</dim>
                 </port>
             </input>
             <output>
-                <port id="3">
-                    <dim>_CON1_</dim>
-                    <dim>_COC1_</dim>
-                    <dim>_COH1_</dim>
-                    <dim>_COW1_</dim>
+                <port id="3">__CO_DIMS_1__
                 </port>
             </output>
         </layer>
@@ -480,22 +491,20 @@ class MKLDNNGraphTwoConcatTests: public TestsCommon,
                 <port id="1">
                     <dim>_CI51N_</dim>
                     <dim>_CI51C_</dim>
+                    <dim>_CI51D_</dim>
                     <dim>_CI51H_</dim>
                     <dim>_CI51W_</dim>
                 </port>
                 <port id="2">
                     <dim>_CI52N_</dim>
                     <dim>_CI52C_</dim>
+                    <dim>_CI52D_</dim>
                     <dim>_CI52H_</dim>
                     <dim>_CI52W_</dim>
                 </port>
             </input>
             <output>
-                <port id="3">
-                    <dim>_CON2_</dim>
-                    <dim>_COC2_</dim>
-                    <dim>_COH2_</dim>
-                    <dim>_COW2_</dim>
+                <port id="3">__CO_DIMS_2__
                 </port>
             </output>
         </layer>
@@ -508,7 +517,7 @@ class MKLDNNGraphTwoConcatTests: public TestsCommon,
     </edges>
 </net>
 )V0G0N";
-    void changeEdgeToLayer(std::string& model, int f_l, int f_p, int t_l, int t_p, dim4 dims) {
+    void changeEdgeToLayer(std::string& model, int f_l, int f_p, int t_l, int t_p, vector<size_t> dims) {
         std::string TL = "_FL" + std::to_string(f_l) + std::to_string(f_p) + "_";
         std::string TP = "_FP" + std::to_string(f_l) + std::to_string(f_p) + "_";
         if (!FIND_STR(model, TL) || !FIND_STR(model, TP)) {
@@ -526,31 +535,40 @@ class MKLDNNGraphTwoConcatTests: public TestsCommon,
         }
 
         std::string CI = "_CI" + std::to_string(t_l) + std::to_string(t_p);
-        REPLACE_WITH_NUM(model, CI + "N_", dims.n);
-        REPLACE_WITH_NUM(model, CI + "C_", dims.c);
-        REPLACE_WITH_NUM(model, CI + "H_", dims.h);
-        REPLACE_WITH_NUM(model, CI + "W_", dims.w);
+        auto dims_size = dims.size();
+        REPLACE_WITH_NUM(model, CI + "N_", dims[0]);
+        REPLACE_WITH_NUM(model, CI + "C_", dims[1]);
+        REPLACE_WITH_NUM(model, CI + "H_", dims[dims_size - 2]);
+        REPLACE_WITH_NUM(model, CI + "W_", dims[dims_size - 1]);
+        if (dims_size < 5) REMOVE_LINE(model, std::string("<dim>") + CI + std::string("D_") + "</dim>");
+        else REPLACE_WITH_NUM(model, CI + "D_", dims[dims_size - 3]);
     }
 
 
     std::string getModel(two_concat_test_params p) {
         std::string model = model_t;
-        REPLACE_WITH_NUM(model, "_IN1_", p.in1.n);
-        REPLACE_WITH_NUM(model, "_IC1_", p.in1.c);
-        REPLACE_WITH_NUM(model, "_IW1_", p.in1.w);
-        REPLACE_WITH_NUM(model, "_IH1_", p.in1.h);
-
-        REPLACE_WITH_NUM(model, "_IN2_", p.in2.n);
-        REPLACE_WITH_NUM(model, "_IC2_", p.in2.c);
-        REPLACE_WITH_NUM(model, "_IW2_", p.in2.w);
-        REPLACE_WITH_NUM(model, "_IH2_", p.in2.h);
-
-        REPLACE_WITH_NUM(model, "_IN3_", p.in3.n);
-        REPLACE_WITH_NUM(model, "_IC3_", p.in3.c);
-        REPLACE_WITH_NUM(model, "_IW3_", p.in3.w);
-        REPLACE_WITH_NUM(model, "_IH3_", p.in3.h);
-
-        dim4 concat11;
+        std::string s_dims;
+        for (size_t i = 0; i < p.in1.size(); i++) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(p.in1[i]) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS_1__", s_dims);
+
+        s_dims = "";
+        for (size_t i = 0; i < p.in2.size(); i++) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(p.in2[i]) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS_2__", s_dims);
+
+        s_dims = "";
+        for (size_t i = 0; i < p.in3.size(); i++) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(p.in3[i]) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS_3__", s_dims);
+
+        vector<size_t> concat11;
         switch (p.concat1.input1) {
             case 1:
                 changeEdgeToLayer(model, 2, 1, 4, 1, p.in2);
@@ -565,7 +583,7 @@ class MKLDNNGraphTwoConcatTests: public TestsCommon,
                 concat11 = p.in1;
         }
 
-        dim4 concat12;
+        vector<size_t> concat12;
         switch (p.concat1.input2) {
             case 1:
                 changeEdgeToLayer(model, 2, 1, 4, 2, p.in2);
@@ -580,7 +598,7 @@ class MKLDNNGraphTwoConcatTests: public TestsCommon,
                 concat12 = p.in1;
         }
 
-        dim4 concat21;
+        vector<size_t> concat21;
         switch (p.concat2.input1) {
             case 1:
                 changeEdgeToLayer(model, 2, 1, 5, 1, p.in2);
@@ -595,7 +613,7 @@ class MKLDNNGraphTwoConcatTests: public TestsCommon,
                 concat21 = p.in1;
         }
 
-        dim4 concat22;
+        vector<size_t> concat22;
         switch (p.concat2.input2) {
             case 1:
                 changeEdgeToLayer(model, 2, 1, 5, 2, p.in2);
@@ -610,17 +628,25 @@ class MKLDNNGraphTwoConcatTests: public TestsCommon,
                 concat22 = p.in1;
         }
 
-        REPLACE_WITH_NUM(model, "_CON1_", p.concat1.axis == 0 ? concat11.n + concat12.n : concat21.n);
-        REPLACE_WITH_NUM(model, "_COC1_", p.concat1.axis == 1 ? concat11.c + concat12.c : concat21.c);
-        REPLACE_WITH_NUM(model, "_COH1_", p.concat1.axis == 2 ? concat11.h + concat12.h : concat21.h);
-        REPLACE_WITH_NUM(model, "_COW1_", p.concat1.axis == 3 ? concat11.w + concat12.w : concat21.w);
+        s_dims = "";
+        for (size_t i = 0; i < p.in2.size(); i++) {
+            size_t concat = p.concat1.axis == i ? concat11[i] + concat12[i] : concat21[i];
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(concat) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__CO_DIMS_1__", s_dims);
+
         REPLACE_WITH_NUM(model, "_CONCAT1_AXIS_", p.concat1.axis);
         REPLACE_WITH_STR(model, "_CONCAT1_NAME_", p.concat1.name);
 
-        REPLACE_WITH_NUM(model, "_CON2_", p.concat2.axis == 0 ? concat21.n + concat22.n : concat21.n);
-        REPLACE_WITH_NUM(model, "_COC2_", p.concat2.axis == 1 ? concat21.c + concat22.c : concat21.c);
-        REPLACE_WITH_NUM(model, "_COH2_", p.concat2.axis == 2 ? concat21.h + concat22.h : concat21.h);
-        REPLACE_WITH_NUM(model, "_COW2_", p.concat2.axis == 3 ? concat21.w + concat22.w : concat21.w);
+        s_dims = "";
+        for (size_t i = 0; i < p.in2.size(); i++) {
+            size_t concat = p.concat2.axis == i ? concat21[i] + concat22[i] : concat21[i];
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(concat) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__CO_DIMS_2__", s_dims);
+
         REPLACE_WITH_NUM(model, "_CONCAT2_AXIS_", p.concat2.axis);
         REPLACE_WITH_STR(model, "_CONCAT2_NAME_", p.concat2.name);
         return model;
@@ -642,19 +668,28 @@ protected:
             MKLDNNGraphTestClass graph;
             graph.CreateGraph(net_reader.getNetwork());
 
-            InferenceEngine::SizeVector dims_src1 = {p.in1.n, p.in1.c, p.in1.h, p.in1.w};
-            InferenceEngine::SizeVector dims_src2 = {p.in2.n, p.in2.c, p.in2.h, p.in2.w};
-            InferenceEngine::SizeVector dims_src3 = {p.in3.n, p.in3.c, p.in3.h, p.in3.w};
+            InferenceEngine::SizeVector dims_src1 = p.in1;
+            InferenceEngine::SizeVector dims_src2 = p.in2;
+            InferenceEngine::SizeVector dims_src3 = p.in3;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.in1.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src1);
+            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src1);
             src1->allocate();
             fill_data(src1->buffer(), src1->size());
 
-            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src2);
+            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src2);
             src2->allocate();
             fill_data(src2->buffer(), src2->size());
 
-            InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src3);
+            InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src3);
             src3->allocate();
             fill_data(src3->buffer(), src3->size());
 
@@ -996,46 +1031,26 @@ class MKLDNNGraphIncorrectConcatTests: public TestsCommon,
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="1">
             <output>
-                <port id="1">
-                    <dim>_IN1_</dim>
-                    <dim>_IC1_</dim>
-                    <dim>_IH1_</dim>
-                    <dim>_IW1_</dim>
+                <port id="1">__SRC_DIMS_1__
                 </port>
             </output>
         </layer>
         <layer name="in2" type="Input" precision="FP32" id="2">
             <output>
-                <port id="2">
-                    <dim>_IN2_</dim>
-                    <dim>_IC2_</dim>
-                    <dim>_IH2_</dim>
-                    <dim>_IW2_</dim>
+                <port id="2">__SRC_DIMS_2__
                 </port>
             </output>
         </layer>
         <layer name="con" id="3" type="Concat" precision="FP32">
             <concat_data axis="_AXIS_"/>
             <input>
-                <port id="1">
-                    <dim>_IN1_</dim>
-                    <dim>_IC1_</dim>
-                    <dim>_IH1_</dim>
-                    <dim>_IW1_</dim>
+                <port id="1">__SRC_DIMS_1__
                 </port>
-                <port id="2">
-                    <dim>_IN2_</dim>
-                    <dim>_IC2_</dim>
-                    <dim>_IH2_</dim>
-                    <dim>_IW2_</dim>
+                <port id="2">__SRC_DIMS_2__
                 </port>
             </input>
             <output>
-                <port id="3">
-                    <dim>_ON_</dim>
-                    <dim>_OC_</dim>
-                    <dim>_OH_</dim>
-                    <dim>_OW_</dim>
+                <port id="3">__DST_DIMS__
                 </port>
             </output>
         </layer>
@@ -1049,20 +1064,27 @@ class MKLDNNGraphIncorrectConcatTests: public TestsCommon,
 
     std::string getModel(concat_test_params p) {
         std::string model = model_t;
-        REPLACE_WITH_NUM(model, "_IN1_", p.in1.n);
-        REPLACE_WITH_NUM(model, "_IC1_", p.in1.c);
-        REPLACE_WITH_NUM(model, "_IW1_", p.in1.w);
-        REPLACE_WITH_NUM(model, "_IH1_", p.in1.h);
+        std::string s_dims;
+        for (auto& dim : p.in1) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS_1__", s_dims);
 
-        REPLACE_WITH_NUM(model, "_IN2_", p.in2.n);
-        REPLACE_WITH_NUM(model, "_IC2_", p.in2.c);
-        REPLACE_WITH_NUM(model, "_IW2_", p.in2.w);
-        REPLACE_WITH_NUM(model, "_IH2_", p.in2.h);
+        s_dims = "";
+        for (auto& dim : p.in2) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS_2__", s_dims);
 
-        REPLACE_WITH_NUM(model, "_ON_", p.axis == 0 ? p.in1.n + p.in2.n : p.in1.n);
-        REPLACE_WITH_NUM(model, "_OC_", p.axis == 1 ? p.in1.c + p.in2.c : p.in1.c);
-        REPLACE_WITH_NUM(model, "_OH_", p.axis == 2 ? p.in1.h + p.in2.h : p.in1.h);
-        REPLACE_WITH_NUM(model, "_OW_", p.axis == 3 ? p.in1.w + p.in2.w : p.in1.w);
+        s_dims = "";
+        for (size_t i = 0; i < p.in1.size(); i++) {
+            size_t dim = p.axis == i ? p.in1[i] + p.in2[i] : p.in1[i];
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__DST_DIMS__", s_dims);
 
         REPLACE_WITH_NUM(model, "_AXIS_", p.axis);
         return model;
@@ -1079,10 +1101,8 @@ protected:
             std::string model = getModel(p);
 
             InferenceEngine::CNNNetReader net_reader;
-            ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
-
-            MKLDNNGraphTestClass graph;
-            ASSERT_THROW(graph.CreateGraph(net_reader.getNetwork()), InferenceEngine::details::InferenceEngineException);
+            ASSERT_THROW(net_reader.ReadNetwork(model.data(), model.length()), 
+                         InferenceEngine::details::InferenceEngineException);
         } catch (const InferenceEngine::details::InferenceEngineException &e) {
             FAIL() << e.what();
         }
index 7d12352..dbfbc06 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #define XBYAK_UNDEF_JNL
 #include "../../../../../../../thirdparty/mkl-dnn/src/cpu/xbyak/xbyak_util.h"
 
+using namespace InferenceEngine;
 using namespace ::testing;
 using namespace std;
 using namespace mkldnn;
 
-
 struct conv_test_params {
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } in;
-
-    size_t krn_w;
-    size_t krn_h;
-    size_t str_w;
-    size_t str_h;
-    size_t pad_w;
-    size_t pad_h;
+    // Formats: NCHW, NCDHW
+    vector<size_t> dims;
+    // Formats: WH, WHD
+    vector<size_t> kernel;
+    vector<size_t> strides;
+    vector<size_t> pads_begin;
+    vector<size_t> pads_end;
 
     size_t out_c;
     size_t grp_c;
+    string auto_pad;
 
     size_t num_prim_desc;
 
     int selectedType;
-    std::vector<MKLDNNPlugin::impl_desc_type> preferTypes;
+    vector<MKLDNNPlugin::impl_desc_type> preferTypes;
 
-    std::vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
+    vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
 };
 
 template <typename data_t>
-void ref_conv(const InferenceEngine::TBlob<data_t> &src, const data_t *weights, const size_t weightsSize,
-                InferenceEngine::TBlob<data_t> &dst, conv_test_params prm) {
-    size_t KW = prm.krn_w;
-    size_t KH = prm.krn_h;
+void ref_conv(const TBlob<data_t> &src, const data_t *weights, const size_t weightsSize,
+                TBlob<data_t> &dst, conv_test_params prm) {
+    auto dims_size = src.dims().size();
+
+    size_t KW = prm.kernel[X_AXIS];
+    size_t KH = prm.kernel[Y_AXIS];
+    size_t KD = dims_size == 5 ? prm.kernel[Z_AXIS] : 1u;
     size_t GC = prm.grp_c;
 
     size_t IC = src.dims()[1];
-    size_t IH = src.dims()[2];
-    size_t IW = src.dims()[3];
+    size_t ID = dims_size == 5 ? src.dims()[dims_size - 3] : 1u;
+    size_t IH = src.dims()[dims_size - 2];
+    size_t IW = src.dims()[dims_size - 1];
 
-    size_t OW = (IW + 2 * prm.pad_w - prm.krn_w) / prm.str_w + 1;
-    size_t OH = (IH + 2 * prm.pad_h - prm.krn_h) / prm.str_h + 1;
+    size_t OW = (IW + 2u * prm.pads_begin[X_AXIS] - prm.kernel[X_AXIS]) / prm.strides[X_AXIS] + 1u;
+    size_t OH = (IH + 2u * prm.pads_begin[Y_AXIS] - prm.kernel[Y_AXIS]) / prm.strides[Y_AXIS] + 1u;
+    size_t OD = dims_size == 5 ? (ID + 2u * prm.pads_begin[Z_AXIS] - prm.kernel[Z_AXIS]) / prm.strides[Z_AXIS] + 1u : 1u;
     size_t OC = prm.out_c;
 
 
     const data_t *src_data = src.readOnly();
     const data_t *weights_data = weights;
-    const data_t *bias_data = weights_data + KW * KH * OC * IC / GC;
+    const data_t *bias_data = weights_data + KW * KH * KD * OC * IC / GC;
     data_t *dst_data = dst.data();
 
-    IE_ASSERT(KW * KH * OC * IC / GC + OC == weightsSize);
+    IE_ASSERT(KW * KH * KD * OC * IC / GC + OC == weightsSize);
     IE_ASSERT(OW == dst.dims()[0]);
     IE_ASSERT(OH == dst.dims()[1]);
+    
+    size_t SC1 = OH * OW;
+    size_t SC2 = SC1 * OD;
+    size_t SC3 = OC / GC;
+    size_t SC4 = SC2 * SC3;
+    
+    size_t IC1 = IH * IW;
+    size_t IC2 = IC1 * ID;
+    size_t IC3 = IC / GC;
+    size_t IC4 = IC2 * IC3;
+    
+    size_t KC1 = KH * KW;
+    size_t KC2 = KC1 * KD;
+    size_t KC3 = IC3 * KC2;
+    size_t KC4 = SC3 * KC3;
 
     for (uint32_t g = 0; g < GC; g++) {
+        size_t gc = g * SC4;
+        size_t goc = g * SC3;
+        size_t gic = g * IC4;
+        size_t gkc = g * KC4;
         for (uint32_t oc = 0; oc < OC / GC; oc++) {
-            for (uint32_t oh = 0; oh < OH; oh++) {
-                for (uint32_t ow = 0; ow < OW; ow++) {
-                    size_t oidx = g * OC / GC * OH * OW
-                                  + oc * OH * OW + oh * OW + ow;
-                    dst_data[oidx] = bias_data[g * OC / GC + oc];
-
-                    for (size_t ic = 0; ic < IC / GC; ic++) {
-                        for (size_t kh = 0; kh < KH; kh++) {
-                            for (size_t kw = 0; kw < KW; kw++) {
-                                int32_t iw = ow * prm.str_w - prm.pad_w + kw;
-                                int32_t ih = oh * prm.str_h - prm.pad_h + kh;
-                                if (iw < 0 || iw >= (int32_t)IW || ih < 0
-                                    || ih >= (int32_t)IH)
-                                    continue;
-                                size_t iidx = g * IC / GC * IH * IW
-                                              + ic * IH * IW + ih * IW + iw;
-                                size_t widx = g * OC / GC * IC / GC * KH * KW
-                                              + oc * IC / GC * KH * KW
-                                              + ic * KH * KW + kh * KW + kw;
-
-                                dst_data[ oidx] += src_data[iidx] * weights_data[widx];
+            size_t cc = gc + oc * SC2;
+            size_t gooc = goc + oc;
+            size_t gkoc = gkc + oc * KC3;
+            for (uint32_t od = 0; od < OD; od++) {
+                size_t dc = cc + od * SC1;
+                for (uint32_t oh = 0; oh < OH; oh++) {
+                    size_t hc = dc + oh * OW;
+                    for (uint32_t ow = 0; ow < OW; ow++) {
+                        size_t oidx = hc + ow;
+
+                        dst_data[oidx] = bias_data[gooc];
+
+                        for (size_t ic = 0; ic < IC / GC; ic++) {
+                            size_t icc = gkoc + ic * KC2;
+                            size_t kicc = gic + ic * IC2;
+                            for (size_t kd = 0; kd < KD; kd++) {
+                                int32_t id = dims_size == 5 ? od * prm.strides[Z_AXIS] - prm.pads_begin[Z_AXIS] + kd : 0;
+                                if (id < 0 || id >= (int32_t)ID) continue;
+                                size_t kidc = kicc + id * IC1;
+                                size_t kdc = icc + kd * KC1;
+                                for (size_t kh = 0; kh < KH; kh++) {
+                                    int32_t ih = oh * prm.strides[Y_AXIS] - prm.pads_begin[Y_AXIS] + kh;
+                                    if (ih < 0 || ih >= (int32_t)IH) continue;
+                                    size_t kihc = kidc + ih * IW;
+                                    size_t khc = kdc + kh * KW;
+                                    for (size_t kw = 0; kw < KW; kw++) {
+                                        int32_t iw = ow * prm.strides[X_AXIS] - prm.pads_begin[X_AXIS] + kw;
+                                        if (iw < 0 || iw >= (int32_t)IW) continue;
+
+                                        size_t iidx = kihc + iw;
+                                        size_t widx = khc + kw;
+
+                                        dst_data[oidx] += src_data[iidx] * weights_data[widx];
+                                    }
+                                }
                             }
                         }
                     }
@@ -108,42 +140,32 @@ void ref_conv(const InferenceEngine::TBlob<data_t> &src, const data_t *weights,
 
 class MKLDNNGraphConvolutionTests: public TestsCommon,
                                    public WithParamInterface<conv_test_params> {
-    std::string model_t = R"V0G0N(
-<Net Name="Convolution_Only" version="2" precision="FP32" batch="1">
+    std::string model_t_5D = R"V0G0N(
+<net name="Convolution_Only" version="3" precision="FP32" batch="1">
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="0">
             <output>
-                <port id="0">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="0">__SRC_DIMS__
                 </port>
             </output>
         </layer>
         <layer name="conv1" id="1" type="Convolution" precision="FP32">
-            <convolution stride-x="_SW_" stride-y="_SH_"
-                         pad-x="_PW_"    pad-y="_PH_"
-                         kernel-x="_KW_" kernel-y="_KH_"
-                         output="_OC_"   group="_GC_" PrimitivesPriority="_IMPLS_"/>
+            <convolution _AP_ kernel="_K_"
+                         pads_begin="_PB_"  pads_end="_PE_"
+                         strides="_KS_"
+                         output="_OC_"  group="_GC_" PrimitivesPriority="_IMPLS_"/>
 
             <weights offset="0" size="_S1_" />
             <biases offset="_S1_" size="_S2_" />
 
             <input>
-                <port id="1">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="1">__SRC_DIMS__
                 </port>
             </input>
             <output>
                 <port id="2">
                     <dim>_IN_</dim>
-                    <dim>_OC_</dim>
-                    <dim>_OH_</dim>
-                    <dim>_OW_</dim>
+                    <dim>_OC_</dim>__DST_DIMS__
                 </port>
             </output>
         </layer>
@@ -151,33 +173,53 @@ class MKLDNNGraphConvolutionTests: public TestsCommon,
     <edges>
         <edge from-layer="0" from-port="0" to-layer="1" to-port="1"/>
     </edges>
-</Net>
+</net>
 )V0G0N";
 
 protected:
     std::string getModel(conv_test_params p) {
-        std::string model = model_t;
-        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
-        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
-        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
-        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
-
-        REPLACE_WITH_NUM(model, "_KW_", p.krn_w);
-        REPLACE_WITH_NUM(model, "_KH_", p.krn_h);
-        REPLACE_WITH_NUM(model, "_SW_", p.str_w);
-        REPLACE_WITH_NUM(model, "_SH_", p.str_h);
-        REPLACE_WITH_NUM(model, "_PW_", p.pad_w);
-        REPLACE_WITH_NUM(model, "_PH_", p.pad_h);
+        std::string model = model_t_5D;
+        std::string s_dims;
+        for (auto& dim : p.dims) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS__", s_dims);
+
+        s_dims = "";
+        int k_len = p.kernel.size();
+        for (size_t i = 2; i < p.dims.size(); i++) {
+            size_t inx = k_len - i + 1;
+            size_t dim = (p.dims[i] + 2lu * p.pads_begin[inx] - p.kernel[inx]) / p.strides[inx] + 1lu;
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__DST_DIMS__", s_dims);
+
+        REPLACE_WITH_NUM(model, "_IN_", p.dims[0]);
+
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_K_", p.kernel);
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_KS_", p.strides);
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_PB_", p.pads_begin);
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_PE_", p.pads_end);
+        string auto_pad;
+        if (!p.auto_pad.empty()) auto_pad = string("auto_pad=") + string("\"") + p.auto_pad + string("\"");
+        REPLACE_WITH_STR(model, "_AP_", auto_pad);
 
         REPLACE_WITH_NUM(model, "_GC_", p.grp_c);
         REPLACE_WITH_NUM(model, "_OC_", p.out_c);
-        REPLACE_WITH_NUM(model, "_OH_", (p.in.h + 2 * p.pad_h - p.krn_h) / p.str_h + 1);
-        REPLACE_WITH_NUM(model, "_OW_", (p.in.w + 2 * p.pad_w - p.krn_w) / p.str_w + 1);
 
-        size_t w_data_size = (p.krn_w * p.krn_h * p.out_c * p.in.c / p.grp_c) * sizeof(float);
+        size_t w_data_size = 1;
+        for (auto ker : p.kernel) {
+            w_data_size *= ker;
+        }
+
+        w_data_size = (w_data_size * p.out_c * p.dims[1] / p.grp_c) * sizeof(float);
         size_t b_data_size = p.out_c * sizeof(float);
+
         REPLACE_WITH_NUM(model, "_S1_", w_data_size);
         REPLACE_WITH_NUM(model, "_S2_", b_data_size);
+
         std::string impls;
         for (const auto& preferType : p.preferTypes) {
             if (!impls.empty())
@@ -185,6 +227,7 @@ protected:
             impls += "cpu:" + MKLDNNGraphTestClass::getStrPrimitiveDescriptorType(preferType);
         }
         REPLACE_WITH_STR(model, "_IMPLS_", impls);
+
         return model;
     }
 
@@ -197,19 +240,28 @@ protected:
             conv_test_params p = ::testing::WithParamInterface<conv_test_params>::GetParam();
             std::string model = getModel(p);
 
-            InferenceEngine::CNNNetReader net_reader;
+            CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
-            InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {(p.krn_w * p.krn_h * p.out_c * p.in.c / p.grp_c + p.out_c)
-                                                              * sizeof(float)});
+            size_t blob_size = p.out_c * p.dims[1] / p.grp_c;
+            for (auto k : p.kernel) {
+                blob_size *= k;
+            }
+            blob_size = (blob_size + p.out_c) * sizeof(float);
+            TBlob<uint8_t> *weights = new TBlob<uint8_t>
+                    (Precision::U8, C, {blob_size});
             weights->allocate();
+
             fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
-            InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
+            size_t w_buffer_len = weights->size() / sizeof(float);
+
+            TBlob<uint8_t>::Ptr weights_ptr = TBlob<uint8_t>::Ptr(weights);
 
             net_reader.SetWeights(weights_ptr);
+            CNNNetwork network = net_reader.getNetwork();
 
             MKLDNNGraphTestClass graph;
-            graph.CreateGraph(net_reader.getNetwork());
+            graph.CreateGraph(network);
 
             auto& nodes = graph.getNodes();
             nodes = graph.getNodes();
@@ -241,39 +293,47 @@ protected:
                 }
             }
 
-            InferenceEngine::SizeVector dims_src = {p.in.n, p.in.c, p.in.h, p.in.w};
+            Layout layout = ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = NCHW;
+                    break;
+                case 5:
+                    layout = NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            Blob::Ptr src = make_shared_blob<float, const SizeVector>
+                    (Precision::FP32, layout, p.dims);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
-            auto * srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
+            auto * srcPtr = dynamic_cast<TBlob<float>*>(src.get());
 
             if (srcPtr == nullptr)
                 FAIL() << "Cannot cast blob to TBlob<float>.";
 
-            InferenceEngine::BlobMap srcs;
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src));
+            BlobMap srcs;
+            srcs.insert(std::pair<std::string, Blob::Ptr>("in1", src));
 
-            InferenceEngine::OutputsDataMap out;
-            out = net_reader.getNetwork().getOutputsInfo();
-            InferenceEngine::BlobMap outputBlobs;
+            OutputsDataMap out;
+            out = network.getOutputsInfo();
+            BlobMap outputBlobs;
 
-            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
+            std::pair<std::string, DataPtr> item = *out.begin();
 
-            InferenceEngine::TBlob<float>::Ptr output;
-            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
+            TBlob<float>::Ptr output;
+            output = make_shared_blob<float>(item.second->getTensorDesc());
             output->allocate();
             outputBlobs[item.first] = output;
 
             graph.Infer(srcs, outputBlobs);
 
-
-            InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
+            TBlob<float> dst_ref(item.second->getTensorDesc());
             dst_ref.allocate();
             ref_conv(*srcPtr, (const float *)weights->buffer(), weights->size() / sizeof(float), dst_ref, p);
-            compare(*output, dst_ref);
-        } catch (const InferenceEngine::details::InferenceEngineException &e) {
+            compare(*output, dst_ref, 0.0002f);
+        } catch (const details::InferenceEngineException &e) {
             FAIL() << e.what();
         }
     }
@@ -284,31 +344,68 @@ TEST_P(MKLDNNGraphConvolutionTests, TestsConvolution) {}
 INSTANTIATE_TEST_CASE_P(
         TestConvolution, MKLDNNGraphConvolutionTests,
         ::testing::Values(
-                conv_test_params{{1, 9, 16, 32},
-                                 1, 1, 1, 1, 0, 0, 17, 1, 6, MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_1x1},
+        /*0*/   conv_test_params{{1, 9, 16, 32},
+                                 {1, 1}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 6, MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_1x1 },
                 conv_test_params{{1, 9, 32, 16},
-                                 2, 4, 1, 1, 0, 0, 17, 1, 5, MKLDNNPlugin::impl_desc_type::jit },
+                                 {2, 4}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit },
                 conv_test_params{{1, 9, 32, 16},
-                                 2, 4, 2, 1, 0, 0, 17, 1, 5, MKLDNNPlugin::impl_desc_type::jit },
+                                 {2, 4}, {2, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit },
                 conv_test_params{{1, 3, 40, 40},
-                                 3, 3, 1, 2, 0, 0, 20, 1, 5, MKLDNNPlugin::impl_desc_type::jit },
+                                 {3, 3}, {1, 2}, {0, 0}, {0, 0}, 20, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit },
                 conv_test_params{{1, 1, 40, 40},
-                                 3, 3, 1, 2, 0, 0, 20, 1, 5, MKLDNNPlugin::impl_desc_type::jit },
+                                 {3, 3}, {1, 2}, {0, 0}, {0, 0}, 20, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit },
                 conv_test_params{{1, 1, 32, 16},
-                                 2, 4, 2, 1, 0, 0, 17, 1, 5, MKLDNNPlugin::impl_desc_type::jit },
-                /*conv_test_params{{1, 9, 16, 32},
-                                 1, 1, 1, 1, 0, 0, 17, 1, 6, MKLDNNPlugin::impl_desc_type::gemm,
+                                 {2, 4}, {2, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit },
+#ifdef USE_MKL
+                conv_test_params{{1, 9, 16, 32},
+                                 {1, 1}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 6, MKLDNNPlugin::impl_desc_type::gemm,
                                  {MKLDNNPlugin::impl_desc_type::gemm_any,
                                   MKLDNNPlugin::impl_desc_type::gemm_blas,
                                   MKLDNNPlugin::impl_desc_type::gemm_avx512,
                                   MKLDNNPlugin::impl_desc_type::gemm_avx2,
-                                  MKLDNNPlugin::impl_desc_type::gemm_sse42}
-                },*/
+                                  MKLDNNPlugin::impl_desc_type::gemm_sse42} },
+#endif
                 conv_test_params{{1, 9, 32, 16},
-                                 2, 4, 1, 1, 0, 0, 17, 1, 5, MKLDNNPlugin::impl_desc_type::ref_any, {MKLDNNPlugin::impl_desc_type::ref_any} },
+                                 {2, 4}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::ref_any,
+                                 {MKLDNNPlugin::impl_desc_type::ref_any} },
                 conv_test_params{{1, 4, 54, 96},
-                                 3, 3, 1, 1, 1, 1, 64, 1, 3, MKLDNNPlugin::impl_desc_type::ref_any,
-                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd, MKLDNNPlugin::impl_desc_type::ref_any}}));
+                                 {3, 3}, {1, 1}, {1, 1}, {0, 0}, 64, 1, "", 3, MKLDNNPlugin::impl_desc_type::ref_any,
+                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd, MKLDNNPlugin::impl_desc_type::ref_any}},
+                // 5D
+        /*9*/   conv_test_params{{1, 3, 15, 20, 20},
+                                 {3, 3, 3}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::ref_any,
+                                 {MKLDNNPlugin::impl_desc_type::ref_any} },
+                conv_test_params{{1, 24, 15, 20, 20},
+                                 {3, 3, 3}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::ref_any,
+                                 {MKLDNNPlugin::impl_desc_type::ref_any} },
+                conv_test_params{{1, 32, 15, 20, 20},
+                                 {3, 3, 3}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::ref_any,
+                                 {MKLDNNPlugin::impl_desc_type::ref_any} },
+                conv_test_params{{1, 3, 15, 25, 20},
+                                 {3, 3, 3}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::jit },
+                conv_test_params{{1, 24, 15, 25, 20},
+                                 {3, 3, 3}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::jit },
+        /*14*/  conv_test_params{{1, 32, 15, 25, 20},
+                                 {3, 3, 3}, {2, 2, 2}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::jit },
+#ifdef USE_MKL
+                conv_test_params{{1, 5, 15, 20, 20},
+                                 {3, 3, 3}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::gemm_blas },
+                conv_test_params{{1, 5, 15, 20, 20},
+                                 {3, 3, 3}, {3, 2, 1}, {0, 0, 0}, {0, 0, 0}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::gemm_blas },
+                conv_test_params{{1, 5, 15, 20, 20},
+                                 {3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}, 64, 1, "", 2, MKLDNNPlugin::impl_desc_type::gemm_blas },
+                conv_test_params{{1, 16, 30, 30, 10},
+                                 {5, 5, 5}, {1, 1, 1}, {2, 2, 2}, {2, 2, 2}, 16, 1, "", 2, MKLDNNPlugin::impl_desc_type::gemm_blas,
+                                 {MKLDNNPlugin::impl_desc_type::gemm_blas} },
+                conv_test_params{{1, 4, 16, 16, 16},
+                                 {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, 8, 1, "same_upper", 2, MKLDNNPlugin::impl_desc_type::gemm_blas },
+#endif
+        /*20*/  conv_test_params{{1, 16, 30, 30, 10},
+                                 {5, 5, 5}, {1, 1, 1}, {2, 2, 2}, {2, 2, 2}, 16, 1, "", 2, MKLDNNPlugin::impl_desc_type::jit },
+                conv_test_params{{1, 16, 30, 30, 10},
+                                 {5, 5, 5}, {1, 1, 1}, {2, 2, 2}, {2, 2, 2}, 16, 1, "", 2, MKLDNNPlugin::impl_desc_type::ref_any,
+                                 {MKLDNNPlugin::impl_desc_type::ref_any} }));
+
 
 class MKLDNNGraphDynBatchConvolutionTests: public MKLDNNGraphConvolutionTests {
 protected:
@@ -317,52 +414,66 @@ protected:
             TestsCommon::SetUp();
             conv_test_params p = ::testing::WithParamInterface<conv_test_params>::GetParam();
             std::string model = getModel(p);
-            size_t MB = p.in.n;
-            if (MB < 2)
-                MB = 2;
+            std::vector<size_t> dims = p.dims;
+            if (dims[0] < 2)
+                dims[0] = 2;
 
-            InferenceEngine::CNNNetReader net_reader;
+            CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
-            InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C,
-                    {(p.krn_w * p.krn_h * p.out_c * p.in.c / p.grp_c + p.out_c) * sizeof(float)});
+            size_t blob_size = p.out_c * dims[1] / p.grp_c;
+            for (auto k : p.kernel) {
+                blob_size *= k;
+            }
+            blob_size = (blob_size + p.out_c) * sizeof(float);
+            TBlob<uint8_t> *weights = new TBlob<uint8_t>(Precision::U8, C,
+                    {blob_size});
             weights->allocate();
             fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
-            InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
+            TBlob<uint8_t>::Ptr weights_ptr = TBlob<uint8_t>::Ptr(weights);
 
             net_reader.SetWeights(weights_ptr);
-            InferenceEngine::CNNNetwork network = net_reader.getNetwork();
-            auto implNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(&((InferenceEngine::ICNNNetwork&)network));
+            CNNNetwork network = net_reader.getNetwork();
+            auto implNet = dynamic_cast<details::CNNNetworkImpl *>(&((ICNNNetwork&)network));
             ASSERT_NE(nullptr, implNet) << "Failed to cast ICNNNetwork to CNNNetworkImpl";
-            InferenceEngine::ResponseDesc resp;
-            InferenceEngine::StatusCode sts  = implNet->setBatchSizeReshape(MB, &resp);
-            ASSERT_EQ((int)InferenceEngine::StatusCode::OK, sts) << resp.msg;
+            ResponseDesc resp;
+            StatusCode sts  = implNet->setBatchSizeReshape(dims[0], &resp);
+            ASSERT_EQ((int)StatusCode::OK, sts) << resp.msg;
 
             MKLDNNGraphTestClass graph;
-            graph.CreateGraph(net_reader.getNetwork());
-
-            InferenceEngine::SizeVector dims_src = {MB, p.in.c, p.in.h, p.in.w};
+            graph.CreateGraph(network);
+
+            Layout layout = ANY;
+            switch (dims.size()) {
+                case 4:
+                    layout = NCHW;
+                    break;
+                case 5:
+                    layout = NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            Blob::Ptr src = make_shared_blob<float, const SizeVector>
+                    (Precision::FP32, layout, dims);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
-            auto * srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
+            auto * srcPtr = dynamic_cast<TBlob<float>*>(src.get());
 
             if (srcPtr == nullptr)
                 FAIL() << "Cannot cast blob to TBlob<float>.";
 
-            InferenceEngine::BlobMap srcs;
-            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src));
+            BlobMap srcs;
+            srcs.insert(std::pair<std::string, Blob::Ptr>("in1", src));
 
-            InferenceEngine::OutputsDataMap out;
-            out = net_reader.getNetwork().getOutputsInfo();
-            InferenceEngine::BlobMap outputBlobs;
+            OutputsDataMap out;
+            out = network.getOutputsInfo();
+            BlobMap outputBlobs;
 
-            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
+            std::pair<std::string, DataPtr> item = *out.begin();
 
-            InferenceEngine::TBlob<float>::Ptr output;
-            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
+            TBlob<float>::Ptr output;
+            output = make_shared_blob<float>(item.second->getTensorDesc());
             output->allocate();
             outputBlobs[item.first] = output;
 
@@ -373,9 +484,9 @@ protected:
                        node->getType() == MKLDNNPlugin::Convolution_Sum_Activation;
             };
 
-            graph.checkDynBatch(srcs, outputBlobs, MB, MB, checkConvolution, MKLDNNGraphTestClass::CheckDynBatchType::Child);
-            graph.checkDynBatch(srcs, outputBlobs, 1, MB, checkConvolution, MKLDNNGraphTestClass::CheckDynBatchType::Child);
-        } catch (const InferenceEngine::details::InferenceEngineException &e) {
+            graph.checkDynBatch(srcs, outputBlobs, dims[0], dims[0], checkConvolution, MKLDNNGraphTestClass::CheckDynBatchType::Child);
+            graph.checkDynBatch(srcs, outputBlobs, 1, dims[0], checkConvolution, MKLDNNGraphTestClass::CheckDynBatchType::Child);
+        } catch (const details::InferenceEngineException &e) {
             FAIL() << e.what();
         }
     }
@@ -387,25 +498,25 @@ INSTANTIATE_TEST_CASE_P(
         TestDynBatchConvolution, MKLDNNGraphDynBatchConvolutionTests,
         ::testing::Values(
                 conv_test_params{{1, 8, 16, 32},
-                                 1, 1, 1, 1, 0, 0, 17, 1, 7, MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_1x1,
+                                 {1, 1}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 7, MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_1x1,
                                  {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd}},
                 conv_test_params{{1, 9, 32, 16},
-                                 2, 4, 1, 1, 0, 0, 17, 1, 5, MKLDNNPlugin::impl_desc_type::jit,
-                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd}},
+                                 {2, 4}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit,
+                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd} },
                 conv_test_params{{1, 9, 32, 16},
-                                 2, 4, 2, 1, 0, 0, 17, 1, 5, MKLDNNPlugin::impl_desc_type::jit,
-                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd}},
+                                 {2, 4}, {2, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit,
+                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd} },
                 conv_test_params{{1, 3, 40, 40},
-                                 3, 3, 1, 2, 0, 0, 20, 1, 5, MKLDNNPlugin::impl_desc_type::jit,
-                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd}},
+                                 {3, 3}, {1, 2}, {0, 0}, {0, 0}, 20, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit,
+                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd} },
                 conv_test_params{{1, 1, 40, 40},
-                                 3, 3, 1, 2, 0, 0, 20, 1, 5, MKLDNNPlugin::impl_desc_type::jit,
-                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd}},
+                                 {3, 3}, {1, 2}, {0, 0}, {0, 0}, 20, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit,
+                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd} },
                 conv_test_params{{1, 1, 32, 16},
-                                 2, 4, 2, 1, 0, 0, 17, 1, 5, MKLDNNPlugin::impl_desc_type::jit,
-                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd}},
+                                 {2, 4}, {2, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::jit,
+                                 {MKLDNNPlugin::impl_desc_type::jit_avx512_winograd} },
                 conv_test_params{{1, 9, 16, 32},
-                                 1, 1, 1, 1, 0, 0, 17, 1, 7, MKLDNNPlugin::impl_desc_type::gemm,
+                                 {1, 1}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 7, MKLDNNPlugin::impl_desc_type::gemm,
                                  {MKLDNNPlugin::impl_desc_type::gemm_any,
                                   MKLDNNPlugin::impl_desc_type::gemm_blas,
                                   MKLDNNPlugin::impl_desc_type::gemm_avx512,
@@ -413,4 +524,4 @@ INSTANTIATE_TEST_CASE_P(
                                   MKLDNNPlugin::impl_desc_type::gemm_sse42}
                 },
                 conv_test_params{{1, 9, 32, 16},
-                                 2, 4, 1, 1, 0, 0, 17, 1, 5, MKLDNNPlugin::impl_desc_type::ref_any, {MKLDNNPlugin::impl_desc_type::ref_any} }));
+                                 {2, 4}, {1, 1}, {0, 0}, {0, 0}, 17, 1, "", 5, MKLDNNPlugin::impl_desc_type::ref_any, {MKLDNNPlugin::impl_desc_type::ref_any} }));
index 86dbb0b..b263511 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "tests_common.hpp"
 
 
+using namespace InferenceEngine;
 using namespace ::testing;
 using namespace std;
 using namespace mkldnn;
 
 
 struct deconv_test_params {
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } in;
-
-    size_t krn_w;
-    size_t krn_h;
-    size_t str_w;
-    size_t str_h;
-    size_t pad_w;
-    size_t pad_h;
+    // Formats: NCHW, NCDHW
+    vector<size_t> dims;
+    // Formats: WH, WHD
+    vector<size_t> kernel;
+    vector<size_t> strides;
+    vector<size_t> pads_begin;
+    vector<size_t> pads_end;
 
     size_t out_c;
     size_t grp_c;
 
     bool with_bias;
+    string auto_pad;
 
     size_t num_prim_desc;
 
@@ -51,19 +46,24 @@ struct deconv_test_params {
 template <typename data_t>
 void ref_deconv(const InferenceEngine::TBlob<data_t> &src, const InferenceEngine::Blob::Ptr &weights, const InferenceEngine::Blob::Ptr &bias,
                 InferenceEngine::TBlob<data_t> &dst, deconv_test_params prm) {
+    auto dims_size = src.dims().size();
 
     size_t G  = prm.grp_c;
-    size_t KW = prm.krn_w;
-    size_t KH = prm.krn_h;
+    size_t KW = prm.kernel[X_AXIS];
+    size_t KH = prm.kernel[Y_AXIS];
+    size_t KD = prm.kernel.size() > Z_AXIS ? prm.kernel[Z_AXIS] : 1u;
 
-    size_t PW = prm.pad_w;
-    size_t PH = prm.pad_h;
+    size_t PW = prm.pads_begin[X_AXIS];
+    size_t PH = prm.pads_begin[Y_AXIS];
+    size_t PD = prm.pads_begin.size() > Z_AXIS ? prm.pads_begin[Z_AXIS] : 0u;
 
-    size_t SW = prm.str_w;
-    size_t SH = prm.str_h;
+    size_t SW = prm.strides[X_AXIS];
+    size_t SH = prm.strides[Y_AXIS];
+    size_t SD = prm.strides.size() > Z_AXIS ? prm.strides[Z_AXIS] : 1u;
 
-    size_t IW = src.dims()[3];
-    size_t IH = src.dims()[2];
+    size_t IW = src.dims()[dims_size - 1];
+    size_t IH = src.dims()[dims_size - 2];
+    size_t ID = dims_size == 5 ? src.dims()[dims_size - 3] : 1u;
     size_t IC = src.dims()[1];
     size_t MB = src.dims()[0];
 
@@ -71,6 +71,7 @@ void ref_deconv(const InferenceEngine::TBlob<data_t> &src, const InferenceEngine
 
     size_t OW = SW * (IW - 1) + KW - 2 * PW;
     size_t OH = SH * (IH - 1) + KH - 2 * PH;
+    size_t OD = dims_size == 5 ? (SD * (ID - 1) + KD - 2 * PD) : 1u;
 
     const data_t *src_data = src.readOnly();
     const data_t *weights_data = weights->buffer().as<data_t*>();
@@ -78,43 +79,69 @@ void ref_deconv(const InferenceEngine::TBlob<data_t> &src, const InferenceEngine
 
     data_t *dst_data = dst.data();
 
-    for (int g = 0; g < G; ++g) {
-        for (int mb = 0; mb < MB; ++mb) {
-            for (int oc = 0; oc < OC / G; ++oc) {
-                for (int oh = 0; oh < OH; ++oh) {
-                    for (int ow = 0; ow < OW; ++ow) {
-                        size_t didx = mb * OC * OH * OW
-                                      + (g * OC / G + oc) * OH * OW + oh * OW + ow;
-
-                        dst_data[didx] = data_t(0);
-                        if (prm.with_bias) dst_data[didx] += bias_data[oc];
-
-                        for (int ic = 0; ic < IC / G; ic++) {
-                            for (int kh = 0; kh < KH; kh++) {
-                                for (int kw = 0; kw < KW; kw++) {
-                                    if (ow + PW < kw || oh + PH < kh)
-                                        continue;
-
-                                    size_t iw = ow - kw + PW;
-                                    size_t ih = oh - kh + PH;
+    size_t CS1 = OH * OW;
+    size_t CS2 = CS1 * OD;
+    size_t CS3 = CS2 * OC;
 
-                                    if (iw % SW != 0 || ih % SH != 0)
-                                        continue;
+    size_t CI1 = IH * IW;
+    size_t CI2 = CI1 * ID;
+    size_t CI3 = CI2 * IC;
 
-                                    iw /= SW;
-                                    ih /= SH;
+    size_t CK1 = KH * KW;
+    size_t CK2 = CK1 * KD;
+    size_t CK3 = CK2 * (OC / G);
+    size_t CK4 = CK3 * (IC / G);
 
-                                    if (ih < IH && iw < IW) {
-                                        size_t sidx = mb * IC * IH * IW
-                                                      + (g * IC / G + ic) * IH * IW + ih * IW
-                                                      + iw;
-
-                                        size_t widx = g * (IC / G) * (OC / G) * KH * KW +
-                                                      ic * (OC / G) * KH * KW +
-                                                      + oc * KH * KW + kh * KW
-                                                      + kw;
-
-                                        dst_data[didx] += src_data[sidx] * weights_data[widx];
+    for (int g = 0; g < G; ++g) {
+        for (int mb = 0; mb < MB; ++mb) {
+            for (int oc = 0; oc < OC / G; ++oc) {
+                for (int od = 0; od < OD; ++od) {
+                    for (int oh = 0; oh < OH; ++oh) {
+                        for (int ow = 0; ow < OW; ++ow) {
+                            size_t didx = mb * CS3
+                                          + (g * OC / G + oc) * CS2
+                                          + od * CS1
+                                          + oh * OW
+                                          + ow;
+
+                            dst_data[didx] = data_t(0);
+                            if (prm.with_bias) dst_data[didx] += bias_data[g * OC / G + oc];
+
+                            for (int ic = 0; ic < IC / G; ic++) {
+                                for (int kd = 0; kd < KD; kd++) {
+                                    for (int kh = 0; kh < KH; kh++) {
+                                        for (int kw = 0; kw < KW; kw++) {
+                                            if (ow + PW < kw || oh + PH < kh || od + PD < kd)
+                                                continue;
+
+                                            size_t iw = ow - kw + PW;
+                                            size_t ih = oh - kh + PH;
+                                            size_t id = od - kd + PD;
+
+                                            if (iw % SW != 0 || ih % SH != 0 || id % SD != 0)
+                                                continue;
+
+                                            iw /= SW;
+                                            ih /= SH;
+                                            id /= SD;
+
+                                            if (ih < IH && iw < IW && id < ID) {
+                                                size_t sidx = mb * CI3
+                                                              + (g * IC / G + ic) * CI2
+                                                              + id * CI1
+                                                              + ih * IW
+                                                              + iw;
+
+                                                size_t widx = g * CK4
+                                                              + ic * CK3
+                                                              + oc * CK2
+                                                              + kd * CK1
+                                                              + kh * KW
+                                                              + kw;
+
+                                                dst_data[didx] += src_data[sidx] * weights_data[widx];
+                                            }
+                                        }
                                     }
                                 }
                             }
@@ -128,42 +155,32 @@ void ref_deconv(const InferenceEngine::TBlob<data_t> &src, const InferenceEngine
 
 class MKLDNNGraphDeconvolutionalTests: public TestsCommon,
                                      public WithParamInterface<deconv_test_params> {
-    std::string model_t = R"V0G0N(
-<Net Name="Deconvolution_Only" version="2" precision="FP32" batch="1">
+    std::string model_t_5D = R"V0G0N(
+<net name="Deconvolution_Only" version="3" precision="FP32" batch="1">
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="0">
             <output>
-                <port id="0">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="0">__SRC_DIMS__
                 </port>
             </output>
         </layer>
         <layer name="deconv1" id="1" type="Deconvolution" precision="FP32">
-            <deconvolution stride-x="_SW_" stride-y="_SH_"
-                         pad-x="_PW_"    pad-y="_PH_"
-                         kernel-x="_KW_" kernel-y="_KH_"
-                         output="_OC_"   group="_GC_"/>
+            <deconvolution _AP_ kernel="_K_"
+                         pads_begin="_PB_"  pads_end="_PE_"
+                         strides="_KS_"
+                         output="_OC_" group="_GC_" PrimitivesPriority="_IMPLS_"/>
 
             <weights offset="0" size="_S1_" />
-            <biases offset="_OFF2_" size="_S2_" />
+            <biases offset="_S1_" size="_S2_" />
 
             <input>
-                <port id="1">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="1">__SRC_DIMS__
                 </port>
             </input>
             <output>
                 <port id="2">
                     <dim>_IN_</dim>
-                    <dim>_OC_</dim>
-                    <dim>_OH_</dim>
-                    <dim>_OW_</dim>
+                    <dim>_OC_</dim>__DST_DIMS__
                 </port>
             </output>
         </layer>
@@ -171,38 +188,61 @@ class MKLDNNGraphDeconvolutionalTests: public TestsCommon,
     <edges>
         <edge from-layer="0" from-port="0" to-layer="1" to-port="1"/>
     </edges>
-</Net>
+</net>
 )V0G0N";
 
 protected:
     std::string getModel(deconv_test_params p) {
-        std::string model = model_t;
-
-        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
-        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
-        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
-        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
+        std::string model = model_t_5D;
+        auto dims_size = p.dims.size();
+        std::string s_dims;
+        for (auto& dim : p.dims) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS__", s_dims);
+
+        s_dims = "";
+        int k_len = p.kernel.size();
+        for (size_t i = 2; i < p.dims.size(); i++) {
+            size_t inx = k_len - i + 1;
+            size_t dim = p.strides[inx] * (p.dims[i] - 1) + p.kernel[inx] - 2 * p.pads_begin[inx];
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__DST_DIMS__", s_dims);
+        REPLACE_WITH_NUM(model, "_IN_", p.dims[0]);
 
-        REPLACE_WITH_NUM(model, "_KW_", p.krn_w);
-        REPLACE_WITH_NUM(model, "_KH_", p.krn_h);
-        REPLACE_WITH_NUM(model, "_SW_", p.str_w);
-        REPLACE_WITH_NUM(model, "_SH_", p.str_h);
-        REPLACE_WITH_NUM(model, "_PW_", p.pad_w);
-        REPLACE_WITH_NUM(model, "_PH_", p.pad_h);
+        if (!p.with_bias) REMOVE_LINE(model, "<biases offset=\"_S1_\" size=\"_S2_\" />");
 
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_K_", p.kernel);
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_KS_", p.strides);
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_PB_", p.pads_begin);
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_PE_", p.pads_end);
         REPLACE_WITH_NUM(model, "_GC_", p.grp_c);
         REPLACE_WITH_NUM(model, "_OC_", p.out_c);
-        REPLACE_WITH_NUM(model, "_OH_", p.str_h * (p.in.h - 1) + p.krn_h - 2 * p.pad_h);
-        REPLACE_WITH_NUM(model, "_OW_", p.str_w * (p.in.w - 1) + p.krn_w - 2 * p.pad_w);
+        string auto_pad;
+        if (!p.auto_pad.empty()) auto_pad = string("auto_pad=") + string("\"") + p.auto_pad + string("\"");
+        REPLACE_WITH_STR(model, "_AP_", auto_pad);
 
-        size_t w_data_size = (p.krn_w * p.krn_h * p.out_c * (p.in.c / p.grp_c)) * sizeof(float);
+        size_t blob_size = p.out_c * (p.dims[1] / p.grp_c);
+        for (auto k : p.kernel) {
+            blob_size *= k;
+        }
+        size_t w_data_size = blob_size * sizeof(float);
         REPLACE_WITH_NUM(model, "_S1_", w_data_size);
 
-        if (!p.with_bias) REMOVE_LINE(model, "<biases offset=\"_OFF2_\" size=\"_S2_\" />");
         size_t b_data_size = p.out_c * sizeof(float);
-        REPLACE_WITH_NUM(model, "_OFF2_", w_data_size);
         REPLACE_WITH_NUM(model, "_S2_", b_data_size);
 
+        std::string impls;
+        for (const auto& preferType : p.preferTypes) {
+            if (!impls.empty())
+                impls += ",";
+            impls += "cpu:" + MKLDNNGraphTestClass::getStrPrimitiveDescriptorType(preferType);
+        }
+        REPLACE_WITH_STR(model, "_IMPLS_", impls);
+
         return model;
     }
 
@@ -218,7 +258,11 @@ protected:
             InferenceEngine::CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
-            InferenceEngine::SizeVector dims_weights = {p.krn_w * p.krn_h * p.out_c * (p.in.c / p.grp_c)};
+            size_t blob_size = p.out_c * (p.dims[1] / p.grp_c);
+            for (auto k : p.kernel) {
+                blob_size *= k;
+            }
+            InferenceEngine::SizeVector dims_weights = { blob_size };
 
             std::vector<InferenceEngine::Blob::Ptr> blob_to_model;
             InferenceEngine::Blob::Ptr weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32, InferenceEngine::C, dims_weights);
@@ -262,9 +306,18 @@ protected:
                 }
             }
 
-            InferenceEngine::SizeVector dims_src = {p.in.n, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.dims;
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Layout layout = ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
@@ -294,7 +347,7 @@ protected:
 
             ref_deconv(*srcPtr, weights, bias, dst_ref, p);
 
-            compare(*output, dst_ref);
+            compare(*output, dst_ref, 0.0002f);
         } catch (const InferenceEngine::details::InferenceEngineException &e) {
             FAIL() << e.what();
         }
@@ -307,24 +360,59 @@ TEST_P(MKLDNNGraphDeconvolutionalTests, TestsDeconvolution) {}
 INSTANTIATE_TEST_CASE_P(
         TestDeconvolution, MKLDNNGraphDeconvolutionalTests,
         ::testing::Values(
-                deconv_test_params{{1, 3, 3, 3}, 3, 3, 1, 1, 0, 0, 2, 1, false, 2, {MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{3, 3, 3, 3}, 4, 3, 1, 1, 0, 0, 2, 1, false, 2, {MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{1, 3, 3, 3}, 4, 3, 1, 2, 0, 0, 2, 1, false, 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{1, 3, 3, 3}, 4, 3, 2, 2, 0, 0, 2, 1, false, 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{4, 17, 3, 3}, 4, 3, 2, 2, 0, 0, 2, 1, false, 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
-                /*deconv_test_params{{2, 8, 5, 5}, 4, 4, 2, 2, 1, 1, 8, 2, false, 3, {MKLDNNPlugin::impl_desc_type::gemm}},*/
-                deconv_test_params{{2, 8, 5, 5}, 4, 4, 2, 2, 1, 1, 8, 8, false, 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
-                deconv_test_params{{2, 8, 5, 5}, 8, 8, 4, 4, 1, 1, 8, 8, false, 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
-                deconv_test_params{{2, 8, 5, 5}, 4, 8, 2, 4, 1, 1, 8, 8, false, 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
-                deconv_test_params{{1, 3, 3, 3}, 3, 3, 1, 1, 0, 0, 2, 1, true, 2, {MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{3, 3, 3, 3}, 4, 3, 1, 1, 0, 0, 2, 1, true, 2, {MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{1, 3, 3, 3}, 4, 3, 1, 2, 0, 0, 2, 1, true, 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{1, 3, 3, 3}, 4, 3, 2, 2, 0, 0, 2, 1, true, 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{4, 17, 3, 3}, 4, 3, 2, 2, 0, 0, 2, 1, true, 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
-                /*deconv_test_params{{2, 8, 5, 5}, 4, 4, 2, 2, 1, 1, 8, 2, true, 3, {MKLDNNPlugin::impl_desc_type::gemm}},*/
-                deconv_test_params{{2, 8, 5, 5}, 4, 4, 2, 2, 1, 1, 8, 8, true, 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
-                deconv_test_params{{2, 8, 5, 5}, 8, 8, 4, 4, 1, 1, 8, 8, true, 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
-                deconv_test_params{{2, 8, 5, 5}, 4, 8, 2, 4, 1, 1, 8, 8, true, 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}
+        /*0*/   deconv_test_params{{1, 3, 3, 3}, {3, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{3, 3, 3, 3}, {4, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{1, 3, 3, 3}, {4, 3}, {1, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{1, 3, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{4, 17, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
+                deconv_test_params{{2, 8, 5, 5}, {8, 8}, {4, 4}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
+                deconv_test_params{{2, 8, 5, 5}, {4, 8}, {2, 4}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
+        /*8*/   deconv_test_params{{1, 3, 3, 3}, {3, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{3, 3, 3, 3}, {4, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{1, 3, 3, 3}, {4, 3}, {1, 2}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{1, 3, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{4, 17, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, true, "", 2, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 8, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
+                deconv_test_params{{2, 8, 5, 5}, {8, 8}, {4, 4}, {1, 1}, {0, 0}, 8, 8, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
+                deconv_test_params{{2, 8, 5, 5}, {4, 8}, {2, 4}, {1, 1}, {0, 0}, 8, 8, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
+                deconv_test_params{{1, 3, 3, 3}, {3, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, false, "", 2, {MKLDNNPlugin::impl_desc_type::ref_any}, 
+                                    {MKLDNNPlugin::impl_desc_type::ref_any}},
+        /*17*/  deconv_test_params{{2, 8, 5, 5}, {1, 3}, {1, 1}, {0, 1}, {0, 1}, 8, 8, true, "", 2,
+                    {MKLDNNPlugin::impl_desc_type::ref_any}, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                deconv_test_params{{1, 6, 6, 5}, {3, 1}, {1, 1}, {1, 0}, {1, 0}, 9, 3, true, "", 2,
+                    {MKLDNNPlugin::impl_desc_type::ref_any}, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                deconv_test_params{{2, 24, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 24, 3, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}},
+                deconv_test_params{{2, 24, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 24, 1, true, "", 3, {MKLDNNPlugin::impl_desc_type::jit}},
+                deconv_test_params{{2, 72, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 72, 3, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}},
+                deconv_test_params{{1, 12, 2, 2}, {4, 4}, {2, 2}, {1, 1}, {1, 1}, 12, 12, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}},
+#ifdef USE_MKL
+                deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 2, false, "", 3, {MKLDNNPlugin::impl_desc_type::gemm}},
+                deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 2, true, "", 3, {MKLDNNPlugin::impl_desc_type::gemm}},
+                deconv_test_params{{1, 6, 6, 5}, {3, 1}, {1, 1}, {1, 0}, {1, 0}, 9, 3, true, "", 2,
+                    {MKLDNNPlugin::impl_desc_type::gemm_blas}},
+                deconv_test_params{{1, 64, 12, 12, 2}, {2, 2, 2}, {2, 2, 2}, {0, 0, 0}, {1, 0, 0}, 32, 1, true, "", 4,
+                    {MKLDNNPlugin::impl_desc_type::gemm_blas}},
+                deconv_test_params{{1, 32, 12, 12, 2}, {2, 2, 2}, {2, 2, 2}, {0, 0, 0}, {1, 0, 0}, 16, 1, true, "", 4, 
+                    {MKLDNNPlugin::impl_desc_type::gemm_blas} },
+                deconv_test_params{{1, 25, 1, 1, 1}, {4, 4, 4}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, 64, 1, true, "valid", 3,
+                    {MKLDNNPlugin::impl_desc_type::gemm_blas} },
+                deconv_test_params{{1, 32, 16, 16, 16}, {4, 4, 4}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, 1, 1, true, "same_upper", 3,
+                    {MKLDNNPlugin::impl_desc_type::gemm_blas} },
+                deconv_test_params{{1, 64, 12, 12, 2}, {2, 2, 2}, {2, 2, 2}, {0, 0, 0}, {1, 0, 0}, 32, 1, true, "same_upper", 3,
+                    {MKLDNNPlugin::impl_desc_type::gemm_blas} },
+                deconv_test_params{{1, 50, 1, 1, 1}, {4, 4, 4}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, 128, 1, true, "", 3,
+                    {MKLDNNPlugin::impl_desc_type::gemm_blas}, {MKLDNNPlugin::impl_desc_type::gemm_blas}},
+#endif
+                // 5D
+                deconv_test_params{{1, 2, 8, 5, 5}, {3, 3, 3}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}, 4, 1, true, "", 4,
+                    {MKLDNNPlugin::impl_desc_type::ref_any}, {MKLDNNPlugin::impl_desc_type::ref_any} }
+
+                // Blocked, with biases
+                // TODO support on jit
+//                deconv_test_params{{2, 24, 5, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 24, 3, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}},
+//                deconv_test_params{{2, 24, 5, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 24, 1, true, "", 3, {MKLDNNPlugin::impl_desc_type::jit}},
+//                deconv_test_params{{2, 72, 5, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 72, 3, true, "", 4, {MKLDNNPlugin::impl_desc_type::jit}}
         ));
 
 class MKLDNNGraphDynBatchDeconvolutionalTests: public MKLDNNGraphDeconvolutionalTests {
@@ -334,14 +422,18 @@ protected:
             TestsCommon::SetUp();
             deconv_test_params p = ::testing::WithParamInterface<deconv_test_params>::GetParam();
             std::string model = getModel(p);
-            size_t MB = p.in.n;
+            size_t MB = p.dims[0];
             if (MB < 2)
                 MB = 2;
 
             InferenceEngine::CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
-
-            InferenceEngine::SizeVector dims_weights = {p.krn_w * p.krn_h * p.out_c * (p.in.c / p.grp_c)};
+            
+            size_t blob_size = 1;
+            for (auto k : p.kernel) {
+                blob_size *= k;
+            }
+            InferenceEngine::SizeVector dims_weights = {blob_size * p.out_c * (p.dims[1] / p.grp_c)};
 
             std::vector<InferenceEngine::Blob::Ptr> blob_to_model;
             InferenceEngine::Blob::Ptr weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32, InferenceEngine::C, dims_weights);
@@ -379,8 +471,18 @@ protected:
             graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
             graph.CreateGraph(net_reader.getNetwork());
 
-            InferenceEngine::SizeVector dims_src = {MB, p.in.c, p.in.h, p.in.w};
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::SizeVector dims_src = p.dims;
+
+            InferenceEngine::Layout layout = ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             InferenceEngine::TBlob<float>* srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
             if (srcPtr == nullptr)
                 FAIL() << "Cannot cast blob to TBlob<float>.";
@@ -419,13 +521,13 @@ TEST_P(MKLDNNGraphDynBatchDeconvolutionalTests, TestsDynBatchDeconvolutional) {}
 INSTANTIATE_TEST_CASE_P(
         TestsDynBatchDeconvolutional, MKLDNNGraphDynBatchDeconvolutionalTests,
         ::testing::Values(
-                deconv_test_params{{1, 3, 3, 3}, 3, 3, 1, 1, 0, 0, 2, 1, false, 5, {MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{3, 3, 3, 3}, 4, 3, 1, 1, 0, 0, 2, 1, false, 5, {MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{1, 3, 3, 3}, 4, 3, 1, 2, 0, 0, 2, 1, false, 4, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{1, 3, 3, 3}, 4, 3, 2, 2, 0, 0, 2, 1, false, 3, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{4, 17, 3, 3}, 4, 3, 2, 2, 0, 0, 2, 1, false, 3, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
-                deconv_test_params{{2, 8, 5, 5}, 4, 4, 2, 2, 1, 1, 8, 2, false, 3, {MKLDNNPlugin::impl_desc_type::gemm}},
-                deconv_test_params{{2, 8, 5, 5}, 4, 4, 2, 2, 1, 1, 8, 8, false, 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
-                deconv_test_params{{2, 8, 5, 5}, 8, 8, 4, 4, 1, 1, 8, 8, false, 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
-                deconv_test_params{{2, 8, 5, 5}, 4, 8, 2, 4, 1, 1, 8, 8, false, 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}
+                deconv_test_params{{1, 3, 3, 3}, {3, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, false, "", 5, {MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{3, 3, 3, 3}, {4, 3}, {1, 1}, {0, 0}, {0, 0}, 2, 1, false, "", 5, {MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{1, 3, 3, 3}, {4, 3}, {1, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 4, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{1, 3, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 3, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{4, 17, 3, 3}, {4, 3}, {2, 2}, {0, 0}, {0, 0}, 2, 1, false, "", 3, {MKLDNNPlugin::impl_desc_type::gemm, MKLDNNPlugin::impl_desc_type::jit} },
+                deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 2, false, "", 3, {MKLDNNPlugin::impl_desc_type::gemm}},
+                deconv_test_params{{2, 8, 5, 5}, {4, 4}, {2, 2}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
+                deconv_test_params{{2, 8, 5, 5}, {8, 8}, {4, 4}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}},
+                deconv_test_params{{2, 8, 5, 5}, {4, 8}, {2, 4}, {1, 1}, {0, 0}, 8, 8, false, "", 4, {MKLDNNPlugin::impl_desc_type::jit | MKLDNNPlugin::impl_desc_type::_dw}}
         ));
index 4f07b8b..27bd241 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -19,12 +18,8 @@ using namespace mkldnn;
 struct depthwise_test_params {
     mkldnn::algorithm alg;
 
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } in;
+    // Formats: NCHW, NCDHW
+    vector<size_t> dims;
 
     bool isBroadcast;
 
@@ -39,8 +34,11 @@ struct depthwise_test_params {
 template <typename data_t>
 void ref_depthwise(const InferenceEngine::TBlob<data_t> &src, const data_t *weights, const size_t weightsSize,
                    InferenceEngine::TBlob<data_t> &dst, depthwise_test_params prm) {
-    size_t IW = src.dims()[3];
-    size_t IH = src.dims()[2];
+    auto dims_size = src.dims().size();
+
+    size_t IW = src.dims()[dims_size - 1];
+    size_t IH = src.dims()[dims_size - 2];
+    size_t ID = dims_size == 5 ? src.dims()[2] : 1u;
     size_t IC = src.dims()[1];
     size_t MB = src.dims()[0];
 
@@ -50,21 +48,28 @@ void ref_depthwise(const InferenceEngine::TBlob<data_t> &src, const data_t *weig
     const data_t *bias_data = weights_data + bias_offset;
     data_t *dst_data = dst.data();
 
-    for(int mb = 0; mb < MB; mb++) {
-        for(int c = 0; c < IC; c++) {
-            for(int h = 0; h < IH; h++) {
-                for(int w = 0; w < IW; w++) {
-                    int idx = mb * IC * IH * IW
-                              + c * IH * IW
-                              + h * IW + w;
-
-                    int widx = prm.isBroadcast ? 0 : c;
-                    int bidx = prm.isBroadcast ? 0 : c;
-
-                    if (prm.alg == depthwise_scale_shift)
-                        dst_data[idx] = src_data[idx] * weights_data[widx] + bias_data[bidx];
-                    else if (prm.alg == depthwise_prelu)
-                        dst_data[idx] = src_data[idx] > 0 ? src_data[idx] : src_data[idx]*weights_data[widx];
+    size_t c1 = IH * IW;
+    size_t c2 = IC * c1;
+    size_t c3 = ID * c2;
+    for (int mb = 0; mb < MB; mb++) {
+        size_t m1 = mb * c3;
+        for (int c = 0; c < IC; c++) {
+            size_t m2 = m1 + c * c1;
+            for (int d = 0; d < ID; d++) {
+                size_t m3 = m2 + d * c2;
+                for (int h = 0; h < IH; h++) {
+                    size_t m4 = m3 + h * IW;
+                    for (int w = 0; w < IW; w++) {
+                        int idx = m4 + w;
+
+                        int widx = prm.isBroadcast ? 0 : c;
+                        int bidx = prm.isBroadcast ? 0 : c;
+
+                        if (prm.alg == depthwise_scale_shift)
+                            dst_data[idx] = src_data[idx] * weights_data[widx] + bias_data[bidx];
+                        else if (prm.alg == depthwise_prelu)
+                            dst_data[idx] = src_data[idx] > 0 ? src_data[idx] : src_data[idx]*weights_data[widx];
+                    }
                 }
             }
         }
@@ -73,7 +78,7 @@ void ref_depthwise(const InferenceEngine::TBlob<data_t> &src, const data_t *weig
 
 class MKLDNNGraphDepthwiseTests: public TestsCommon,
                                      public WithParamInterface<depthwise_test_params> {
-    std::string model_t = R"V0G0N(
+    std::string model_t_4D = R"V0G0N(
 <Net Name="Lrn_Only" version="2" precision="FP32" batch="1">
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="0">
@@ -115,14 +120,72 @@ class MKLDNNGraphDepthwiseTests: public TestsCommon,
 </Net>
 )V0G0N";
 
+
+    std::string model_t_5D = R"V0G0N(
+<Net Name="Lrn_Only" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="in1" type="Input" precision="FP32" id="0">
+            <output>
+                <port id="0">
+                    <dim>_IN_</dim>
+                    <dim>_IC_</dim>
+                    <dim>_ID_</dim>
+                    <dim>_IH_</dim>
+                    <dim>_IW_</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="depthwise" id="1" type="_LT_" precision="FP32">
+            <data _P_NAME_="_P_VAL_"  PrimitivesPriority="_IMPLS_"/>
+            <weights offset="0" size="_S1_" />
+            <biases offset="_S1_" size="_S2_" />
+
+            <input>
+                <port id="1">
+                    <dim>_IN_</dim>
+                    <dim>_IC_</dim>
+                    <dim>_ID_</dim>
+                    <dim>_IH_</dim>
+                    <dim>_IW_</dim>
+                </port>
+            </input>
+            <output>
+                <port id="2">
+                    <dim>_IN_</dim>
+                    <dim>_IC_</dim>
+                    <dim>_ID_</dim>
+                    <dim>_IH_</dim>
+                    <dim>_IW_</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="0" from-port="0" to-layer="1" to-port="1"/>
+    </edges>
+</Net>
+)V0G0N";
+
 protected:
     std::string getModel(depthwise_test_params p) {
-        std::string model = model_t;
+        std::string model;
+        auto dims_size = p.dims.size();
+        if (dims_size == 4) {
+            model = model_t_4D;
+        } else if (dims_size == 5) {
+            model = model_t_5D;
+        }
+
+        REPLACE_WITH_NUM(model, "_IW_", p.dims[dims_size - 1]);
+        REPLACE_WITH_NUM(model, "_IC_", p.dims[1]);
+        REPLACE_WITH_NUM(model, "_IN_", p.dims[0]);
 
-        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
-        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
-        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
-        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
+        switch (dims_size) {
+            case 5:
+                REPLACE_WITH_NUM(model, "_ID_", p.dims[dims_size - 3]);
+            case 4:
+                REPLACE_WITH_NUM(model, "_IH_", p.dims[dims_size - 2]);
+        }
 
         if (p.alg == depthwise_scale_shift) {
             REPLACE_WITH_STR(model, "_LT_", "ScaleShift");
@@ -135,7 +198,7 @@ protected:
             REPLACE_WITH_NUM(model, "_P_VAL_", p.isBroadcast ? 1 : 0);
         }
 
-        size_t array_size =  p.isBroadcast ? 1 : p.in.c;
+        size_t array_size =  p.isBroadcast ? 1 : p.dims[1];
         size_t w_data_size = array_size * sizeof(float);
         size_t b_data_size = array_size * sizeof(float);
         REPLACE_WITH_NUM(model, "_S1_", w_data_size);
@@ -161,7 +224,7 @@ protected:
             InferenceEngine::CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
-            size_t weightSize = 2*p.in.c*sizeof(float);
+            size_t weightSize = 2 * p.dims[1] * sizeof(float);
             InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {weightSize});
             weights->allocate();
             fill_data( weights->data().as<float*>(), weights->size() / sizeof(float));
@@ -185,9 +248,18 @@ protected:
                 }
             }
 
-            InferenceEngine::SizeVector dims_src = {p.in.n, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
@@ -252,7 +324,17 @@ INSTANTIATE_TEST_CASE_P(
                 depthwise_test_params{depthwise_prelu, {1, 1, 1, 1}, false, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
                 depthwise_test_params{depthwise_prelu, {1, 4, 5, 5}, false, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
                 depthwise_test_params{depthwise_prelu, {4, 4, 10, 10}, true, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
-                depthwise_test_params{depthwise_prelu, {1, 32, 128, 256}, true, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}}
+                depthwise_test_params{depthwise_prelu, {1, 32, 128, 256}, true, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                // 5D
+                // mkl-dnn does not support 5D depthwise on jit yet
+//                depthwise_test_params{depthwise_scale_shift, {1, 32, 16, 128, 256}, false, 3, MKLDNNPlugin::impl_desc_type::jit},
+//                depthwise_test_params{depthwise_scale_shift, {4, 3, 16, 228, 228}, false, 3, MKLDNNPlugin::impl_desc_type::jit},
+//                depthwise_test_params{depthwise_scale_shift, {1, 1, 1, 1, 1}, false, 3, MKLDNNPlugin::impl_desc_type::jit},
+//                depthwise_test_params{depthwise_scale_shift, {4, 4, 4, 10, 10}, true, 3, MKLDNNPlugin::impl_desc_type::jit},
+//                depthwise_test_params{depthwise_scale_shift, {1, 32, 16, 128, 256}, false, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+//                depthwise_test_params{depthwise_scale_shift, {4, 3, 16, 228, 228}, false, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                depthwise_test_params{depthwise_scale_shift, {1, 1, 1, 1, 1}, false, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                depthwise_test_params{depthwise_scale_shift, {4, 4, 4, 10, 10}, true, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}}
         ));
 
 class MKLDNNGraphDynBatchDepthwiseTests: public MKLDNNGraphDepthwiseTests {
@@ -263,14 +345,14 @@ protected:
             TestsCommon::SetUp();
             depthwise_test_params p = ::testing::WithParamInterface<depthwise_test_params>::GetParam();
             std::string model = getModel(p);
-            size_t MB = p.in.n;
+            size_t MB = p.dims[0];
             if (MB < 2)
                 MB = 2;
 
             InferenceEngine::CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
-            InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {p.in.c * 4 * sizeof(float)});
+            InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {p.dims[1] * 4 * sizeof(float)});
             weights->allocate();
             fill_data( weights->data().as<float*>(), weights->size() / sizeof(float));
             float * data = weights->buffer();
@@ -293,8 +375,17 @@ protected:
             graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
             graph.CreateGraph(net_reader.getNetwork());
 
-            InferenceEngine::SizeVector dims_src = {MB, p.in.c, p.in.h, p.in.w};
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::SizeVector dims_src = p.dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             InferenceEngine::TBlob<float>* srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
             if (srcPtr == nullptr)
                 FAIL() << "Cannot cast blob to TBlob<float>.";
index ebb2df4..e1d288d 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -19,12 +18,8 @@ using namespace std;
 using namespace mkldnn;
 
 struct eltwise_test_params {
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } in;
+    // Formats: NCHW, NCDHW
+    vector<size_t> dims;
 
     enum opType {
         Sum = 0, Prod = 1, Max = 2
@@ -100,66 +95,38 @@ void ref_eltwise(const std::vector<InferenceEngine::TBlob<data_t>> &src, Inferen
 class MKLDNNGraphEltwiseTests: public TestsCommon,
                                      public WithParamInterface<eltwise_test_params> {
     std::string model_t = R"V0G0N(
-<net name="EltwiseOnly" version="2" precision="FP32" batch="1">
+<net name="EltwiseOnly" version="3" precision="FP32" batch="1">
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="1">
             <output>
-                <port id="1">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="1">__SRC_DIMS__
                 </port>
             </output>
         </layer>
         <layer name="in2" type="Input" precision="FP32" id="2">
             <output>
-                <port id="2">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="2">__SRC_DIMS__
                 </port>
             </output>
         </layer>
         <layer name="in3" type="Input" precision="FP32" id="3">
             <output>
-                <port id="3">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="3">__SRC_DIMS__
                 </port>
             </output>
         </layer>
         <layer name="con" id="4" type="Eltwise" precision="FP32">
-            <elementwise_data operation="_OP_" coeff="_COEFF_"/>
+            <data operation="_OP_" _COEFF_/>
             <input>
-                <port id="1">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="1">__SRC_DIMS__
                 </port>
-                <port id="2">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="2">__SRC_DIMS__
                 </port>
-                <port id="3">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="3">__SRC_DIMS__
                 </port>
             </input>
             <output>
-                <port id="4">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="4">__SRC_DIMS__
                 </port>
             </output>
         </layer>
@@ -185,12 +152,19 @@ protected:
             op = "max";
         }
 
-        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
-        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
-        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
-        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
+        std::string src_dims;
+        for (auto& dim : p.dims) {
+                src_dims += "\n                    <dim>";
+                src_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims);
+
+        std::string scale;
+        if (!p.scales.empty()) {
+            scale = std::string("coeff=\"") + p.scales + std::string("\"");
+        }
         REPLACE_WITH_STR(model, "_OP_", op);
-        REPLACE_WITH_STR(model, "_COEFF_", p.scales);
+        REPLACE_WITH_STR(model, "_COEFF_", scale);
         return model;
     }
 
@@ -221,9 +195,18 @@ protected:
                 }
             }
 
-            InferenceEngine::SizeVector dims_src = {p.in.n, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src1->allocate();
 
             InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
@@ -232,7 +215,7 @@ protected:
                 FAIL() << "Cannot cast blob to TBlob<float>.";
 
             fill_data(src1->buffer(), src1->size());
-            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src2->allocate();
 
             InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
@@ -240,7 +223,7 @@ protected:
             if (srcPtr2 == nullptr)
                 FAIL() << "Cannot cast blob to TBlob<float>.";
             fill_data(src2->buffer(), src2->size());
-            InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src3->allocate();
 
             InferenceEngine::TBlob<float>* srcPtr3 = dynamic_cast<InferenceEngine::TBlob<float>*>(src3.get());
@@ -273,7 +256,7 @@ protected:
 
             ref_eltwise(src_vec, dst_ref, p);
 
-            compare(*output, dst_ref);
+            compare(*output, dst_ref, 0.0005f);
         } catch (const InferenceEngine::details::InferenceEngineException &e) {
             FAIL() << e.what();
         }
@@ -338,6 +321,17 @@ INSTANTIATE_TEST_CASE_P(
                             ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
                             ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
                         }
+                } },
+                eltwise_test_params{{1, 32, 16, 16, 16}, eltwise_test_params::opType::Sum, "", 3, MKLDNNPlugin::impl_desc_type::ref, {
+                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
+                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
+                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
+                            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(0).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(1).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(2).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().outConfs.at(0).desc.getLayout());
+                        }
                 } }
         ));
 
@@ -348,7 +342,7 @@ protected:
             TestsCommon::SetUp();
             eltwise_test_params p = ::testing::WithParamInterface<eltwise_test_params>::GetParam();
             std::string model = getModel(p);
-            size_t MB = p.in.n;
+            size_t MB = p.dims[0];
             if (MB < 2)
                 MB = 2;
 
@@ -365,9 +359,18 @@ protected:
             graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
             graph.CreateGraph(net_reader.getNetwork());
 
-            InferenceEngine::SizeVector dims_src = {MB, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src1->allocate();
 
             InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
@@ -376,7 +379,7 @@ protected:
                 FAIL() << "Cannot cast blob to TBlob<float>.";
 
             fill_data(src1->buffer(), src1->size());
-            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src2->allocate();
 
             InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
@@ -384,7 +387,7 @@ protected:
             if (srcPtr2 == nullptr)
                 FAIL() << "Cannot cast blob to TBlob<float>.";
             fill_data(src2->buffer(), src2->size());
-            InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src3->allocate();
 
             InferenceEngine::TBlob<float>* srcPtr3 = dynamic_cast<InferenceEngine::TBlob<float>*>(src3.get());
index a4ece62..dcf001f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -20,12 +19,8 @@ using namespace std;
 using namespace mkldnn;
 
 struct fc_test_params {
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } in;
+    // Formats: NCHW, NCDHW
+    vector<size_t> in_dims;
 
     size_t out_c;
 
@@ -41,32 +36,44 @@ struct fc_test_params {
 template <typename data_t>
 void ref_innerproduct(const InferenceEngine::TBlob<data_t> &src, const data_t *weights, const size_t weightsSize,
                       InferenceEngine::TBlob<data_t> &dst, fc_test_params prm) {
-    size_t IW = src.dims()[3];
-    size_t IH = src.dims()[2];
-    size_t IC = src.dims()[1];
+    auto dims_size = src.dims().size();
+
     size_t IB = src.dims()[0];
+    size_t IC = src.dims()[1];
+    size_t ID = dims_size == 5 ? src.dims()[dims_size - 3] : 1u;
+    size_t IH = src.dims()[dims_size - 2];
+    size_t IW = src.dims()[dims_size - 1];
 
     size_t OC = prm.out_c;
 
     const data_t *src_data = src.readOnly();
     const data_t *weights_data = weights;
-    const data_t *bias_data = weights_data + IW*IH*IC*OC;
+    const data_t *bias_data = weights_data + IW*IH*ID*IC*OC;
     data_t *dst_data = dst.data();
 
-    IE_ASSERT( IW*IH*IC*OC + OC == weightsSize);
-    IE_ASSERT( OC == dst.dims()[0]);
+    IE_ASSERT( IW*IH*ID*IC*OC + OC == weightsSize );
+    IE_ASSERT( OC == dst.dims()[0] );
 
     for (size_t n = 0; n < IB; n++) {
         for (size_t oc = 0; oc < OC; oc++) {
             dst_data[n*OC + oc] = bias_data[oc];
             for (size_t ic = 0; ic < IC; ic++) {
-                for (size_t kh = 0; kh < IH; kh++) {
-                    for (size_t kw = 0; kw < IW; kw++) {
-                        size_t iidx = n * IC * IH * IW + ic * IH * IW + kh * IW + kw;
-                        size_t widx = oc * IC * IH * IW
-                                      + ic * IH * IW + kh * IW + kw;
-
-                        dst_data[n*OC + oc] += src_data[iidx] * weights_data[widx];
+                for (size_t kd = 0; kd < ID; kd++) {
+                    for (size_t kh = 0; kh < IH; kh++) {
+                        for (size_t kw = 0; kw < IW; kw++) {
+                            size_t iidx = n * IC * ID * IH * IW
+                                        + ic * ID * IH * IW
+                                        + kd * IH * IW
+                                        + kh * IW
+                                        + kw;
+                            size_t widx = oc * IC * ID * IH * IW
+                                          + ic * ID * IH * IW 
+                                          + kd * IH * IW 
+                                          + kh * IW 
+                                          + kw;
+
+                            dst_data[n*OC + oc] += src_data[iidx] * weights_data[widx];
+                        }
                     }
                 }
             }
@@ -77,15 +84,11 @@ void ref_innerproduct(const InferenceEngine::TBlob<data_t> &src, const data_t *w
 class MKLDNNGraphFullyConnectedTests: public TestsCommon,
                                       public WithParamInterface<fc_test_params> {
     std::string model_t = R"V0G0N(
-<Net Name="FullyConnected_Only" version="2" precision="FP32" batch="1">
+<Net Name="FullyConnected_Only" version="3" precision="FP32" batch="1">
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="0">
             <output>
-                <port id="0">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="0">__SRC_DIMS__
                 </port>
             </output>
         </layer>
@@ -96,11 +99,7 @@ class MKLDNNGraphFullyConnectedTests: public TestsCommon,
             <biases offset="_S1_" size="_S2_" />
 
             <input>
-                <port id="1">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                <port id="1">__SRC_DIMS__
                 </port>
             </input>
             <output>
@@ -120,14 +119,19 @@ class MKLDNNGraphFullyConnectedTests: public TestsCommon,
 protected:
     std::string getModel(fc_test_params p) {
         std::string model = model_t;
-        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
-        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
-        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
-        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
+        std::string s_dims;
+        for (auto& dim : p.in_dims) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS__", s_dims);
 
+        REPLACE_WITH_NUM(model, "_IN_", p.in_dims[0]);
         REPLACE_WITH_NUM(model, "_OC_", p.out_c);
 
-        size_t w_data_size = (p.in.w * p.in.h * p.in.c * p.out_c )* sizeof(float);
+        size_t w_data_size = p.out_c * sizeof(float);
+        for (int i = 1; i < p.in_dims.size(); i++)
+            w_data_size *= p.in_dims[i];
         size_t b_data_size = p.out_c * sizeof(float);
         REPLACE_WITH_NUM(model, "_S1_", w_data_size);
         REPLACE_WITH_NUM(model, "_S2_", b_data_size);
@@ -153,7 +157,12 @@ protected:
             InferenceEngine::CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
-            InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {(p.in.w * p.in.h * p.in.c * p.out_c + p.out_c) * sizeof(float)});
+            size_t weights_size = p.out_c;
+            for (int i = 1; i < p.in_dims.size(); i++) {
+                weights_size *= p.in_dims[i];
+            }
+            weights_size = (weights_size + p.out_c) * sizeof(float);
+            InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {weights_size});
             weights->allocate();
             fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
             InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
@@ -174,9 +183,18 @@ protected:
                 }
             }
 
-            InferenceEngine::SizeVector dims_src = {p.in.n, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.in_dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.in_dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
@@ -224,7 +242,10 @@ INSTANTIATE_TEST_CASE_P(
                 fc_test_params{{1, 4, 227, 227}, 10, 6, MKLDNNPlugin::impl_desc_type::gemm },
                 fc_test_params{{1, 3, 227, 227}, 96, 6, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
                 fc_test_params{{1, 4, 227, 227}, 8, 6, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
-                fc_test_params{{1, 4, 227, 227}, 10, 6, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}}));
+                fc_test_params{{1, 4, 227, 227}, 10, 6, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                //5D
+                fc_test_params{{1, 4, 32, 32, 32}, 10, 6, MKLDNNPlugin::impl_desc_type::gemm },
+                fc_test_params{{1, 3, 32, 32, 32}, 96, 6, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}}));
 
 class MKLDNNGraphDynBatchFullyConnectedTests: public MKLDNNGraphFullyConnectedTests {
     virtual void SetUp() {
@@ -232,14 +253,19 @@ class MKLDNNGraphDynBatchFullyConnectedTests: public MKLDNNGraphFullyConnectedTe
             TestsCommon::SetUp();
             fc_test_params p = ::testing::WithParamInterface<fc_test_params>::GetParam();
             std::string model = getModel(p);
-            size_t MB = p.in.n;
+            size_t MB = p.in_dims[0];
             if (MB < 2)
                 MB = 2;
 
             InferenceEngine::CNNNetReader net_reader;
             ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
-            InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {(p.in.w * p.in.h * p.in.c * p.out_c + p.out_c) * sizeof(float)});
+            size_t weights_size = p.out_c;
+            for (int i = 1; i < p.in_dims.size(); i++) {
+                weights_size *= p.in_dims[i];
+            }
+            weights_size = (weights_size + p.out_c) * sizeof(float);
+            InferenceEngine::TBlob<uint8_t> *weights = new InferenceEngine::TBlob<uint8_t>(InferenceEngine::Precision::U8, InferenceEngine::C, {weights_size});
             weights->allocate();
             fill_data((float *) weights->buffer(), weights->size() / sizeof(float));
             InferenceEngine::TBlob<uint8_t>::Ptr weights_ptr = InferenceEngine::TBlob<uint8_t>::Ptr(weights);
@@ -255,9 +281,18 @@ class MKLDNNGraphDynBatchFullyConnectedTests: public MKLDNNGraphFullyConnectedTe
             graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
             graph.CreateGraph(net_reader.getNetwork());
 
-            InferenceEngine::SizeVector dims_src = {MB, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.in_dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.in_dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
diff --git a/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_gemm_test.cpp b/inference-engine/tests/unit/engines/mkldnn/graph/layers/internal/graph_gemm_test.cpp
new file mode 100644 (file)
index 0000000..8a2acf0
--- /dev/null
@@ -0,0 +1,627 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <gmock/gmock-spec-builders.h>
+#include "mkldnn_plugin/mkldnn_graph.h"
+
+#include "test_graph.hpp"
+
+#include "single_layer_common.hpp"
+#include <mkldnn_plugin/mkldnn_extension_utils.h>
+#include <inference_engine/cnn_network_impl.hpp>
+#include "tests_common.hpp"
+
+using namespace ::testing;
+using namespace std;
+using namespace mkldnn;
+
+struct gemm_test_params {
+    struct {
+        size_t MB1_A;
+        size_t MB2_A;
+        size_t MB1_B;
+        size_t MB2_B;
+        size_t MB1_C;
+        size_t MB2_C;
+        size_t MB1_D;
+        size_t MB2_D;
+    } batches;
+
+    size_t M;
+    size_t N;
+    size_t K;
+
+    float alpha;
+    float beta;
+
+    bool transposeA;
+    bool transposeB;
+
+    size_t num_prim_desc;
+
+    MKLDNNPlugin::impl_desc_type selectedType;
+
+    std::vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
+};
+
+template<typename data_t>
+void ref_gemm(const std::vector<InferenceEngine::TBlob<data_t>> &src, InferenceEngine::TBlob<data_t> &dst,
+              gemm_test_params prm) {
+    const data_t *src0_data = src[0].readOnly();
+    const data_t *src1_data = src[1].readOnly();
+    const data_t *src2_data = src.size() == 3 ? src[2].readOnly() : dst.readOnly();
+    data_t *dst_data = dst.data();
+
+    size_t MB1 = prm.batches.MB1_D;
+    size_t MB2 = prm.batches.MB2_D;
+    size_t M  = prm.M;
+    size_t N  = prm.N;
+    size_t K  = prm.K;
+
+    for (int mb1 = 0; mb1 < MB1; mb1++) {
+        const data_t *a_data = src0_data;
+        const data_t *b_data = src1_data;
+        const data_t *c_data = src2_data;
+        data_t *d_data = dst_data;
+
+        for (int mb2 = 0; mb2 < MB2; mb2++) {
+            for (int i = 0; i < M; i++) {
+                for (int j = 0; j < N; j++) {
+                    d_data[i * N + j] = src.size() == 3 ? prm.beta * c_data[i * N + j] : 0;
+
+                    for (int k = 0; k < K; k++) {
+                        size_t src0_off = prm.transposeA ? k * M + i : i * K + k;
+                        size_t src1_off = prm.transposeB ? j * K + k : k * N + j;
+                        d_data[i * N + j] += prm.alpha * a_data[src0_off] * b_data[src1_off];
+                    }
+                }
+            }
+            a_data += prm.batches.MB2_A == MB2 ? M*K : 0;
+            b_data += prm.batches.MB2_B == MB2 ? K*N : 0;
+            c_data += prm.batches.MB2_C == MB2 ? M*N : 0;
+            d_data += M*N;
+        }
+
+        src0_data += prm.batches.MB1_A == MB1 ? prm.batches.MB2_A*M*K : 0;
+        src1_data += prm.batches.MB1_B == MB1 ? prm.batches.MB2_B*K*N : 0;
+        src2_data += prm.batches.MB1_C == MB1 ? prm.batches.MB2_C*M*N : 0;
+        dst_data += prm.batches.MB2_D*M*N;
+    }
+}
+
+class MKLDNNGraphGemmTests: public TestsCommon,
+                                     public WithParamInterface<gemm_test_params> {
+    std::string model_t = R"V0G0N(
+<net name="gemmOnly" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="in1" type="Input" precision="FP32" id="1">
+            <output>
+                <port id="1">
+                    <dim>_MB1_A_</dim>
+                    <dim>_MB2_A_</dim>
+                    <dim>_M_</dim>
+                    <dim>_K_</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="in2" type="Input" precision="FP32" id="2">
+            <output>
+                <port id="1">
+                    <dim>_MB1_B_</dim>
+                    <dim>_MB2_B_</dim>
+                    <dim>_K_</dim>
+                    <dim>_N_</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="in3" type="Input" precision="FP32" id="3">
+            <output>
+                <port id="1">
+                    <dim>_MB1_C_</dim>
+                    <dim>_MB2_C_</dim>
+                    <dim>_M_</dim>
+                    <dim>_N_</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="gemm" id="4" type="GEMM" precision="FP32">
+            <data alpha="_A_" beta="_B_" transpose_a="_TA_" transpose_b="_TB_"/>
+            <input>
+                <port id="1">
+                    <dim>_MB1_A_</dim>
+                    <dim>_MB2_A_</dim>
+                    <dim>_M_</dim>
+                    <dim>_K_</dim>
+                </port>
+                <port id="2">
+                    <dim>_MB1_B_</dim>
+                    <dim>_MB2_B_</dim>
+                    <dim>_K_</dim>
+                    <dim>_N_</dim>
+                </port>
+                <port id="3">
+                    <dim>_MB1_C_</dim>
+                    <dim>_MB2_C_</dim>
+                    <dim>_M_</dim>
+                    <dim>_N_</dim>
+                </port>
+            </input>
+            <output>
+                <port id="4">
+                    <dim>_MB1_D_</dim>
+                    <dim>_MB2_D_</dim>
+                    <dim>_M_</dim>
+                    <dim>_N_</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="1" from-port="1" to-layer="4" to-port="1"/>
+        <edge from-layer="2" from-port="1" to-layer="4" to-port="2"/>
+        <edge from-layer="3" from-port="1" to-layer="4" to-port="3"/>
+    </edges>
+</net>
+)V0G0N";
+
+protected:
+    std::string getModel(gemm_test_params p) {
+        std::string model = model_t;
+        std::string op;
+
+        REPLACE_WITH_NUM(model, "_MB1_A_", p.batches.MB1_A);
+        REPLACE_WITH_NUM(model, "_MB2_A_", p.batches.MB2_A);
+        REPLACE_WITH_NUM(model, "_MB1_B_", p.batches.MB1_B);
+        REPLACE_WITH_NUM(model, "_MB2_B_", p.batches.MB2_B);
+        REPLACE_WITH_NUM(model, "_MB1_C_", p.batches.MB1_C);
+        REPLACE_WITH_NUM(model, "_MB2_C_", p.batches.MB2_C);
+        REPLACE_WITH_NUM(model, "_MB1_D_", p.batches.MB1_D);
+        REPLACE_WITH_NUM(model, "_MB2_D_", p.batches.MB2_D);
+
+        REPLACE_WITH_NUM(model, "_M_", p.M);
+        REPLACE_WITH_NUM(model, "_N_", p.N);
+        REPLACE_WITH_NUM(model, "_K_", p.K);
+
+        REPLACE_WITH_NUM(model, "_A_", p.alpha);
+        REPLACE_WITH_NUM(model, "_B_", p.beta);
+        REPLACE_WITH_NUM(model, "_TA_", p.transposeA);
+        REPLACE_WITH_NUM(model, "_TB_", p.transposeB);
+
+        return model;
+    }
+
+    virtual void TearDown() {
+    }
+
+    virtual void SetUp() {
+        try {
+            TestsCommon::SetUp();
+            gemm_test_params p = ::testing::WithParamInterface<gemm_test_params>::GetParam();
+            std::string model = getModel(p);
+
+            InferenceEngine::CNNNetReader net_reader;
+            ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
+
+            MKLDNNGraphTestClass graph;
+            graph.CreateGraph(net_reader.getNetwork());
+
+            auto& nodes = graph.getNodes();
+            for (int i = 0; i < nodes.size(); i++) {
+                if (nodes[i]->getType() == MKLDNNPlugin::Gemm) {
+                    ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size());
+                    for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
+                        p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j));
+                    }
+                    ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor());
+                    ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType());
+                }
+            }
+
+            InferenceEngine::SizeVector dims_src1 = {p.batches.MB1_A, p.batches.MB2_A, p.M, p.K};
+            InferenceEngine::SizeVector dims_src2 = {p.batches.MB1_B, p.batches.MB2_B, p.K, p.N};
+            InferenceEngine::SizeVector dims_src3 = {p.batches.MB1_C, p.batches.MB2_C, p.M, p.N};
+            InferenceEngine::SizeVector dims_dst  = {p.batches.MB1_D, p.batches.MB2_D, p.M, p.N};
+
+            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src1);
+            src1->allocate();
+            InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
+            if (srcPtr1 == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+            fill_data(src1->buffer(), src1->size());
+
+            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src2);
+            src2->allocate();
+            InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
+            if (srcPtr2 == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+            fill_data(src2->buffer(), src2->size());
+
+            InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src3);
+            src3->allocate();
+            InferenceEngine::TBlob<float>* srcPtr3 = dynamic_cast<InferenceEngine::TBlob<float>*>(src3.get());
+            if (srcPtr3 == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+            fill_data(src3->buffer(), src3->size());
+
+            InferenceEngine::BlobMap srcs;
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src1));
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in2", src2));
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in3", src3));
+
+            InferenceEngine::OutputsDataMap out;
+            out = net_reader.getNetwork().getOutputsInfo();
+            InferenceEngine::BlobMap outputBlobs;
+
+            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
+
+            InferenceEngine::TBlob<float>::Ptr output;
+            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
+            output->allocate();
+            outputBlobs[item.first] = output;
+
+            graph.Infer(srcs, outputBlobs);
+
+            InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
+            dst_ref.allocate();
+
+            std::vector<InferenceEngine::TBlob<float>> src_vec = {*srcPtr1, *srcPtr2, *srcPtr3};
+
+            ref_gemm(src_vec, dst_ref, p);
+
+            compare(*output, dst_ref);
+        } catch (const InferenceEngine::details::InferenceEngineException &e) {
+            FAIL() << e.what();
+        }
+    }
+};
+
+TEST_P(MKLDNNGraphGemmTests, TestsGemm) {}
+
+INSTANTIATE_TEST_CASE_P(
+        TestsGemm, MKLDNNGraphGemmTests,
+        ::testing::Values(
+                gemm_test_params{{2, 1, 2, 1, 2, 1, 2, 1}, 3, 3, 2, 1, 1, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any, {
+                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::gemm_any, impl.getImplementationType());
+                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
+                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
+                        }
+                } },
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 8, 5, 4, 1, 1, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any, {
+                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::gemm_any, impl.getImplementationType());
+                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
+                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
+                        }
+                } },
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 16, 10, 12, 1, 1, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any, {
+                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::gemm_any, impl.getImplementationType());
+                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
+                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
+                        }
+                } },
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 11, 10, 20, 1, 1, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any, {
+                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::gemm_any, impl.getImplementationType());
+                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
+                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
+                        }
+                } },
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 5, 13, 2, 1, 1, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any, {
+                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::gemm_any, impl.getImplementationType());
+                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
+                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
+                        }
+                } },
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 5, 15, 10, 1, 1, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any, {
+                        [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+                            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::gemm_any, impl.getImplementationType());
+                            ASSERT_EQ(3, impl.getConfig().inConfs.size());
+                            ASSERT_EQ(1, impl.getConfig().outConfs.size());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(1).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(2).desc.getLayout());
+                            ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
+                        }
+                } },
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 5, 6, 7, 2, 0, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 5, 6, 7, 0, 2, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 3, 7, 4, 2, 3, true, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 7, 3, 4, 2, 3, true, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 7, 4, 3, 2, 3, true, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 3, 7, 4, 2, 3, false, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 7, 3, 4, 2, 3, false, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 7, 4, 3, 2, 3, false, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 3, 7, 4, 2, 3, true, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 7, 3, 4, 2, 3, true, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{3, 2, 3, 2, 3, 2, 3, 2}, 7, 4, 3, 2, 3, true, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 2, 3, 2, 3, 2, 3}, 7, 4, 3, 2, 3, true, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 2, 3, 1, 3, 2, 3}, 7, 4, 3, 2, 3, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{2, 3, 1, 3, 1, 3, 2, 3}, 7, 4, 3, 2, 3, false, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{5, 3, 5, 1, 5, 3, 5, 3}, 7, 4, 3, 2, 3, true, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{5, 3, 5, 1, 5, 1, 5, 3}, 7, 4, 3, 2, 3, false, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{5, 1, 5, 1, 5, 3, 5, 3}, 7, 4, 3, 2, 3, true, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 1, 5, 3, 5, 3, 5, 3}, 7, 4, 3, 2, 3, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 1, 1, 1, 5, 3, 5, 3}, 7, 4, 3, 2, 3, true, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{5, 4, 1, 1, 1, 1, 5, 4}, 7, 4, 3, 2, 3, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any}
+        ));
+
+class MKLDNNGraphDynBatchGemmTests: public MKLDNNGraphGemmTests {
+protected:
+    virtual void SetUp() {
+        try {
+            TestsCommon::SetUp();
+            gemm_test_params p = ::testing::WithParamInterface<gemm_test_params>::GetParam();
+            std::string model = getModel(p);
+            size_t MB = p.batches.MB1_D;
+            if (MB < 2)
+                MB = 2;
+
+            InferenceEngine::CNNNetReader net_reader;
+            ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
+            InferenceEngine::CNNNetwork network = net_reader.getNetwork();
+            auto implNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(&((InferenceEngine::ICNNNetwork&)network));
+            ASSERT_NE(nullptr, implNet) << "Failed to cast ICNNNetwork to CNNNetworkImpl";
+            InferenceEngine::ResponseDesc resp;
+            InferenceEngine::StatusCode sts  = implNet->setBatchSizeReshape(MB, &resp);
+            ASSERT_EQ((int)InferenceEngine::StatusCode::OK, sts) << resp.msg;
+
+            MKLDNNGraphTestClass graph;
+            graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
+            graph.CreateGraph(net_reader.getNetwork());
+
+            InferenceEngine::SizeVector dims_src1 = {MB, p.batches.MB2_A, p.M, p.K};
+            InferenceEngine::SizeVector dims_src2 = {MB, p.batches.MB2_B, p.K, p.N};
+            InferenceEngine::SizeVector dims_src3 = {MB, p.batches.MB2_C, p.M, p.N};
+            InferenceEngine::SizeVector dims_dst  = {MB, p.batches.MB2_D, p.M, p.N};
+
+            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src1);
+            src1->allocate();
+            InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
+            if (srcPtr1 == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+            fill_data(src1->buffer(), src1->size());
+
+            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src2);
+            src2->allocate();
+            InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
+            if (srcPtr2 == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+            fill_data(src2->buffer(), src2->size());
+
+            InferenceEngine::Blob::Ptr src3 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src3);
+            src3->allocate();
+            InferenceEngine::TBlob<float>* srcPtr3 = dynamic_cast<InferenceEngine::TBlob<float>*>(src3.get());
+            if (srcPtr3 == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+            fill_data(src3->buffer(), src3->size());
+
+            InferenceEngine::BlobMap srcs;
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src1));
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in2", src2));
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in3", src3));
+
+            InferenceEngine::OutputsDataMap out;
+            out = net_reader.getNetwork().getOutputsInfo();
+            InferenceEngine::BlobMap outputBlobs;
+
+            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
+
+            InferenceEngine::TBlob<float>::Ptr output;
+            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
+            output->allocate();
+            outputBlobs[item.first] = output;
+
+            auto check = [](const MKLDNNPlugin::MKLDNNNodePtr& node) {
+                return node->getType() == MKLDNNPlugin::Gemm;
+            };
+
+            graph.checkDynBatch(srcs, outputBlobs, MB, MB, check);
+            graph.checkDynBatch(srcs, outputBlobs, 1, MB, check);
+        } catch (const InferenceEngine::details::InferenceEngineException &e) {
+            FAIL() << e.what();
+        }
+    }
+};
+
+TEST_P(MKLDNNGraphDynBatchGemmTests, TestsDynBatchGemm) {}
+
+INSTANTIATE_TEST_CASE_P(
+        TestsDynBatchGemm, MKLDNNGraphDynBatchGemmTests,
+        ::testing::Values(
+                gemm_test_params{{1, 3, 1, 3, 1, 3, 1, 3}, 3, 3, 3, 1, 1, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 1, 1, 1, 3, 1, 3}, 16, 15, 12, 1, 1, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any}
+));
+
+class MKLDNNGraphSingleBatchDimGemmTests: public TestsCommon,
+                                     public WithParamInterface<gemm_test_params> {
+    std::string model_t = R"V0G0N(
+<net name="gemmOnly" version="2" precision="FP32" batch="1">
+    <layers>
+        <layer name="in1" type="Input" precision="FP32" id="1">
+            <output>
+                <port id="1">
+                    <dim>_MB_A_</dim>
+                    <dim>_M_</dim>
+                    <dim>_K_</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="in2" type="Input" precision="FP32" id="2">
+            <output>
+                <port id="1">
+                    <dim>_MB_B_</dim>
+                    <dim>_K_</dim>
+                    <dim>_N_</dim>
+                </port>
+            </output>
+        </layer>
+        <layer name="gemm" id="3" type="GEMM" precision="FP32">
+            <data alpha="_A_" beta="_B_" transpose_a="_TA_" transpose_b="_TB_"/>
+            <input>
+                <port id="1">
+                    <dim>_MB_A_</dim>
+                    <dim>_M_</dim>
+                    <dim>_K_</dim>
+                </port>
+                <port id="2">
+                    <dim>_MB_B_</dim>
+                    <dim>_K_</dim>
+                    <dim>_N_</dim>
+                </port>
+            </input>
+            <output>
+                <port id="3">
+                    <dim>_MB_D_</dim>
+                    <dim>_M_</dim>
+                    <dim>_N_</dim>
+                </port>
+            </output>
+        </layer>
+    </layers>
+    <edges>
+        <edge from-layer="1" from-port="1" to-layer="3" to-port="1"/>
+        <edge from-layer="2" from-port="1" to-layer="3" to-port="2"/>
+    </edges>
+</net>
+)V0G0N";
+
+protected:
+    std::string getModel(gemm_test_params p) {
+        std::string model = model_t;
+        std::string op;
+
+        REPLACE_WITH_NUM(model, "_MB_A_", p.batches.MB2_A);
+        REPLACE_WITH_NUM(model, "_MB_B_", p.batches.MB2_B);
+        REPLACE_WITH_NUM(model, "_MB_D_", p.batches.MB2_D);
+
+        REPLACE_WITH_NUM(model, "_M_", p.M);
+        REPLACE_WITH_NUM(model, "_N_", p.N);
+        REPLACE_WITH_NUM(model, "_K_", p.K);
+
+        REPLACE_WITH_NUM(model, "_A_", p.alpha);
+        REPLACE_WITH_NUM(model, "_B_", p.beta);
+        REPLACE_WITH_NUM(model, "_TA_", p.transposeA);
+        REPLACE_WITH_NUM(model, "_TB_", p.transposeB);
+
+        return model;
+    }
+
+    virtual void TearDown() {
+    }
+
+    virtual void SetUp() {
+        try {
+            TestsCommon::SetUp();
+            gemm_test_params p = ::testing::WithParamInterface<gemm_test_params>::GetParam();
+            std::string model = getModel(p);
+
+            InferenceEngine::CNNNetReader net_reader;
+            ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
+
+            MKLDNNGraphTestClass graph;
+            graph.CreateGraph(net_reader.getNetwork());
+
+            auto& nodes = graph.getNodes();
+            for (int i = 0; i < nodes.size(); i++) {
+                if (nodes[i]->getType() == MKLDNNPlugin::Gemm) {
+                    ASSERT_EQ(p.num_prim_desc, nodes[i]->getSupportedPrimitiveDescriptors().size());
+                    for (size_t j = 0; j < p.num_prim_desc && j < p.comp.size(); j++) {
+                        p.comp.at(j)(nodes[i]->getSupportedPrimitiveDescriptors().at(j));
+                    }
+                    ASSERT_NE(nullptr, nodes[i]->getSelectedPrimitiveDescriptor());
+                    ASSERT_EQ(p.selectedType, nodes[i]->getSelectedPrimitiveDescriptor()->getImplementationType());
+                }
+            }
+
+            InferenceEngine::SizeVector dims_src1 = {p.batches.MB2_A, p.M, p.K};
+            InferenceEngine::SizeVector dims_src2 = {p.batches.MB2_B, p.K, p.N};
+            InferenceEngine::SizeVector dims_dst  = {p.batches.MB2_D, p.M, p.N};
+
+            InferenceEngine::Blob::Ptr src1 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::CHW, dims_src1);
+            src1->allocate();
+            InferenceEngine::TBlob<float>* srcPtr1 = dynamic_cast<InferenceEngine::TBlob<float>*>(src1.get());
+            if (srcPtr1 == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+            fill_data(src1->buffer(), src1->size());
+
+            InferenceEngine::Blob::Ptr src2 = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::CHW, dims_src2);
+            src2->allocate();
+            InferenceEngine::TBlob<float>* srcPtr2 = dynamic_cast<InferenceEngine::TBlob<float>*>(src2.get());
+            if (srcPtr2 == nullptr)
+                FAIL() << "Cannot cast blob to TBlob<float>.";
+            fill_data(src2->buffer(), src2->size());
+
+            InferenceEngine::BlobMap srcs;
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src1));
+            srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in2", src2));
+
+            InferenceEngine::OutputsDataMap out;
+            out = net_reader.getNetwork().getOutputsInfo();
+            InferenceEngine::BlobMap outputBlobs;
+
+            std::pair<std::string, InferenceEngine::DataPtr> item = *out.begin();
+
+            InferenceEngine::TBlob<float>::Ptr output;
+            output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
+            output->allocate();
+            outputBlobs[item.first] = output;
+
+            graph.Infer(srcs, outputBlobs);
+
+            InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
+            dst_ref.allocate();
+
+            std::vector<InferenceEngine::TBlob<float>> src_vec = {*srcPtr1, *srcPtr2};
+
+            ref_gemm(src_vec, dst_ref, p);
+
+            compare(*output, dst_ref);
+        } catch (const InferenceEngine::details::InferenceEngineException &e) {
+            FAIL() << e.what();
+        }
+    }
+};
+
+TEST_P(MKLDNNGraphSingleBatchDimGemmTests, TestsGemm) {}
+
+INSTANTIATE_TEST_CASE_P(
+        TestsGemm, MKLDNNGraphSingleBatchDimGemmTests,
+        ::testing::Values(
+                gemm_test_params{{1, 1, 1, 1, 1, 1, 1, 1}, 7, 4, 3, 2, 3, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 1, 3, 1, 1, 1, 3}, 7, 4, 3, 2, 3, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 1, 1, 1, 1, 1, 3}, 7, 4, 3, 2, 3, false, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 1, 1, 1, 1, 1, 1, 1}, 7, 4, 3, 2, 3, true, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 1, 3, 1, 1, 1, 3}, 7, 4, 3, 2, 3, true, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 1, 1, 1, 1, 1, 3}, 7, 4, 3, 2, 3, true, false, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 1, 1, 1, 1, 1, 1, 1}, 7, 4, 3, 2, 3, false, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 1, 3, 1, 1, 1, 3}, 7, 4, 3, 2, 3, false, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 1, 1, 1, 1, 1, 3}, 7, 4, 3, 2, 3, false, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 1, 1, 1, 1, 1, 1, 1}, 7, 4, 3, 2, 3, true, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 1, 3, 1, 1, 1, 3}, 7, 4, 3, 2, 3, true, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any},
+                gemm_test_params{{1, 3, 1, 1, 1, 1, 1, 3}, 7, 4, 3, 2, 3, true, true, 1, MKLDNNPlugin::impl_desc_type::gemm_any}
+        ));
index b685882..793e3d4 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -21,7 +20,7 @@ public:
     MKLDNNTestExecNetwork(InferenceEngine::ICNNNetwork &network, const MKLDNNPlugin::Config &cfg)
             : MKLDNNExecNetwork(network, cfg, {}) {}
     MKLDNNPlugin::MKLDNNGraph& getGraph() {
-        return *graph;
+        return *graphs[0];
     }
 };
 
index f920566..a40add1 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -193,7 +192,9 @@ INSTANTIATE_TEST_CASE_P(
                 permute_test_params{{2, 3, 4, 5, 6}, {0, 3, 2, 4, 1}, 1, MKLDNNPlugin::impl_desc_type::unknown},
                 permute_test_params{{2, 8, 2, 2, 4, 5}, {0, 1, 4, 2, 5, 3}, 1, MKLDNNPlugin::impl_desc_type::unknown},
                 permute_test_params{{2, 8, 3, 3, 4, 5}, {0, 1, 4, 2, 5, 3}, 1, MKLDNNPlugin::impl_desc_type::unknown},
-                permute_test_params{{2, 8, 3, 4}, {3, 0, 1, 2}, 2, MKLDNNPlugin::impl_desc_type::unknown}
+                permute_test_params{{2, 8, 3, 4}, {3, 0, 1, 2}, 2, MKLDNNPlugin::impl_desc_type::unknown},
+                permute_test_params{{2, 12, 9}, {0, 2, 1}, 1, MKLDNNPlugin::impl_desc_type::unknown},
+                permute_test_params{{2, 8, 3, 3, 4, 5}, {0, 3, 4, 1, 5, 2}, 1, MKLDNNPlugin::impl_desc_type::unknown}
         ));
 
 class MKLDNNGraphDynBatchPermuteTests: public MKLDNNGraphPermuteTests {
@@ -271,5 +272,7 @@ INSTANTIATE_TEST_CASE_P(
                 permute_test_params{{2, 3, 4, 5, 6}, {0, 2, 4, 3, 1}, 1, MKLDNNPlugin::impl_desc_type::unknown},
                 permute_test_params{{2, 3, 4, 5, 6}, {0, 3, 2, 4, 1}, 1, MKLDNNPlugin::impl_desc_type::unknown},
                 permute_test_params{{2, 8, 2, 2, 4, 5}, {0, 1, 4, 2, 5, 3}, 1, MKLDNNPlugin::impl_desc_type::unknown},
-                permute_test_params{{2, 8, 3, 3, 4, 5}, {0, 1, 4, 2, 5, 3}, 1, MKLDNNPlugin::impl_desc_type::unknown}
+                permute_test_params{{2, 8, 3, 3, 4, 5}, {0, 1, 4, 2, 5, 3}, 1, MKLDNNPlugin::impl_desc_type::unknown},
+                permute_test_params{{2, 12, 9}, {0, 2, 1}, 1, MKLDNNPlugin::impl_desc_type::unknown},
+                permute_test_params{{2, 8, 3, 3, 4, 5}, {0, 3, 4, 1, 5, 2}, 1, MKLDNNPlugin::impl_desc_type::unknown}
         ));
index 80725d9..a1ee6bd 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "test_graph.hpp"
 
 #include "single_layer_common.hpp"
+#include <ie_layers.h>
 #include <mkldnn_plugin/mkldnn_extension_utils.h>
 #include <inference_engine/cnn_network_impl.hpp>
 #include "tests_common.hpp"
+#include "ir_gen_helper.hpp"
+#include <math.h>
 
-
+using namespace InferenceEngine;
 using namespace ::testing;
 using namespace std;
 using namespace mkldnn;
+using namespace single_layer_tests;
 
 struct pooling_test_params {
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } in;
-
-    size_t krn_w;
-    size_t krn_h;
-    size_t str_w;
-    size_t str_h;
-    size_t pad_w;
-    size_t pad_h;
+    // Formats: NCHW, NCDHW
+    vector<size_t> dims;
+    // Formats: WH, WHD
+    vector<size_t> kernel;
+    vector<size_t> strides;
+    vector<size_t> pads_begin;
+    vector<size_t> pads_end;
+
+    PoolingLayer::PoolType _type;
+    bool _exclude_pad;
 
     size_t num_prim_desc;
 
     MKLDNNPlugin::impl_desc_type selectedType;
-    std::vector<MKLDNNPlugin::impl_desc_type> preferTypes;
+    vector<MKLDNNPlugin::impl_desc_type> preferTypes;
 
-    std::vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
+    vector<std::function<void(MKLDNNPlugin::PrimitiveDescInfo)>> comp;
 };
 
 template <typename data_t>
 void ref_pool(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<data_t> &dst, pooling_test_params prm)
 {
-    size_t KW = prm.krn_w;
-    size_t KH = prm.krn_h;
-
-    size_t IW = prm.in.w;
-    size_t IH = prm.in.h;
-
-    size_t OW = (IW + 2 * prm.pad_w - prm.krn_w) / prm.str_w + 1;
-    size_t OH = (IH + 2 * prm.pad_h - prm.krn_h) / prm.str_h + 1;
-    size_t OC = prm.in.c;
+    int dims_size = prm.dims.size();
+
+    int KW = prm.kernel[X_AXIS];
+    int KH = prm.kernel[Y_AXIS];
+    int KD = dims_size == 5 ? prm.kernel[Z_AXIS] : 1;
+
+    int SW = prm.strides[X_AXIS];
+    int SH = prm.strides[Y_AXIS];
+    int SD = prm.strides.size() > Z_AXIS ? prm.strides[Z_AXIS] : 1;
+
+    int IW = prm.dims[dims_size - 1];
+    int IH = prm.dims[dims_size - 2];
+    int ID = dims_size == 5 ? prm.dims[dims_size - 3] : 1;
+    
+    int PWB = prm.pads_begin[X_AXIS];
+    int PHB = prm.pads_begin[Y_AXIS];
+    int PDB = prm.pads_begin.size() > Z_AXIS ? prm.pads_begin[Z_AXIS] : 0;
+    int PWE = prm.pads_end[X_AXIS];
+    int PHE = prm.pads_end[Y_AXIS];
+    int PDE = prm.pads_end.size() > Z_AXIS ? prm.pads_end[Z_AXIS] : 0;
+
+    int OW = (IW + PWB + PWE - KW) / SW + 1;
+    int OH = (IH + PHB + PHE - KH) / SH + 1;
+    int OD = dims_size == 5 ? (ID + PDB + PDE - KD) / SD + 1 : 1;
+    int OC = prm.dims[1];
 
     const data_t *src_data = src.readOnly();
     data_t *dst_data = dst.data();
 
-    IE_ASSERT( OC == dst.dims()[2]);
-
-    for (size_t c = 0; c < OC; c++) {
-        for (size_t oh = 0; oh < OH; oh++) {
-            for (size_t ow = 0; ow < OW; ow++) {
-                size_t oidx = c * OH * OW
-                              + oh * OW + ow;
-                data_t out_ref = data_t(0);
-                bool is_initialized = false;
-                for (uint32_t kh = 0; kh < KH; kh++) {
-                    for (uint32_t kw = 0; kw < KW; kw++) {
-                        int32_t iw = ow * prm.str_w - prm.pad_w + kw;
-                        int32_t ih = oh * prm.str_h - prm.pad_h + kh;
-                        if (iw < 0 || iw >= IW || ih < 0
-                            || ih >= IH)
-                            continue;
-                        uint32_t iidx = c * IH * IW + ih * IW + iw;
-
-                        data_t d = src_data[iidx];
-                        if (!is_initialized) {
-                            out_ref = d;
-                            is_initialized = true;
-                        } else {
-                            if (out_ref < d)
-                                out_ref = d;
+    IE_ASSERT(OC == dst.dims()[dims_size - 2]);
+
+    int k1 = OH * OW,
+           k2 = k1 * OD,
+           k3 = IH * IW,
+           k4 = k3 * ID;
+
+    if (prm._type == PoolingLayer::MAX) {
+        for (int c = 0; c < OC; c++) {
+            int cc = c * k2;
+            for (int od = 0; od < OD; od++) {
+                int cd = cc + od * k1;
+                for (int oh = 0; oh < OH; oh++) {
+                    int ch = cd + oh * OW;
+                    for (int ow = 0; ow < OW; ow++) {
+
+                        int oidx = ch + ow;
+                        data_t out_ref = data_t(0);
+                        bool is_initialized = false;
+
+                        for (int kd = 0; kd < KD; kd++) {
+                            int id = dims_size == 5 ? od * SD - PDB + kd : 0lu;
+                            if (id < 0 || id >= ID) continue;
+                            for (int kh = 0; kh < KH; kh++) {
+                                int ih = oh * SH - PHB + kh;
+                                if (ih < 0 || ih >= IH) continue;
+                                for (int kw = 0; kw < KW; kw++) {
+                                    int iw = ow * SW - PWB + kw;
+                                    if (iw < 0 || iw >= IW) continue;
+                                    int iidx = c * k4
+                                                + id * k3
+                                                + ih * IW
+                                                + iw;
+
+                                    data_t d = src_data[iidx];
+                                    if (!is_initialized) {
+                                        out_ref = d;
+                                        is_initialized = true;
+                                    } else {
+                                        if (out_ref < d)
+                                            out_ref = d;
+                                    }
+                                }
+                            }
                         }
+                        dst_data[oidx] = out_ref;
                     }
                 }
-                dst_data[oidx] = out_ref;
             }
         }
-    }
+    } else if (prm._type == PoolingLayer::AVG) {
+
+        bool include_padding = false;
+        bool not_zero_l = false;
+        for (auto lr : prm.pads_begin) {
+            if (lr) {
+                not_zero_l = true;
+                break;
+            }
+        }
+        if (!prm._exclude_pad && not_zero_l)
+            include_padding = true;
+
+        int PDBKD = KD - PDB,
+            PHBKH = KH - PHB,
+            PWBKW = KW - PWB,
+            IDPDE = ID + PDE,
+            IHPHE = IH + PHE,
+            IWPWE = IW + PWE;
+
+        for (int c = 0; c < OC; c++) {
+            int cc = c * k2;
+            for (int od = 0; od < OD; od++) {
+                int cd = cc + od * k1;
+                int id_start = od * SD - PDB;
+                int id_end = std::min(od * SD + PDBKD, IDPDE);
+                for (int oh = 0; oh < OH; oh++) {
+                    int ch = cd + oh * OW;
+                    int ih_start = oh * SH - PHB;
+                    int ih_end = std::min(oh * SH + PHBKH, IHPHE);
+                    for (int ow = 0; ow < OW; ow++) {
+                        size_t oidx = ch + ow;
+                        dst_data[oidx] = (data_t)0;
+                        int iw_start = ow * SW - PWB;
+                        int iw_end = std::min(ow * SW + PWBKW, IWPWE);
+
+                        // include_padding
+                        double num_summands = (ih_end - ih_start) * (iw_end - iw_start) * (id_end - id_start);
+
+                        id_start = std::max(id_start, 0);
+                        ih_start = std::max(ih_start, 0);
+                        iw_start = std::max(iw_start, 0);
+                        id_end = std::min(id_end, ID);
+                        ih_end = std::min(ih_end, IH);
+                        iw_end = std::min(iw_end, IW);
+
+                        if (!include_padding)
+                            num_summands = (id_end - id_start) * (ih_end - ih_start) * (iw_end - iw_start);
+                        if (num_summands == 0.0) continue;
+
+                        double dst = 0.0;
+                        for (int id = id_start; id < id_end; ++id) {
+                            for (int ih = ih_start; ih < ih_end; ++ih) {
+                                for (int iw = iw_start; iw < iw_end; ++iw) {
+                                    size_t iidx = c * k4
+                                                + id * k3
+                                                + ih * IW
+                                                + iw;
+
+                                    dst += (double)src_data[iidx];
+                        }   }   }
+
+                        dst_data[oidx] = (data_t)(dst / num_summands);
+    }   }   }   }   }
 }
 
 class MKLDNNGraphPoolingTests: public TestsCommon,
                                      public WithParamInterface<pooling_test_params> {
-    std::string model_t = R"V0G0N(
-<Net Name="Pooling_Only" version="2" precision="FP32" batch="1">
-    <layers>
-        <layer name="in1" type="Input" precision="FP32" id="0">
-            <output>
-                <port id="0">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
-                </port>
-            </output>
-        </layer>
+    std::string layers_t = R"V0G0N(
         <layer name="pool" id="1" type="Pooling" precision="FP32">
 
-            <pooling stride-x="_SW_" stride-y="_SH_"
-                     pad-x="_PW_" pad-y="_PH_"
-                     kernel-x="_KW_" kernel-y="_KH_"
-                     method="MAX" round="Ceil" PrimitivesPriority="_IMPLS_"/>
+            <pooling kernel="_K_"
+                     strides="_KS_"
+                     pads_begin="_PB_" pads_end="_PE_"
+                     pool-method="_PM_" exclude-pad="_EP_" rounding_type="floor"
+                     PrimitivesPriority="_IMPLS_"/>
 
             <input>
                 <port id="1">
-                    <dim>_IN_</dim>
-                    <dim>_IC_</dim>
-                    <dim>_IH_</dim>
-                    <dim>_IW_</dim>
+                    __SRC_DIMS__
                 </port>
             </input>
             <output>
                 <port id="1">
                     <dim>_IN_</dim>
                     <dim>_IC_</dim>
-                    <dim>_OH_</dim>
-                    <dim>_OW_</dim>
+                    __DST_DIMS__
                 </port>
             </output>
         </layer>
-    </layers>
-    <edges>
+)V0G0N";
+
+    std::string edges_t = R"V0G0N(
         <edge from-layer="0" from-port="0" to-layer="1" to-port="1"/>
-    </edges>
-</Net>
 )V0G0N";
 
 protected:
     std::string getModel(pooling_test_params p) {
-        std::string model = model_t;
+        std::string model = layers_t;
 
-        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
-        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
-        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
-        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
+        std::string s_dims;
+        for (auto& dim : p.dims) {
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__SRC_DIMS__", s_dims);
+
+        s_dims = "";
+        int k_len = p.kernel.size();
+        for (size_t i = 2lu; i < p.dims.size(); i++) {
+            size_t inx = k_len - i + 1lu;
+            size_t dim = (p.dims[i] + p.pads_begin[inx] + p.pads_end[inx] - p.kernel[inx]) / p.strides[inx] + 1lu;
+            s_dims += "\n                    <dim>";
+            s_dims += std::to_string(dim) + "</dim>";
+        }
+       REPLACE_WITH_STR(model, "__DST_DIMS__", s_dims);
+        
+        std::string pool_method;
+        switch (p._type) {
+            case PoolingLayer::AVG: pool_method = "avg";
+                break;
+            case PoolingLayer::ROI: pool_method = "roi";
+                break;
+            default: pool_method = "max";
+        }
+        REPLACE_WITH_STR(model, "_PM_", pool_method);
+        
+        std::string exclude_pad = "false";
+        if (p._exclude_pad) exclude_pad = "true";
+        REPLACE_WITH_STR(model, "_EP_", exclude_pad);
 
-        REPLACE_WITH_NUM(model, "_KW_", p.krn_w);
-        REPLACE_WITH_NUM(model, "_KH_", p.krn_h);
-        REPLACE_WITH_NUM(model, "_SW_", p.str_w);
-        REPLACE_WITH_NUM(model, "_SH_", p.str_h);
-        REPLACE_WITH_NUM(model, "_PW_", p.pad_w);
-        REPLACE_WITH_NUM(model, "_PH_", p.pad_h);
+        REPLACE_WITH_NUM(model, "_IN_", p.dims[0]);
+        REPLACE_WITH_NUM(model, "_IC_", p.dims[1]);
 
-        REPLACE_WITH_NUM(model, "_OW_", (p.in.w + 2 * p.pad_w - p.krn_w) / p.str_w + 1);
-        REPLACE_WITH_NUM(model, "_OH_", (p.in.h + 2 * p.pad_h - p.krn_h) / p.str_h + 1);
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_K_", p.kernel);
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_KS_", p.strides);
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_PB_", p.pads_begin);
+        REPLACE_WITH_NUM_VECTOR_REVERSE(model, "_PE_", p.pads_end);
 
         std::string impls;
         for (const auto& preferType : p.preferTypes) {
@@ -164,6 +270,9 @@ protected:
             impls += "cpu:" + MKLDNNGraphTestClass::getStrPrimitiveDescriptorType(preferType);
         }
         REPLACE_WITH_STR(model, "_IMPLS_", impls);
+
+        model = IRTemplateGenerator::getIRTemplate("Pooling_Only", p.dims, "FP32", model, edges_t);
+
         return model;
     }
 
@@ -193,9 +302,18 @@ protected:
                 }
             }
 
-            InferenceEngine::SizeVector dims_src = {p.in.n, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::Layout layout = ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src =
+                InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, p.dims);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
@@ -225,7 +343,7 @@ protected:
 
             ref_pool(*srcPtr, dst_ref, p);
 
-            compare(*output, dst_ref);
+            compare(*output, dst_ref, 0.0001f);
         } catch (const InferenceEngine::details::InferenceEngineException &e) {
             FAIL() << e.what();
         }
@@ -237,12 +355,64 @@ TEST_P(MKLDNNGraphPoolingTests, TestsPooling) {}
 INSTANTIATE_TEST_CASE_P(
         TestsPooling, MKLDNNGraphPoolingTests,
         ::testing::Values(
-                pooling_test_params{{1, 3, 228, 228}, 2, 2, 2, 2, 0, 0, 6, MKLDNNPlugin::impl_desc_type::jit},
-                pooling_test_params{{1, 3, 228, 228}, 4, 2, 2, 2, 0, 0, 4, MKLDNNPlugin::impl_desc_type::jit},
-                pooling_test_params{{1, 3, 228, 228}, 4, 2, 2, 1, 0, 0, 4, MKLDNNPlugin::impl_desc_type::jit},
-                pooling_test_params{{1, 3, 228, 228}, 2, 2, 2, 2, 0, 0, 6, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
-                pooling_test_params{{1, 3, 228, 228}, 4, 2, 2, 2, 0, 0, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
-                pooling_test_params{{1, 3, 228, 228}, 4, 2, 2, 1, 0, 0, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}}));
+        /*0*/   pooling_test_params{{1, 3, 228, 228}, {2, 2}, {2, 2}, {0, 0}, {0, 0}, PoolingLayer::MAX, false, 6, MKLDNNPlugin::impl_desc_type::jit},
+                pooling_test_params{{1, 3, 228, 228}, {4, 2}, {2, 2}, {0, 0}, {0, 0}, PoolingLayer::MAX, false, 4, MKLDNNPlugin::impl_desc_type::jit},
+                pooling_test_params{{1, 3, 228, 228}, {4, 2}, {2, 1}, {0, 0}, {0, 0}, PoolingLayer::MAX, false, 4, MKLDNNPlugin::impl_desc_type::jit},
+                pooling_test_params{{1, 3, 228, 228}, {2, 2}, {2, 2}, {0, 0}, {0, 0}, PoolingLayer::MAX, false, 6, MKLDNNPlugin::impl_desc_type::ref,
+                            {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1, 3, 228, 228}, {4, 2}, {2, 2}, {0, 0}, {0, 0}, PoolingLayer::MAX, false, 4, MKLDNNPlugin::impl_desc_type::ref,
+                            {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1, 3, 228, 228}, {4, 2}, {2, 1}, {0, 0}, {0, 0}, PoolingLayer::MAX, false, 4, MKLDNNPlugin::impl_desc_type::ref,
+                            {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1u, 4u, 128u, 128u}, {2u, 2u}, {2u, 2u}, {1u, 0u}, {0u, 0u}, PoolingLayer::AVG, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1u, 4u, 128u, 128u}, {2u, 2u}, {2u, 2u}, {1u, 0u}, {0u, 0u}, PoolingLayer::AVG, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::jit },
+                pooling_test_params{{1u, 4u, 128u, 128u}, {2u, 2u}, {2u, 2u}, {0u, 0u}, {0u, 0u}, PoolingLayer::AVG, true, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+        /*9*/   pooling_test_params{{1u, 4u, 128u, 128u}, {2u, 2u}, {2u, 2u}, {0u, 0u}, {0u, 0u}, PoolingLayer::AVG, true, 3u,
+                            MKLDNNPlugin::impl_desc_type::jit },
+                pooling_test_params{{1u, 4u, 128u, 128u}, {2u, 2u}, {2u, 2u}, {2u, 2u}, {2u, 2u}, PoolingLayer::AVG, true, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1u, 4u, 128u, 128u}, {2u, 2u}, {2u, 2u}, {2u, 2u}, {2u, 2u}, PoolingLayer::AVG, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1u, 4u, 128u, 128u}, {2u, 2u}, {2u, 2u}, {2u, 2u}, {2u, 2u}, PoolingLayer::MAX, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                // TODO Fix jit implementation. End paddings
+//                pooling_test_params{{1u, 4u, 128u, 128u}, {2u, 2u}, {2u, 2u}, {2u, 2u}, {2u, 0u}, PoolingLayer::AVG, true, 3u,
+//                            MKLDNNPlugin::impl_desc_type::jit },
+//                pooling_test_params{{1u, 4u, 128u, 128u}, {2u, 2u}, {2u, 2u}, {2u, 2u}, {2u, 0u}, PoolingLayer::AVG, false, 3u,
+//                            MKLDNNPlugin::impl_desc_type::jit },
+//                pooling_test_params{{1u, 4u, 128u, 128u}, {2u, 2u}, {2u, 2u}, {2u, 2u}, {2u, 0u}, PoolingLayer::MAX, false, 3u,
+//                            MKLDNNPlugin::impl_desc_type::jit },
+
+                // 5D tensor
+                pooling_test_params{{1u, 3u, 16u, 32u, 32u}, {2u, 2u, 2u}, {1u, 1u, 1u}, {0u, 0u, 0u}, {0u, 0u, 0u}, PoolingLayer::MAX, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1u, 3u, 16u, 32u, 32u}, {2u, 2u, 2u}, {1u, 1u, 1u}, {0u, 0u, 0u}, {0u, 0u, 0u}, PoolingLayer::MAX, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::jit },
+                pooling_test_params{{1u, 3u, 16u, 32u, 32u}, {2u, 2u, 2u}, {1u, 1u, 1u}, {1u, 1u, 1u}, {1u, 1u, 1u}, PoolingLayer::MAX, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1u, 32u, 60u, 60u, 60u}, {2u, 3u, 4u}, {2u, 2u, 2u}, {1u, 1u, 1u}, {1u, 2u, 3u}, PoolingLayer::MAX, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+        /*20*/  pooling_test_params{{1u, 3u, 16u, 32u, 32u}, {2u, 2u, 2u}, {1u, 1u, 1u}, {1u, 2u, 3u}, {1u, 2u, 3u}, PoolingLayer::MAX, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::jit },
+                pooling_test_params{{1u, 4u, 128u, 128u, 128u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {1u, 0u, 0u}, {0u, 0u, 0u}, PoolingLayer::AVG, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1u, 4u, 128u, 128u, 128u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {1u, 0u, 0u}, {0u, 0u, 0u}, PoolingLayer::AVG, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::jit },
+                pooling_test_params{{1u, 4u, 128u, 128u, 128u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {0u, 0u, 0u}, {0u, 0u, 0u}, PoolingLayer::AVG, true, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1u, 4u, 128u, 128u, 128u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {0u, 0u, 0u}, {0u, 0u, 0u}, PoolingLayer::AVG, true, 3u,
+                            MKLDNNPlugin::impl_desc_type::jit },
+                pooling_test_params{{1u, 4u, 128u, 128u, 128u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {0u, 0u, 0u}, PoolingLayer::AVG, true, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1u, 4u, 128u, 128u, 128u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {2u, 2u, 2u}, PoolingLayer::AVG, true, 3u,
+                            MKLDNNPlugin::impl_desc_type::jit },
+                pooling_test_params{{1u, 4u, 128u, 128u, 128u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {2u, 2u, 2u}, PoolingLayer::AVG, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1u, 4u, 128u, 128u, 128u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {2u, 2u, 2u}, {2u, 2u, 2u}, PoolingLayer::AVG, false, 3u,
+                            MKLDNNPlugin::impl_desc_type::jit } ));
 
 
 class MKLDNNGraphDynBatchPoolingTests: public MKLDNNGraphPoolingTests {
@@ -252,7 +422,7 @@ protected:
             TestsCommon::SetUp();
             pooling_test_params p = ::testing::WithParamInterface<pooling_test_params>::GetParam();
             std::string model = getModel(p);
-            size_t MB = p.in.n;
+            size_t MB = p.dims[0];
             if (MB < 2)
                 MB = 2;
 
@@ -269,9 +439,18 @@ protected:
             graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
             graph.CreateGraph(net_reader.getNetwork());
 
-            InferenceEngine::SizeVector dims_src = {MB, p.in.c, p.in.h, p.in.w};
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Layout layout = ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
+            InferenceEngine::Blob::Ptr src =
+                InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, p.dims);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
@@ -310,7 +489,7 @@ TEST_P(MKLDNNGraphDynBatchPoolingTests, TestsDynBatchPooling) {}
 INSTANTIATE_TEST_CASE_P(
         TestsDynBatchPooling, MKLDNNGraphDynBatchPoolingTests,
         ::testing::Values(
-                pooling_test_params{{1, 3, 228, 228}, 4, 2, 2, 1, 0, 0, 4, MKLDNNPlugin::impl_desc_type::jit},
-                pooling_test_params{{1, 3, 228, 228}, 2, 2, 2, 2, 0, 0, 6, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
-                pooling_test_params{{1, 3, 228, 228}, 4, 2, 2, 2, 0, 0, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
-                pooling_test_params{{1, 3, 228, 228}, 4, 2, 2, 1, 0, 0, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}}));
+                pooling_test_params{{1, 3, 228, 228}, {4, 2}, {2, 1}, {0, 0}, {0, 0}, PoolingLayer::MAX, false, 4, MKLDNNPlugin::impl_desc_type::jit},
+                pooling_test_params{{1, 3, 228, 228}, {2, 2}, {2, 2}, {0, 0}, {0, 0}, PoolingLayer::MAX, false, 6, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1, 3, 228, 228}, {4, 2}, {2, 2}, {0, 0}, {0, 0}, PoolingLayer::MAX, false, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}},
+                pooling_test_params{{1, 3, 228, 228}, {4, 2}, {2, 1}, {0, 0}, {0, 0}, PoolingLayer::MAX, false, 4, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref_any}}));
index 53feb58..ce860c2 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -20,12 +19,8 @@ using namespace mkldnn;
 
 
 struct relu_test_params {
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } in;
+    // Formats: NCHW, NCDHW
+    vector<size_t> dims;
 
     float n_clope;
 
@@ -39,22 +34,29 @@ struct relu_test_params {
 template <typename data_t>
 void ref_relu(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<data_t> &dst, relu_test_params prm)
 {
-    size_t IW = prm.in.w;
-    size_t IH = prm.in.h;
-    size_t IC = prm.in.c;
+    auto dims_size = src.dims().size();
+    
+    size_t IW = src.dims()[dims_size - 1];
+    size_t IH = src.dims()[dims_size - 2];
+    size_t ID = dims_size == 5 ? src.dims()[dims_size - 3] : 1u;
+    size_t IC = src.dims()[1];
 
     const data_t *src_data = src.readOnly();
     data_t *dst_data = dst.data();
 
     for (uint32_t c = 0; c < IC; c++) {
-        for (uint32_t h = 0; h < IH; h++) {
-            for (uint32_t w = 0; w < IW; w++) {
-                uint32_t oidx = c * IH * IW
-                                + h * IW + w;
-
-                dst_data[oidx] = src_data[oidx] >= 0.0 ?
-                                 src_data[oidx] :
-                                 src_data[oidx] * prm.n_clope;
+        for (uint32_t d = 0; d < ID; d++) {
+            for (uint32_t h = 0; h < IH; h++) {
+                for (uint32_t w = 0; w < IW; w++) {
+                    uint32_t oidx = c * ID * IH * IW
+                                    + d * IH * IW
+                                    + h * IW
+                                    + w;
+
+                    dst_data[oidx] = src_data[oidx] >= 0.0 ?
+                                     src_data[oidx] :
+                                     src_data[oidx] * prm.n_clope;
+                }
             }
         }
     }
@@ -63,13 +65,14 @@ void ref_relu(const InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<
 class MKLDNNGraphReluTests: public TestsCommon,
                                      public WithParamInterface<relu_test_params> {
     std::string model_t = R"V0G0N(
-<Net Name="Relu_Only" version="2" precision="FP32" batch="1">
+<Net Name="Relu_Only" version="3" precision="FP32" batch="1">
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="0">
             <output>
                 <port id="0">
                     <dim>_IN_</dim>
                     <dim>_IC_</dim>
+                    <dim>_ID_</dim>
                     <dim>_IH_</dim>
                     <dim>_IW_</dim>
                 </port>
@@ -80,6 +83,7 @@ class MKLDNNGraphReluTests: public TestsCommon,
                 <port id="1">
                     <dim>_IN_</dim>
                     <dim>_IC_</dim>
+                    <dim>_ID_</dim>
                     <dim>_IH_</dim>
                     <dim>_IW_</dim>
                 </port>
@@ -88,6 +92,7 @@ class MKLDNNGraphReluTests: public TestsCommon,
                 <port id="2">
                     <dim>_IN_</dim>
                     <dim>_IC_</dim>
+                    <dim>_ID_</dim>
                     <dim>_IH_</dim>
                     <dim>_IW_</dim>
                 </port>
@@ -102,11 +107,24 @@ class MKLDNNGraphReluTests: public TestsCommon,
 
     std::string getModel(relu_test_params p) {
         std::string model = model_t;
+        auto dims_size = p.dims.size();
+
+        switch (dims_size) {
+            case 3:
+                REMOVE_LINE(model, "<dim>_IH_</dim>");
+            case 4:
+                REMOVE_LINE(model, "<dim>_ID_</dim>");
+        }
 
-        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
-        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
-        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
-        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
+        REPLACE_WITH_NUM(model, "_IW_", p.dims[dims_size - 1]);
+        REPLACE_WITH_NUM(model, "_IC_", p.dims[1]);
+        REPLACE_WITH_NUM(model, "_IN_", p.dims[0]);
+        switch (dims_size) {
+            case 5:
+                REPLACE_WITH_NUM(model, "_ID_", p.dims[dims_size - 3]);
+            case 4:
+                REPLACE_WITH_NUM(model, "_IH_", p.dims[dims_size - 2]);
+        }
 
         return model;
     }
@@ -138,9 +156,18 @@ protected:
                 }
             }
 
-            InferenceEngine::SizeVector dims_src = {p.in.n, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
@@ -170,7 +197,7 @@ protected:
 
             ref_relu(*srcPtr, dst_ref, p);
 
-            compare(*output, dst_ref);
+            compare(*output, dst_ref, 0.0005f);
         } catch (const InferenceEngine::details::InferenceEngineException &e) {
             FAIL() << e.what();
         }
@@ -199,4 +226,22 @@ INSTANTIATE_TEST_CASE_P(
                                     ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
                                     ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
                                 }
-                        }}));
+                        }},
+                relu_test_params{
+                        {1, 64, 32, 32, 32}, 0.0f, 3, MKLDNNPlugin::impl_desc_type::ref_any, {
+                                [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+                                    ASSERT_TRUE(impl.getImplementationType() | MKLDNNPlugin::impl_desc_type::ref_any);
+                                    ASSERT_EQ(1, impl.getConfig().inConfs.size());
+                                    ASSERT_EQ(1, impl.getConfig().outConfs.size());
+                                    ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(0).desc.getLayout());
+                                    ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().outConfs.at(0).desc.getLayout());
+                                },
+                                [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+                                    ASSERT_TRUE(impl.getImplementationType() | MKLDNNPlugin::impl_desc_type::ref_any);
+                                    ASSERT_EQ(1, impl.getConfig().inConfs.size());
+                                    ASSERT_EQ(1, impl.getConfig().outConfs.size());
+                                    ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(0).desc.getLayout());
+                                    ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().outConfs.at(0).desc.getLayout());
+                                }
+                        }}
+        ));
index 50b2900..d85aaa5 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -81,14 +80,14 @@ __DST_DIMS__
 
                std::string src_dims;
                for (auto& dim : p.in) {
-                       src_dims += "<dim>";
+                       src_dims += "                    <dim>";
                        src_dims += std::to_string(dim) + "</dim>\n";
                }
                REPLACE_WITH_STR(model, "__SRC_DIMS__", src_dims);
 
                std::string dst_dims;
                for (auto& dim : p.out) {
-                       dst_dims += "<dim>";
+                       dst_dims += "\t\t<dim>";
                        dst_dims += std::to_string(dim) + "</dim>\n";
                }
                REPLACE_WITH_STR(model, "__DST_DIMS__", dst_dims);
@@ -176,7 +175,7 @@ TEST_P(MKLDNNGraphReshapeTests, TestsReshape) {}
 INSTANTIATE_TEST_CASE_P(
         TestsReshape, MKLDNNGraphReshapeTests,
         ::testing::Values(
-        reshape_test_params{ {1, 3, 228, 228}, {1, 24, 2, 3249}, {1, 24, 2, 3249}, 0, -1, 2,
+        reshape_test_params{ {1, 3, 228, 228}, {1, 24, 2, 3249}, {1, 24, 2, 3249}, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown, { [](MKLDNNPlugin::PrimitiveDescInfo impl) {
                 ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
                 ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -184,7 +183,7 @@ INSTANTIATE_TEST_CASE_P(
                 ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
                 ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 4 },{ 2, 2 },{ 2, 2 }, 0, -1, 2,
+        reshape_test_params{ { 4 },{ 2, 2 },{ 2, 2 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -192,7 +191,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::C, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::NC, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 4 },{ 1, 2, 2 },{ 1, 2, 2 }, 0, -1, 2,
+        reshape_test_params{ { 4 },{ 1, 2, 2 },{ 1, 2, 2 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -200,7 +199,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::C, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::CHW, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 4 },{ 1, 4, 1, 1 },{ 1, 4, 1, 1 }, 0, -1, 2,
+        reshape_test_params{ { 4 },{ 1, 4, 1, 1 },{ 1, 4, 1, 1 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -208,7 +207,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::C, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 4, 4 },{ 1, 4, 4 },{ 1, 4, 4 }, 0, -1, 2,
+        reshape_test_params{ { 4, 4 },{ 1, 4, 4 },{ 1, 4, 4 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -216,7 +215,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::NC, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::CHW, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 4, 4 },{ 1, 4, 2, 2 },{ 1, 4, 2, 2 }, 0, -1, 2,
+        reshape_test_params{ { 4, 4 },{ 1, 4, 2, 2 },{ 1, 4, 2, 2 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -224,7 +223,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::NC, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 4, 2, 2 },{ 1, 4, 2, 2 },{ 1, 4, 2, 2 }, 0, -1, 2,
+        reshape_test_params{ { 4, 2, 2 },{ 1, 4, 2, 2 },{ 1, 4, 2, 2 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -232,7 +231,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::CHW, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 2, 2 },{ 4 },{ 4 }, 0, -1, 2,
+        reshape_test_params{ { 2, 2 },{ 4 },{ 4 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -240,7 +239,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::NC, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::C, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-            reshape_test_params{ { 1, 2, 2 },{ 4 },{ 4 }, 0, -1, 2,
+        reshape_test_params{ { 1, 2, 2 },{ 4 },{ 4 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -248,7 +247,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::CHW, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::C, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 1, 1, 2, 2 },{ 4 },{ 4 }, 0, -1, 2,
+        reshape_test_params{ { 1, 1, 2, 2 },{ 4 },{ 4 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -256,7 +255,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::C, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 4, 2, 2 },{ 4, 4 },{ 4, 4 }, 0, -1, 2,
+        reshape_test_params{ { 4, 2, 2 },{ 4, 4 },{ 4, 4 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -264,7 +263,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::CHW, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::NC, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 1, 4, 2, 2 },{ 4, 4 },{ 4, 4 }, 0, -1, 2,
+        reshape_test_params{ { 1, 4, 2, 2 },{ 4, 4 },{ 4, 4 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -272,7 +271,7 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::NC, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 1, 4, 2, 2 },{ 4, 2, 2 },{ 4, 2, 2 }, 0, -1, 2,
+        reshape_test_params{ { 1, 4, 2, 2 },{ 4, 2, 2 },{ 4, 2, 2 }, 0, -1, 1,
             MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -280,20 +279,28 @@ INSTANTIATE_TEST_CASE_P(
             ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::CHW, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 1, 4, 2, 2 },{ 4, 2, 2, 1, 1 },{ 4, 2, 2, 1, 1 }, 0, -1, 2,
-            MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+        reshape_test_params{ { 1, 4, 2, 2 }, { 4, 2, 2, 1, 1 }, { 4, 2, 2, 1, 1 }, 0, -1, 1,
+            MKLDNNPlugin::impl_desc_type::unknown, { [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
             ASSERT_EQ(1, impl.getConfig().outConfs.size());
             ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().inConfs.at(0).desc.getLayout());
-            ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().outConfs.at(0).desc.getLayout());
+            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } },
-        reshape_test_params{ { 4, 2, 2, 1, 1 },{ 1, 4, 2, 2 },{ 1, 4, 2, 2 }, 0, -1, 2,
-            MKLDNNPlugin::impl_desc_type::unknown,{ [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+        reshape_test_params{ { 4, 2, 2, 1, 1 }, { 1, 4, 2, 2 }, { 1, 4, 2, 2 }, 0, -1, 1,
+            MKLDNNPlugin::impl_desc_type::unknown, { [](MKLDNNPlugin::PrimitiveDescInfo impl) {
             ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
             ASSERT_EQ(1, impl.getConfig().inConfs.size());
             ASSERT_EQ(1, impl.getConfig().outConfs.size());
-            ASSERT_EQ(InferenceEngine::Layout::BLOCKED, impl.getConfig().inConfs.at(0).desc.getLayout());
+            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().inConfs.at(0).desc.getLayout());
             ASSERT_EQ(InferenceEngine::Layout::NCHW, impl.getConfig().outConfs.at(0).desc.getLayout());
+        } } },
+        reshape_test_params{ { 1, 200 }, { 1, 200, 1, 1, 1 }, { 1, 200, 1, 1, 1 }, 0, -1, 1,
+            MKLDNNPlugin::impl_desc_type::unknown, { [](MKLDNNPlugin::PrimitiveDescInfo impl) {
+            ASSERT_EQ(MKLDNNPlugin::impl_desc_type::unknown, impl.getImplementationType());
+            ASSERT_EQ(1, impl.getConfig().inConfs.size());
+            ASSERT_EQ(1, impl.getConfig().outConfs.size());
+            ASSERT_EQ(InferenceEngine::Layout::NC, impl.getConfig().inConfs.at(0).desc.getLayout());
+            ASSERT_EQ(InferenceEngine::Layout::NCDHW, impl.getConfig().outConfs.at(0).desc.getLayout());
         } } }
 ));
index cf63450..e253a82 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -19,26 +18,11 @@ using namespace std;
 using namespace mkldnn;
 
 struct split_test_params {
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } in;
-
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } out1;
-
-    struct {
-        size_t n;
-        size_t c;
-        size_t h;
-        size_t w;
-    } out2;
+    // Formats: NCHW, NCDHW
+    vector<size_t> dims;
+    std::vector<vector<size_t>> outs;
+
+    int axis;
 
     size_t num_prim_desc;
 
@@ -49,133 +33,120 @@ struct split_test_params {
 };
 
 template <typename data_t>
-void ref_split(InferenceEngine::TBlob<data_t> &src, InferenceEngine::TBlob<data_t> &dst1, InferenceEngine::TBlob<data_t> &dst2) {
+void ref_split(InferenceEngine::TBlob<data_t> &src, std::vector<InferenceEngine::TBlob<data_t>>& dsts, split_test_params& prm) {
     const float * srcData = src.readOnly();
 
-    int MB = dst1.dims()[dst1.dims().size() - 1];
-
-    float * dstData1 = dst1.data();
-    int dstSize1 = dst1.size() / MB;
+    int outerSize = 1;
+    for (int i = 0; i < prm.axis; i++)
+        outerSize *= src.dims()[i];
 
-    float *dstData2 = dst2.data();
-    int dstSize2 = dst2.size() / MB;
+    for (size_t osIdx = 0; osIdx < outerSize; osIdx++) {
+        for (size_t dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
+            float* dstData = dsts[dstIdx].data();
+            int innerSize = dsts[dstIdx].size() / outerSize;
 
-    for (int b = 0; b < MB; b++) {
-        for (size_t j = 0; j < dstSize1; j++, srcData++) {
-            dstData1[b*dstSize1 + j] = *srcData;
-        }
-
-        for (size_t j = 0; j < dstSize2; j++, srcData++) {
-            dstData2[b*dstSize1 + j] = *srcData;
+            for (size_t j = 0; j < innerSize; j++, srcData++) {
+                dstData[osIdx*innerSize + j] = *srcData;
+            }
         }
     }
 }
 
 class MKLDNNGraphSplitTests: public TestsCommon,
                               public WithParamInterface<split_test_params> {
-    // TODO: remove power layers from the test
     std::string model_t = R"V0G0N(
-<net name="ConcatOnly" version="2" precision="FP32" batch="1">
+<net name="ConcatOnly" version="3" precision="FP32" batch="1">
     <layers>
         <layer name="in1" type="Input" precision="FP32" id="1">
             <output>
                 <port id="1">
                     <dim>_IN_</dim>
                     <dim>_IC_</dim>
+                    <dim>_ID_</dim>
                     <dim>_IH_</dim>
                     <dim>_IW_</dim>
                 </port>
             </output>
         </layer>
         <layer name="split" id="2" type="Split" precision="FP32">
-            <split_data axis="1" PrimitivesPriority="_IMPLS_"/>
+            <split_data axis="_AXIS_" PrimitivesPriority="_IMPLS_"/>
             <input>
                 <port id="1">
                     <dim>_IN_</dim>
                     <dim>_IC_</dim>
+                    <dim>_ID_</dim>
                     <dim>_IH_</dim>
                     <dim>_IW_</dim>
                 </port>
             </input>
             <output>
-                <port id="2">
-                    <dim>_ON1_</dim>
-                    <dim>_OC1_</dim>
-                    <dim>_OH1_</dim>
-                    <dim>_OW1_</dim>
-                </port>
-                <port id="3">
-                    <dim>_ON2_</dim>
-                    <dim>_OC2_</dim>
-                    <dim>_OH2_</dim>
-                    <dim>_OW2_</dim>
-                </port>
-            </output>
-        </layer>
-        <layer name="power1" id="3" type="Power" precision="FP32">
-            <power_data power="1" scale="1" shift="0"/>
-            <input>
-                <port id="1">
-                    <dim>_ON1_</dim>
-                    <dim>_OC1_</dim>
-                    <dim>_OH1_</dim>
-                    <dim>_OW1_</dim>
-                </port>
-            </input>
-            <output>
-                <port id="2">
-                    <dim>_ON1_</dim>
-                    <dim>_OC1_</dim>
-                    <dim>_OH1_</dim>
-                    <dim>_OW1_</dim>
-                </port>
-            </output>
-        </layer>
-        <layer name="power2" id="4" type="Power" precision="FP32">
-            <power_data power="1" scale="1" shift="0"/>
-            <input>
-                <port id="1">
-                    <dim>_ON2_</dim>
-                    <dim>_OC2_</dim>
-                    <dim>_OH2_</dim>
-                    <dim>_OW2_</dim>
-                </port>
-            </input>
-            <output>
-                <port id="2">
-                    <dim>_ON2_</dim>
-                    <dim>_OC2_</dim>
-                    <dim>_OH2_</dim>
-                    <dim>_OW2_</dim>
-                </port>
+                _OP_
             </output>
         </layer>
     </layers>
     <edges>
         <edge from-layer="1" from-port="1" to-layer="2" to-port="1"/>
-        <edge from-layer="2" from-port="2" to-layer="3" to-port="1"/>
-        <edge from-layer="2" from-port="3" to-layer="4" to-port="1"/>
     </edges>
 </net>
 )V0G0N";
 
+    std::string port_t = R"V0G0N(
+<port id="_ID_">
+    <dim>_N_</dim>
+    <dim>_C_</dim>
+    <dim>_D_</dim>
+    <dim>_H_</dim>
+    <dim>_W_</dim>
+</port>
+)V0G0N";
+
 protected:
     std::string getModel(split_test_params p) {
         std::string model = model_t;
-        REPLACE_WITH_NUM(model, "_IN_", p.in.n);
-        REPLACE_WITH_NUM(model, "_IC_", p.in.c);
-        REPLACE_WITH_NUM(model, "_IW_", p.in.w);
-        REPLACE_WITH_NUM(model, "_IH_", p.in.h);
-
-        REPLACE_WITH_NUM(model, "_ON1_", p.out1.n);
-        REPLACE_WITH_NUM(model, "_OC1_", p.out1.c);
-        REPLACE_WITH_NUM(model, "_OH1_", p.out1.h);
-        REPLACE_WITH_NUM(model, "_OW1_", p.out1.w);
-
-        REPLACE_WITH_NUM(model, "_ON2_", p.out2.n);
-        REPLACE_WITH_NUM(model, "_OC2_", p.out2.c);
-        REPLACE_WITH_NUM(model, "_OH2_", p.out2.h);
-        REPLACE_WITH_NUM(model, "_OW2_", p.out2.w);
+        auto dims_size = p.dims.size();
+
+        switch (dims_size) {
+            case 3:
+                REMOVE_LINE(model, "<dim>_IH_</dim>");
+            case 4:
+                REMOVE_LINE(model, "<dim>_ID_</dim>");
+        }
+        REPLACE_WITH_NUM(model, "_IN_", p.dims[0]);
+        REPLACE_WITH_NUM(model, "_IC_", p.dims[1]);
+        REPLACE_WITH_NUM(model, "_IW_", p.dims[dims_size - 1]);
+        switch (dims_size) {
+            case 5:
+                REPLACE_WITH_NUM(model, "_ID_", p.dims[dims_size - 3]);
+            case 4:
+                REPLACE_WITH_NUM(model, "_IH_", p.dims[dims_size - 2]);
+        }
+
+        std::string outPorts;
+        for (int idx = 0; idx < p.outs.size(); idx++) {
+            std::string outPort = port_t;
+            switch (dims_size) {
+                case 3:
+                    REMOVE_LINE(outPort, "<dim>_H_</dim>");
+                case 4:
+                    REMOVE_LINE(outPort, "<dim>_D_</dim>");
+            }
+            REPLACE_WITH_NUM(outPort, "_ID_", idx);
+            REPLACE_WITH_NUM(outPort, "_N_", p.outs[idx][0]);
+            REPLACE_WITH_NUM(outPort, "_C_", p.outs[idx][1]);
+            REPLACE_WITH_NUM(outPort, "_W_", p.outs[idx][dims_size - 1]);
+            switch (dims_size) {
+                case 5:
+                    REPLACE_WITH_NUM(outPort, "_D_", p.outs[idx][dims_size - 3]);
+                case 4:
+                    REPLACE_WITH_NUM(outPort, "_H_", p.outs[idx][dims_size - 2]);
+            }
+
+            outPorts += outPort;
+        }
+        REPLACE_WITH_STR(model, "_OP_", outPorts);
+
+        REPLACE_WITH_NUM(model, "_AXIS_", p.axis);
+
         std::string impls;
         for (const auto& preferType : p.preferTypes) {
             if (!impls.empty())
@@ -195,7 +166,7 @@ protected:
             std::string model = getModel(p);
 
             InferenceEngine::CNNNetReader net_reader;
-            ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
+            net_reader.ReadNetwork(model.data(), model.length());
 
             MKLDNNGraphTestClass graph;
             graph.CreateGraph(net_reader.getNetwork());
@@ -212,16 +183,25 @@ protected:
             }
             ASSERT_LE(3, nodes.size());
 
-            InferenceEngine::SizeVector dims_src = {p.in.n, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
             InferenceEngine::BlobMap srcs;
             srcs.insert(std::pair<std::string, InferenceEngine::Blob::Ptr>("in1", src));
 
-            InferenceEngine::TBlob<float>* srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
+            auto srcPtr = dynamic_cast<InferenceEngine::TBlob<float>*>(src.get());
 
             if (srcPtr == nullptr)
                 FAIL() << "Cannot cast blob to TBlob<float>.";
@@ -229,33 +209,26 @@ protected:
             InferenceEngine::OutputsDataMap out;
             out = net_reader.getNetwork().getOutputsInfo();
             InferenceEngine::BlobMap outputBlobs;
-            auto it = out.begin();
-
-            std::pair<std::string, InferenceEngine::DataPtr> item = *it;
-
-            InferenceEngine::TBlob<float>::Ptr output1;
-            output1 = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
-            output1->allocate();
-            outputBlobs[item.first] = output1;
-
-            InferenceEngine::TBlob<float> dst_ref1(item.second->getTensorDesc());
-            dst_ref1.allocate();
-
-            item = *(++it);
-            InferenceEngine::TBlob<float>::Ptr output2;
-            output2 = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
-            output2->allocate();
-            outputBlobs[item.first] = output2;
-
-            InferenceEngine::TBlob<float> dst_ref2(item.second->getTensorDesc());
-            dst_ref2.allocate();
+            std::vector<InferenceEngine::TBlob<float>> dst_refs;
+            for (auto& item : out) {
+                InferenceEngine::TBlob<float>::Ptr output;
+                output = InferenceEngine::make_shared_blob<float>(item.second->getTensorDesc());
+                output->allocate();
+                outputBlobs[item.first] = output;
+
+                InferenceEngine::TBlob<float> dst_ref(item.second->getTensorDesc());
+                dst_ref.allocate();
+                dst_refs.push_back(dst_ref);
+            }
 
             graph.Infer(srcs, outputBlobs);
 
-            ref_split(*srcPtr, dst_ref1, dst_ref2);
+            ref_split(*srcPtr, dst_refs, p);
 
-            compare(*output1, dst_ref1);
-            compare(*output2, dst_ref2);
+            int ref_idx = 0;
+            for (auto& output : outputBlobs) {
+                compare(*output.second, dst_refs[ref_idx++], 0.0005f);
+            }
         } catch (const InferenceEngine::details::InferenceEngineException &e) {
             FAIL() << e.what();
         }
@@ -269,9 +242,8 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(
                 split_test_params {
                         {1, 24, 2, 5},
-                        {1, 16, 2, 5},
-                        {1, 8, 2, 5},
-                        3, MKLDNNPlugin::impl_desc_type::unknown, {}, {
+                        {{1, 16, 2, 5}, {1, 8, 2, 5}},
+                        1, 3, MKLDNNPlugin::impl_desc_type::unknown, {}, {
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
                                     ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -300,9 +272,8 @@ INSTANTIATE_TEST_CASE_P(
                 },
                 split_test_params {
                         {1, 20, 2, 5},
-                        {1, 13, 2, 5},
-                        {1, 7, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
+                        {{1, 13, 2, 5}, {1, 7, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
                                     ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -323,9 +294,8 @@ INSTANTIATE_TEST_CASE_P(
                 },
                 split_test_params {
                         {1, 20, 2, 5},
-                        {1, 10, 2, 5},
-                        {1, 10, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
+                        {{1, 10, 2, 5}, {1, 10, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
                                     ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -346,9 +316,8 @@ INSTANTIATE_TEST_CASE_P(
                 },
                 split_test_params {
                         {2, 20, 2, 5},
-                        {2, 10, 2, 5},
-                        {2, 10, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
+                        {{2, 10, 2, 5}, {2, 10, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
                                     ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -369,27 +338,76 @@ INSTANTIATE_TEST_CASE_P(
                 },
                 split_test_params {
                         {1, 24, 2, 5},
-                        {1, 16, 2, 5},
-                        {1, 8, 2, 5},
-                        3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                        {{1, 16, 2, 5}, {1, 8, 2, 5}},
+                        1, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
                 },
                 split_test_params {
                         {1, 20, 2, 5},
-                        {1, 13, 2, 5},
-                        {1, 7, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                        {{1, 13, 2, 5}, {1, 7, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
                 },
                 split_test_params {
                         {1, 20, 2, 5},
-                        {1, 10, 2, 5},
-                        {1, 10, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                        {{1, 10, 2, 5}, {1, 10, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
                 },
                 split_test_params {
                         {2, 20, 2, 5},
-                        {2, 10, 2, 5},
-                        {2, 10, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}}));
+                        {{2, 10, 2, 5}, {2, 10, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {2, 20, 2, 5},
+                        {{2, 15, 2, 5}, {2,  5, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {9, 11, 7, 5},
+                        {{3, 11, 7, 5}, {6, 11, 7, 5}},
+                        0, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {3, 11, 7, 5},
+                        {{3, 11, 4, 5}, {3, 11, 3, 5}},
+                        2, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {3, 11, 7, 5},
+                        {{3, 11, 7, 1}, {3, 11, 7, 4}},
+                        3, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {5, 6, 7, 15},
+                        {{1, 6, 7, 15}, {2, 6, 7, 15}, {1, 6, 7, 15}, {1, 6, 7, 15}},
+                        0, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {5, 6, 7, 15},
+                        {{5, 1, 7, 15}, {5, 2, 7, 15}, {5, 1, 7, 15}, {5, 2, 7, 15}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {5, 6, 7, 15},
+                        {{5, 6, 3, 15}, {5, 6, 1, 15}, {5, 6, 2, 15}, {5, 6, 1, 15}},
+                        2, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {5, 6, 7, 15},
+                        {{5, 6, 7, 5}, {5, 6, 7, 3}, {5, 6, 7, 4}, {5, 6, 7, 3}},
+                        3, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {5, 6, 7, 15},
+                        {{5, 6, 7, 15}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}},
+                split_test_params {
+                        {1, 32, 16, 16, 16},
+                        {{1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}},
+                        1, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}},
+                split_test_params {
+                        {1, 32, 16, 16, 16},
+                        {{1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}, {1, 8, 16, 16, 16}},
+                        1, 3, MKLDNNPlugin::impl_desc_type::unknown, {}}));
 
 class MKLDNNGraphDynBatchSplitTests: public MKLDNNGraphSplitTests {
 protected:
@@ -397,7 +415,7 @@ protected:
         try {
             split_test_params p = ::testing::WithParamInterface<split_test_params>::GetParam();
             std::string model = getModel(p);
-            size_t MB = p.in.n;
+            size_t MB = p.dims[0];
             if (MB < 2)
                 MB = 2;
 
@@ -414,9 +432,18 @@ protected:
             graph.setProperty({{InferenceEngine::PluginConfigParams::KEY_DYN_BATCH_ENABLED, InferenceEngine::PluginConfigParams::YES}});
             graph.CreateGraph(net_reader.getNetwork());
 
-            InferenceEngine::SizeVector dims_src = {MB, p.in.c, p.in.h, p.in.w};
+            InferenceEngine::SizeVector dims_src = p.dims;
+            InferenceEngine::Layout layout = InferenceEngine::ANY;
+            switch (p.dims.size()) {
+                case 4:
+                    layout = InferenceEngine::NCHW;
+                    break;
+                case 5:
+                    layout = InferenceEngine::NCDHW;
+                    break;
+            }
 
-            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, InferenceEngine::NCHW, dims_src);
+            InferenceEngine::Blob::Ptr src = InferenceEngine::make_shared_blob<float, const InferenceEngine::SizeVector>(InferenceEngine::Precision::FP32, layout, dims_src);
             src->allocate();
             fill_data(src->buffer(), src->size());
 
@@ -465,9 +492,8 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(
                 split_test_params {
                         {1, 24, 2, 5},
-                        {1, 16, 2, 5},
-                        {1, 8, 2, 5},
-                        3, MKLDNNPlugin::impl_desc_type::unknown, {}, {
+                        {{1, 16, 2, 5}, {1, 8, 2, 5}},
+                        1, 3, MKLDNNPlugin::impl_desc_type::unknown, {}, {
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
                                     ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -496,9 +522,8 @@ INSTANTIATE_TEST_CASE_P(
                 },
                 split_test_params {
                         {1, 20, 2, 5},
-                        {1, 13, 2, 5},
-                        {1, 7, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
+                        {{1, 13, 2, 5}, {1, 7, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
                                     ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -519,9 +544,8 @@ INSTANTIATE_TEST_CASE_P(
                 },
                 split_test_params {
                         {1, 20, 2, 5},
-                        {1, 10, 2, 5},
-                        {1, 10, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
+                        {{1, 10, 2, 5}, {1, 10, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
                                     ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -542,9 +566,8 @@ INSTANTIATE_TEST_CASE_P(
                 },
                 split_test_params {
                         {2, 20, 2, 5},
-                        {2, 10, 2, 5},
-                        {2, 10, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
+                        {{2, 10, 2, 5}, {2, 10, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::unknown, {}, {
                                 [](MKLDNNPlugin::PrimitiveDescInfo impl) {
                                     ASSERT_EQ(MKLDNNPlugin::impl_desc_type::ref, impl.getImplementationType());
                                     ASSERT_EQ(1, impl.getConfig().inConfs.size());
@@ -564,25 +587,51 @@ INSTANTIATE_TEST_CASE_P(
                         }
                 },
                 split_test_params {
-                        {1, 24, 2, 5},
-                        {1, 16, 2, 5},
-                        {1, 8, 2, 5},
-                        3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                        {2, 24, 2, 5},
+                        {{2, 16, 2, 5}, {2, 8, 2, 5}},
+                        1, 3, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
                 },
                 split_test_params {
                         {1, 20, 2, 5},
-                        {1, 13, 2, 5},
-                        {1, 7, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                        {{1, 13, 2, 5}, {1, 7, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
                 },
                 split_test_params {
                         {1, 20, 2, 5},
-                        {1, 10, 2, 5},
-                        {1, 10, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                        {{1, 10, 2, 5}, {1, 10, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
                 },
                 split_test_params {
                         {2, 20, 2, 5},
-                        {2, 10, 2, 5},
-                        {2, 10, 2, 5},
-                        2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}}));
+                        {{2, 10, 2, 5}, {2, 10, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {2, 20, 2, 5},
+                        {{2, 15, 2, 5}, {2,  5, 2, 5}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {3, 11, 7, 5},
+                        {{3, 11, 4, 5}, {3, 11, 3, 5}},
+                        2, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {3, 11, 7, 5},
+                        {{3, 11, 7, 1}, {3, 11, 7, 4}},
+                        3, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {5, 6, 7, 15},
+                        {{5, 1, 7, 15}, {5, 2, 7, 15}, {5, 1, 7, 15}, {5, 2, 7, 15}},
+                        1, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {5, 6, 7, 15},
+                        {{5, 6, 3, 15}, {5, 6, 1, 15}, {5, 6, 2, 15}, {5, 6, 1, 15}},
+                        2, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}
+                },
+                split_test_params {
+                        {5, 6, 7, 15},
+                        {{5, 6, 7, 5}, {5, 6, 7, 3}, {5, 6, 7, 4}, {5, 6, 7, 3}},
+                        3, 2, MKLDNNPlugin::impl_desc_type::ref, {MKLDNNPlugin::impl_desc_type::ref}}));
index 1d3780f..52bcb45 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -511,7 +510,6 @@ TEST_F(MKLDNNGraphStructureTests, TestNoRedundantReordersBeforeConcat) {
     compare(*output, *dstOut);
 
     // Compare for batch2
-    graph = {};
     net_reader.getNetwork().setBatchSize(2);
     graph.CreateGraph(net_reader.getNetwork());
     desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, {2, 3, 7, 7}, InferenceEngine::NCHW);
@@ -812,7 +810,8 @@ TEST_F(MKLDNNGraphStructureTests, TestNoRedundantReordersBeforeDWConvolution) {
     compare(*output, *dstOut);
 }
 
-TEST_F(MKLDNNGraphStructureTests, TestNoRedundantReordersBeforeDWDeconvolution) {
+// TODO change hardcoded reference to dynamically generated
+TEST_F(MKLDNNGraphStructureTests, DISABLED_TestNoRedundantReordersBeforeDWDeconvolution) {
     std::string model = R"V0G0N(
 <net name="deconv" version="2" batch="1">
     <layers>
@@ -944,7 +943,7 @@ TEST_F(MKLDNNGraphStructureTests, TestNoRedundantReordersBeforeDWDeconvolution)
     outputBlobs["deconv2"] = output2;
 
     graph.Infer(srcs, outputBlobs);
-
     std::vector<float> refDst1 = {-0.042f, -0.563f, -0.150f, 0.396f, 0.224f, 0.229f, -0.335f, -0.390f, -0.213f, 0.959f, 0.520f, -0.507f,
                                   -0.200f, -0.202f, 0.441f, 0.499f, 0.000f, 0.000f, 0.000f, 0.000f, 0.363f, 0.141f, -0.497f, -0.332f, -0.311f,
                                   0.423f, 0.693f, -0.012f, -0.328f, -0.106f, 0.518f, 0.353f, 0.000f, 0.000f, 0.000f, 0.000f, 0.050f, -0.352f,
@@ -1238,7 +1237,7 @@ TEST_F(MKLDNNGraphStructureTests, TestOutputAfterInplacePlusConcat) {
 }
 
 TEST_F(MKLDNNGraphStructureTests, TestResnetPart) {
-    std::string model = R"V0G0N(
+    std::string modelB = R"V0G0N(
 <net name="ResNet-152" version="2" batch="1">
     <layers>
         <layer name="input" type="Input" precision="FP32" id="0">
@@ -1531,7 +1530,8 @@ TEST_F(MKLDNNGraphStructureTests, TestResnetPart) {
             </output>
             <weights offset="401152" size="147456"/>
             <biases offset="548608" size="256"/>
-        </layer>
+        </layer> )V0G0N";
+        std::string modelE =R"V0G0N(
         <layer name="res2b_branch2b_relu" type="ReLU" precision="FP32" id="29">
             <input>
                 <port id="58">
@@ -1706,6 +1706,7 @@ TEST_F(MKLDNNGraphStructureTests, TestResnetPart) {
 </net>
 )V0G0N";
 
+    std::string model = modelB + modelE;
     InferenceEngine::CNNNetReader net_reader;
     ASSERT_NO_THROW(net_reader.ReadNetwork(model.data(), model.length()));
 
@@ -2268,7 +2269,6 @@ TEST_F(MKLDNNGraphStructureTests, TestResultsAfterGroupedConvWithStrides) {
     graph.Infer(srcs, outputBlobs);
 
     // Compare for batch2
-    graph = {};
     net_reader.getNetwork().setBatchSize(2);
     graph.CreateGraph(net_reader.getNetwork());
     desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, {2, 24, 80, 80}, InferenceEngine::NCHW);
@@ -3277,7 +3277,6 @@ TEST_F(MKLDNNGraphStructureTests, TestFailedPartDPN92) {
     }
 
     // Compare for batch2
-    graph = {};
     net_reader.getNetwork().setBatchSize(2);
     graph.CreateGraph(net_reader.getNetwork());
     desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, {2, 32, 14, 14}, InferenceEngine::NCHW);
@@ -4081,7 +4080,6 @@ TEST_F(MKLDNNGraphStructureTests, TestFailedPartPlateRecognitionBarrier0001) {
     }
 
     // Compare for batch2
-    graph = {};
     net_reader.getNetwork().setBatchSize(2);
     graph.CreateGraph(net_reader.getNetwork());
     desc = InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, {2, 128, 1, 88}, InferenceEngine::NCHW);
@@ -4334,10 +4332,10 @@ TEST_F(MKLDNNGraphStructureTests, TestFailedVNect0002) {
     auto& nodes = graph.getNodes();
     for (auto &node : nodes) {
         if ( node->getType() == MKLDNNPlugin::Output &&
-             (node->getName() == "out_slice_heatmaps.1" ||
+             (node->getName() == "out_slice_heatmaps.0" ||
+              node->getName() == "out_slice_heatmaps.1" ||
               node->getName() == "out_slice_heatmaps.2" ||
-              node->getName() == "out_slice_heatmaps.3" ||
-              node->getName() == "out_slice_heatmaps.4" ) ) {
+              node->getName() == "out_slice_heatmaps.3" ) ) {
             outputs_num++;
         }
     }
@@ -4812,9 +4810,9 @@ TEST_F(MKLDNNGraphStructureTests, TestConstantLayerAsOutput) {
 
     net_reader.SetWeights(weights_ptr);
 
-    std::shared_ptr<InferenceEngine::IExtension> cpuExt(new InferenceEngine::Extensions::Cpu::CpuExtensions());
+    InferenceEngine::Extension cpuExt(make_so_name("cpu_extension"));
     MKLDNNPlugin::MKLDNNExtensionManager::Ptr extMgr(new MKLDNNPlugin::MKLDNNExtensionManager());
-    extMgr->AddExtension(cpuExt);
+    extMgr->AddExtension(InferenceEngine::IExtensionPtr(&cpuExt, [](InferenceEngine::IExtension*){}));
 
     MKLDNNGraphTestClass graph;
     graph.CreateGraph(net_reader.getNetwork(), extMgr);
@@ -6081,8 +6079,8 @@ TEST_F(MKLDNNGraphStructureTests, TestCreateGraphWithSplit) {
     ASSERT_EQ(nodes[4].get()->getType(), MKLDNNPlugin::Type::Output);
 
     InferenceEngine::OutputsDataMap outputs = reader.getNetwork().getOutputsInfo();
-    const std::pair<std::string, InferenceEngine::DataPtr> splitOutputItem1 = std::make_pair("Split.1", outputs["Split.1"]);
-    const std::pair<std::string, InferenceEngine::DataPtr> splitOutputItem2 = std::make_pair("Split.2", outputs["Split.2"]);
+    const std::pair<std::string, InferenceEngine::DataPtr> splitOutputItem1 {"Split.0", outputs["Split.0"]};
+    const std::pair<std::string, InferenceEngine::DataPtr> splitOutputItem2 {"Split.1", outputs["Split.1"]};
 
     std::vector<float> splitExpectedOutputData1(batchSize);
     std::vector<float> splitExpectedOutputData2(batchSize);
@@ -6219,7 +6217,7 @@ TEST_F(MKLDNNGraphStructureTests, TestCreateGraphWithFakeOutput) {
 
         InferenceEngine::OutputsDataMap outputs = reader.getNetwork().getOutputsInfo();
         const std::pair<std::string, InferenceEngine::DataPtr> reshapeOutputItem = std::make_pair("Reshape", outputs["Reshape"]);
-        const std::string splitOutputName = std::string("Split.") + (splitFromPortNumber == 1 ? "2" : "1");
+        const std::string splitOutputName = std::string("Split.") + (splitFromPortNumber == 1 ? "1" : "0");
         const std::pair<std::string, InferenceEngine::DataPtr> splitOutputItem = std::make_pair(splitOutputName, outputs[splitOutputName]);
 
         std::vector<float> reshapeExpectedOutputData(batchSize);
@@ -6399,9 +6397,9 @@ TEST_F(MKLDNNGraphStructureTests, TestCreateGraphWithMultipleData) {
     ASSERT_EQ(nodes[3].get()->getType(), MKLDNNPlugin::Type::Reshape);
     ASSERT_EQ(nodes[4].get()->getType(), MKLDNNPlugin::Type::Output);
     ASSERT_EQ(nodes[5].get()->getType(), MKLDNNPlugin::Type::Reorder);
-    ASSERT_EQ(nodes[6].get()->getType(), MKLDNNPlugin::Type::Output);
-    ASSERT_EQ(nodes[7].get()->getType(), MKLDNNPlugin::Type::Reorder);
-    ASSERT_EQ(nodes[8].get()->getType(), MKLDNNPlugin::Type::Reshape);
+    ASSERT_EQ(nodes[6].get()->getType(), MKLDNNPlugin::Type::Reshape);
+    ASSERT_EQ(nodes[7].get()->getType(), MKLDNNPlugin::Type::Output);
+    ASSERT_EQ(nodes[8].get()->getType(), MKLDNNPlugin::Type::Reorder);
     ASSERT_EQ(nodes[9].get()->getType(), MKLDNNPlugin::Type::Output);
     ASSERT_EQ(nodes[10].get()->getType(), MKLDNNPlugin::Type::Reshape);
     ASSERT_EQ(nodes[11].get()->getType(), MKLDNNPlugin::Type::Output);
@@ -6411,7 +6409,7 @@ TEST_F(MKLDNNGraphStructureTests, TestCreateGraphWithMultipleData) {
         std::make_pair("reshape1", outputs.find("reshape1")->second),
         std::make_pair("reshape2", outputs.find("reshape2")->second),
         std::make_pair("reshape3", outputs.find("reshape3")->second),
-        std::make_pair("split.1", outputs.find("split.1")->second)
+        std::make_pair("split.0", outputs.find("split.0")->second)
     };
 
     std::vector<std::vector<float>> expectedOutputData = {
index 620be63..b0d7bfb 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -101,26 +100,81 @@ public:
             // need to retain converted blobs until infer finish
             std::vector<InferenceEngine::Blob::Ptr> convertedInputs;
             for (auto input : inputs) {
-                InferenceEngine::TBlob<float> *in_f = nullptr;
                 switch (input.second->precision()) {
-                    case InferenceEngine::Precision::FP32:
+                    case InferenceEngine::Precision::FP32: {
+                        InferenceEngine::TBlob<float> *in_f = nullptr;
                         in_f = dynamic_cast<InferenceEngine::TBlob<float> *>(input.second.get());
-                        break;
-                    default:
-                        THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
-                }
+                        if (in_f == nullptr) {
+                            FAIL() << "Input data precision not supported. Expected float.";
+                        }
 
-                switch (input.second->precision()) {
-                    case InferenceEngine::Precision::FP32: break;
-                    default: FAIL() << "Unsupported precision";
-                }
+                        if (in_f->readOnly() == nullptr) {
+                            THROW_IE_EXCEPTION << "Input data was not allocated.";
+                        }
+                    }
+                    break;
+                    case InferenceEngine::Precision::I32: {
+                        InferenceEngine::TBlob<int32_t> *in_f = nullptr;
+                        in_f = dynamic_cast<InferenceEngine::TBlob<int32_t> *>(input.second.get());
+                        if (in_f == nullptr) {
+                            FAIL() << "Input data precision not supported. Expected float.";
+                        }
 
-                if (in_f == nullptr) {
-                    FAIL() << "Input data precision not supported. Expected float.";
-                }
+                        if (in_f->readOnly() == nullptr) {
+                            THROW_IE_EXCEPTION << "Input data was not allocated.";
+                        }
+                    }
+                    break;
+                    case InferenceEngine::Precision::U16: {
+                        InferenceEngine::TBlob<uint16_t> *in_f = nullptr;
+                        in_f = dynamic_cast<InferenceEngine::TBlob<uint16_t> *>(input.second.get());
+                        if (in_f == nullptr) {
+                            FAIL() << "Input data precision not supported. Expected float.";
+                        }
+
+                        if (in_f->readOnly() == nullptr) {
+                            THROW_IE_EXCEPTION << "Input data was not allocated.";
+                        }
+                    }
+                    break;
+                    case InferenceEngine::Precision::I16: {
+                        InferenceEngine::TBlob<int16_t> *in_f = nullptr;
+                        in_f = dynamic_cast<InferenceEngine::TBlob<int16_t> *>(input.second.get());
+                        if (in_f == nullptr) {
+                            FAIL() << "Input data precision not supported. Expected float.";
+                        }
+
+                        if (in_f->readOnly() == nullptr) {
+                            THROW_IE_EXCEPTION << "Input data was not allocated.";
+                        }
+                    }
+                    break;
+                    case InferenceEngine::Precision::U8: {
+                        InferenceEngine::TBlob<uint8_t> *in_f = nullptr;
+                        in_f = dynamic_cast<InferenceEngine::TBlob<uint8_t> *>(input.second.get());
+                        if (in_f == nullptr) {
+                            FAIL() << "Input data precision not supported. Expected float.";
+                        }
 
-                if (in_f->readOnly() == nullptr) {
-                    THROW_IE_EXCEPTION << "Input data was not allocated.";
+                        if (in_f->readOnly() == nullptr) {
+                            THROW_IE_EXCEPTION << "Input data was not allocated.";
+                        }
+                    }
+                    break;
+                    case InferenceEngine::Precision::I8: {
+                        InferenceEngine::TBlob<int8_t> *in_f = nullptr;
+                        in_f = dynamic_cast<InferenceEngine::TBlob<int8_t> *>(input.second.get());
+                        if (in_f == nullptr) {
+                            FAIL() << "Input data precision not supported. Expected float.";
+                        }
+
+                        if (in_f->readOnly() == nullptr) {
+                            THROW_IE_EXCEPTION << "Input data was not allocated.";
+                        }
+                    }
+                    break;
+                    default:
+                        THROW_IE_EXCEPTION << "Unsupported input precision " << input.second->precision();
                 }
 
                 PushInputData(input.first, input.second, batch);
@@ -207,4 +261,4 @@ public:
             }
         }
     }
-};
\ No newline at end of file
+};
index 631f68f..2971eb7 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -304,57 +303,6 @@ TEST(CNNSpecificGraphCopyTests, copyPreprocess) {
     ASSERT_FLOAT_EQ(pp[2]->meanValue, 122);
 }
 
-
-TEST(CNNSpecificGraphCopyTests, copyV1IR) {
-    CNNNetReader netReader;
-    //define minimal network with Clamp layer
-    const std::string SINGLE_LAYER_MODEL = R"V0G0N(
-    <net name="SingleLayer" version="1" batch="1">
-        <input>
-            <dim>3</dim>
-            <dim>224</dim>
-            <dim>224</dim>
-        </input>
-        <layers>
-            <layer id="1" name="ClampLayer" precision="FP16" type="Clamp">
-                <data max="6" min="0"/>
-                <input>
-                        <port id="0">
-                                <dim>3</dim>
-                                <dim>224</dim>
-                                <dim>224</dim>
-                        </port>
-                </input>
-                <output>
-                        <port id="1">
-                                <dim>3</dim>
-                                <dim>224</dim>
-                                <dim>224</dim>
-                        </port>
-                </output>
-            </layer>
-        </layers>
-        <edges>
-        </edges>
-    </net>
-    )V0G0N";
-    ASSERT_NO_THROW(netReader.ReadNetwork(SINGLE_LAYER_MODEL.data(), SINGLE_LAYER_MODEL.length()));
-    ASSERT_TRUE(netReader.isParseSuccess());
-    auto network = netReader.getNetwork();
-
-    //copy the network
-    struct EmptyStruct {};
-    auto visitor = [&](CNNLayerPtr lp) { return injectData<EmptyStruct>(lp); };
-    auto copied_net_ptr = CNNNetCopy(network, visitor);
-    auto copied_net = CNNNetwork(copied_net_ptr.get());
-
-    //check that Clamp layer was properly copied
-    auto layer = std::dynamic_pointer_cast<ClampLayer>(copied_net.getLayerByName("ClampLayer"));
-    ASSERT_NE(layer, nullptr) << "Could not perform dynamic cast from base pointer to Clamp layer pointer. "
-                                 "Net copy could be incorrect.";
-}
-
-
 TEST(CNNSpecificGraphCopyTests, copyNetworkWithDeconvolution) {
     CNNNetReader netReader;
     //define minimal network with deconvolution layer
index 427098b..9de222c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -47,23 +46,14 @@ TEST_F(BlobProxyTests, convertByteBlobToFloat) {
     }
 }
 
-TEST_F(BlobProxyTests, shouldThrowOnAllocate) {
-    SizeVector v = {1, 2, 3};
-    auto allocator = createMockAllocator();
-
-    TBlobProxy<float> proxy(Precision::FP32, C, TBlob<float>(Precision::FP32, CHW, v, dynamic_pointer_cast<IAllocator>(allocator)), 2, {2});
-
-    EXPECT_THROW(((Blob&)proxy).allocate(), InferenceEngineException);
-}
-
-TEST_F(BlobProxyTests, shouldThrowOnDeAllocate)
+TEST_F(BlobProxyTests, shouldNotDeAllocate)
 {
     SizeVector v = {1, 2, 3};
     auto allocator = createMockAllocator();
 
     TBlobProxy<float> proxy(Precision::FP32, C, TBlob<float>(Precision::FP32, CHW, v, dynamic_pointer_cast<IAllocator>(allocator)), 2, {2});
 
-    EXPECT_THROW(((Blob&)proxy).deallocate(), InferenceEngineException);
+    EXPECT_EQ(((Blob&)proxy).deallocate(), false);
 }
 
 
@@ -236,15 +226,6 @@ TEST_F(BlobProxyTests, canReturnConstantData) {
     ASSERT_NE(proxy.cbuffer().as<const void*>(), nullptr);
 }
 
-TEST_F(BlobProxyTests, noAllocDeallocLogic) {
-    TBlob<float>::Ptr b(new TBlob<float>(Precision::FP32, C));
-    b->set({ 1.0f, 2.0f, 3.0f });
-    TBlobProxy<uint8_t> proxy(Precision::U8, C, b, 0, { b->byteSize() });
-    ASSERT_ANY_THROW(((Blob*) &proxy)->allocate());
-    ASSERT_ANY_THROW(((Blob*) &proxy)->deallocate());
-}
-
-
 TEST_F(BlobProxyTests, canIterateOverData) {
     TBlob<uint8_t>::Ptr b(new TBlob<uint8_t >(Precision::FP32, C));
     b->set({ 1, 2, 3 });
index 59015c2..e104c4c 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -35,7 +34,6 @@ protected:
         return shared_ptr<MockAllocator>(new MockAllocator());
     }
 
-
 public:
 
 };
@@ -489,4 +487,4 @@ TEST_F(BlobTests, makeRoiBlobWrongSize) {
     // try to create ROI blob with wrong size
     ROI roi = {0, 1, 1, 4, 4};  // cropped picture with: id = 0, (x,y) = (1,1), sizeX (W) = 4, sizeY (H) = 4
     ASSERT_THROW(make_shared_blob(blob, roi), InferenceEngine::details::InferenceEngineException);
-}
\ No newline at end of file
+}
index 5ac02ae..594ee19 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -170,9 +169,9 @@ TEST_F(InferRequestThreadSafeDefaultTests, callbackTakesOKIfAsyncRequestWasOK) {
             testRequest), [](IInferRequest *p) { p->Release(); });
     testRequest->SetPointerToPublicInterface(asyncRequest);
 
-    testRequest->SetCompletionCallback({[](InferenceEngine::IInferRequest::Ptr request, StatusCode status) {
+    testRequest->SetCompletionCallback([](InferenceEngine::IInferRequest::Ptr request, StatusCode status) {
         ASSERT_EQ((int) StatusCode::OK, status);
-    }});
+    });
     EXPECT_CALL(*mockInferRequestInternal.get(), InferImpl()).Times(1);
 
     testRequest->StartAsync();
index 263859b..8839861 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -93,3 +92,21 @@ TEST_F(DataTests, canSetNotEmptyDimsForBlockingDescBlocked) {
 TEST_F(DataTests, canSetNotEmptyDimsForBlockingDescNCHW) {
     ASSERT_NO_THROW(BlockingDesc(notEmptyDims, NCHW));
 }
+
+TEST_F(DataTests, setPrecision) {
+    Data data(data_name, emptyDims, Precision::FP32, Layout::NCHW);
+
+    EXPECT_EQ(Precision::FP32, data.precision);
+    EXPECT_EQ(Precision::FP32, data.getPrecision());
+    EXPECT_EQ(Precision::FP32, data.getTensorDesc().getPrecision());
+
+    data.setPrecision(Precision::FP16);
+    EXPECT_EQ(Precision::FP16, data.precision);
+    EXPECT_EQ(Precision::FP16, data.getPrecision());
+    EXPECT_EQ(Precision::FP16, data.getTensorDesc().getPrecision());
+
+    data.precision = Precision::Q78;
+    EXPECT_EQ(Precision::Q78, data.precision);
+    EXPECT_EQ(Precision::Q78, data.getPrecision());
+    EXPECT_EQ(Precision::Q78, data.getTensorDesc().getPrecision());
+}
index 9a39c1e..6d18b64 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -353,33 +352,56 @@ TEST_F(LayersTests, poolIRv2BackwardCompatibility) {
     pool._stride[Y_AXIS] = 3u;
     ASSERT_EQ(pool._stride_y, 3u);
 }
+
 TEST_F(LayersTests, canGetPadBeginForConvolution) {
     ConvolutionLayer layer(getDefaultParamsForLayer());
+    PropertyVector<unsigned> ref{{1, 2}};
+    layer._padding = ref;
+
+    auto allPads = getPaddings(layer);
+
+    ASSERT_EQ(allPads.begin, ref);
+}
+
+TEST_F(LayersTests, canGetPadEndForConvolution) {
+    ConvolutionLayer layer(getDefaultParamsForLayer());
+    PropertyVector<unsigned> ref{{1, 2}};
+    layer._pads_end = ref;
+
+    auto allPads = getPaddings(layer);
+
+    ASSERT_EQ(allPads.end, ref);
+}
+
+TEST_F(LayersTests, canGetPad3DBeginForConvolution) {
+    ConvolutionLayer layer(getDefaultParamsForLayer());
     PropertyVector<unsigned> ref;
     ref.insert(X_AXIS, 1);
     ref.insert(Y_AXIS, 2);
+    ref.insert(Z_AXIS, 3);
     layer._padding = ref;
 
-    auto allPads = getConvPaddings(layer);
+    auto allPads = getPaddings(layer);
 
     ASSERT_EQ(allPads.begin, ref);
 }
 
-TEST_F(LayersTests, canGetPadEndForConvolution) {
+TEST_F(LayersTests, canGetPad3DEndForConvolution) {
     ConvolutionLayer layer(getDefaultParamsForLayer());
     PropertyVector<unsigned> ref;
     ref.insert(X_AXIS, 1);
     ref.insert(Y_AXIS, 2);
+    ref.insert(Z_AXIS, 3);
     layer._pads_end = ref;
 
-    auto allPads = getConvPaddings(layer);
+    auto allPads = getPaddings(layer);
 
     ASSERT_EQ(allPads.end, ref);
 }
 
 TEST_F(LayersTests, returnDefaultPadForEmptyConvolution) {
     ConvolutionLayer layer(getDefaultParamsForLayer());
-    auto allPads = getConvPaddings(layer);
+    auto allPads = getPaddings(layer);
     PropertyVector<unsigned> ref_begin(2, 0u);
     PropertyVector<unsigned> ref_end;
     ASSERT_EQ(allPads.begin, ref_begin);
@@ -389,16 +411,21 @@ TEST_F(LayersTests, returnDefaultPadForEmptyConvolution) {
 TEST_F(LayersTests, returnEmptyPadForValidPadConvolution) {
     ConvolutionLayer layer(getDefaultParamsForLayer());
     layer.params["auto_pad"] = "valid";
-    auto allPads = getConvPaddings(layer);
-    PropertyVector<unsigned> ref(2,0);
+    auto allPads = getPaddings(layer);
+    PropertyVector<unsigned> ref(2,0u);
     ASSERT_EQ(allPads.begin, ref);
     ASSERT_EQ(allPads.end, ref);
+
+    PropertyVector<unsigned> ref3D(2,0u);
+    layer._kernel.insert(Z_AXIS, 0u);
+    ASSERT_EQ(allPads.begin, ref3D);
+    ASSERT_EQ(allPads.end, ref3D);
 }
 
 TEST_F(LayersTests, throwOnSamePadForEmptyConvolution) {
     ConvolutionLayer layer(getDefaultParamsForLayer());
     layer.params["auto_pad"] = "same_upper";
-    ASSERT_THROW(getConvPaddings(layer), details::InferenceEngineException);
+    ASSERT_THROW(getPaddings(layer), details::InferenceEngineException);
 }
 
 TEST_F(LayersTests, throwOnInvalidDimsSamePadForConvolution) {
@@ -406,7 +433,7 @@ TEST_F(LayersTests, throwOnInvalidDimsSamePadForConvolution) {
     layer.params["auto_pad"] = "same_upper";
     auto emptyData = std::make_shared<InferenceEngine::Data>("", Precision::UNSPECIFIED);
     layer.insData.push_back(emptyData);
-    ASSERT_THROW(getConvPaddings(layer), details::InferenceEngineException);
+    ASSERT_THROW(getPaddings(layer), details::InferenceEngineException);
 }
 
 TEST_F(LayersTests, throwOn2DSamePadForConvolution) {
@@ -415,7 +442,7 @@ TEST_F(LayersTests, throwOn2DSamePadForConvolution) {
     auto notEmptyData = std::make_shared<InferenceEngine::Data>("", SizeVector{1, 1}, Precision::UNSPECIFIED,
                                                                 Layout::NC);
     layer.insData.push_back(notEmptyData);
-    ASSERT_THROW(getConvPaddings(layer), details::InferenceEngineException);
+    ASSERT_THROW(getPaddings(layer), details::InferenceEngineException);
 }
 
 TEST_F(LayersTests, throwWithNotEnoughParamsSamePadForConvolution) {
@@ -423,7 +450,12 @@ TEST_F(LayersTests, throwWithNotEnoughParamsSamePadForConvolution) {
     layer.params["auto_pad"] = "same_upper";
     auto notEmptyData = std::make_shared<InferenceEngine::Data>("", SizeVector{1, 2, 3, 4}, Precision::UNSPECIFIED);
     layer.insData.push_back(notEmptyData);
-    ASSERT_NO_THROW(getConvPaddings(layer));
+    ASSERT_NO_THROW(getPaddings(layer));
+
+    auto notEmptyData3D = std::make_shared<InferenceEngine::Data>("", SizeVector{1, 2, 3, 4, 5}, Precision::UNSPECIFIED, Layout::NCDHW);
+    layer._kernel.insert(Z_AXIS, 0u);
+    layer.insData[0] = notEmptyData3D;
+    ASSERT_NO_THROW(getPaddings(layer));
 }
 
 // parameters are from real model, like Mobilenet-SSD
@@ -433,19 +465,39 @@ TEST_F(LayersTests, canGetSamePadForConvolutionEvenInput) {
     TensorDesc tensorDesc(Precision::UNSPECIFIED, SizeVector{1, 144, 160, 160}, Layout::NCHW);
     auto notEmptyData = std::make_shared<InferenceEngine::Data>("", tensorDesc);
     layer.insData.push_back(notEmptyData);
-    layer._dilation.insert(X_AXIS, 1);
-    layer._dilation.insert(Y_AXIS, 1);
-    layer._kernel.insert(X_AXIS, 3);
-    layer._kernel.insert(Y_AXIS, 3);
-    layer._stride.insert(X_AXIS, 2);
-    layer._stride.insert(Y_AXIS, 2);
+    layer._dilation = PropertyVector<unsigned>{{1, 1}};
+    layer._kernel = PropertyVector<unsigned>{{3, 3}};
+    layer._stride = PropertyVector<unsigned>{{2, 2}};
 
-    auto pad = getConvPaddings(layer);
+    auto pad = getPaddings(layer);
 
     ASSERT_EQ(pad.begin, PropertyVector<unsigned>(2, 0));
     ASSERT_EQ(pad.end, PropertyVector<unsigned>(2, 1));
 }
 
+// parameters are from real model, like V-Net
+TEST_F(LayersTests, canGetSamePadForConvolutionEvenInput3D) {
+    ConvolutionLayer layer(getDefaultParamsForLayer());
+    layer.params["auto_pad"] = "same_upper";
+    TensorDesc tensorDesc(Precision::UNSPECIFIED, SizeVector{1, 6, 190, 190, 20}, Layout::NCDHW);
+    auto notEmptyData = std::make_shared<InferenceEngine::Data>("", tensorDesc);
+    layer.insData.push_back(notEmptyData);
+    layer._dilation.insert(X_AXIS, 1u);
+    layer._dilation.insert(Y_AXIS, 1u);
+    layer._dilation.insert(Z_AXIS, 1u);
+    layer._kernel.insert(X_AXIS, 5u);
+    layer._kernel.insert(Y_AXIS, 5u);
+    layer._kernel.insert(Z_AXIS, 5u);
+    layer._stride.insert(X_AXIS, 1u);
+    layer._stride.insert(Y_AXIS, 1u);
+    layer._stride.insert(Z_AXIS, 1u);
+
+    auto pad = getPaddings(layer);
+
+    ASSERT_EQ(pad.begin, PropertyVector<unsigned>(3, 2u));
+    ASSERT_EQ(pad.end, PropertyVector<unsigned>(3, 2u));
+}
+
 // parameters are from real model, like Mobilenet-SSD
 TEST_F(LayersTests, canGetSamePadForConvolutionOddInput) {
     ConvolutionLayer layer(getDefaultParamsForLayer());
@@ -453,16 +505,83 @@ TEST_F(LayersTests, canGetSamePadForConvolutionOddInput) {
     TensorDesc tensorDesc(Precision::UNSPECIFIED, SizeVector{1, 144, 75, 75}, Layout::NCHW);
     auto notEmptyData = std::make_shared<InferenceEngine::Data>("", tensorDesc);
     layer.insData.push_back(notEmptyData);
-    layer._dilation.insert(X_AXIS, 1);
-    layer._dilation.insert(Y_AXIS, 1);
-    layer._kernel.insert(X_AXIS, 3);
-    layer._kernel.insert(Y_AXIS, 3);
-    layer._stride.insert(X_AXIS, 2);
-    layer._stride.insert(Y_AXIS, 2);
+    layer._dilation = PropertyVector<unsigned>{{1, 1}};
+    layer._kernel = PropertyVector<unsigned>{{3, 3}};
+    layer._stride = PropertyVector<unsigned>{{2, 2}};
+    PropertyVector<unsigned> ref(2, 1);
+
+    auto pad = getPaddings(layer);
+
+    ASSERT_EQ(pad.begin, ref);
+    ASSERT_EQ(pad.end, ref);
+}
+
+TEST_F(LayersTests, canGetSamePadForDeConvolutionEvenInput) {
+    DeconvolutionLayer layer(getDefaultParamsForLayer());
+    layer.params["auto_pad"] = "same_upper";
+    TensorDesc tensorDesc(Precision::UNSPECIFIED, SizeVector{1, 144, 160, 160}, Layout::NCHW);
+    auto notEmptyData = std::make_shared<InferenceEngine::Data>("", tensorDesc);
+    layer.insData.push_back(notEmptyData);
+    layer._dilation = PropertyVector<unsigned>{{1, 1}};
+    layer._kernel = PropertyVector<unsigned>{{3, 3}};
+    layer._stride = PropertyVector<unsigned>{{2, 2}};
+
+    auto pad = getPaddings(layer);
+
+    ASSERT_EQ(pad.begin, PropertyVector<unsigned>(2, 0));
+    ASSERT_EQ(pad.end, PropertyVector<unsigned>(2, 1));
+}
+
+TEST_F(LayersTests, canGetSamePadForDeConvolutionOddInput) {
+    DeconvolutionLayer layer(getDefaultParamsForLayer());
+    layer.params["auto_pad"] = "same_upper";
+    TensorDesc tensorDesc(Precision::UNSPECIFIED, SizeVector{1, 144, 75, 75}, Layout::NCHW);
+    auto notEmptyData = std::make_shared<InferenceEngine::Data>("", tensorDesc);
+    layer.insData.push_back(notEmptyData);
+    layer._dilation = PropertyVector<unsigned>{{1, 1}};
+    layer._kernel = PropertyVector<unsigned>{{3, 3}};
+    layer._stride = PropertyVector<unsigned>{{2, 2}};
     PropertyVector<unsigned> ref(2, 1);
 
-    auto pad = getConvPaddings(layer);
+    auto pad = getPaddings(layer);
 
     ASSERT_EQ(pad.begin, ref);
     ASSERT_EQ(pad.end, ref);
 }
+
+TEST_F(LayersTests, canGetSamePadForPoolingEvenInput) {
+    PoolingLayer layer(getDefaultParamsForLayer());
+    layer.params["auto_pad"] = "same_upper";
+    TensorDesc tensorDesc(Precision::UNSPECIFIED, SizeVector{1, 144, 160, 160}, Layout::NCHW);
+    auto notEmptyData = std::make_shared<InferenceEngine::Data>("", tensorDesc);
+    layer.insData.push_back(notEmptyData);
+    layer._kernel = PropertyVector<unsigned>{{3, 3}};
+    layer._stride = PropertyVector<unsigned>{{2, 2}};
+
+    auto pad = getPaddings(layer);
+
+    ASSERT_EQ(pad.begin, PropertyVector<unsigned>(2, 0));
+    ASSERT_EQ(pad.end, PropertyVector<unsigned>(2, 1));
+}
+
+TEST_F(LayersTests, canGetSamePadForPoolingOddInput) {
+    PoolingLayer layer(getDefaultParamsForLayer());
+    layer.params["auto_pad"] = "same_upper";
+    TensorDesc tensorDesc(Precision::UNSPECIFIED, SizeVector{1, 144, 75, 75}, Layout::NCHW);
+    auto notEmptyData = std::make_shared<InferenceEngine::Data>("", tensorDesc);
+    layer.insData.push_back(notEmptyData);
+    layer._kernel = PropertyVector<unsigned>{{3, 3}};
+    layer._stride = PropertyVector<unsigned>{{2, 2}};
+    PropertyVector<unsigned> ref(2, 1);
+
+    auto pad = getPaddings(layer);
+
+    ASSERT_EQ(pad.begin, ref);
+    ASSERT_EQ(pad.end, ref);
+}
+
+
+TEST_F(LayersTests, cannotGetPadForUnsupportedLayer) {
+    FullyConnectedLayer layer(getDefaultParamsForLayer());
+    ASSERT_ANY_THROW(getPaddingsImpl(layer));
+}
\ No newline at end of file
index e60a805..b54aa38 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
 using namespace InferenceEngine;
 using namespace ::testing;
 
-class PluginDispatcherTests : public ::testing::Test {};
+class PluginDispatcherTests : public ::testing::Test {
+public:
+    const std::string nameExt(const std::string& name) { return name + IE_BUILD_POSTFIX;}
+};
 
 TEST_F(PluginDispatcherTests, canLoadMockPlugin) {
     PluginDispatcher dispatcher({ "", "./", "./lib" });
-    ASSERT_NO_THROW(dispatcher.getPluginByName("mock_engine"));
+    ASSERT_NO_THROW(dispatcher.getPluginByName(nameExt("mock_engine")));
 }
 
 TEST_F(PluginDispatcherTests, throwsOnUnknownPlugin) {
     PluginDispatcher dispatcher({ "./", "./lib" });
-    ASSERT_THROW(dispatcher.getPluginByName("unknown_plugin"), InferenceEngine::details::InferenceEngineException);
+    ASSERT_THROW(dispatcher.getPluginByName(nameExt("unknown_plugin")), InferenceEngine::details::InferenceEngineException);
 }
 
 TEST_F(PluginDispatcherTests, throwsOnDeviceWithoutPlugins) {
@@ -42,12 +44,12 @@ TEST_F(PluginDispatcherTests, triesToLoadEveryPluginSuitableForDevice) {
 
     ON_CALL(disp, getPluginByName(_)).WillByDefault(ThrowException());
 #ifdef ENABLE_MKL_DNN
-    EXPECT_CALL(disp, getPluginByName("MKLDNNPlugin")).Times(1);
+    EXPECT_CALL(disp, getPluginByName(nameExt("MKLDNNPlugin"))).Times(1);
 #endif
 #ifdef ENABLE_OPENVX_CVE
-    EXPECT_CALL(disp, getPluginByName("OpenVXPluginCVE")).Times(1);
+    EXPECT_CALL(disp, getPluginByName(nameExt("OpenVXPluginCVE"))).Times(1);
 #elif defined ENABLE_OPENVX
-    EXPECT_CALL(disp, getPluginByName("OpenVXPlugin")).Times(1);
+    EXPECT_CALL(disp, getPluginByName(nameExt("OpenVXPlugin"))).Times(1);
 #endif
     ASSERT_THROW(disp.getSuitablePlugin(TargetDevice::eCPU), InferenceEngine::details::InferenceEngineException);
 }
@@ -56,7 +58,7 @@ TEST_F(PluginDispatcherTests, triesToLoadEveryPluginSuitableForDevice) {
 TEST_F(PluginDispatcherTests, returnsIfLoadSuccessfull) {
     MockDispatcher disp({ "./", "./lib" });
     PluginDispatcher dispatcher({ "", "./", "./lib" });
-    auto ptr = dispatcher.getPluginByName("mock_engine");
+    auto ptr = dispatcher.getPluginByName(nameExt("mock_engine"));
 
     EXPECT_CALL(disp, getPluginByName(_)).WillOnce(Return(ptr));
     ASSERT_NO_THROW(disp.getSuitablePlugin(TargetDevice::eCPU));
index 04f30a8..16bd43b 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -46,7 +45,7 @@ TEST_F(TensorDescTests, CreateEmptyBlob) {
     ASSERT_EQ(blob->getTensorDesc().getLayout(), Layout::NCHW);
 }
 
-TEST_F(TensorDescTests, CreateBlockedBlob) {
+TEST_F(TensorDescTests, CreateBlockedBlobNCHW) {
     TensorDesc desc(Precision::FP32, {1, 4, 2, 1}, {{1, 2, 2, 1, 2}, {0, 1, 2, 3, 1}});
     float data[8] = {1, 2, 3, 4, 5, 6, 7, 8};
     Blob::Ptr blockedBlob = make_shared_blob<float>(desc, data);
@@ -58,6 +57,18 @@ TEST_F(TensorDescTests, CreateBlockedBlob) {
     ASSERT_EQ(Layout::BLOCKED, blockedBlob->layout());
 }
 
+TEST_F(TensorDescTests, CreateBlockedBlobNCDHW) {
+    TensorDesc desc(Precision::FP32, {1, 4, 2, 2, 1}, {{1, 2, 2, 2, 1, 2}, {0, 1, 2, 3, 4, 1}});
+    float data[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+    Blob::Ptr blockedBlob = make_shared_blob<float>(desc, data);
+    Blob::Ptr ncdhwBlob = make_shared_blob<float>({Precision::FP32, {1, 4, 2, 2, 1}, Layout::NCDHW}, data);
+    ASSERT_NE(blockedBlob->getTensorDesc().offset(6), ncdhwBlob->getTensorDesc().offset(6));
+    ASSERT_EQ(5, blockedBlob->getTensorDesc().offset(6));
+    ASSERT_EQ(6, ncdhwBlob->getTensorDesc().offset(6));
+    ASSERT_EQ(Layout::NCDHW, ncdhwBlob->layout());
+    ASSERT_EQ(Layout::BLOCKED, blockedBlob->layout());
+}
+
 TEST_F(TensorDescTests, CompareNHWCandNCHWLayouts) {
     TensorDesc descNCHW(Precision::FP32, {1, 3, 4, 2}, Layout::NCHW);
     TensorDesc descNHWC(Precision::FP32, {1, 3, 4, 2}, Layout::NHWC);
@@ -70,3 +81,16 @@ TEST_F(TensorDescTests, CompareNHWCandNCHWLayouts) {
     ASSERT_EQ(descNCHW.getBlockingDesc().getOrder(), nchw);
     ASSERT_EQ(descNHWC.getBlockingDesc().getOrder(), nhwc);
 }
+
+TEST_F(TensorDescTests, CompareNDHWCandNCDHWLayouts) {
+    TensorDesc descNCDHW(Precision::FP32, {1, 3, 4, 4, 2}, Layout::NCDHW);
+    TensorDesc descNDHWC(Precision::FP32, {1, 3, 4, 4, 2}, Layout::NDHWC);
+    SizeVector ncdhw = {0, 1, 2, 3, 4};
+    SizeVector ndhwc = {0, 2, 3, 4, 1};
+
+    ASSERT_NE(descNCDHW, descNDHWC);
+    ASSERT_NE(descNCDHW.getBlockingDesc(), descNDHWC.getBlockingDesc());
+    ASSERT_NE(descNCDHW.getBlockingDesc().getOrder(), descNDHWC.getBlockingDesc().getOrder());
+    ASSERT_EQ(descNCDHW.getBlockingDesc().getOrder(), ncdhw);
+    ASSERT_EQ(descNDHWC.getBlockingDesc().getOrder(), ndhwc);
+}
index 9588151..ad53afb 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 7867075..43337fb 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -41,6 +40,7 @@ class MockICNNNetwork : public InferenceEngine::ICNNNetwork {
     MOCK_QUALIFIED_METHOD1(getInputShapes, const noexcept, void (InferenceEngine::ICNNNetwork::InputShapes&));
     MOCK_QUALIFIED_METHOD2(reshape, noexcept, InferenceEngine::StatusCode (const InferenceEngine::ICNNNetwork::InputShapes &, InferenceEngine::ResponseDesc *));
     MOCK_QUALIFIED_METHOD2(AddExtension, noexcept, InferenceEngine::StatusCode (const InferenceEngine::IShapeInferExtensionPtr &, InferenceEngine::ResponseDesc *));
+    MOCK_QUALIFIED_METHOD3(serialize, const noexcept, InferenceEngine::StatusCode (const std::string &, const std::string &, InferenceEngine::ResponseDesc*));
 };
 
 /**
index fff3ded..12b7c2f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -20,7 +19,5 @@ struct MockIFormatParser : public InferenceEngine::details::IFormatParser {
     MOCK_METHOD1(Parse, InferenceEngine::details::CNNNetworkImplPtr(pugi::xml_node &));
 
     MOCK_METHOD1(SetWeights, void(const InferenceEngine::TBlob<uint8_t>::Ptr &));
-
-    MOCK_METHOD2(CopyBlobsByName, void(void*, std::string));
 };
 
index 4016d1a..bc71bae 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -42,4 +41,5 @@ public:
     MOCK_QUALIFIED_METHOD1(getInputShapes, const noexcept, void (ICNNNetwork::InputShapes &));
     MOCK_QUALIFIED_METHOD2(reshape, noexcept, StatusCode (const ICNNNetwork::InputShapes &, ResponseDesc *));
     MOCK_QUALIFIED_METHOD2(AddExtension, noexcept, StatusCode (const IShapeInferExtensionPtr &, ResponseDesc *));
+    MOCK_QUALIFIED_METHOD3(serialize, const noexcept, StatusCode (const std::string &, const std::string &, InferenceEngine::ResponseDesc*));
 };
diff --git a/inference-engine/tests/unit/opencv_test_gapi/CMakeLists.txt b/inference-engine/tests/unit/opencv_test_gapi/CMakeLists.txt
new file mode 100644 (file)
index 0000000..73d3af5
--- /dev/null
@@ -0,0 +1,34 @@
+# Copyright (C) 2018 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+if(NOT ENABLE_GAPI_TESTS)
+    message(WARNING "Skipping GAPI unit tests")
+    return()
+endif()
+
+find_package(OpenCV COMPONENTS gapi)
+if(NOT(OpenCV_FOUND))
+    message(WARNING "No suitable OpenCV version detected, " ${TARGET_NAME} " skipped")
+    return()
+endif()
+
+file(GLOB SOURCES *.cpp common/*.cpp cpu/*.cpp)
+file(GLOB HEADERS *.hpp common/*.hpp cpu/*.hpp)
+
+set(TARGET opencv_test_gapi)
+add_executable(${TARGET} ${SOURCES} ${HEADERS})
+
+target_include_directories(${TARGET}
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/common"
+  PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/cpu"
+  PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/fluid/modules/gapi/include/")
+
+target_link_libraries(${TARGET} ${OpenCV_LIBS} inference_engine gtest gtest_main)
+
+if(GAPI_TEST_PERF)
+  target_compile_definitions(${TARGET} PRIVATE -DPERF_TEST=1)
+else()
+  target_compile_definitions(${TARGET} PRIVATE -DPERF_TEST=0)
+endif()
diff --git a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.hpp b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests.hpp
new file mode 100644 (file)
index 0000000..7a251f9
--- /dev/null
@@ -0,0 +1,47 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef OPENCV_GAPI_CORE_TESTS_HPP
+#define OPENCV_GAPI_CORE_TESTS_HPP
+
+#include "gapi_tests_common.hpp"
+#include "ie_preprocess.hpp"
+
+#include <gtest/gtest.h>
+
+namespace opencv_test
+{
+
+struct ResizeTestGAPI: public testing::TestWithParam<std::tuple<int, int, std::pair<cv::Size, cv::Size>, double, cv::GCompileArgs>> {};
+
+struct Split2TestGAPI: public TestParams<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+struct Split3TestGAPI: public TestParams<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+struct Split4TestGAPI: public TestParams<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+
+struct Merge2TestGAPI: public TestParams<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+struct Merge3TestGAPI: public TestParams<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+struct Merge4TestGAPI: public TestParams<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+
+//------------------------------------------------------------------------------
+
+struct ResizeTestIE: public testing::TestWithParam<std::tuple<int, int, std::pair<cv::Size, cv::Size>, double>> {};
+
+struct SplitTestIE: public TestParams<std::tuple<int, cv::Size>> {};
+struct MergeTestIE: public TestParams<std::tuple<int, cv::Size>> {};
+
+//------------------------------------------------------------------------------
+
+using PreprocParams = std::tuple< InferenceEngine::Precision     // input-output data type
+                                , InferenceEngine::ResizeAlgorithm // resize algorithm, if needed
+                                , InferenceEngine::Layout        // input tensor layout
+                                , InferenceEngine::Layout        // output tensor layout
+                                , int                            // number of channels
+                                , std::pair<cv::Size, cv::Size>
+                                >;
+
+struct PreprocTest: public TestParams<PreprocParams> {};
+
+} // opencv_test
+
+#endif //OPENCV_GAPI_CORE_TESTS_HPP
diff --git a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests_inl.hpp b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_core_tests_inl.hpp
new file mode 100644 (file)
index 0000000..3daaba5
--- /dev/null
@@ -0,0 +1,876 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef OPENCV_GAPI_CORE_TESTS_INL_HPP
+#define OPENCV_GAPI_CORE_TESTS_INL_HPP
+
+#include "gapi_core_tests.hpp"
+
+#include "blob_factory.hpp"
+#include "blob_transform.hpp"
+#include "ie_preprocess.hpp"
+#include "ie_preprocess_data.hpp"
+#include "ie_preprocess_gapi_kernels.hpp"
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/gapi.hpp>
+
+#include <cstdarg>
+#include <cstdio>
+#include <ctime>
+
+#include <chrono>
+
+#define CV_MAT_CHANNELS(flags) (((flags) >> CV_CN_SHIFT) + 1)
+
+// Can be set externally (via CMake) if built with -DGAPI_TEST_PERF=ON
+#ifndef PERF_TEST
+#define PERF_TEST 0 // 1=test performance, 0=don't
+#endif
+
+namespace opencv_test
+{
+
+#if PERF_TEST
+// performance test: iterate function, measure and print milliseconds per call
+template<typename F> static void test_ms(F func, int iter, const char format[], ...)
+{
+    using std::chrono::high_resolution_clock;
+
+    std::vector<high_resolution_clock::duration> samples(iter); samples.clear();
+    if (0 == iter)
+        return;
+
+    for (int i=0; i < iter; i++)
+    {
+        auto start = high_resolution_clock::now();
+        func(); // iterate calls
+        samples.push_back(high_resolution_clock::now() - start);
+    }
+
+    std::sort(samples.begin(), samples.end());
+
+    auto median = samples[samples.size() / 2];
+
+    double median_ms = std::chrono::duration_cast<std::chrono::microseconds>(median).count() * 0.001; // convert to milliseconds
+
+    printf("Performance(ms): %lg ", median_ms);
+
+    va_list args;
+    va_start(args, format);
+    vprintf(format, args);
+    va_end(args);
+
+    printf("\n");
+}
+
+static cv::String interpToString(int interp)
+{
+    switch(interp)
+    {
+    case cv::INTER_AREA   : return "INTER_AREA";
+    case cv::INTER_LINEAR : return "INTER_LINEAR";
+    case cv::INTER_NEAREST: return "INTER_NEAREST";
+    }
+    CV_Assert(!"ERROR: unsupported interpolation!");
+    return nullptr;
+}
+
+static cv::String depthToString(int depth)
+{
+    switch(depth)
+    {
+    case CV_8U  : return "CV_8U";
+    case CV_32F : return "CV_32F";
+    }
+    CV_Assert(!"ERROR: unsupported depth!");
+    return nullptr;
+}
+
+static cv::String typeToString(int type)
+{
+    switch(type)
+    {
+    case CV_8UC1  : return "CV_8UC1";
+    case CV_8UC2  : return "CV_8UC2";
+    case CV_8UC3  : return "CV_8UC3";
+    case CV_8UC4  : return "CV_8UC4";
+    case CV_32FC1 : return "CV_32FC1";
+    case CV_32FC2 : return "CV_32FC2";
+    case CV_32FC3 : return "CV_32FC3";
+    case CV_32FC4 : return "CV_32FC4";
+    }
+    CV_Assert(!"ERROR: unsupported type!");
+    return nullptr;
+}
+#endif  // PERF_TEST
+
+TEST_P(ResizeTestGAPI, AccuracyTest)
+{
+    int type = 0, interp = 0;
+    cv::Size sz_in, sz_out;
+    double tolerance = 0.0;
+    cv::GCompileArgs compile_args;
+    std::pair<cv::Size, cv::Size> sizes;
+    std::tie(type, interp, sizes, tolerance, compile_args) = GetParam();
+    std::tie(sz_in, sz_out) = sizes;
+
+    cv::Mat in_mat1 (sz_in, type );
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+
+    cv::Mat out_mat(sz_out, type);
+    cv::Mat out_mat_ocv(sz_out, type);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in, out;
+    switch (CV_MAT_CHANNELS(type))
+    {
+    case 1:
+        out = InferenceEngine::gapi::ScalePlane::on(in, type, sz_in, sz_out, interp);
+        break;
+    case 3:
+        {
+        int depth = CV_MAT_DEPTH(type);
+        int type1 = CV_MAKE_TYPE(depth, 1);
+        cv::GMat in0, in1, in2, out0, out1, out2;
+        std::tie(in0, in1, in2) = InferenceEngine::gapi::Split3::on(in);
+        out0 = InferenceEngine::gapi::ScalePlane::on(in0, type1, sz_in, sz_out, interp);
+        out1 = InferenceEngine::gapi::ScalePlane::on(in1, type1, sz_in, sz_out, interp);
+        out2 = InferenceEngine::gapi::ScalePlane::on(in2, type1, sz_in, sz_out, interp);
+        out = InferenceEngine::gapi::Merge3::on(out0, out1, out2);
+        }
+        break;
+    default: CV_Assert(!"ERROR: unsupported number of channels!");
+    }
+
+    cv::GComputation c(in, out);
+
+    // compile graph, and test once
+
+    auto own_in_mat1 = cv::to_own(in_mat1);
+    auto own_out_mat = cv::to_own(out_mat);
+
+    std::vector<cv::gapi::own::Mat> v_in  = { own_in_mat1 };
+    std::vector<cv::gapi::own::Mat> v_out = { own_out_mat };
+
+    c.apply(v_in, v_out, std::move(compile_args));
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    test_ms([&](){ c.apply(v_in, v_out); },
+            100, "Resize GAPI %s %s %dx%d -> %dx%d",
+            interpToString(interp).c_str(), typeToString(type).c_str(),
+            sz_in.width, sz_in.height, sz_out.width, sz_out.height);
+#endif
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::resize(in_mat1, out_mat_ocv, sz_out, 0, 0, interp);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        cv::Mat absDiff;
+        cv::absdiff(out_mat, out_mat_ocv, absDiff);
+        EXPECT_EQ(0, cv::countNonZero(absDiff > tolerance));
+    }
+}
+
+TEST_P(Split2TestGAPI, AccuracyTest)
+{
+    int depth = std::get<0>(GetParam());
+    cv::Size sz_in = std::get<1>(GetParam());
+    auto compile_args = std::get<2>(GetParam());
+
+    int type1 = CV_MAKE_TYPE(depth, 1);
+    int type2 = CV_MAKE_TYPE(depth, 2);
+    initMatrixRandU(type2, sz_in, type1);
+
+    cv::Mat out_mat2 = cv::Mat(sz_in, type1);
+    cv::Mat out_mat_ocv2 = cv::Mat(sz_in, type1);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, out1, out2;
+    std::tie(out1, out2) = InferenceEngine::gapi::Split2::on(in1);
+    cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2));
+
+    // compile graph, and test once
+
+    auto own_in_mat1      = cv::to_own(in_mat1);
+    auto own_out_mat_gapi = cv::to_own(out_mat_gapi);
+    auto own_out_mat2     = cv::to_own(out_mat2);
+
+    std::vector<cv::gapi::own::Mat> v_in  = { own_in_mat1 };
+    std::vector<cv::gapi::own::Mat> v_out = { own_out_mat_gapi, own_out_mat2 };
+
+    c.apply(v_in, v_out, std::move(compile_args));
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    test_ms([&](){ c.apply(v_in, v_out); },
+        400, "Split GAPI %s %dx%d", typeToString(type2).c_str(), sz_in.width, sz_in.height);
+#endif
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        std::vector<cv::Mat> out_mats_ocv = {out_mat_ocv, out_mat_ocv2};
+        cv::split(in_mat1, out_mats_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv  != out_mat_gapi));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv2 != out_mat2));
+    }
+}
+
+TEST_P(Split3TestGAPI, AccuracyTest)
+{
+    int depth = std::get<0>(GetParam());
+    cv::Size sz_in = std::get<1>(GetParam());
+    auto compile_args = std::get<2>(GetParam());
+
+    int type1 = CV_MAKE_TYPE(depth, 1);
+    int type3 = CV_MAKE_TYPE(depth, 3);
+    initMatrixRandU(type3, sz_in, type1);
+
+    cv::Mat out_mat2 = cv::Mat(sz_in, type1);
+    cv::Mat out_mat3 = cv::Mat(sz_in, type1);
+    cv::Mat out_mat_ocv2 = cv::Mat(sz_in, type1);
+    cv::Mat out_mat_ocv3 = cv::Mat(sz_in, type1);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, out1, out2, out3;
+    std::tie(out1, out2, out3) = InferenceEngine::gapi::Split3::on(in1);
+    cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2, out3));
+
+    // compile graph, and test once
+
+    auto own_in_mat1      = cv::to_own(in_mat1);
+    auto own_out_mat_gapi = cv::to_own(out_mat_gapi);
+    auto own_out_mat2     = cv::to_own(out_mat2);
+    auto own_out_mat3     = cv::to_own(out_mat3);
+
+    std::vector<cv::gapi::own::Mat> v_in  = { own_in_mat1 };
+    std::vector<cv::gapi::own::Mat> v_out = { own_out_mat_gapi, own_out_mat2, own_out_mat3 };
+
+    c.apply(v_in, v_out, std::move(compile_args));
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    test_ms([&](){ c.apply(v_in, v_out); },
+        400, "Split GAPI %s %dx%d", typeToString(type3).c_str(), sz_in.width, sz_in.height);
+#endif
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        std::vector<cv::Mat> out_mats_ocv = {out_mat_ocv, out_mat_ocv2, out_mat_ocv3};
+        cv::split(in_mat1, out_mats_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv  != out_mat_gapi));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv2 != out_mat2));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv3 != out_mat3));
+    }
+}
+
+TEST_P(Split4TestGAPI, AccuracyTest)
+{
+    int depth = std::get<0>(GetParam());
+    cv::Size sz_in = std::get<1>(GetParam());
+    auto compile_args = std::get<2>(GetParam());
+
+    int type1 = CV_MAKE_TYPE(depth, 1);
+    int type4 = CV_MAKE_TYPE(depth, 4);
+    initMatrixRandU(type4, sz_in, type1);
+
+    cv::Mat out_mat2 = cv::Mat(sz_in, type1);
+    cv::Mat out_mat3 = cv::Mat(sz_in, type1);
+    cv::Mat out_mat4 = cv::Mat(sz_in, type1);
+    cv::Mat out_mat_ocv2 = cv::Mat(sz_in, type1);
+    cv::Mat out_mat_ocv3 = cv::Mat(sz_in, type1);
+    cv::Mat out_mat_ocv4 = cv::Mat(sz_in, type1);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, out1, out2, out3, out4;
+    std::tie(out1, out2, out3, out4) = InferenceEngine::gapi::Split4::on(in1);
+    cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2, out3, out4));
+
+    // compile graph, and test once
+
+    auto own_in_mat1      = cv::to_own(in_mat1);
+    auto own_out_mat_gapi = cv::to_own(out_mat_gapi);
+    auto own_out_mat2     = cv::to_own(out_mat2);
+    auto own_out_mat3     = cv::to_own(out_mat3);
+    auto own_out_mat4     = cv::to_own(out_mat4);
+
+    std::vector<cv::gapi::own::Mat> v_in  = { own_in_mat1 };
+    std::vector<cv::gapi::own::Mat> v_out = { own_out_mat_gapi, own_out_mat2,
+                                                  own_out_mat3, own_out_mat4 };
+
+    c.apply(v_in, v_out, std::move(compile_args));
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    test_ms([&](){ c.apply(v_in, v_out); },
+        400, "Split GAPI %s %dx%d", typeToString(type4).c_str(), sz_in.width, sz_in.height);
+#endif
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        std::vector<cv::Mat> out_mats_ocv = {out_mat_ocv, out_mat_ocv2, out_mat_ocv3, out_mat_ocv4};
+        cv::split(in_mat1, out_mats_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv  != out_mat_gapi));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv2 != out_mat2));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv3 != out_mat3));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv4 != out_mat4));
+    }
+}
+
+TEST_P(Merge2TestGAPI, AccuracyTest)
+{
+    int depth = std::get<0>(GetParam());
+    cv::Size sz_in = std::get<1>(GetParam());
+    auto compile_args = std::get<2>(GetParam());
+
+    int type1 = CV_MAKE_TYPE(depth, 1);
+    int type2 = CV_MAKE_TYPE(depth, 2);
+    initMatsRandU(type1, sz_in, type2);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = InferenceEngine::gapi::Merge2::on(in1, in2);
+    cv::GComputation c(cv::GIn(in1, in2), cv::GOut(out));
+
+    // compile graph, and test once
+
+    auto own_in_mat1      = cv::to_own(in_mat1);
+    auto own_in_mat2      = cv::to_own(in_mat2);
+    auto own_out_mat_gapi = cv::to_own(out_mat_gapi);
+
+    std::vector<cv::gapi::own::Mat> v_in  = { own_in_mat1, own_in_mat2 };
+    std::vector<cv::gapi::own::Mat> v_out = { own_out_mat_gapi };
+
+    c.apply(v_in, v_out, std::move(compile_args));
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    test_ms([&](){ c.apply(v_in, v_out); },
+        400, "Merge GAPI %s %dx%d", typeToString(type2).c_str(), sz_in.width, sz_in.height);
+#endif
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        std::vector<cv::Mat> in_mats_ocv = {in_mat1, in_mat2};
+        cv::merge(in_mats_ocv, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    }
+}
+
+TEST_P(Merge3TestGAPI, AccuracyTest)
+{
+    int depth = std::get<0>(GetParam());
+    cv::Size sz_in = std::get<1>(GetParam());
+    auto compile_args = std::get<2>(GetParam());
+
+    int type1 = CV_MAKE_TYPE(depth, 1);
+    int type3 = CV_MAKE_TYPE(depth, 3);
+    initMatsRandU(type1, sz_in, type3);
+
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::Mat in_mat3(sz_in,  type1);
+    cv::randn(in_mat3, mean, stddev);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, in3;
+    auto out = InferenceEngine::gapi::Merge3::on(in1, in2, in3);
+    cv::GComputation c(cv::GIn(in1, in2, in3), cv::GOut(out));
+
+    // compile graph, and test once
+
+    auto own_in_mat1      = cv::to_own(in_mat1);
+    auto own_in_mat2      = cv::to_own(in_mat2);
+    auto own_in_mat3      = cv::to_own(in_mat3);
+    auto own_out_mat_gapi = cv::to_own(out_mat_gapi);
+
+    std::vector<cv::gapi::own::Mat> v_in  = { own_in_mat1, own_in_mat2, own_in_mat3 };
+    std::vector<cv::gapi::own::Mat> v_out = { own_out_mat_gapi };
+
+    c.apply(v_in, v_out, std::move(compile_args));
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    test_ms([&](){ c.apply(v_in, v_out); },
+        400, "Merge GAPI %s %dx%d", typeToString(type3).c_str(), sz_in.width, sz_in.height);
+#endif
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        std::vector<cv::Mat> in_mats_ocv = {in_mat1, in_mat2, in_mat3};
+        cv::merge(in_mats_ocv, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    }
+}
+
+TEST_P(Merge4TestGAPI, AccuracyTest)
+{
+    int depth = std::get<0>(GetParam());
+    cv::Size sz_in = std::get<1>(GetParam());
+    auto compile_args = std::get<2>(GetParam());
+
+    int type1 = CV_MAKE_TYPE(depth, 1);
+    int type4 = CV_MAKE_TYPE(depth, 4);
+    initMatsRandU(type1, sz_in, type4);
+
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::Mat in_mat3(sz_in,  type1);
+    cv::Mat in_mat4(sz_in,  type1);
+    cv::randn(in_mat3, mean, stddev);
+    cv::randn(in_mat4, mean, stddev);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, in3, in4;
+    auto out = InferenceEngine::gapi::Merge4::on(in1, in2, in3, in4);
+    cv::GComputation c(cv::GIn(in1, in2, in3, in4), cv::GOut(out));
+
+    // compile graph, and test once
+
+    auto own_in_mat1      = cv::to_own(in_mat1);
+    auto own_in_mat2      = cv::to_own(in_mat2);
+    auto own_in_mat3      = cv::to_own(in_mat3);
+    auto own_in_mat4      = cv::to_own(in_mat4);
+    auto own_out_mat_gapi = cv::to_own(out_mat_gapi);
+
+    std::vector<cv::gapi::own::Mat> v_in  = { own_in_mat1, own_in_mat2, own_in_mat3, own_in_mat4 };
+    std::vector<cv::gapi::own::Mat> v_out = { own_out_mat_gapi };
+
+    c.apply(v_in, v_out, std::move(compile_args));
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    test_ms([&](){ c.apply(v_in, v_out); },
+        400, "Merge GAPI %s %dx%d", typeToString(type4).c_str(), sz_in.width, sz_in.height);
+#endif
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        std::vector<cv::Mat> in_mats_ocv = {in_mat1, in_mat2, in_mat3, in_mat4};
+        cv::merge(in_mats_ocv, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    }
+}
+
+//----------------------------------------------------------------------
+
+TEST_P(ResizeTestIE, AccuracyTest)
+{
+    int type = 0, interp = 0;
+    cv::Size sz_in, sz_out;
+    double tolerance = 0.0;
+    std::pair<cv::Size, cv::Size> sizes;
+    std::tie(type, interp, sizes, tolerance) = GetParam();
+    std::tie(sz_in, sz_out) = sizes;
+
+    cv::Mat in_mat1(sz_in, type );
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+
+    cv::Mat out_mat(sz_out, type);
+    cv::Mat out_mat_ocv(sz_out, type);
+
+    // Inference Engine code ///////////////////////////////////////////////////
+
+    size_t channels = out_mat.channels();
+    CV_Assert(1 == channels || 3 == channels);
+
+    int depth = CV_MAT_DEPTH(type);
+    CV_Assert(CV_8U == depth || CV_32F == depth);
+
+    CV_Assert(cv::INTER_AREA == interp || cv::INTER_LINEAR == interp);
+
+    ASSERT_TRUE(in_mat1.isContinuous() && out_mat.isContinuous());
+
+    using namespace InferenceEngine;
+
+    size_t  in_height = in_mat1.rows,  in_width = in_mat1.cols;
+    size_t out_height = out_mat.rows, out_width = out_mat.cols;
+    InferenceEngine::SizeVector  in_sv = { 1, channels,  in_height,  in_width };
+    InferenceEngine::SizeVector out_sv = { 1, channels, out_height, out_width };
+
+    // HWC blob: channels are interleaved
+    Precision precision = CV_8U == depth ? Precision::U8 : Precision::FP32;
+    TensorDesc  in_desc(precision,  in_sv, Layout::NHWC);
+    TensorDesc out_desc(precision, out_sv, Layout::NHWC);
+
+    Blob::Ptr in_blob, out_blob;
+    in_blob  = make_blob_with_precision(in_desc , in_mat1.data);
+    out_blob = make_blob_with_precision(out_desc, out_mat.data);
+
+    PreProcessData preprocess;
+    preprocess.setRoiBlob(in_blob);
+
+    ResizeAlgorithm algorithm = cv::INTER_AREA == interp ? RESIZE_AREA : RESIZE_BILINEAR;
+
+    // test once to warm-up cache
+    preprocess.execute(out_blob, algorithm);
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    test_ms([&](){ preprocess.execute(out_blob, algorithm); },
+            100, "Resize IE %s %s %dx%d -> %dx%d",
+            interpToString(interp).c_str(), typeToString(type).c_str(),
+            sz_in.width, sz_in.height, sz_out.width, sz_out.height);
+#endif
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::resize(in_mat1, out_mat_ocv, sz_out, 0, 0, interp);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        cv::Mat absDiff;
+        cv::absdiff(out_mat, out_mat_ocv, absDiff);
+        EXPECT_EQ(0, cv::countNonZero(absDiff > tolerance));
+    }
+}
+
+TEST_P(SplitTestIE, AccuracyTest)
+{
+    int type = std::get<0>(GetParam());
+    cv::Size size = std::get<1>(GetParam());
+
+    int depth = CV_MAT_DEPTH(type);
+    CV_Assert(CV_8U == depth || CV_32F == depth);
+
+    int type1 = CV_MAKE_TYPE(depth, 1);
+    int type4 = CV_MAKE_TYPE(depth, 4);
+
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::Mat in_mat(size, type);
+    cv::randn(in_mat, mean, stddev);
+
+    int channels = in_mat.channels();
+    CV_Assert(2 == channels || 3 == channels || 4 == channels);
+
+    size_t elemsize1 = in_mat.elemSize1();
+    int    total     = in_mat.total();
+
+    cv::Mat out_mat(size, type4);
+    CV_Assert(in_mat.isContinuous() && out_mat.isContinuous());
+
+    cv::Mat out_mat0(size, type1, out_mat.data + 0*total*elemsize1);
+    cv::Mat out_mat1(size, type1, out_mat.data + 1*total*elemsize1);
+    cv::Mat out_mat2(size, type1, out_mat.data + 2*total*elemsize1);
+    cv::Mat out_mat3(size, type1, out_mat.data + 3*total*elemsize1);
+
+    cv::Mat out_mats[] = {out_mat0, out_mat1, out_mat2, out_mat3};
+
+    std::vector<cv::Mat> out_mats_ocv(channels);
+
+    // Inference Engine code ///////////////////////////////////////////////////
+
+    using namespace InferenceEngine;
+
+    size_t width  = size.width;
+    size_t height = size.height;
+    InferenceEngine::SizeVector sv = { 1, (size_t)channels, height,  width };
+
+    Precision precision = CV_8U == depth ? Precision::U8 : Precision::FP32;
+    TensorDesc  in_desc(precision, sv, Layout::NHWC); // interleaved
+    TensorDesc out_desc(precision, sv, Layout::NCHW); // color planes
+
+    Blob::Ptr in_blob, out_blob;
+    in_blob  = make_blob_with_precision( in_desc,  in_mat.data);
+    out_blob = make_blob_with_precision(out_desc, out_mat.data);
+
+    // test once
+    blob_copy(in_blob, out_blob);
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    test_ms([&]() { blob_copy(in_blob, out_blob); },
+        400, "Split IE %s %dx%d", typeToString(type).c_str(), size.width, size.height);
+#endif
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+
+    cv::split(in_mat, out_mats_ocv);
+
+    // Comparison //////////////////////////////////////////////////////////////
+
+    for (int i = 0; i < channels; i++)
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mats[i] != out_mats_ocv[i]));
+    }
+}
+
+TEST_P(MergeTestIE, AccuracyTest)
+{
+    int type = std::get<0>(GetParam());
+    cv::Size size = std::get<1>(GetParam());
+
+    int depth = CV_MAT_DEPTH(type);
+    CV_Assert(CV_8U == depth || CV_32F == depth);
+
+    int type1 = CV_MAKE_TYPE(depth, 1);
+    int type4 = CV_MAKE_TYPE(depth, 4);
+
+    cv::Mat out_mat(size, type), out_mat_ocv;
+
+    cv::Mat in_mat(size, type4);
+
+    int channels = out_mat.channels();
+    CV_Assert(2 == channels || 3 == channels || 4 == channels);
+
+    size_t elemsize1 = out_mat.elemSize1();
+    int    total     = out_mat.total();
+
+    cv::Mat in_mat0(size, type1, in_mat.data + 0*total*elemsize1);
+    cv::Mat in_mat1(size, type1, in_mat.data + 1*total*elemsize1);
+    cv::Mat in_mat2(size, type1, in_mat.data + 2*total*elemsize1);
+    cv::Mat in_mat3(size, type1, in_mat.data + 3*total*elemsize1);
+
+    cv::Mat in_mats[] = { in_mat0, in_mat1, in_mat2, in_mat3 };
+
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    for (int i = 0; i < 4 ; i++)
+    {
+        cv::randn(in_mats[i], mean, stddev);
+    }
+
+    CV_Assert(in_mat.isContinuous() && out_mat.isContinuous());
+
+    // Inference Engine code ///////////////////////////////////////////////////
+
+    using namespace InferenceEngine;
+
+    size_t width  = size.width;
+    size_t height = size.height;
+    InferenceEngine::SizeVector sv = { 1, (size_t)channels, height,  width };
+
+    Precision precision = CV_8U == depth ? Precision::U8 : Precision::FP32;
+    TensorDesc  in_desc(precision, sv, Layout::NCHW); // color planes
+    TensorDesc out_desc(precision, sv, Layout::NHWC); // interleaved
+
+    Blob::Ptr in_blob, out_blob;
+    in_blob  = make_blob_with_precision( in_desc,  in_mat.data);
+    out_blob = make_blob_with_precision(out_desc, out_mat.data);
+
+    // test once
+    blob_copy(in_blob, out_blob);
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    test_ms([&]() { blob_copy(in_blob, out_blob); },
+        400, "Merge IE %s %dx%d", typeToString(type).c_str(), size.width, size.height);
+#endif
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+
+    cv::merge(in_mats, channels, out_mat_ocv);
+
+    // Comparison //////////////////////////////////////////////////////////////
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != out_mat_ocv));
+}
+
+namespace
+{
+// FIXME: Copy-paste from cropRoi tests
+template <InferenceEngine::Precision::ePrecision PRC>
+InferenceEngine::Blob::Ptr img2Blob(cv::Mat &img, InferenceEngine::Layout layout) {
+    using namespace InferenceEngine;
+    using data_t = typename PrecisionTrait<PRC>::value_type;
+
+    const size_t channels = img.channels();
+    const size_t height = img.size().height;
+    const size_t width = img.size().width;
+
+    CV_Assert(cv::DataType<data_t>::depth == img.depth());
+
+    SizeVector dims = {1, channels, height, width};
+    Blob::Ptr resultBlob = make_shared_blob<data_t>(TensorDesc(PRC, dims, layout));;
+    resultBlob->allocate();
+
+    data_t* blobData = resultBlob->buffer().as<data_t*>();
+
+    switch (layout) {
+        case Layout::NCHW: {
+            for (size_t c = 0; c < channels; c++) {
+                for (size_t h = 0; h < height; h++) {
+                    for (size_t w = 0; w < width; w++) {
+                        blobData[c * width * height + h * width + w] = img.ptr<data_t>(h,w)[c];
+                    }
+                }
+            }
+        }
+        break;
+        case Layout::NHWC: {
+            for (size_t h = 0; h < height; h++) {
+                for (size_t w = 0; w < width; w++) {
+                    for (size_t c = 0; c < channels; c++) {
+                        blobData[h * width * channels + w * channels + c] = img.ptr<data_t>(h,w)[c];
+                    }
+                }
+            }
+        }
+        break;
+        default:
+            THROW_IE_EXCEPTION << "Inconsistent input layout for image processing: " << layout;
+    }
+    return resultBlob;
+}
+
+template <InferenceEngine::Precision::ePrecision PRC>
+void Blob2Img(const InferenceEngine::Blob::Ptr& blobP, cv::Mat& img, InferenceEngine::Layout layout) {
+    using namespace InferenceEngine;
+    using data_t = typename PrecisionTrait<PRC>::value_type;
+
+    const size_t channels = img.channels();
+    const size_t height = img.size().height;
+    const size_t width = img.size().width;
+
+    CV_Assert(cv::DataType<data_t>::depth == img.depth());
+
+    data_t* blobData = blobP->buffer().as<data_t*>();
+
+    switch (layout) {
+        case Layout::NCHW: {
+            for (size_t c = 0; c < channels; c++) {
+                for (size_t h = 0; h < height; h++) {
+                    for (size_t w = 0; w < width; w++) {
+                        img.ptr<data_t>(h,w)[c] = blobData[c * width * height + h * width + w];
+                    }
+                }
+            }
+        }
+        break;
+        case Layout::NHWC: {
+            for (size_t h = 0; h < height; h++) {
+                for (size_t w = 0; w < width; w++) {
+                    for (size_t c = 0; c < channels; c++) {
+                        img.ptr<data_t>(h,w)[c] = blobData[h * width * channels + w * channels + c];
+                    }
+                }
+            }
+        }
+        break;
+        default:
+            THROW_IE_EXCEPTION << "Inconsistent input layout for image processing: " << layout;
+    }
+}
+}  // namespace
+
+TEST_P(PreprocTest, Performance)
+{
+    using namespace InferenceEngine;
+    Precision prec;
+    ResizeAlgorithm interp;
+    Layout in_layout, out_layout;
+    int ocv_chan = -1;
+    std::pair<cv::Size, cv::Size> sizes;
+    std::tie(prec, interp, in_layout, out_layout, ocv_chan, sizes) = GetParam();
+    cv::Size in_size, out_size;
+    std::tie(in_size, out_size) = sizes;
+
+    const int ocv_depth = prec == Precision::U8 ? CV_8U :
+        prec == Precision::FP32 ? CV_32F : -1;
+    const int ocv_type = CV_MAKETYPE(ocv_depth, ocv_chan);
+    initMatrixRandU(ocv_type, in_size, ocv_type, false);
+
+    cv::Mat out_mat(out_size, ocv_type);
+
+    Blob::Ptr in_blob, out_blob;
+    switch (prec)
+    {
+    case Precision::U8:
+        in_blob = img2Blob<Precision::U8>(in_mat1, in_layout);
+        out_blob = img2Blob<Precision::U8>(out_mat, out_layout);
+        break;
+
+    case Precision::FP32:
+        in_blob = img2Blob<Precision::FP32>(in_mat1, in_layout);
+        out_blob = img2Blob<Precision::FP32>(out_mat, out_layout);
+        break;
+
+    default:
+        FAIL() << "Unsupported configuration";
+    }
+
+    PreProcessData preprocess;
+    preprocess.setRoiBlob(in_blob);
+
+    // test once to warm-up cache
+    preprocess.execute(out_blob, interp);
+
+    switch (prec)
+    {
+    case Precision::U8:   Blob2Img<Precision::U8>  (out_blob, out_mat, out_layout); break;
+    case Precision::FP32: Blob2Img<Precision::FP32>(out_blob, out_mat, out_layout); break;
+    default: FAIL() << "Unsupported configuration";
+    }
+
+    cv::Mat ocv_out_mat(out_size, ocv_type);
+    auto cv_interp = interp == RESIZE_AREA ? cv::INTER_AREA : cv::INTER_LINEAR;
+    cv::resize(in_mat1, ocv_out_mat, out_size, 0, 0, cv_interp);
+
+    cv::Mat absDiff;
+    cv::absdiff(ocv_out_mat, out_mat, absDiff);
+    EXPECT_EQ(cv::countNonZero(absDiff > 1), 0);
+
+#if PERF_TEST
+    // iterate testing, and print performance
+    const auto type_str = depthToString(ocv_depth);
+    const auto interp_str = interp == RESIZE_AREA ? "AREA"
+        : interp == RESIZE_BILINEAR ? "BILINEAR" : "?";
+    const auto layout_to_str = [](const Layout &l) {
+        switch (l) {
+        case Layout::NCHW: return "NCHW";
+        case Layout::NHWC: return "NHWC";
+        default: return "?";
+        }
+    };
+    const auto in_layout_str = layout_to_str(in_layout);
+    const auto out_layout_str = layout_to_str(out_layout);
+
+    test_ms([&]() { preprocess.execute(out_blob, interp); },
+            300,
+            "Preproc %s %d %s %s %dx%d %s %dx%d",
+            type_str.c_str(),
+            ocv_chan,
+            interp_str,
+            in_layout_str, in_size.width, in_size.height,
+            out_layout_str, out_size.width, out_size.height);
+#endif // PERF_TEST
+
+}
+
+} // opencv_test
+
+#endif //OPENCV_GAPI_CORE_TESTS_INL_HPP
diff --git a/inference-engine/tests/unit/opencv_test_gapi/common/gapi_tests_common.hpp b/inference-engine/tests/unit/opencv_test_gapi/common/gapi_tests_common.hpp
new file mode 100644 (file)
index 0000000..27b43e3
--- /dev/null
@@ -0,0 +1,106 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <iostream>
+
+#include "opencv2/core.hpp"
+#include "opencv2/gapi/cpu/core.hpp"
+
+#include <gtest/gtest.h>
+
+namespace
+{
+    std::ostream& operator<<(std::ostream& o, const cv::GCompileArg& arg)
+    {
+        return o << (arg.tag.empty() ? "empty" : arg.tag);
+    }
+}
+
+namespace opencv_test
+{
+
+class TestFunctional
+{
+public:
+    cv::Mat in_mat1;
+    cv::Mat in_mat2;
+    cv::Mat out_mat_gapi;
+    cv::Mat out_mat_ocv;
+
+    cv::Scalar sc;
+
+    void initMatsRandU(int type, cv::Size sz_in, int dtype, bool createOutputMatrices = true)
+    {
+        in_mat1 = cv::Mat(sz_in, type);
+        in_mat2 = cv::Mat(sz_in, type);
+
+        auto& rng = cv::theRNG();
+        sc = cv::Scalar(rng(100),rng(100),rng(100),rng(100));
+        cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+        cv::randu(in_mat2, cv::Scalar::all(0), cv::Scalar::all(255));
+
+        if (createOutputMatrices && dtype != -1)
+        {
+            out_mat_gapi = cv::Mat (sz_in, dtype);
+            out_mat_ocv = cv::Mat (sz_in, dtype);
+        }
+    }
+
+    void initMatrixRandU(int type, cv::Size sz_in, int dtype, bool createOutputMatrices = true)
+    {
+        in_mat1 = cv::Mat(sz_in, type);
+
+        auto& rng = cv::theRNG();
+        sc = cv::Scalar(rng(100),rng(100),rng(100),rng(100));
+
+        cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+
+        if (createOutputMatrices && dtype != -1)
+        {
+            out_mat_gapi = cv::Mat (sz_in, dtype);
+            out_mat_ocv = cv::Mat (sz_in, dtype);
+        }
+    }
+
+    void initMatsRandN(int type, cv::Size sz_in, int dtype, bool createOutputMatrices = true)
+    {
+        in_mat1  = cv::Mat(sz_in, type);
+        cv::randn(in_mat1, cv::Scalar::all(127), cv::Scalar::all(40.f));
+
+        if (createOutputMatrices  && dtype != -1)
+        {
+            out_mat_gapi = cv::Mat(sz_in, dtype);
+            out_mat_ocv = cv::Mat(sz_in, dtype);
+        }
+    }
+
+    static cv::Mat nonZeroPixels(const cv::Mat& mat)
+    {
+        int channels = mat.channels();
+        std::vector<cv::Mat> split(channels);
+        cv::split(mat, split);
+        cv::Mat result;
+        for (int c=0; c < channels; c++)
+        {
+            if (c == 0)
+                result = split[c] != 0;
+            else
+                result = result | (split[c] != 0);
+        }
+        return result;
+    }
+
+    static int countNonZeroPixels(const cv::Mat& mat)
+    {
+        return cv::countNonZero( nonZeroPixels(mat) );
+    }
+
+};
+
+template<class T>
+class TestParams: public TestFunctional, public testing::TestWithParam<T>{};
+
+}
diff --git a/inference-engine/tests/unit/opencv_test_gapi/cpu/gapi_core_tests_fluid.cpp b/inference-engine/tests/unit/opencv_test_gapi/cpu/gapi_core_tests_fluid.cpp
new file mode 100644 (file)
index 0000000..31714b6
--- /dev/null
@@ -0,0 +1,244 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "gapi_core_tests.hpp"
+
+#include "ie_preprocess_gapi_kernels.hpp"
+
+#include <opencv2/opencv.hpp>
+
+#include <gtest/gtest.h>
+
+namespace opencv_test
+{
+
+#define CORE_FLUID InferenceEngine::gapi::preprocKernels()
+
+#define TEST_SIZES        \
+    cv::Size(3840, 2160), \
+    cv::Size(1920, 1080), \
+    cv::Size(1280,  720), \
+    cv::Size(1280,  960), \
+    cv::Size( 960,  720), \
+    cv::Size( 640,  480), \
+    cv::Size( 320,  200), \
+    cv::Size( 113,   71)
+
+#define TEST_RESIZE_DOWN \
+    std::make_pair(cv::Size(3840, 2160), cv::Size(1920, 1080)), \
+    std::make_pair(cv::Size(3840, 2160), cv::Size(1280,  720)), \
+    std::make_pair(cv::Size(1920, 1080), cv::Size(1280,  720)), \
+    std::make_pair(cv::Size(1920, 1080), cv::Size( 640,  480)), \
+    std::make_pair(cv::Size(1280,  720), cv::Size( 640,  480)), \
+    std::make_pair(cv::Size(1280,  720), cv::Size( 320,  200)), \
+    std::make_pair(cv::Size( 640,  480), cv::Size( 320,  200)), \
+    std::make_pair(cv::Size( 640,  480), cv::Size( 113,   71)), \
+    std::make_pair(cv::Size( 320,  200), cv::Size( 113,   71))
+
+#define TEST_RESIZE_UP \
+    std::make_pair(cv::Size(1920, 1080), cv::Size(3840, 2160)), \
+    std::make_pair(cv::Size(1280,  720), cv::Size(3840, 2160)), \
+    std::make_pair(cv::Size(1280,  720), cv::Size(1920, 1080)), \
+    std::make_pair(cv::Size( 640,  480), cv::Size(1920, 1080)), \
+    std::make_pair(cv::Size( 640,  480), cv::Size(1280,  720)), \
+    std::make_pair(cv::Size( 320,  200), cv::Size(1280,  720)), \
+    std::make_pair(cv::Size( 320,  200), cv::Size( 640,  480)), \
+    std::make_pair(cv::Size( 113,   71), cv::Size( 640,  480)), \
+    std::make_pair(cv::Size( 113,   71), cv::Size( 320,  200))
+
+#define TEST_RESIZE_HORZ \
+    std::make_pair(cv::Size(3840, 2160), cv::Size(1920, 2160)), \
+    std::make_pair(cv::Size(1920, 1080), cv::Size(3840, 1080)), \
+    std::make_pair(cv::Size(1920, 1080), cv::Size(1280, 1080)), \
+    std::make_pair(cv::Size(1280,  720), cv::Size(1920,  720)), \
+    std::make_pair(cv::Size(1280,  720), cv::Size( 640,  720)), \
+    std::make_pair(cv::Size( 640,  480), cv::Size(1280,  480)), \
+    std::make_pair(cv::Size( 640,  480), cv::Size( 320,  480)), \
+    std::make_pair(cv::Size( 320,  200), cv::Size( 640,  200)), \
+    std::make_pair(cv::Size( 320,  200), cv::Size( 113,  200)), \
+    std::make_pair(cv::Size( 113,   71), cv::Size( 320,   71))
+
+#define TEST_RESIZE_VERT \
+    std::make_pair(cv::Size(3840, 2160), cv::Size(3840, 1080)), \
+    std::make_pair(cv::Size(1920, 1080), cv::Size(1920, 2160)), \
+    std::make_pair(cv::Size(1920, 1080), cv::Size(1920,  720)), \
+    std::make_pair(cv::Size(1280,  720), cv::Size(1280, 1080)), \
+    std::make_pair(cv::Size(1280,  720), cv::Size(1280,  480)), \
+    std::make_pair(cv::Size( 640,  480), cv::Size( 640,  720)), \
+    std::make_pair(cv::Size( 640,  480), cv::Size( 640,  200)), \
+    std::make_pair(cv::Size( 320,  200), cv::Size( 320,  480)), \
+    std::make_pair(cv::Size( 320,  200), cv::Size( 320,   71)), \
+    std::make_pair(cv::Size( 113,   71), cv::Size( 113,  200))
+
+#define TEST_RESIZE_COPY \
+    std::make_pair(cv::Size(3840, 2160), cv::Size(3840, 2160)), \
+    std::make_pair(cv::Size(1920, 1080), cv::Size(1920, 1080)), \
+    std::make_pair(cv::Size(1280,  720), cv::Size(1280,  720)), \
+    std::make_pair(cv::Size( 640,  480), cv::Size( 640,  480)), \
+    std::make_pair(cv::Size( 320,  200), cv::Size( 320,  200)), \
+    std::make_pair(cv::Size( 113,   71), cv::Size( 113,   71))
+
+#define TEST_RESIZE_SPECIAL \
+    std::make_pair(cv::Size(300, 300), cv::Size(300, 199)), \
+    std::make_pair(cv::Size(300, 300), cv::Size(199, 300)), \
+    std::make_pair(cv::Size(300, 300), cv::Size(199, 199)), \
+    std::make_pair(cv::Size(199, 199), cv::Size(300, 300)), \
+    std::make_pair(cv::Size(199, 300), cv::Size(300, 300)), \
+    std::make_pair(cv::Size(300, 199), cv::Size(300, 300))
+
+#define TEST_RESIZE_PAIRS \
+    TEST_RESIZE_DOWN, \
+    TEST_RESIZE_UP, \
+    TEST_RESIZE_HORZ, \
+    TEST_RESIZE_VERT, \
+    TEST_RESIZE_COPY, \
+    TEST_RESIZE_SPECIAL
+
+using namespace testing;
+
+INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestGAPI,
+                        Combine(Values(CV_8UC1, CV_8UC3),
+                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(1), // error not more than 1 unit
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestGAPI,
+                        Combine(Values(CV_32FC1, CV_32FC3),
+                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(0.015), // accuracy like ~1.5%
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Split2TestFluid, Split2TestGAPI,
+                        Combine(Values(CV_8U, CV_32F),
+                                Values(TEST_SIZES),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Split3TestFluid, Split3TestGAPI,
+                        Combine(Values(CV_8U, CV_32F),
+                                Values(TEST_SIZES),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Split4TestFluid, Split4TestGAPI,
+                        Combine(Values(CV_8U, CV_32F),
+                                Values(TEST_SIZES),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Merge2TestFluid, Merge2TestGAPI,
+                        Combine(Values(CV_8U, CV_32F),
+                                Values(TEST_SIZES),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Merge3TestFluid, Merge3TestGAPI,
+                        Combine(Values(CV_8U, CV_32F),
+                                Values(TEST_SIZES),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Merge4TestFluid, Merge4TestGAPI,
+                        Combine(Values(CV_8U, CV_32F),
+                                Values(TEST_SIZES),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+//----------------------------------------------------------------------
+
+INSTANTIATE_TEST_CASE_P(ResizeTestFluid_U8, ResizeTestIE,
+                        Combine(Values(CV_8UC1, CV_8UC3),
+                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(1))); // error not more than 1 unit
+
+INSTANTIATE_TEST_CASE_P(ResizeTestFluid_F32, ResizeTestIE,
+                        Combine(Values(CV_32FC1, CV_32FC3),
+                                Values(cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(TEST_RESIZE_PAIRS),
+                                Values(0.05))); // error within 0.05 units
+
+INSTANTIATE_TEST_CASE_P(SplitTestFluid, SplitTestIE,
+                        Combine(Values(CV_8UC2, CV_8UC3, CV_8UC4,
+                                       CV_32FC2, CV_32FC3, CV_32FC4),
+                                Values(TEST_SIZES)));
+
+INSTANTIATE_TEST_CASE_P(MergeTestFluid, MergeTestIE,
+                        Combine(Values(CV_8UC2, CV_8UC3, CV_8UC4,
+                                       CV_32FC2, CV_32FC3, CV_32FC4),
+                                Values(TEST_SIZES)));
+
+//------------------------------------------------------------------------------
+
+namespace IE = InferenceEngine;
+
+static const auto FRAME_SIZES =
+   Values(std::make_pair(cv::Size(1920,1080),
+                         cv::Size(1024,1024)), // person-vehicle-bike-detection-crossroad-0078
+          std::make_pair(cv::Size(1024, 768),
+                         cv::Size( 992, 544)), // person-detection-retail-0001
+          std::make_pair(cv::Size(1280, 720),
+                         cv::Size( 896, 512)), // road-segmentation-adas-0001
+          std::make_pair(cv::Size(3840, 2160),
+                         cv::Size(2048, 1024)), // semantic-segmentation-adas-0001
+          std::make_pair(cv::Size(1270, 720),
+                         cv::Size(2048, 1024)), // semantic-segmentation-adas-0001 (UPSCALE)
+          std::make_pair(cv::Size( 640, 480),
+                         cv::Size( 544, 320)));  // 320 - face-person-detection-retail-0002,
+                                                 // 320 - person-detection-retail-10013
+                                                 // 300 - face-detection-retail-0004
+
+static const auto PATCH_SIZES =
+    Values(std::make_pair(cv::Size(200,400),
+                          cv::Size(128,384)),  // person-reidentification-retail-0076
+           std::make_pair(cv::Size( 96,256),
+                          cv::Size(128,384)),  // person-reidentification-retail-0076 (UPSCALE)
+           std::make_pair(cv::Size(340,340),
+                          cv::Size(320,256)),  // vehicle-license-plate-detection-barrier-0007
+           std::make_pair(cv::Size(256,256),
+                          cv::Size( 72,72)),   // vehicle-attributes-recognition-barrier-0039
+           std::make_pair(cv::Size(96,96),
+                          cv::Size(64,64)),    // 60 - head-pose-estimation-adas-0001
+                                               // 62 - age-gender-recognition-retail-0013
+                                               // 64 - emotions-recognition-retail-0003
+           std::make_pair(cv::Size(128,48),
+                          cv::Size( 94,24)),   // license-plate-recognition-barrier-0001
+           std::make_pair(cv::Size(120,200),
+                          cv::Size(80, 160))); // 80 - person-attributes-recognition-crossroad-0031
+                                               // 64 - person-reidentification-retail-0079
+
+INSTANTIATE_TEST_CASE_P(ReorderResize_Frame, PreprocTest,
+                        Combine(Values(IE::Precision::U8, IE::Precision::FP32),
+                                Values(IE::ResizeAlgorithm::RESIZE_BILINEAR), // AREA is not there yet
+                                Values(IE::Layout::NHWC),
+                                Values(IE::Layout::NCHW),
+                                Values(1, 3),
+                                FRAME_SIZES));
+
+INSTANTIATE_TEST_CASE_P(Scale3ch_Frame, PreprocTest,
+                        Combine(Values(IE::Precision::U8, IE::Precision::FP32),
+                                Values(IE::ResizeAlgorithm::RESIZE_BILINEAR), // AREA is not there yet
+                                Values(IE::Layout::NHWC),
+                                Values(IE::Layout::NHWC),
+                                Values(3),
+                                FRAME_SIZES));
+
+INSTANTIATE_TEST_CASE_P(ReorderResize_Patch, PreprocTest,
+                        Combine(Values(IE::Precision::U8, IE::Precision::FP32),
+                                Values(IE::ResizeAlgorithm::RESIZE_BILINEAR), // AREA is not there yet
+                                Values(IE::Layout::NHWC),
+                                Values(IE::Layout::NCHW, IE::Layout::NCHW),
+                                Values(1, 3),
+                                PATCH_SIZES));
+
+INSTANTIATE_TEST_CASE_P(Everything, PreprocTest,
+                        Combine(Values(IE::Precision::U8, IE::Precision::FP32),
+                                Values(IE::ResizeAlgorithm::RESIZE_BILINEAR, IE::ResizeAlgorithm::RESIZE_AREA),
+                                Values(IE::Layout::NHWC, IE::Layout::NCHW),
+                                Values(IE::Layout::NHWC, IE::Layout::NCHW),
+                                Values(1, 2, 3, 4),
+                                Values(std::make_pair(cv::Size(1920, 1080), cv::Size(1024,1024)),
+                                       std::make_pair(cv::Size(1280, 720), cv::Size(544,320)),
+                                       std::make_pair(cv::Size(640, 480), cv::Size(896, 512)),
+                                       std::make_pair(cv::Size(200, 400), cv::Size(128, 384)),
+                                       std::make_pair(cv::Size(256, 256), cv::Size(72, 72)),
+                                       std::make_pair(cv::Size(96, 256), cv::Size(128, 384)))));
+
+}
index 04bb17d..b8661bd 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -42,7 +41,8 @@ protected:
             "BatchNormalization",
             "Input",
             "Memory",
-            "Const"
+            "Const",
+            "Gemm"
     };
 
     void TearDown() override {
index d67d43a..9f57e35 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,7 +6,7 @@
 #include <inference_engine/shape_infer/built-in/ie_built_in_holder.hpp>
 #include <xml_net_builder.hpp>
 #include <inference_engine/cnn_network_impl.hpp>
-#include <inference_engine/v2_format_parser.h>
+#include <inference_engine/ie_format_parser.h>
 #include <xml_helper.hpp>
 #include <inference_engine/shape_infer/ie_reshaper.hpp>
 #include "built_in_shape_infer_general_test.hpp"
@@ -18,7 +17,7 @@ using namespace ShapeInfer;
 class BuiltInShapeInferImplTestBatch : public BuiltInShapeInferImplTest {};
 
 TEST_P(BuiltInShapeInferImplTestBatch, batch) {
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams.data, layerDataName);
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams.data, layerDataName);
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
 
     if (canInfer) {
index bf3d1eb..07aaf7f 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,7 +6,7 @@
 #include <inference_engine/shape_infer/built-in/ie_built_in_holder.hpp>
 #include <xml_net_builder.hpp>
 #include <inference_engine/cnn_network_impl.hpp>
-#include <inference_engine/v2_format_parser.h>
+#include <inference_engine/ie_format_parser.h>
 #include <xml_helper.hpp>
 #include <inference_engine/shape_infer/ie_reshaper.hpp>
 #include "built_in_shape_infer_general_test.hpp"
@@ -98,7 +97,7 @@ TEST_P(BuiltInShapeInferConvImplTest, impl) {
     ASSERT_NE(nullptr, impl);
     if (!group) group = 1;
     SizeVector weightsDim{kernel.x * kernel.y * out_channels * inOutShapes.inDims[0][1] / group};
-    blobs["weights"] = make_shared_blob(Precision::UNSPECIFIED, weightsDim);
+    blobs["weights"] = make_shared_blob(Precision::fromType<size_t>(), weightsDim);
     ASSERT_NO_THROW(sts = impl->inferShapes(inOutShapes.inDims, getMapParams(), blobs, outShapes, &resp));
     ASSERT_EQ(int(OK), sts) << resp.msg;
     ASSERT_EQ(inOutShapes.outDims, outShapes);
@@ -106,7 +105,7 @@ TEST_P(BuiltInShapeInferConvImplTest, impl) {
 
 TEST_P(BuiltInShapeInferConvImplTest, batch) {
     auto layerParams = getMapParams();
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams, dataName, 2);
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<2>(type, inOutShapes, &layerParams, dataName);
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     sts = cnnNetworkImplPtr->setBatchSizeReshape(BATCH, &resp);
     ASSERT_EQ((int) OK, sts) << resp.msg;
@@ -116,7 +115,7 @@ TEST_P(BuiltInShapeInferConvImplTest, batch) {
 
 TEST_P(BuiltInShapeInferConvImplTest, reshaper) {
     auto layerParams = getMapParams();
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams, dataName, 2);
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<2>(type, inOutShapes, &layerParams, dataName);
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     auto inputShapes = setInputShapes(*cnnNetworkImplPtr, newInOutShapes.inDims);
     reshaper->run(inputShapes);
@@ -129,7 +128,7 @@ TEST_P(BuiltInShapeInferConvImplTest, impl_IRv3) {
     ASSERT_NE(nullptr, impl);
     if (!group) group = 1;
     SizeVector weightsDim{kernel.x * kernel.y * out_channels * inOutShapes.inDims[0][1] / group};
-    blobs["weights"] = make_shared_blob(Precision::UNSPECIFIED, weightsDim);
+    blobs["weights"] = make_shared_blob(Precision::fromType<size_t>(), weightsDim);
     ASSERT_NO_THROW(sts = impl->inferShapes(inOutShapes.inDims, getMapParams_IRv3(), blobs, outShapes, &resp));
     ASSERT_EQ(int(OK), sts) << resp.msg;
     ASSERT_EQ(inOutShapes.outDims, outShapes);
@@ -137,7 +136,7 @@ TEST_P(BuiltInShapeInferConvImplTest, impl_IRv3) {
 
 TEST_P(BuiltInShapeInferConvImplTest, batch_IRv3) {
     auto layerParams = getMapParams_IRv3();
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams, dataName);
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams, dataName);
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     sts = cnnNetworkImplPtr->setBatchSizeReshape(BATCH, &resp);
     ASSERT_EQ((int) OK, sts) << resp.msg;
@@ -147,7 +146,7 @@ TEST_P(BuiltInShapeInferConvImplTest, batch_IRv3) {
 
 TEST_P(BuiltInShapeInferConvImplTest, reshaper_IRv3) {
     auto layerParams = getMapParams_IRv3();
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams, dataName);
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams, dataName);
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     auto inputShapes = setInputShapes(*cnnNetworkImplPtr, newInOutShapes.inDims);
     reshaper->run(inputShapes);
index bb2de35..2b66d59 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,7 +6,7 @@
 #include <inference_engine/shape_infer/built-in/ie_built_in_holder.hpp>
 #include <xml_net_builder.hpp>
 #include <inference_engine/cnn_network_impl.hpp>
-#include <inference_engine/v2_format_parser.h>
+#include <inference_engine/ie_format_parser.h>
 #include <xml_helper.hpp>
 #include <inference_engine/shape_infer/ie_reshaper.hpp>
 #include "built_in_shape_infer_general_test.hpp"
@@ -19,7 +18,7 @@ class BuiltInShapeInferImplFakeTest : public BuiltInShapeInferImplTest {
 };
 
 TEST_P(BuiltInShapeInferImplFakeTest, reshaper) {
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams.data, layerDataName);
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams.data, layerDataName);
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     auto inputShapes = setInputShapes(*cnnNetworkImplPtr, newInOutShapes.inDims);
 
index c7fc06e..a7d3a64 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -31,7 +30,7 @@ TEST_P(BuiltInShapeInferImplTest, impl) {
 }
 
 TEST_P(BuiltInShapeInferImplTest, reshaper) {
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams.data, layerDataName);
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams.data, layerDataName);
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     auto inputShapes = setInputShapes(*cnnNetworkImplPtr.get(), newInOutShapes.inDims);
 
@@ -141,20 +140,15 @@ INSTANTIATE_TEST_CASE_P(
                                                    {{2, 5742, 6}}}),
                                       NewInOutShapes({{{2, 34458}},
                                                       {{2, 5743, 6}}}),
-                                      MapParams(MapStrStr(std::map<std::string, std::string>{{"dim",      "0,-1,6"},
-                                                                                             {"in2out",   "0-0"},
-                                                                                             {"num_axes", "-1"},
-                                                                                             {"axis",     "0"}})),
+                                      MapParams(MapStrStr(std::map<std::string, std::string>{{"dim", "0,-1,6"}})),
                                       LayerDataName("data"),
                                       CanInfer(true)),
                 ::testing::make_tuple(LayerType("Reshape"),
-                                      InOutShapes({{{2, 1, 4, 5}},
-                                                   {{40}}}),
-                                      NewInOutShapes({{{4, 1, 4, 5}},
-                                                      {{80}}}),
-                                      MapParams(MapStrStr(std::map<std::string, std::string>{{"axis",     "0"},
-                                                                                             {"num_axes", "-1"},
-                                                                                             {"in2out",   "0-0,1-0,2-0,3-0"}})),
+                                      InOutShapes({{{1, 1, 300, 4}},
+                                                   {{300, 4}}}),
+                                      NewInOutShapes({{{1, 1, 500, 4}},
+                                                      {{500, 4}}}),
+                                      MapParams(MapStrStr(std::map<std::string, std::string>{{"dim", "-1,4"}})),
                                       LayerDataName("data"),
                                       CanInfer(true)),
                 ::testing::make_tuple(LayerType("Flatten"),
@@ -162,9 +156,7 @@ INSTANTIATE_TEST_CASE_P(
                                                    {{40}}}),
                                       NewInOutShapes({{{4, 1, 4, 5}},
                                                       {{80}}}),
-                                      MapParams(MapStrStr(std::map<std::string, std::string>{{"axis",     "0"},
-                                                                                             {"end_axis", "-1"},
-                                                                                             {"in2out",   "0-0,1-0,2-0,3-0"}})),
+                                      MapParams(MapParams(MapStrStr())),
                                       LayerDataName("data"),
                                       CanInfer(true)),
                 ::testing::make_tuple(LayerType("PriorBoxClustered"),
@@ -464,6 +456,215 @@ INSTANTIATE_TEST_CASE_P(
                                       MapParams(MapStrStr({{"out_max_val", "0"},
                                                            {"top_k",       "100"}})),
                                       LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Gemm"),
+                                      InOutShapes({{{15, 10}, {10, 20}, {15, 20}},
+                                                   {{15, 20}}}),
+                                      NewInOutShapes({{{20, 15}, {15, 25}, {20, 25}},
+                                                      {{20, 25}}}),
+                                      MapParams(MapStrStr({{"alpha",       "1"},
+                                                           {"beta",        "1"},
+                                                           {"transpose_a", "false"},
+                                                           {"transpose_b", "false"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Gemm"),
+                                      InOutShapes({{{15, 10}, {10, 20}, {15, 20}},
+                                                   {{15, 20}}}),
+                                      NewInOutShapes({{{20, 15}, {10, 25}, {20, 25}},
+                                                      {{20, 25}}}),
+                                      MapParams(MapStrStr({{"alpha",       "1"},
+                                                           {"beta",        "1"},
+                                                           {"transpose_a", "false"},
+                                                           {"transpose_b", "false"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(false)),
+                ::testing::make_tuple(LayerType("Gemm"),
+                                      InOutShapes({{{15, 10}, {10, 20}, {15, 20}},
+                                                   {{15, 20}}}),
+                                      NewInOutShapes({{{20, 15}, {15, 25}, {15, 25}},
+                                                      {{20, 25}}}),
+                                      MapParams(MapStrStr({{"alpha",       "1"},
+                                                           {"beta",        "1"},
+                                                           {"transpose_a", "false"},
+                                                           {"transpose_b", "false"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(false)),
+                ::testing::make_tuple(LayerType("Gemm"),
+                                      InOutShapes({{{15, 10}, {10, 20}},
+                                                   {{15, 20}}}),
+                                      NewInOutShapes({{{20, 15}, {15, 25}},
+                                                      {{20, 25}}}),
+                                      MapParams(MapStrStr({{"alpha",       "1"},
+                                                           {"beta",        "1"},
+                                                           {"transpose_a", "false"},
+                                                           {"transpose_b", "false"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Gemm"),
+                                      InOutShapes({{{15, 10}, {10, 20}},
+                                                   {{15, 20}}}),
+                                      NewInOutShapes({{{20, 15}, {10, 25}},
+                                                      {{20, 25}}}),
+                                      MapParams(MapStrStr({{"alpha",       "1"},
+                                                           {"beta",        "1"},
+                                                           {"transpose_a", "false"},
+                                                           {"transpose_b", "false"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(false)),
+                ::testing::make_tuple(LayerType("Gemm"),
+                                      InOutShapes({{{3, 3, 15, 10}, {3, 3, 10, 20}, {3, 3, 15, 20}},
+                                                   {{3, 3, 15, 20}}}),
+                                      NewInOutShapes({{{4, 1, 20, 15}, {4, 1, 15, 25}, {4, 1, 20, 25}},
+                                                      {{4, 1, 20, 25}}}),
+                                      MapParams(MapStrStr({{"alpha",       "1"},
+                                                           {"beta",        "1"},
+                                                           {"transpose_a", "false"},
+                                                           {"transpose_b", "false"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Gemm"),
+                                      InOutShapes({{{3, 3, 15, 10}, {3, 1, 10, 20}, {3, 1, 15, 20}},
+                                                   {{3, 3, 15, 20}}}),
+                                      NewInOutShapes({{{4, 2, 20, 15}, {4, 2, 15, 25}, {4, 1, 20, 25}},
+                                                      {{4, 2, 20, 25}}}),
+                                      MapParams(MapStrStr({{"alpha",       "1"},
+                                                           {"beta",        "1"},
+                                                           {"transpose_a", "false"},
+                                                           {"transpose_b", "false"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Pad"),
+                                      InOutShapes({{{3, 3, 15, 10}},
+                                                   {{9, 11, 25, 22}}}),
+                                      NewInOutShapes({{{4, 2, 20, 15}},
+                                                      {{10, 10, 30, 27}}}),
+                                      MapParams(MapStrStr({{"pads_begin",  "1,2,3,4"},
+                                                           {"pads_end",    "5,6,7,8"},
+                                                           {"pad_mode",    "edge"},
+                                                           {"pad_value",   "1.0f"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Pad"),
+                                      InOutShapes({{{10, 10, 15, 10}},
+                                                   {{16, 18, 25, 22}}}),
+                                      NewInOutShapes({{{20, 30, 40, 50}},
+                                                      {{26, 38, 40, 50}}}),
+                                      MapParams(MapStrStr({{"pads_begin",  "1,2,0,0"},
+                                                           {"pads_end",    "5,6,0,0"},
+                                                           {"pad_mode",    "reflect"},
+                                                           {"pad_value",   "1.0f"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Pad"),
+                                      InOutShapes({{{10, 10, 15, 10}},
+                                                   {{16, 18, 25, 22}}}),
+                                      NewInOutShapes({{{4, 2, 20, 15}},
+                                                      {{10, 10, 30, 27}}}),
+                                      MapParams(MapStrStr({{"pads_begin",  "1,2,3,4"},
+                                                           {"pads_end",    "5,6,7,8"},
+                                                           {"pad_mode",    "reflect"},
+                                                           {"pad_value",   "1.0f"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(false))
+        )
+);
+
+INSTANTIATE_TEST_CASE_P(
+        BuiltInGeneralImpls2, BuiltInShapeInferImplTest,
+        ::testing::Values(
+                ::testing::make_tuple(LayerType("Gather"),
+                                      InOutShapes({{{7, 16}, {1, 25}},
+                                                   {{1, 25, 16}}}),
+                                      NewInOutShapes({{{7, 16}, {12, 25}},
+                                                      {{12, 25, 16}}}),
+                                      MapParams(MapStrStr(std::map<std::string, std::string>{{"axis", "0"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Gather"),
+                                      InOutShapes({{{7, 16}, {1, 25}},
+                                                   {{7, 1, 25}}}),
+                                      NewInOutShapes({{{7, 16}, {12, 25}},
+                                                      {{7, 12, 25}}}),
+                                      MapParams(MapStrStr(std::map<std::string, std::string>{{"axis", "1"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Gather"),
+                                      InOutShapes({{{7, 16}, {1, 25}},
+                                                   {{7, 1, 25}}}),
+                                      NewInOutShapes({{{7, 16}, {12, 25}},
+                                                      {{7, 12, 25}}}),
+                                      MapParams(MapStrStr(std::map<std::string, std::string>{{"axis", "-1"}})),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Reshape"),
+                                      InOutShapes({{{1, 2}},
+                                                   {{1, 1}}}),
+                                      NewInOutShapes({{{1, 2}},
+                                                      {{1, 1}}}),
+                                      MapParams(MapStrStr(std::map<std::string, std::string>{{"dim", "1,1"}})),  // dim doesn't match input
+                                      LayerDataName("data"),
+                                      CanInfer(false)),
+                ::testing::make_tuple(LayerType("Flatten"),
+                                      InOutShapes({{{2, 1, 4, 5}},
+                                                   {{40}}}),
+                                      NewInOutShapes({{{4, 1, 4, 5}},
+                                                      {{80}}}),
+                                      MapParams(MapParams(MapStrStr(std::map<std::string, std::string>{{"axis", "0"},
+                                                                                                       {"end_axis", "-1"}}))),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Flatten"),
+                                      InOutShapes({{{2, 2, 4, 5}},
+                                                   {{2, 8, 5}}}),
+                                      NewInOutShapes({{{4, 2, 4, 5}},
+                                                      {{4, 8, 5}}}),
+                                      MapParams(MapParams(MapStrStr(std::map<std::string, std::string>{{"axis", "1"},
+                                                                                                       {"end_axis", "2"}}))),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Flatten"),
+                                      InOutShapes({{{2, 2, 4, 5}},
+                                                   {{2, 40}}}),
+                                      NewInOutShapes({{{4, 2, 4, 5}},
+                                                      {{4, 40}}}),
+                                      MapParams(MapParams(MapStrStr(std::map<std::string, std::string>{{"axis", "1"}}))),
+                                      LayerDataName("data"),
+                                      CanInfer(true)),
+                ::testing::make_tuple(LayerType("Flatten"),
+                                      InOutShapes({{{2, 2, 4, 5}},
+                                                   {{4, 4, 5}}}),
+                                      NewInOutShapes({{{4, 2, 4, 5}},
+                                                      {{8, 4, 5}}}),
+                                      MapParams(MapParams(MapStrStr(std::map<std::string, std::string>{{"end_axis", "1"}}))),
+                                      LayerDataName("data"),
                                       CanInfer(true))
         )
 );
+
+class LayerValidatorNegativeTests : public BuiltInShapeInferImplTest {
+};
+
+TEST_P(LayerValidatorNegativeTests, reshaper) {
+    ASSERT_THROW(buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams.data, layerDataName),
+                 InferenceEngine::details::InferenceEngineException);
+}
+
+// TODO: test using MR!1690
+INSTANTIATE_TEST_CASE_P(
+        Reshape, LayerValidatorNegativeTests,
+        ::testing::Combine(
+                ::testing::Values(LayerType("Reshape")),
+                ::testing::Values(InOutShapes({{{1,   1, 300, 4}},
+                                               {{300, 4}}})),
+                ::testing::Values(NewInOutShapes({{{1,   1, 500, 4}},
+                                                  {{500, 4}}})),
+                ::testing::Values(
+                        MapParams(MapStrStr(
+                                std::map<std::string, std::string>{{"dim", "0,-2,6"}})),  // can't be less the -1
+                        MapParams(MapStrStr(
+                                std::map<std::string, std::string>{{"dim", "0,-1,-1"}}))),  // single -1 is expected
+                ::testing::Values(LayerDataName("data")),
+                ::testing::Values(CanInfer())
+        )
+);
index ca02f5a..5eac622 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -8,56 +7,34 @@
 #include <gtest/gtest.h>
 #include <inference_engine/shape_infer/built-in/ie_built_in_holder.hpp>
 #include <utility>
-#include <inference_engine/v2_format_parser.h>
+#include <inference_engine/ie_format_parser.h>
 #include <xml_helper.hpp>
 #include <xml_net_builder.hpp>
-
-#define PRETTY_PARAM(name, type)                                                            \
-    class name                                                                              \
-    {                                                                                       \
-    public:                                                                                 \
-        typedef type param_type;                                                            \
-        explicit name ( param_type arg = param_type ()) : val_(arg) {}                      \
-        operator param_type () const {return val_;}                                         \
-    private:                                                                                \
-        param_type val_;                                                                    \
-    };                                                                                      \
-    static inline void PrintTo(name param, ::std::ostream* os)                              \
-    {                                                                                       \
-        *os << #name ": " << ::testing::PrintToString((name::param_type)(param));           \
-    }
-
-struct MapStrStr {
-    std::map<std::string, std::string> data{};
-
-    explicit MapStrStr(std::map<std::string, std::string> _data) : data(std::move(_data)) {}
-
-    MapStrStr() {}
-};
+#include <single_layer_common.hpp>
 
 class BaseTestCreator {
 protected:
     std::string _type;
 public:
-    explicit BaseTestCreator(const std::stringtype) : _type(type) {}
+    explicit BaseTestCreator(const std::string &type) : _type(type) {}
 
-    virtual InferenceEngine::CNNLayerPtr create(const std::string& type)  = 0;
+    virtual InferenceEngine::CNNLayerPtr create(const std::string &type) = 0;
 
-    virtual bool shouldCreate(const std::stringtype) = 0;
+    virtual bool shouldCreate(const std::string &type) = 0;
 };
 
 template<class LT>
 class LayerTestCreator : public BaseTestCreator {
 public:
-    explicit LayerTestCreator(const std::stringtype) : BaseTestCreator(type) {}
+    explicit LayerTestCreator(const std::string &type) : BaseTestCreator(type) {}
 
-    InferenceEngine::CNNLayerPtr create(const std::stringtype) override {
+    InferenceEngine::CNNLayerPtr create(const std::string &type) override {
         InferenceEngine::LayerParams params;
         params.type = type;
         return std::make_shared<LT>(params);
     }
 
-    bool shouldCreate(const std::stringtype) override {
+    bool shouldCreate(const std::string &type) override {
         return type == _type;
     }
 };
@@ -66,12 +43,12 @@ struct param_size {
     unsigned x;
     unsigned y;
 
-    friend std::ostream& operator<<(std::ostream& os, param_size const& paramSize) {
+    friend std::ostream &operator<<(std::ostream &os, param_size const &paramSize) {
         os << "x=" << std::to_string(paramSize.x) << ", y=" << std::to_string(paramSize.y);
         return os;
     };
 
-    std::string toSeparetedRow(const charseparator) {
+    std::string toSeparetedRow(const char *separator) {
         std::string res = std::to_string(y) + separator + std::to_string(x);
         return res;
     }
@@ -123,7 +100,7 @@ protected:
         holder = std::make_shared<InferenceEngine::ShapeInfer::BuiltInShapeInferHolder>();
     }
 
-    InferenceEngine::IShapeInferImpl::Ptr getShapeInferImpl(const std::stringtype) {
+    InferenceEngine::IShapeInferImpl::Ptr getShapeInferImpl(const std::string &type) {
         InferenceEngine::IShapeInferImpl::Ptr impl;
         sts = holder->getShapeInferImpl(impl, type.c_str(), &resp);
         if (sts != InferenceEngine::StatusCode::OK) THROW_IE_EXCEPTION << resp.msg;
@@ -139,7 +116,7 @@ protected:
 template<class T>
 class BuiltInShapeInferTestWithParam : public BuiltInShapeInferCommon,
                                        public testing::WithParamInterface<T> {
-    const std::vector<std::shared_ptr<BaseTestCreator>>getCreators() const {
+    const std::vector<std::shared_ptr<BaseTestCreator>> &getCreators() const {
         // there should be unique_ptr but it cant be used with initializer lists
         static std::vector<std::shared_ptr<BaseTestCreator> > creators = {
                 std::make_shared<LayerTestCreator<InferenceEngine::PowerLayer>>("Power"),
@@ -166,20 +143,23 @@ class BuiltInShapeInferTestWithParam : public BuiltInShapeInferCommon,
                 std::make_shared<LayerTestCreator<InferenceEngine::ReshapeLayer>>("Reshape"),
                 std::make_shared<LayerTestCreator<InferenceEngine::TileLayer>>("Tile"),
                 std::make_shared<LayerTestCreator<InferenceEngine::BatchNormalizationLayer>>("BatchNormalization"),
+                std::make_shared<LayerTestCreator<InferenceEngine::GemmLayer>>("Gemm"),
+                std::make_shared<LayerTestCreator<InferenceEngine::PadLayer>>("Pad"),
+                std::make_shared<LayerTestCreator<InferenceEngine::GatherLayer>>("Gather")
         };
         return creators;
     }
 
 protected:
     InferenceEngine::DataPtr
-    getNotEmptyData(std::string const& name = "", const InferenceEngine::SizeVector& dims = {}) {
+    getNotEmptyData(std::string const &name = "", const InferenceEngine::SizeVector &dims = {}) {
         InferenceEngine::TensorDesc desc(InferenceEngine::Precision::UNSPECIFIED, dims,
                                          InferenceEngine::TensorDesc::getLayoutByDims(dims));
         return std::make_shared<InferenceEngine::Data>(name, desc);
     }
 
-    InferenceEngine::CNNLayer::Ptr createLayer(const std::stringtype) const {
-        for (autocreator : getCreators()) {
+    InferenceEngine::CNNLayer::Ptr createLayer(const std::string &type) const {
+        for (auto &creator : getCreators()) {
             if (!creator->shouldCreate(type))
                 continue;
             return creator->create(type);
@@ -188,63 +168,35 @@ protected:
         return genericCreator.create(type);
     }
 
-    void initLayer(const InferenceEngine::CNNLayerPtr& layer, const testing::InOutData& inOutData) {
-        for (const autoin:inOutData.inDims) {
+    void initLayer(const InferenceEngine::CNNLayerPtr &layer, const testing::InOutData &inOutData) {
+        for (const auto &in:inOutData.inDims) {
             auto data = getNotEmptyData("", in);
             _savedData.push_back(data);
             layer->insData.push_back(data);
         }
-        for (const autoout:inOutData.outDims) {
+        for (const auto &out:inOutData.outDims) {
             layer->outData.push_back(getNotEmptyData("", out));
         }
     }
 
-    static testing::InOutData getFakeData(const testing::InOutDatainOutShapes) {
+    static testing::InOutData getFakeData(const testing::InOutData &inOutShapes) {
         testing::InOutData initial = inOutShapes;
-        for (autodims : initial.inDims) {
+        for (auto &dims : initial.inDims) {
             std::fill(dims.begin(), dims.end(), 1);
         }
-        for (autodims : initial.outDims) {
+        for (auto &dims : initial.outDims) {
             std::fill(dims.begin(), dims.end(), 1);
         }
         return initial;
     }
 
-    static InferenceEngine::details::CNNNetworkImplPtr buildSingleLayerNetwork(
-            const std::string& layerType,
-            const testing::InOutData& inOutShapes,
-            std::map<std::string, std::string>* params,
-            const std::string& layerDataName,
-            int ir_version = 3) {
-        testing::XMLHelper xmlHelper(new InferenceEngine::details::V2FormatParser(ir_version));
-        std::string precision = InferenceEngine::Precision(InferenceEngine::Precision::FP32).name();
-        auto netBuilder = testing::V2NetBuilder::buildNetworkWithOneInput("Mock", inOutShapes.inDims[0], precision);
-        size_t inputsNumber = inOutShapes.inDims.size();
-        for (int i = 1; i < inputsNumber; i++) {
-            netBuilder.addInputLayer(precision, inOutShapes.inDims[i]);
-        }
-        netBuilder.addLayer(layerType, precision, params, inOutShapes, 0, 0, layerDataName);
-        std::string testContent;
-        if (inputsNumber > 1) {
-            auto edgeBuilder = netBuilder.havingEdges();
-            for (size_t i = 0; i < inputsNumber; i++) {
-                edgeBuilder.connect(i, inputsNumber);
-            }
-            testContent = edgeBuilder.finish();
-        } else {
-            testContent = netBuilder.finish();
-        }
-        xmlHelper.loadContent(testContent);
-        return xmlHelper.parseWithReturningNetwork();
-    }
-
     static InferenceEngine::ICNNNetwork::InputShapes
-    setInputShapes(const InferenceEngine::ICNNNetworkcnnNetwork,
-                   const std::vector<InferenceEngine::SizeVector>shapesToSet) {
+    setInputShapes(const InferenceEngine::ICNNNetwork &cnnNetwork,
+                   const std::vector<InferenceEngine::SizeVector> &shapesToSet) {
         InferenceEngine::ICNNNetwork::InputShapes inputShapes;
         InferenceEngine::InputsDataMap inputs;
         cnnNetwork.getInputsInfo(inputs);
-        for (const autopair : inputs) {
+        for (const auto &pair : inputs) {
             auto info = pair.second;
             if (info) {
                 auto data = info->getInputData();
@@ -254,14 +206,14 @@ protected:
             }
         }
         int i = 0;
-        for (autopair : inputShapes) {
+        for (auto &pair : inputShapes) {
             pair.second = shapesToSet[i++];
         }
         return inputShapes;
     }
 
-    static void checkNetworkInOut(const InferenceEngine::ICNNNetworknetwork,
-                                  const testing::InOutDatainOutData) {
+    static void checkNetworkInOut(const InferenceEngine::ICNNNetwork &network,
+                                  const testing::InOutData &inOutData) {
         InferenceEngine::InputsDataMap inputsDataMap;
         InferenceEngine::OutputsDataMap outputsDataMap;
         network.getInputsInfo(inputsDataMap);
@@ -276,6 +228,16 @@ protected:
         }
     }
 
+    template<int Version = 3>
+    static InferenceEngine::details::CNNNetworkImplPtr
+    buildSingleLayerNetwork(const std::string &layerType,
+                            const testing::InOutData &inOutShapes,
+                            std::map<std::string, std::string> *params,
+                            const std::string &layerDataName = "data") {
+        auto *parser = new InferenceEngine::details::FormatParser(Version);
+        return buildSingleLayerNetworkCommon<Version>(parser, layerType, inOutShapes, params, layerDataName);
+    }
+
 protected:
     std::vector<InferenceEngine::SizeVector> outShapes;
     std::map<std::string, std::string> params;
index 8ea27c3..487ff84 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,7 +6,7 @@
 #include <inference_engine/shape_infer/built-in/ie_built_in_holder.hpp>
 #include <xml_net_builder.hpp>
 #include <inference_engine/cnn_network_impl.hpp>
-#include <inference_engine/v2_format_parser.h>
+#include <inference_engine/ie_format_parser.h>
 #include <xml_helper.hpp>
 #include <inference_engine/shape_infer/ie_reshaper.hpp>
 #include "built_in_shape_infer_general_test.hpp"
@@ -78,7 +77,6 @@ protected:
 };
 
 TEST_P(BuiltInShapeInferPoolImplTest, body) {
-    InferenceEngine::details::BaseCreator::version_ = 2;
     auto impl = getShapeInferImpl(type);
     ASSERT_NE(nullptr, impl);
     ASSERT_NO_THROW(sts = impl->inferShapes(inOutShapes.inDims, getMapParams(), blobs, outShapes, &resp));
@@ -88,7 +86,7 @@ TEST_P(BuiltInShapeInferPoolImplTest, body) {
 
 TEST_P(BuiltInShapeInferPoolImplTest, reshaper) {
     auto layerParams = getMapParams();
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams, "pooling_data", 2);
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<2>(type, inOutShapes, &layerParams, "pooling_data");
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     auto inputShapes = setInputShapes(*cnnNetworkImplPtr, newInOutShapes.inDims);
     reshaper->run(inputShapes);
@@ -97,7 +95,7 @@ TEST_P(BuiltInShapeInferPoolImplTest, reshaper) {
 
 TEST_P(BuiltInShapeInferPoolImplTest, batch) {
     auto layerParams = getMapParams();
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams, "pooling_data", 2);
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<2>(type, inOutShapes, &layerParams, "pooling_data");
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     sts = cnnNetworkImplPtr->setBatchSize(BATCH, &resp);
     ASSERT_EQ((int)OK, sts) << resp.msg;
@@ -106,7 +104,6 @@ TEST_P(BuiltInShapeInferPoolImplTest, batch) {
 }
 
 TEST_P(BuiltInShapeInferPoolImplTest, body_IRv3) {
-    InferenceEngine::details::BaseCreator::version_ = 3;
     auto impl = getShapeInferImpl(type);
     ASSERT_NE(nullptr, impl);
     ASSERT_NO_THROW(sts = impl->inferShapes(inOutShapes.inDims, getMapParams_IRv3(), blobs, outShapes, &resp));
@@ -116,7 +113,7 @@ TEST_P(BuiltInShapeInferPoolImplTest, body_IRv3) {
 
 TEST_P(BuiltInShapeInferPoolImplTest, reshaper_IRv3) {
     auto layerParams = getMapParams_IRv3();
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams, "pooling_data");
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams, "pooling_data");
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     auto inputShapes = setInputShapes(*cnnNetworkImplPtr, newInOutShapes.inDims);
     reshaper->run(inputShapes);
@@ -125,7 +122,7 @@ TEST_P(BuiltInShapeInferPoolImplTest, reshaper_IRv3) {
 
 TEST_P(BuiltInShapeInferPoolImplTest, batch_IRv3) {
     auto layerParams = getMapParams_IRv3();
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams, "pooling_data");
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams, "pooling_data");
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     sts = cnnNetworkImplPtr->setBatchSize(BATCH, &resp);
     ASSERT_EQ((int)OK, sts) << resp.msg;
index a4aaca7..4551dd7 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,7 +10,8 @@
 #include <cpp/ie_cnn_net_reader.h>
 #include <test_model_path.hpp>
 #include <inference_engine/debug.h>
-#include <extension/ext_list.hpp>
+#include <ie_extension.h>
+#include <tests_common.hpp>
 #include "built_in_shape_infer_general_test.hpp"
 
 using namespace InferenceEngine;
@@ -20,9 +20,12 @@ using namespace ShapeInfer;
 
 class CPUExtShapeInferTests : public BuiltInShapeInferImplTest {
 protected:
+    InferenceEngine::ShapeInferExtension shapeInferExt;
+    CPUExtShapeInferTests () : shapeInferExt(TestsCommon::make_so_name("cpu_extension")) {}
+
     void SetUp() override {
         BuiltInShapeInferImplTest::SetUp();
-        holder = std::make_shared<InferenceEngine::Extensions::Cpu::CpuExtensions>();
+        holder = std::shared_ptr<IShapeInferExtension>(&shapeInferExt, [](IShapeInferExtension*){});
     }
 };
 
@@ -40,7 +43,7 @@ TEST_P(CPUExtShapeInferTests, impl) {
 }
 
 TEST_P(CPUExtShapeInferTests, reshaper) {
-    auto cnnNetworkImplPtr = buildSingleLayerNetwork(type, inOutShapes, &layerParams.data, layerDataName);
+    auto cnnNetworkImplPtr = buildSingleLayerNetwork<3>(type, inOutShapes, &layerParams.data, layerDataName);
     auto reshaper = std::make_shared<Reshaper>(*cnnNetworkImplPtr);
     auto inputShapes = setInputShapes(*cnnNetworkImplPtr.get(), newInOutShapes.inDims);
     reshaper->AddExtension(holder);
index 277a990..86364ea 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,6 +10,7 @@
 #include <mock_icnn_network.hpp>
 #include <../graph_tools/graph_test_base.hpp>
 #include <shape_infer/mock_reshaper_launcher.hpp>
+#include <shape_infer/ie_reshaper.hpp>
 
 using namespace InferenceEngine;
 using namespace InferenceEngine::details;
@@ -95,7 +95,7 @@ public:
     ResponseDesc resp;
     static const std::string TEST_NAME;
     MockIShapeInferImpl::Ptr impl;
-    Reshaper::Ptr reshaper;
+    ReshaperPtr reshaper;
 };
 
 const std::string ReshaperTest::TEST_NAME = "TEST_NAME";
index da84d72..34ff736 100644 (file)
@@ -1,5 +1,4 @@
 // Copyright (C) 2018 Intel Corporation
-//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -9,7 +8,7 @@
 #include "xml_net_builder.hpp"
 #include "xml_helper.hpp"
 #include "pugixml.hpp"
-#include "inference_engine/v2_format_parser.h"
+#include "inference_engine/ie_format_parser.h"
 #include <fstream>
 #include <stdio.h>
 #include "details/ie_exception.hpp"
@@ -22,7 +21,7 @@ class V2TopologyVerificationTests : public ::testing::Test {
 protected:
     virtual void TearDown() {}
     virtual void SetUp() {
-        xmlHelper.reset(new XMLHelper(new details::V2FormatParser(2)));
+        xmlHelper.reset(new XMLHelper(new details::FormatParser(2)));
     }
 public:
     unique_ptr<CNNNetwork> cnnNetwork;
index f6eb27b..8277d6c 100644 (file)
@@ -1,6 +1,12 @@
 # Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option -Wno-inconsistent-missing-override -Wno-pass-failed")
+  set(CMAKE_CC_FLAGS "${CMAKE_CC_FLAGS} -Wno-unknown-warning-option -Wno-inconsistent-missing-override -Wno-pass-failed")
+endif()
+
 add_subdirectory(pugixml)
 add_subdirectory(stb_lib)
 add_subdirectory(ade)
@@ -24,3 +30,5 @@ endif()
 if(ENABLE_MKL_DNN)
     include(mkldnn.cmake)
 endif()
+
+add_subdirectory("${IE_MAIN_SOURCE_DIR}/thirdparty/fluid/modules/gapi")
diff --git a/inference-engine/thirdparty/MKL.cmake b/inference-engine/thirdparty/MKL.cmake
deleted file mode 100644 (file)
index 470c0a8..0000000
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (C) 2018 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-
-if(MKL_cmake_included)
-    return()
-endif()
-set(MKL_cmake_included true)
-
-function(detect_mkl LIBNAME)
-    if(HAVE_MKL)
-        return()
-    endif()
-
-    message(STATUS "Detecting Intel(R) MKL: trying ${LIBNAME}")
-
-    find_path(MKLINC mkl_cblas.h
-        HINTS ${MKLROOT}/include $ENV{MKLROOT}/include)
-    if(NOT MKLINC)
-        file(GLOB_RECURSE MKLINC
-                ${CMAKE_CURRENT_SOURCE_DIR}/external/*/mkl_cblas.h)
-        if(MKLINC)
-            # if user has multiple version under external/ then guess last
-            # one alphabetically is "latest" and warn
-            list(LENGTH MKLINC MKLINCLEN)
-            if(MKLINCLEN GREATER 1)
-                list(SORT MKLINC)
-                list(REVERSE MKLINC)
-                # message(STATUS "MKLINC found ${MKLINCLEN} files:")
-                # foreach(LOCN IN LISTS MKLINC)
-                #     message(STATUS "       ${LOCN}")
-                # endforeach()
-                list(GET MKLINC 0 MKLINCLST)
-                set(MKLINC "${MKLINCLST}")
-                # message(WARNING "MKLINC guessing... ${MKLINC}.  "
-                #     "Please check that above dir has the desired mkl_cblas.h")
-            endif()
-            get_filename_component(MKLINC ${MKLINC} PATH)
-        endif()
-    endif()
-    if(NOT MKLINC)
-        return()
-    endif()
-
-    get_filename_component(__mklinc_root "${MKLINC}" PATH)
-    find_library(MKLLIB NAMES ${LIBNAME}
-        HINTS   ${MKLROOT}/lib ${MKLROOT}/lib/intel64
-                $ENV{MKLROOT}/lib $ENV{MKLROOT}/lib/intel64
-                ${__mklinc_root}/lib ${__mklinc_root}/lib/intel64)
-    if(NOT MKLLIB)
-        return()
-    endif()
-
-    if(WIN32)
-        set(MKLREDIST ${MKLINC}/../../redist/)
-        find_file(MKLDLL NAMES ${LIBNAME}.dll
-            HINTS
-                ${MKLREDIST}/mkl
-                ${MKLREDIST}/intel64/mkl
-                ${__mklinc_root}/lib)
-        if(NOT MKLDLL)
-            return()
-        endif()
-    endif()
-    if(THREADING STREQUAL "OMP")
-        if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-            get_filename_component(MKLLIBPATH ${MKLLIB} PATH)
-            find_library(MKLIOMP5LIB
-                NAMES "iomp5" "iomp5md" "libiomp5" "libiomp5md"
-                HINTS   ${MKLLIBPATH}
-                        ${MKLLIBPATH}/../../lib
-                        ${MKLLIBPATH}/../../../lib/intel64
-                        ${MKLLIBPATH}/../../compiler/lib
-                        ${MKLLIBPATH}/../../../compiler/lib/intel64
-                        ${OMP}
-                        ${OMP}/lib)
-            if(NOT MKLIOMP5LIB)
-                return()
-            endif()
-            if(WIN32)
-                find_file(MKLIOMP5DLL
-                    NAMES "libiomp5.dll" "libiomp5md.dll"
-                    HINTS ${MKLREDIST}/../compiler ${__mklinc_root}/lib)
-                if(NOT MKLIOMP5DLL)
-                    return()
-                endif()
-            endif()
-        else()
-            set(MKLIOMP5LIB)
-            set(MKLIOMP5DLL)
-        endif()
-    endif()
-
-    get_filename_component(MKLLIBPATH "${MKLLIB}" PATH)
-    string(FIND "${MKLLIBPATH}" ${CMAKE_CURRENT_SOURCE_DIR}/external __idx)
-    if(${__idx} EQUAL 0)
-        if(WIN32)
-            if(MINGW)
-                # We need to install *.dll into bin/ instead of lib/.
-                install(PROGRAMS ${MKLDLL} DESTINATION bin)
-            else()
-                install(PROGRAMS ${MKLDLL} DESTINATION lib)
-            endif()
-        else()
-            install(PROGRAMS ${MKLLIB} DESTINATION lib)
-        endif()
-        if(THREADING STREQUAL "OMP" AND MKLIOMP5LIB)
-            if(WIN32)
-                if(MINGW)
-                    # We need to install *.dll into bin/ instead of lib/.
-                    install(PROGRAMS ${MKLIOMP5DLL} DESTINATION bin)
-                else()
-                    install(PROGRAMS ${MKLIOMP5DLL} DESTINATION lib)
-                endif()
-            else()
-                install(PROGRAMS ${MKLIOMP5LIB} DESTINATION lib)
-            endif()
-        endif()
-    endif()
-
-    if(WIN32)
-        # Add paths to DLL to %PATH% on Windows
-        get_filename_component(MKLDLLPATH "${MKLDLL}" PATH)
-        set(CTESTCONFIG_PATH "${CTESTCONFIG_PATH}\;${MKLDLLPATH}")
-        set(CTESTCONFIG_PATH "${CTESTCONFIG_PATH}" PARENT_SCOPE)
-    endif()
-
-    # TODO: cache the value
-    set(HAVE_MKL TRUE PARENT_SCOPE)
-    set(MKLINC ${MKLINC} PARENT_SCOPE)
-    set(MKLLIB "${MKLLIB}" PARENT_SCOPE)
-
-    if(WIN32)
-        set(MKLDLL "${MKLDLL}" PARENT_SCOPE)
-    endif()
-    if(THREADING STREQUAL "OMP")
-        if(MKLIOMP5LIB)
-            set(MKLIOMP5LIB "${MKLIOMP5LIB}" PARENT_SCOPE)
-        endif()
-        if(WIN32 AND MKLIOMP5DLL)
-            set(MKLIOMP5DLL "${MKLIOMP5DLL}" PARENT_SCOPE)
-        endif()
-    endif()
-endfunction()
-
-if(WIN32)
-    detect_mkl("mklml")
-else()
-    if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-               detect_mkl("mklml_intel")
-    else()
-        detect_mkl("mklml_gnu")
-    endif()
-endif()
-
-
-if(HAVE_MKL)
-    add_definitions(-DUSE_MKL -DUSE_CBLAS)
-    include_directories(AFTER ${MKLINC})
-    list(APPEND mkldnn_LINKER_LIBS ${MKLLIB})
-
-    set(MSG "Intel(R) MKL:")
-    message(STATUS "${MSG} include ${MKLINC}")
-    message(STATUS "${MSG} lib ${MKLLIB}")
-    if(THREADING STREQUAL "OMP")
-        if(MKLIOMP5LIB)
-            message(STATUS "${MSG} OpenMP lib ${MKLIOMP5LIB}")
-        else()
-            message(STATUS "${MSG} OpenMP lib provided by compiler")
-        endif()
-        if(WIN32)
-            message(STATUS "${MSG} dll ${MKLDLL}")
-            if(MKLIOMP5DLL)
-                message(STATUS "${MSG} OpenMP dll ${MKLIOMP5DLL}")
-            else()
-                message(STATUS "${MSG} OpenMP dll provided by compiler")
-            endif()
-        endif()
-    endif()
-else()
-    if(DEFINED ENV{FAIL_WITHOUT_MKL} OR DEFINED FAIL_WITHOUT_MKL)
-        set(SEVERITY "FATAL_ERROR")
-    else()
-        set(SEVERITY "WARNING")
-    endif()
-    message(${SEVERITY}
-        "Intel(R) MKL not found. Some performance features may not be "
-        "available. Please run scripts/prepare_mkl.sh to download a minimal "
-        "set of libraries or get a full version from "
-        "https://software.intel.com/en-us/intel-mkl")
-endif()
index 0ba3b01..562e301 160000 (submodule)
@@ -1 +1 @@
-Subproject commit 0ba3b01dae7262f7828dc6fa65ef3a89fb371cde
+Subproject commit 562e301ccc8327e4016ccc3f1bc3a8592f50ea21
index 5df2144..a7b90fb 100644 (file)
@@ -34,12 +34,16 @@ extern "C" {
 ///        ( @CLDNN_PRIMITIVE_DESC{border} ​).
 typedef enum /*:int32_t*/
 {
-    /// @brief All points in the border are set to zero.
-    cldnn_border_zero,
+    /// @brief All points in the border are set to constant value.
+    cldnn_border_constant,
     /// @brief Border is constructed as an mirror of image (edge is also mirrored).
     /// @details Size of border in any dimension cannot be larger than size of
     ///          input in the same dimension.
     cldnn_border_mirror,
+    /// @brief Border is constructed as an replication of edge.
+    /// @details Size of border in any dimension cannot be larger than size of
+    ///          input in the same dimension.
+    cldnn_border_edge,
     /// @brief Border is constructed as an mirror of image (edge is NOT mirrored).
     /// @details Size of border in any dimension cannot be larger than size of
     ///          input in the same dimension decreased by @c 1.
@@ -66,6 +70,8 @@ cldnn_tensor left_top_sizes;
 cldnn_tensor right_bottom_sizes;
 /// @brief Type of border that needs to be added to the input.
 cldnn_border_type border_type;
+/// @brief Border value that is used in constant mode.
+float border_value;
 CLDNN_END_PRIMITIVE_DESC(border)
 
 
index 79c0777..6171b6c 100644 (file)
@@ -31,12 +31,16 @@ namespace cldnn
 /// @brief Type of border that will be added to the input by border layer / primitive.
 enum class border_type : std::int32_t
 {
-    /// @brief All points in the border are set to zero.
-    zero = cldnn_border_zero,
+    /// @brief All points in the border are set to constant value.
+    constant = cldnn_border_constant,
     /// @brief Border is constructed as an mirror of image (edge is also mirrored).
     /// @details Size of border in any dimension cannot be larger than size of
     ///          input in the same dimension.
     mirror = cldnn_border_mirror,
+    /// @brief Border is constructed as an replication of edge.
+    /// @details Size of border in any dimension cannot be larger than size of
+    ///          input in the same dimension.
+    edge = cldnn_border_edge,
     /// @brief Border is constructed as an mirror of image (edge is NOT mirrored).
     /// @details Size of border in any dimension cannot be larger than size of
     ///          input in the same dimension decreased by @c 1.
@@ -71,41 +75,22 @@ struct border : public primitive_base<border, CLDNN_PRIMITIVE_DESC(border)>
     /// @param right_bottom_sizes Sizes of border that needs to be added from right
     ///                           (in X dimension) and from bottom (in Y dimension).
     /// @param type               Type of added border.
+    /// @param border_value       Value of elements which is used for paddings
     /// @param output_padding     Optional padding for output from primitive.
     border(
         const primitive_id& id,
         const primitive_id& input,
-        const tensor& left_top_sizes     = {0, 0, 0, 0},
-        const tensor& right_bottom_sizes = {0, 0, 0, 0},
-        const border_type type           = border_type::zero,
-        const padding& output_padding    = padding()
+        const tensor& left_top_sizes,
+        const tensor& right_bottom_sizes,
+        const border_type type,
+        const float border_value = 0.0f,
+        const padding& output_padding = padding()
     )
         : primitive_base(id, {input}, output_padding),
           left_top_sizes(left_top_sizes),
           right_bottom_sizes(right_bottom_sizes),
-          type(type)
-    {
-    }
-
-    /// @brief Constructs border primitive / layer.
-    ///
-    /// @param id                 An identifier of new primitive.
-    /// @param input              An identifier of primitive which is an input for newly created
-    ///                           border primitive.
-    /// @param x_y_sizes          Sizes of border that needs to be added from left and right
-    ///                           (in X dimension) and from top and bottom (in Y dimension).
-    ///                           Created border is simmetric (the same size of border applied
-    ///                           from both sides of input).
-    /// @param type               Type of added border.
-    /// @param output_padding     Optional padding for output from primitive.
-    border(
-        const primitive_id& id,
-        const primitive_id& input,
-        const tensor& x_y_sizes,
-        const border_type type        = border_type::zero,
-        const padding& output_padding = padding()
-    )
-        : border(id, input, x_y_sizes, x_y_sizes, type, output_padding)
+          type(type),
+          border_value(border_value)
     {
     }
 
@@ -114,7 +99,8 @@ struct border : public primitive_base<border, CLDNN_PRIMITIVE_DESC(border)>
         : primitive_base(dto),
           left_top_sizes(dto->left_top_sizes),
           right_bottom_sizes(dto->right_bottom_sizes),
-          type(static_cast<border_type>(dto->border_type))
+          type(static_cast<border_type>(dto->border_type)),
+          border_value(dto->border_value)
     {
     }
 
@@ -124,13 +110,15 @@ struct border : public primitive_base<border, CLDNN_PRIMITIVE_DESC(border)>
     tensor right_bottom_sizes;
     /// @brief Type of border that needs to be added to the input.
     border_type type;
-
+    /// @brief Border value that is used in constant mode.
+    float border_value;
 protected:
     void update_dto(dto& dto) const override
     {
         dto.left_top_sizes     = left_top_sizes;
         dto.right_bottom_sizes = right_bottom_sizes;
         dto.border_type        = static_cast<cldnn_border_type>(type);
+        dto.border_value       = border_value;
     }
 };
 /// @}
index 46a8877..c244209 100644 (file)
@@ -339,7 +339,8 @@ namespace kernel_selector
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     enum class BorderType
     {
-        ZERO,
+        CONSTANT,
+        EDGE,
         MIRROR,
         MIRROR_101,
     };
index 3346c4b..b9f4d08 100644 (file)
@@ -27,6 +27,7 @@ namespace kernel_selector
         jit.AddConstants({
             MakeJitConstant("LT_SIZES",              params.lt_sizes),
             MakeJitConstant("RB_SIZES",              params.rb_sizes),
+            MakeJitConstant("BORDER_VALUE",          params.border_value),
             MakeJitConstant(toString(params.b_type), "")
         });
 
index 43c10c7..fbce8a6 100644 (file)
@@ -27,7 +27,8 @@ namespace kernel_selector
     {
         DimTensor<> lt_sizes;
         DimTensor<> rb_sizes;
-        BorderType b_type = BorderType::ZERO;
+        BorderType b_type;
+        float border_value;
 
 
         border_params()
index 9029d7a..9e42901 100644 (file)
@@ -40,6 +40,8 @@ namespace kernel_selector
         k.EnableOutputLayout(DataLayout::byxf);
 
         k.EnableBatching();
+        k.EnableTensorOffset();
+        k.EnableTensorPitches();
 
         return k;
     }
index 37c206d..599dc73 100644 (file)
@@ -124,7 +124,7 @@ namespace kernel_selector
         jit.AddConstant(MakeJitConstant("OUTER_SIZE", outer_size));
         if (inner_size == 1)
         {
-            jit.AddConstant(MakeJitConstant("OUTPUT_SIZE", out.LogicalSize()));
+            jit.AddConstant(MakeJitConstant("OUTPUT_ELEMENTS", out.LogicalSize()));
             jit.AddConstant(MakeJitConstant("DENSE", 1));
         }
         return jit;
index 8f00411..0f6a214 100644 (file)
@@ -51,8 +51,8 @@ KERNEL(border_gpu_ref)(
     const uint out_f  = out_fb % OUTPUT_FEATURE_NUM;
     const uint out_b  = out_fb / OUTPUT_FEATURE_NUM;
 
-#ifdef BORDER_TYPE_ZERO
-    UNIT_TYPE in_val = UNIT_VAL_ZERO;
+#ifdef BORDER_TYPE_CONSTANT
+    UNIT_TYPE in_val = TO_UNIT_TYPE(BORDER_VALUE);
     if (out_x >= blt_sx & out_x < in_lx &
         out_y >= blt_sy & out_y < in_ly &
         out_f >= blt_sf & out_f < in_lf &
@@ -66,6 +66,14 @@ KERNEL(border_gpu_ref)(
         const uint in_pos = GET_DATA_INDEX(INPUT0, in_b, in_f, in_y, in_x);
         in_val = input[in_pos];
     }
+#elif defined BORDER_TYPE_EDGE
+    const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? 0 : in_sx - 1);
+    const uint in_y = (out_y >= blt_sy & out_y < in_ly) ? out_y - blt_sy : (out_y < blt_sy ? 0 : in_sy - 1);
+    const uint in_f = (out_f >= blt_sf & out_f < in_lf) ? out_f - blt_sf : (out_f < blt_sf ? 0 : in_sf - 1);
+    const uint in_b = (out_b >= blt_sb & out_b < in_lb) ? out_b - blt_sb : (out_b < blt_sb ? 0 : in_sb - 1);
+
+    const uint in_pos = GET_DATA_INDEX(INPUT0, in_b, in_f, in_y, in_x);
+    UNIT_TYPE in_val = input[in_pos];
 #elif defined BORDER_TYPE_MIRROR
     const uint in_x = (out_x >= blt_sx & out_x < in_lx) ? out_x - blt_sx : (out_x < blt_sx ? blt_sx - 1 - out_x : in_sx + in_lx - 1 - out_x);
     const uint in_y = (out_y >= blt_sy & out_y < in_ly) ? out_y - blt_sy : (out_y < blt_sy ? blt_sy - 1 - out_y : in_sy + in_ly - 1 - out_y);
@@ -88,4 +96,4 @@ KERNEL(border_gpu_ref)(
 
     const uint out_pos = GET_DATA_INDEX(OUTPUT, out_b, out_f, out_y, out_x);
     output[out_pos] = in_val;
-}
\ No newline at end of file
+}
index 6df286d..0c006bc 100644 (file)
@@ -60,9 +60,9 @@
 
 KERNEL(roi_pooling_gpu)
 (
-    const __global UNIT_TYPE * src_data,
-    __global UNIT_TYPE * dst_data,
-    const __global UNIT_TYPE * src_rois
+    const __global INPUT0_TYPE * src_data,
+    __global OUTPUT_TYPE * dst_data,
+    const __global INPUT1_TYPE * src_rois
 )
 {
     const size_t i = get_global_id(0);
@@ -76,7 +76,7 @@ KERNEL(roi_pooling_gpu)
     //       with SPATIAL_SCALE: It makes sense since the resolution of
     //       the pooled data is limited by its dimensions. (Is this clear?)
 
-    const __global UNIT_TYPE * roi_ptr = &src_rois[PITCH_ROI_R * r];
+    const __global INPUT1_TYPE * roi_ptr = &src_rois[PITCH_ROI_R * r];
 
 #if BILINEAR_POOLING
     const uint output_offset = OUTPUT_OFFSET + x*OUTPUT_X_PITCH + y*OUTPUT_Y_PITCH + c*OUTPUT_FEATURE_PITCH + r*OUTPUT_ROI_PITCH;
@@ -93,7 +93,7 @@ KERNEL(roi_pooling_gpu)
     COORD_T in_x = x*width_scale  + roi_start_w*(COORD_T)(SRC_W - 1);
 
     if (in_y < 0 || in_y > (COORD_T)(SRC_H - 1) || in_x < 0 || in_x > (COORD_T)(SRC_W - 1) || roi_ptr[0] == -1) {
-        dst_data[output_offset] = ACTIVATION((UNIT_TYPE)0, NL_M, NL_N);
+        dst_data[output_offset] = ACTIVATION((OUTPUT_TYPE)0, NL_M, NL_N);
         return;
     }
 
@@ -102,7 +102,7 @@ KERNEL(roi_pooling_gpu)
     int left_x_index   = (int)(floor(in_x));
     int right_x_index  = (int)(min(ceil(in_x), (COORD_T)SRC_W - 1));
 
-    const __global UNIT_TYPE* data = src_data + INPUT0_OFFSET + INPUT0_FEATURE_PITCH*c;
+    const __global INPUT0_TYPE* data = src_data + INPUT0_OFFSET + INPUT0_FEATURE_PITCH*c;
 
     ACCUM_T top_left     = (ACCUM_T)data[top_y_index*INPUT0_Y_PITCH + left_x_index*INPUT0_X_PITCH];
     ACCUM_T top_right    = (ACCUM_T)data[top_y_index*INPUT0_Y_PITCH + right_x_index*INPUT0_X_PITCH];
@@ -114,7 +114,7 @@ KERNEL(roi_pooling_gpu)
 
     ACCUM_T res = top + (bottom - top) * (in_y - top_y_index);
 
-    dst_data[output_offset] = ACTIVATION((UNIT_TYPE)res, NL_M, NL_N);
+    dst_data[output_offset] = ACTIVATION((OUTPUT_TYPE)res, NL_M, NL_N);
 #else
 
 #if USE_OLD_SCALE_AND_ROUNDING
@@ -187,10 +187,10 @@ KERNEL(roi_pooling_gpu)
     const uint work_c = group_x + GROUP_SIZE * (group_y + GROUP_SIZE * c);
 #endif
 
-    const __global UNIT_TYPE* data = src_data + INPUT0_OFFSET + INPUT0_FEATURE_PITCH*work_c;
+    const __global INPUT0_TYPE* data = src_data + INPUT0_OFFSET + INPUT0_FEATURE_PITCH*work_c;
 
 #if MAX_POOLING
-    ACCUM_T res = x_begin < x_after && y_begin < y_after ? UNIT_VAL_MIN : 0;
+    ACCUM_T res = x_begin < x_after && y_begin < y_after ? -FLT_MAX : 0;
 #else
     ACCUM_T res = 0;
 #endif
@@ -198,7 +198,7 @@ KERNEL(roi_pooling_gpu)
     for (int yy = y_begin; yy < y_after; ++yy)
     for (int xx = x_begin; xx < x_after; ++xx)
     {
-        UNIT_TYPE val = data[xx*INPUT0_X_PITCH + yy*INPUT0_Y_PITCH];
+        INPUT0_TYPE val = data[xx*INPUT0_X_PITCH + yy*INPUT0_Y_PITCH];
 #if MAX_POOLING
         res = MAX(res, (ACCUM_T)val);
 #else
@@ -215,6 +215,6 @@ KERNEL(roi_pooling_gpu)
 #endif
 
     const uint output_offset = OUTPUT_OFFSET + x*OUTPUT_X_PITCH + y*OUTPUT_Y_PITCH + c*OUTPUT_FEATURE_PITCH + r*OUTPUT_ROI_PITCH;
-    dst_data[output_offset] = ACTIVATION((UNIT_TYPE)res, NL_M, NL_N);
+    dst_data[output_offset] = ACTIVATION((OUTPUT_TYPE)res, NL_M, NL_N);
 #endif
 }
index b0f5616..b837bdd 100644 (file)
@@ -33,7 +33,7 @@ KERNEL (tile_ref)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)
     {
         UNIT_TYPE save_val = intel_sub_group_shuffle(val, (t*16 + lid)/TILES);
         int offset = group_id*16*TILES + t*16 + lid;
-        if (offset < OUTPUT_SIZE)
+        if (offset < OUTPUT_ELEMENTS)
             output[offset] = save_val;
     }
 #else
index c4f964a..f441136 100644 (file)
@@ -368,7 +368,8 @@ namespace kernel_selector
     {
         switch (type)
         {
-        case BorderType::ZERO:       return "BORDER_TYPE_ZERO";
+        case BorderType::CONSTANT:   return "BORDER_TYPE_CONSTANT";
+        case BorderType::EDGE:       return "BORDER_TYPE_EDGE";
         case BorderType::MIRROR:     return "BORDER_TYPE_MIRROR";
         case BorderType::MIRROR_101: return "BORDER_TYPE_MIRROR_101";
         default:                     return "";
index 66f1c3b..b07a1f9 100644 (file)
@@ -46,11 +46,13 @@ std::string border_inst::to_string(border_node const& node)
 
     const auto& left_top_sizes     = desc->left_top_sizes;
     const auto& right_bottom_sizes = desc->right_bottom_sizes;
+    const auto& border_value       = std::to_string(desc->border_value);
 
     const char* border_type_str = "unknown";
     switch (desc->type)
     {
-    case border_type::zero:       border_type_str = "zero";       break;
+    case border_type::constant:   border_type_str = "constant";   break;
+    case border_type::edge:       border_type_str = "edge";       break;
     case border_type::mirror:     border_type_str = "mirror";     break;
     case border_type::mirror_101: border_type_str = "mirror-101"; break;
     }
@@ -61,6 +63,7 @@ std::string border_inst::to_string(border_node const& node)
     border_info.add("left/top sizes",     left_top_sizes.to_string());
     border_info.add("right/bottom sizes", right_bottom_sizes.to_string());
     border_info.add("border type",        border_type_str);
+    border_info.add("border value",       border_value);
 
     node_info->add("border info", border_info);
 
@@ -110,4 +113,4 @@ border_inst::typed_primitive_inst(network_impl& network, border_node const& node
                                               "Not enough data in input to create mirror-101 border of specified size");
     }
 }
-}
\ No newline at end of file
+}
index 6275c20..6a56244 100644 (file)
@@ -39,9 +39,12 @@ struct border_gpu : typed_primitive_gpu_impl<border>
 
         b_params.lt_sizes = convert_dim_vector(desc->left_top_sizes);
         b_params.rb_sizes = convert_dim_vector(desc->right_bottom_sizes);
+        b_params.border_value = desc->border_value;
+
         switch (desc->type)
         {
-        case border_type::zero:       b_params.b_type = kernel_selector::border_type::ZERO;       break;
+        case border_type::constant:   b_params.b_type = kernel_selector::border_type::CONSTANT;   break;
+        case border_type::edge:       b_params.b_type = kernel_selector::border_type::EDGE;       break;
         case border_type::mirror:     b_params.b_type = kernel_selector::border_type::MIRROR;     break;
         case border_type::mirror_101: b_params.b_type = kernel_selector::border_type::MIRROR_101; break;
         default:
index 910421d..d339492 100644 (file)
@@ -129,11 +129,15 @@ namespace cldnn
             }
             auto mem = alloc_memory(layout);
             first_level_cache->second.emplace_back(memory_record({ { id, network_id } }, mem, network_id));
+            // we don't want to store any resources with no parents so memory pool has to store weak pointer of _engine. 
+            _engine->release();
             return mem;            
         }
         auto mem = alloc_memory(layout);
         std::list<memory_record> list = { memory_record({ { id, network_id } },mem, network_id) };
         _padded_pool.emplace(layout, std::move(list));
+        // we don't want to store any resources with no parents so memory pool has to store weak pointer of _engine. 
+        _engine->release();
         return mem;
     }
 
index 6d10688..005e883 100644 (file)
@@ -2515,6 +2515,9 @@ void program_impl::prepare_buffer_fusing()
             if (remove_bf8_xy_opt)
             {
                 auto users_user_layout = node.get_users().front()->get_users().front()->get_output_layout();
+                               // if users_user_layout is still bf8_yx16 (stacked convolutions) then leave the reorder
+                               if (users_user_layout.format == format::bf8_xy16)
+                                       return;
                 auto input_layout = input.get_output_layout();
                 auto target_layout = layout(input_layout.data_type, users_user_layout.format, input_layout.size, input_layout.data_padding);
                 input.set_output_layout(target_layout, false);
index f60193b..0d45548 100644 (file)
@@ -50,7 +50,7 @@ layout roi_pooling_inst::calc_output_layout(roi_pooling_node const& node)
         fm /= gss;
     }
 
-    return layout(rois_layout.data_type, format::bfyx, { num_rois, fm, desc->pooled_width, desc->pooled_height });
+    return layout(data_layout.data_type, format::bfyx, { num_rois, fm, desc->pooled_width, desc->pooled_height });
 }
 
 std::string roi_pooling_inst::to_string(roi_pooling_node const& node)
index 37b225b..7a25399 100644 (file)
@@ -49,7 +49,7 @@ static std::vector<T> generate_rnd_real_input(
 }
 
 
-TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_zero) {
+TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_constant) {
     //  Input (XY) : 4x3
     //  Output (XY): 10x7
 
@@ -84,7 +84,7 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_zero) {
         border("output", "input",
                {blt_size_b, blt_size_f, blt_size_x, blt_size_y},
                {brb_size_b, brb_size_f, brb_size_x, brb_size_y},
-               border_type::zero)
+               border_type::constant, 0.0f)
     );
 
     std::vector<float> input_data = {
@@ -125,6 +125,82 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_zero) {
     }
 }
 
+TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_constant_non_constant) {
+    //  Input (XY) : 4x3
+    //  Output (XY): 10x7
+
+    constexpr auto in_size_b = 1;
+    constexpr auto in_size_f = 1;
+    constexpr auto in_size_y = 3;
+    constexpr auto in_size_x = 4;
+
+    constexpr auto blt_size_b = 0;
+    constexpr auto blt_size_f = 0;
+    constexpr auto blt_size_y = 1;
+    constexpr auto blt_size_x = 2;
+
+    constexpr auto brb_size_b = 0;
+    constexpr auto brb_size_f = 0;
+    constexpr auto brb_size_y = 3;
+    constexpr auto brb_size_x = 4;
+
+    constexpr auto out_size_b = in_size_b + blt_size_b + brb_size_b;
+    constexpr auto out_size_f = in_size_f + blt_size_f + brb_size_f;
+    constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y;
+    constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x;
+
+    engine engine;
+    auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}});
+
+    topology topology;
+    topology.add(
+        input_layout("input", input.get_layout())
+    );
+    topology.add(
+        border("output", "input",
+               {blt_size_b, blt_size_f, blt_size_x, blt_size_y},
+               {brb_size_b, brb_size_f, brb_size_x, brb_size_y},
+               border_type::constant, 1.0f)
+    );
+
+    std::vector<float> input_data = {
+          1, -2,  3,  -4,
+          5,  6,  7,   8,
+        -10, 12, 13, -13,
+    };
+    std::vector<float> out_data = {
+        1, 1,   1,  1,  1,   1, 1, 1, 1, 1,
+        1, 1,   1, -2,  3,  -4, 1, 1, 1, 1,
+        1, 1,   5,  6,  7,   8, 1, 1, 1, 1,
+        1, 1, -10, 12, 13, -13, 1, 1, 1, 1,
+        1, 1,   1,  1,  1,   1, 1, 1, 1, 1,
+        1, 1,   1,  1,  1,   1, 1, 1, 1, 1,
+        1, 1,   1,  1,  1,   1, 1, 1, 1, 1,
+    };
+    set_values(input, input_data);
+
+    network network(engine, topology);
+    network.set_input_data("input", input);
+    auto outputs = network.execute();
+
+    auto output = outputs.at("output").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    ASSERT_EQ(out_data.size(), static_cast<std::size_t>(out_size_b * out_size_f * out_size_y * out_size_x));
+
+    for (auto b = 0; b < out_size_b; ++b) {             // B
+        for (auto f = 0; f < out_size_f; ++f) {         // F
+            for (auto y = 0; y < out_size_y; ++y) {     // Y
+                for (auto x = 0; x < out_size_x; ++x) { // X
+                    auto output_off = ((y * out_size_x + x) * out_size_f + f) * out_size_b + b; // YXFB
+
+                    EXPECT_EQ(output_ptr[output_off], out_data[output_off]);
+                }
+            }
+        }
+    }
+}
+
 TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_mirror) {
     //  Input (XY) : 4x3
     //  Output (XY): 10x7
@@ -278,7 +354,86 @@ TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_mirror_101) {
         }
     }
 }
-TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_zero) {
+
+TEST(border_gpu, basic_yxfb_0x0x1x2_0x0x3x4_border_edge) {
+    //  Input (XY) : 5x4
+    //  Output (XY): 11x8
+
+    constexpr auto in_size_b = 1;
+    constexpr auto in_size_f = 1;
+    constexpr auto in_size_y = 4;
+    constexpr auto in_size_x = 5;
+
+    constexpr auto blt_size_b = 0;
+    constexpr auto blt_size_f = 0;
+    constexpr auto blt_size_y = 1;
+    constexpr auto blt_size_x = 2;
+
+    constexpr auto brb_size_b = 0;
+    constexpr auto brb_size_f = 0;
+    constexpr auto brb_size_y = 3;
+    constexpr auto brb_size_x = 4;
+
+    constexpr auto out_size_b = in_size_b + blt_size_b + brb_size_b;
+    constexpr auto out_size_f = in_size_f + blt_size_f + brb_size_f;
+    constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y;
+    constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x;
+
+    engine engine;
+    auto input = memory::allocate(engine, {data_types::f32, format::yxfb, {in_size_b, in_size_f, in_size_x, in_size_y}});
+
+    topology topology;
+    topology.add(
+        input_layout("input", input.get_layout())
+    );
+    topology.add(
+        border("output", "input",
+               {blt_size_b, blt_size_f, blt_size_x, blt_size_y},
+               {brb_size_b, brb_size_f, brb_size_x, brb_size_y},
+               border_type::edge)
+    );
+
+    std::vector<float> input_data = {
+          1, -2,  3,  -4,  4,
+          5,  6,  7,   8, -8,
+        -10, 12, 13, -13, 10,
+        -20, 22, 23, -23, 20,
+    };
+    std::vector<float> out_data = {
+          1,   1,   1, -2,  3,  -4,  4,  4,  4,  4,  4,
+          1,   1,   1, -2,  3,  -4,  4,  4,  4,  4,  4,
+          5,   5,   5,  6,  7,   8, -8, -8, -8, -8, -8,
+        -10, -10, -10, 12, 13, -13, 10, 10, 10, 10, 10,
+        -20, -20, -20, 22, 23, -23, 20, 20, 20, 20, 20,
+        -20, -20, -20, 22, 23, -23, 20, 20, 20, 20, 20,
+        -20, -20, -20, 22, 23, -23, 20, 20, 20, 20, 20,
+        -20, -20, -20, 22, 23, -23, 20, 20, 20, 20, 20
+    };
+    set_values(input, input_data);
+
+    network network(engine, topology);
+    network.set_input_data("input", input);
+    auto outputs = network.execute();
+
+    auto output = outputs.at("output").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    ASSERT_EQ(out_data.size(), static_cast<std::size_t>(out_size_b * out_size_f * out_size_y * out_size_x));
+
+    for (auto b = 0; b < out_size_b; ++b) {             // B
+        for (auto f = 0; f < out_size_f; ++f) {         // F
+            for (auto y = 0; y < out_size_y; ++y) {     // Y
+                for (auto x = 0; x < out_size_x; ++x) { // X
+                    auto output_off = ((y * out_size_x + x) * out_size_f + f) * out_size_b + b; // YXFB
+
+                    EXPECT_EQ(output_ptr[output_off], out_data[output_off]);
+                }
+            }
+        }
+    }
+}
+
+TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_constant) {
     constexpr auto in_size_b = 2;
     constexpr auto in_size_f = 3;
     constexpr auto in_size_y = 5;
@@ -310,7 +465,8 @@ TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_zero) {
         border("output", "input",
                {blt_size_b, blt_size_f, blt_size_x, blt_size_y},
                {brb_size_b, brb_size_f, brb_size_x, brb_size_y},
-               border_type::zero)
+               border_type::constant,
+               0.0f)
     );
 
     std::vector<float> input_data = generate_rnd_real_input<float>(in_size_b, in_size_f, in_size_y, in_size_x, -8.0f, 8.0f);
@@ -478,3 +634,69 @@ TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_mirror_101) {
         }
     }
 }
+
+TEST(border_gpu, basic_bfyx_2x1x2x3_1x2x3x4_border_edge) {
+    constexpr auto in_size_b = 3;
+    constexpr auto in_size_f = 4;
+    constexpr auto in_size_y = 6;
+    constexpr auto in_size_x = 5;
+
+    constexpr auto blt_size_b = 2;
+    constexpr auto blt_size_f = 1;
+    constexpr auto blt_size_y = 2;
+    constexpr auto blt_size_x = 3;
+
+    constexpr auto brb_size_b = 1;
+    constexpr auto brb_size_f = 2;
+    constexpr auto brb_size_y = 3;
+    constexpr auto brb_size_x = 4;
+
+    constexpr auto out_size_b = in_size_b + blt_size_b + brb_size_b;
+    constexpr auto out_size_f = in_size_f + blt_size_f + brb_size_f;
+    constexpr auto out_size_y = in_size_y + blt_size_y + brb_size_y;
+    constexpr auto out_size_x = in_size_x + blt_size_x + brb_size_x;
+
+    engine engine;
+    auto input = memory::allocate(engine, {data_types::f32, format::bfyx, {in_size_b, in_size_f, in_size_x, in_size_y}});
+
+    topology topology;
+    topology.add(
+        input_layout("input", input.get_layout())
+    );
+    topology.add(
+        border("output", "input",
+               {blt_size_b, blt_size_f, blt_size_x, blt_size_y},
+               {brb_size_b, brb_size_f, brb_size_x, brb_size_y},
+               border_type::edge)
+    );
+
+    std::vector<float> input_data = generate_rnd_real_input<float>(in_size_b, in_size_f, in_size_y, in_size_x, -8.0f, 8.0f);
+    set_values(input, input_data);
+
+    network network(engine, topology);
+    network.set_input_data("input", input);
+    auto outputs = network.execute();
+
+    auto output = outputs.at("output").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    for (auto b = 0; b < out_size_b; ++b) {             // B
+        for (auto f = 0; f < out_size_f; ++f) {         // F
+            for (auto y = 0; y < out_size_y; ++y) {     // Y
+                for (auto x = 0; x < out_size_x; ++x) { // X
+                    auto output_off = ((b * out_size_f + f) * out_size_y + y) * out_size_x + x; // BFYX
+
+                    auto in_b = (b >= blt_size_b && b < out_size_b - brb_size_b) ? b - blt_size_b : (b < blt_size_b ? 0 : in_size_b - 1);
+                    auto in_f = (f >= blt_size_f && f < out_size_f - brb_size_f) ? f - blt_size_f : (f < blt_size_f ? 0 : in_size_f - 1);
+                    auto in_y = (y >= blt_size_y && y < out_size_y - brb_size_y) ? y - blt_size_y : (y < blt_size_y ? 0 : in_size_y - 1);
+                    auto in_x = (x >= blt_size_x && x < out_size_x - brb_size_x) ? x - blt_size_x : (x < blt_size_x ? 0 : in_size_x - 1);
+
+                    auto input_off  = ((in_b * in_size_f + in_f) * in_size_y + in_y) * in_size_x + in_x; // BFYX
+
+
+                    EXPECT_EQ(output_ptr[output_off], input_data[input_off]);
+                }
+            }
+        }
+    }
+}
diff --git a/inference-engine/thirdparty/fluid/README.md b/inference-engine/thirdparty/fluid/README.md
new file mode 100644 (file)
index 0000000..3b31555
--- /dev/null
@@ -0,0 +1,45 @@
+# OpenCV G-API (Fluid), standalone edition
+
+This subtree hosts sources of G-API - a new OpenCV module for
+efficient image processing. G-API serves as a preprocessing vehicle
+for Inference Engine. At the moment, only Fluid (CPU) backend is used.
+
+The sources are taken from OpenCV's [main repository](https://github.com/opencv).
+
+There are supplementary scripts which ease and verify the update
+process.
+
+## Usage
+
+Updating to the latest `master`:
+
+    ./update.sh
+
+Updating to a particular revision:
+
+    ./update.sh COMMIT_HASH
+
+During update, this script checks if the source tree was modified
+after the latest update. If it was, update fails -- we want to avoid
+any diverge in the source so _no changes_ should be committed ever to
+this copy of G-API.
+
+One can check manually if sources were diverged from its last "valid"
+copy by running
+
+    ./check.sh
+
+An error message and non-zero exit code indicate possible inconsitency
+with this source copy.
+
+One updated, all changes will be automatically staged.
+
+## Files
+
+In addition to the source tree, the above two scripts maintain two
+files:
+- `revision.txt` -- the OpenCV's revision used to produce this source
+  copy. If the code was taken from `master`, a timestamp is stored
+  otherwise.
+- `checksum.txt` -- latest valid copy's check sum. Don't update this
+  file manually.
diff --git a/inference-engine/thirdparty/fluid/check.sh b/inference-engine/thirdparty/fluid/check.sh
new file mode 100644 (file)
index 0000000..e222af6
--- /dev/null
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+if [ ! -f modules ] && [ ! -f checksum.txt ]; then
+    exit 0
+fi
+
+THIS_HASH=$(./checksum.sh)
+OLD_HASH=$(cat checksum.txt)
+
+if [ $THIS_HASH != $OLD_HASH ]; then
+    echo "Invalid checksum -- any changes were done to the source tree here?"
+    exit 1
+fi
+
+echo "Check done."
diff --git a/inference-engine/thirdparty/fluid/checksum.sh b/inference-engine/thirdparty/fluid/checksum.sh
new file mode 100644 (file)
index 0000000..3c82361
--- /dev/null
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+find modules/ -type f -exec md5sum {} \; | sort -k 2 | md5sum | cut -d' ' -f 1
diff --git a/inference-engine/thirdparty/fluid/checksum.txt b/inference-engine/thirdparty/fluid/checksum.txt
new file mode 100644 (file)
index 0000000..d912ec0
--- /dev/null
@@ -0,0 +1 @@
+5d28798fbe1b11d9c9d6fcd28c02f07e
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/CMakeLists.txt b/inference-engine/thirdparty/fluid/modules/gapi/CMakeLists.txt
new file mode 100644 (file)
index 0000000..ec05b38
--- /dev/null
@@ -0,0 +1,112 @@
+# FIXME: Rework standalone build in more generic maner
+# (Restructure directories, add common pass, etc)
+if (NOT DEFINED OPENCV_INITIAL_PASS)
+    include("cmake/standalone.cmake")
+    return()
+endif()
+
+# FIXME: Remove CXX11 check after complete switch to OpenCV 4 branch
+# (CI, bundle, workloads, etc)
+if (NOT HAVE_CXX11 OR NOT TARGET ade)
+  # can't build G-API because of the above reasons
+  ocv_module_disable(gapi)
+  return()
+endif()
+
+set(the_description "OpenCV G-API Core Module")
+ocv_add_module(gapi opencv_imgproc)
+
+file(GLOB gapi_ext_hdrs
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/*.hpp"
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/*.hpp"
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/*.h"
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/util/*.hpp"
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/cpu/*.hpp"
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/gpu/*.hpp"
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/fluid/*.hpp"
+    "${CMAKE_CURRENT_LIST_DIR}/include/opencv2/${name}/own/*.hpp"
+    )
+
+set(gapi_srcs
+    # Front-end part
+    src/api/gapi_priv.cpp
+    src/api/gmat.cpp
+    src/api/garray.cpp
+    src/api/gscalar.cpp
+    src/api/gkernel.cpp
+    src/api/gbackend.cpp
+    src/api/gproto.cpp
+    src/api/gnode.cpp
+    src/api/gcall.cpp
+    src/api/gcomputation.cpp
+    src/api/operators.cpp
+    src/api/kernels_core.cpp
+    src/api/kernels_imgproc.cpp
+
+    # Compiler part
+    src/compiler/gmodel.cpp
+    src/compiler/gmodelbuilder.cpp
+    src/compiler/gislandmodel.cpp
+    src/compiler/gcompiler.cpp
+    src/compiler/gcompiled.cpp
+    src/compiler/passes/helpers.cpp
+    src/compiler/passes/dump_dot.cpp
+    src/compiler/passes/islands.cpp
+    src/compiler/passes/meta.cpp
+    src/compiler/passes/kernels.cpp
+    src/compiler/passes/exec.cpp
+
+    # Executor
+    src/executor/gexecutor.cpp
+
+    # CPU Backend (currently built-in)
+    src/backends/cpu/gcpubackend.cpp
+    src/backends/cpu/gcpukernel.cpp
+    src/backends/cpu/gcpuimgproc.cpp
+    src/backends/cpu/gcpucore.cpp
+
+    # Fluid Backend (also built-in, FIXME:move away)
+    src/backends/fluid/gfluidbuffer.cpp
+    src/backends/fluid/gfluidbackend.cpp
+    src/backends/fluid/gfluidimgproc.cpp
+    src/backends/fluid/gfluidimgproc_func.dispatch.cpp
+    src/backends/fluid/gfluidcore.cpp
+
+    # GPU Backend (currently built-in)
+    src/backends/gpu/ggpubackend.cpp
+    src/backends/gpu/ggpukernel.cpp
+    src/backends/gpu/ggpuimgproc.cpp
+    src/backends/gpu/ggpucore.cpp
+
+    # Compound
+    src/backends/common/gcompoundbackend.cpp
+    src/backends/common/gcompoundkernel.cpp
+    )
+
+ocv_add_dispatched_file(backends/fluid/gfluidimgproc_func SSE4_1 AVX2)
+
+ocv_list_add_prefix(gapi_srcs "${CMAKE_CURRENT_LIST_DIR}/")
+
+# For IDE users
+ocv_source_group("Src"     FILES ${gapi_srcs})
+ocv_source_group("Include" FILES ${gapi_ext_hdrs})
+
+ocv_set_module_sources(HEADERS ${gapi_ext_hdrs} SOURCES ${gapi_srcs})
+ocv_module_include_directories("${CMAKE_CURRENT_LIST_DIR}/src")
+
+# Note `ade` is not a module name but link dependency for ${the_module}
+# (which is opencv_gapi)
+ocv_create_module(ade)
+
+ocv_add_accuracy_tests()
+# FIXME: test binary is linked with ADE directly since ADE symbols
+# are not exported from libopencv_gapi.so in any form - thus
+# there're two copies of ADE code in memory when tests run (!)
+# src/ is specified to include dirs for INTERNAL tests only.
+if(TARGET opencv_test_gapi)
+  target_include_directories(opencv_test_gapi PRIVATE "${CMAKE_CURRENT_LIST_DIR}/src")
+  target_link_libraries(opencv_test_gapi PRIVATE ade)
+endif()
+
+ocv_add_perf_tests()
+ocv_add_samples()
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/cmake/DownloadADE.cmake b/inference-engine/thirdparty/fluid/modules/gapi/cmake/DownloadADE.cmake
new file mode 100644 (file)
index 0000000..8b6f117
--- /dev/null
@@ -0,0 +1,36 @@
+if(ANDROID)
+  # FIXME: Android build will be enabled separately
+  return()
+endif()
+
+set(ade_src_dir "${OpenCV_BINARY_DIR}/3rdparty/ade")
+set(ade_filename "v0.1.1d.zip")
+set(ade_subdir "ade-0.1.1d")
+set(ade_md5 "37479d90e3a5d47f132f512b22cbe206")
+ocv_download(FILENAME ${ade_filename}
+             HASH ${ade_md5}
+             URL
+               "${OPENCV_ADE_URL}"
+               "$ENV{OPENCV_ADE_URL}"
+               "https://github.com/opencv/ade/archive/"
+             DESTINATION_DIR ${ade_src_dir}
+             ID ADE
+             STATUS res
+             UNPACK RELATIVE_URL)
+
+if (NOT res)
+    return()
+endif()
+
+set(ADE_root "${ade_src_dir}/${ade_subdir}/sources/ade")
+file(GLOB_RECURSE ADE_sources "${ADE_root}/source/*.cpp")
+file(GLOB_RECURSE ADE_include "${ADE_root}/include/ade/*.hpp")
+add_library(ade STATIC ${ADE_include} ${ADE_sources})
+target_include_directories(ade PUBLIC $<BUILD_INTERFACE:${ADE_root}/include>)
+set_target_properties(ade PROPERTIES POSITION_INDEPENDENT_CODE True)
+
+if(NOT BUILD_SHARED_LIBS)
+  ocv_install_target(ade EXPORT OpenCVModules ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT dev)
+endif()
+
+ocv_install_3rdparty_licenses(ade "${ade_src_dir}/${ade_subdir}/LICENSE")
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/cmake/init.cmake b/inference-engine/thirdparty/fluid/modules/gapi/cmake/init.cmake
new file mode 100644 (file)
index 0000000..9f6ebef
--- /dev/null
@@ -0,0 +1,11 @@
+if (ade_DIR)
+  # if ade_DIR is set, use ADE-supplied CMake script
+  # to set up variables to the prebuilt ADE
+  find_package(ade 0.1.0)
+endif()
+
+if(NOT TARGET ade)
+  # if ade_DIR is not set, try to use automatically
+  # downloaded one (if there any)
+  include("${CMAKE_CURRENT_LIST_DIR}/DownloadADE.cmake")
+endif()
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/cmake/standalone.cmake b/inference-engine/thirdparty/fluid/modules/gapi/cmake/standalone.cmake
new file mode 100644 (file)
index 0000000..dd6e8cc
--- /dev/null
@@ -0,0 +1,34 @@
+if (NOT TARGET ade )
+  find_package(ade 0.1.0 REQUIRED)
+endif()
+
+set(FLUID_TARGET fluid)
+set(FLUID_ROOT "${CMAKE_CURRENT_LIST_DIR}/../")
+
+file(GLOB FLUID_includes "${FLUID_ROOT}/include/opencv2/*.hpp"
+                         "${FLUID_ROOT}/include/opencv2/gapi/g*.hpp"
+                         "${FLUID_ROOT}/include/opencv2/gapi/util/*.hpp"
+                         "${FLUID_ROOT}/include/opencv2/gapi/own/*.hpp"
+                         "${FLUID_ROOT}/include/opencv2/gapi/fluid/*.hpp")
+file(GLOB FLUID_sources  "${FLUID_ROOT}/src/api/g*.cpp"
+                         "${FLUID_ROOT}/src/compiler/*.cpp"
+                         "${FLUID_ROOT}/src/compiler/passes/*.cpp"
+                         "${FLUID_ROOT}/src/executor/*.cpp"
+                         "${FLUID_ROOT}/src/backends/fluid/*.cpp"
+                         "${FLUID_ROOT}/src/backends/common/*.cpp")
+
+add_library(${FLUID_TARGET} STATIC ${FLUID_includes} ${FLUID_sources})
+
+target_include_directories(${FLUID_TARGET}
+  PUBLIC          $<BUILD_INTERFACE:${FLUID_ROOT}/include>
+  PRIVATE         ${FLUID_ROOT}/src)
+
+target_compile_definitions(${FLUID_TARGET} PUBLIC -DGAPI_STANDALONE
+# This preprocessor definition resolves symbol clash when
+# standalone fluid meets gapi ocv module in one application
+                                           PUBLIC cv=fluidcv)
+
+set_target_properties(${FLUID_TARGET} PROPERTIES POSITION_INDEPENDENT_CODE True)
+set_property(TARGET ${FLUID_TARGET} PROPERTY CXX_STANDARD 11)
+
+target_link_libraries(${FLUID_TARGET} PRIVATE ade)
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/doc/00-root.markdown b/inference-engine/thirdparty/fluid/modules/gapi/doc/00-root.markdown
new file mode 100644 (file)
index 0000000..bbc90ab
--- /dev/null
@@ -0,0 +1,113 @@
+# Graph API {#gapi}
+
+# Introduction {#gapi_root_intro}
+
+OpenCV Graph API (or G-API) is a new OpenCV module targeted to make
+regular image processing fast and portable. These two goals are
+achieved by introducing a new graph-based model of execution.
+
+G-API is a special module in OpenCV -- in contrast with the majority
+of other main modules, this one acts as a framework rather than some
+specific CV algorithm. G-API provides means to define CV operations,
+construct graphs (in form of expressions) using it, and finally
+implement and run the operations for a particular backend.
+
+@note G-API is a new module and now is in active development. It's API
+is volatile at the moment and there may be minor but
+compatibility-breaking changes in the future.
+
+# Contents
+
+G-API documentation is organized into the following chapters:
+
+- @subpage gapi_purposes
+
+  The motivation behind G-API and its goals.
+
+- @subpage gapi_hld
+
+  General overview of G-API architecture and its major internal
+  components.
+
+- @subpage gapi_kernel_api
+
+  Learn how to introduce new operations in G-API and implement it for
+  various backends.
+
+- @subpage gapi_impl
+
+  Low-level implementation details of G-API, for those who want to
+  contribute.
+
+- API Reference: functions and classes
+
+    - @subpage gapi_core
+
+      Core G-API operations - arithmetic, boolean, and other matrix
+      operations;
+
+    - @subpage gapi_imgproc
+
+      Image processing functions: color space conversions, various
+      filters, etc.
+
+# API Example {#gapi_example}
+
+A very basic example of G-API pipeline is shown below:
+
+@include modules/gapi/samples/api_example.cpp
+
+<!-- TODO align this code with text using marks and itemized list -->
+
+G-API is a separate OpenCV module so its header files have to be
+included explicitly. The first four lines of `main()` create and
+initialize OpenCV's standard video capture object, which fetches
+video frames from either an attached camera or a specified file.
+
+G-API pipelie is constructed next. In fact, it is a series of G-API
+operation calls on cv::GMat data. The important aspect of G-API is
+that this code block is just a declaration of actions, but not the
+actions themselves. No processing happens at this point, G-API only
+tracks which operations form pipeline and how it is connected. G-API
+_Data objects_ (here it is cv::GMat) are used to connect operations
+each other. `in` is an _empty_ cv::GMat signalling that it is a
+beginning of computation.
+
+After G-API code is written, it is captured into a call graph with
+instantiation of cv::GComputation object. This object takes
+input/output data references (in this example, `in` and `out`
+cv::GMat objects, respectively) as parameters and reconstructs the
+call graph based on all the data flow between `in` and `out`.
+
+cv::GComputation is a thin object in sense that it just captures which
+operations form up a computation. However, it can be used to execute
+computations -- in the following processing loop, every captured frame (a
+cv::Mat `input_frame`) is passed to cv::GComputation::apply().
+
+![Example pipeline running on sample video 'vtest.avi'](pics/demo.jpg)
+
+cv::GComputation::apply() is a polimorphic method which accepts a
+variadic number of arguments. Since this computation is defined on one
+input, one output, a special overload of cv::GComputation::apply() is
+used to pass input data and get output data.
+
+Internally, cv::GComputation::apply() compiles the captured graph for
+the given input parameters and executes the compiled graph on data
+immediately.
+
+There is a number important concepts can be outlines with this examle:
+* Graph declaration and graph execution are distinct steps;
+* Graph is built implicitly from a sequence of G-API expressions;
+* G-API supports function-like calls -- e.g. cv::gapi::resize(), and
+  operators, e.g operator|() which is used to compute bitwise OR;
+* G-API syntax aims to look pure: every operation call within a graph
+  yields a new result, thus forming a directed acyclic graph (DAG);
+* Graph declaration is not bound to any data -- real data objects
+  (cv::Mat) come into picture after the graph is already declared.
+
+<!-- FIXME: The above operator|() link links to MatExpr not GAPI -->
+
+See [tutorials and porting examples](@ref tutorial_table_of_content_gapi)
+to learn more on various G-API features and concepts.
+
+<!-- TODO Add chapter on declaration, compilation, execution -->
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/doc/01-background.markdown b/inference-engine/thirdparty/fluid/modules/gapi/doc/01-background.markdown
new file mode 100644 (file)
index 0000000..65983cd
--- /dev/null
@@ -0,0 +1,76 @@
+# Why Graph API? {#gapi_purposes}
+
+# Motivation behind G-API {#gapi_intro_why}
+
+G-API module brings graph-based model of execution to OpenCV. This
+chapter briefly describes how this new model can help software
+developers in two aspects: optimizing and porting image processing
+algorithms.
+
+## Optimizing with Graph API {#gapi_intro_opt}
+
+Traditionally OpenCV provided a lot of stand-alone image processing
+functions  (see modules `core` and `imgproc`). Many of that functions
+are well-optimized (e.g. vectorized for specific CPUs, parallel, etc)
+but still the out-of-box optimization scope has been limited to a
+single function only -- optimizing the whole algorithm built atop of that
+functions was a responsibility of a programmer.
+
+OpenCV 3.0 introduced _Transparent API_ (or _T-API_) which allowed to
+offload OpenCV function calls transparently to OpenCL devices and save
+on Host/Device data transfers with cv::UMat -- and it was a great step
+forward. However, T-API is a dynamic API -- user code still remains
+unconstrained and OpenCL kernels are enqueued in arbitrary order, thus
+eliminating further pipeline-level optimization potential.
+
+G-API brings implicit graph model to OpenCV 4.0. Graph model captures
+all operations and its data dependencies in a pipeline and so provides
+G-API framework with extra information to do pipeline-level
+optimizations.
+
+The cornerstone of graph-based optimizations is _Tiling_. Tiling
+allows to break the processing into smaller parts and reorganize
+operations to enable data parallelism, improve data locality, and save
+memory footprint. Data locality is an especially important aspect of
+software optimization due to diffent costs of memory access on modern
+computer architectures -- the more data is reused in the first level
+cache, the more efficient pipeline is.
+
+Definitely the aforementioned techinques can be applied manually --
+but it requires extra skills and knowledge of the target platform and
+the algorithm implementation changes irrevocably -- becoming more
+specific, less flexible, and harder to extend and maintain.
+
+G-API takes this responsiblity and complexity from user and does the
+majority of the work by itself, keeping the algorithm code clean from
+device or optimization details. This approach has its own limitations,
+though, as graph model is a _constrained_ model and not every
+algorithm can be represented as a graph, so the G-API scope is limited
+only to regular image processing -- various filters, arithmentic,
+binary operations, and well-defined geometrical transformations.
+
+## Porting with Graph API {#gapi_intro_port}
+
+The essense of G-API is declaring a sequence of operations to run, and
+then executing that sequence. G-API is a constrained API, so it puts a
+number of limitations on which operations can form a pipeline and
+which data these operations may exchange each other.
+
+This formalization in fact helps to make an algorithm portable. G-API
+clearly separates operation _interfaces_ from its _implementations_.
+
+One operation (_kernel_) may have multiple implementations even for a
+single device (e.g., OpenCV-based "reference" implementation and a
+tiled optimized implementation, both running on CPU). Graphs (or
+_Computations_ in G-API terms) are built only using operation
+interfaces, not implementations -- thus the same graph can be executed
+on different devices (and, of course, using different optimization
+techniques) with little-to-no changes in the graph itself.
+
+G-API supports plugins (_Backends_) which aggreate logic and
+intelligence on what is the best way to execute on a particular
+platform. Once a pipeline is built with G-API, it can be parametrized
+to use either of the backends (or a combination of it) and so a graph
+can be ported easily to a new platform.
+
+@sa @ref gapi_hld
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/doc/10-hld-overview.md b/inference-engine/thirdparty/fluid/modules/gapi/doc/10-hld-overview.md
new file mode 100644 (file)
index 0000000..1dc5b50
--- /dev/null
@@ -0,0 +1,159 @@
+# High-level design overview {#gapi_hld}
+
+# G-API High-level design overview
+
+[TOC]
+
+G-API is a heterogeneous framework and provides single API to program
+image processing pipelines with a number of supported backends.
+
+The key design idea is to keep pipeline code itself platform-neutral
+while specifying which kernels to use and which devices to utilize
+using extra parameters at graph compile (configuration) time. This
+requirement has led to the following architecture:
+
+<!-- FIXME: Render from dot directly -->
+
+![G-API framework architecture](pics/gapi_scheme.png)
+
+There are three layers in this architecture:
+* **API Layer** -- this is the top layer, which implements G-API
+  public interface, its building blocks and semantics.
+  When user constructs a pipeline with G-API, he interacts with this
+  layer directly, and the entities the user operates on (like cv::GMat
+  or cv::GComputation) are provided by this layer.
+* **Graph Compiler Layer** -- this is the intermediate layer which
+  unrolls user computation into a graph and then applies a number of
+  transformations to it (e.g. optimizations). This layer is built atop
+  of [ADE Framework](@ref gapi_detail_ade).
+* **Backends Layer** -- this is the lowest level layer, which lists a
+  number of _Backends_. In contrast with the above two layers,
+  backends are highly coupled with low-level platform details, with
+  every backend standing for every platform. A backend operates on a
+  processed graph (coming from the graph compiler) and executes this
+  graph optimally for a specific platform or device.
+
+# API layer {#gapi_api_layer}
+
+API layer is what user interacts with when defining and using a
+pipeline (a Computation in G-API terms). API layer defines a set of
+G-API _dynamic_ objects which can be used as inputs, outputs, and
+intermediate data objects within a graph:
+* cv::GMat
+* cv::GScalar
+* cv::GArray (template class)
+
+API layer specifies a list of Operations which are defined on these
+data objects -- so called kernels. See G-API [core](@ref gapi_core)
+and [imgproc](@ref gapi_imgproc) namespaces for details on which
+operations G-API provides by default.
+
+G-API is not limited to these operations only -- users can define
+their own kernels easily using a special macro G_TYPED_KERNEL().
+
+API layer is also responsible for marshalling and storing operation
+parameters on pipeline creation. In addition to the aforementioned
+G-API dynamic objects, operations may also accept arbitrary
+parameters (more on this [below](@ref gapi_detail_params)), so API
+layer captures its values and stores internally upon the moment of
+execution.
+
+Finally, cv::GComputation and cv::GCompiled are the remaining
+important components of API layer. The former wraps a series of G-API
+expressions into an object (graph), and the latter is a product of
+graph _compilation_ (see [this chapter](@ref gapi_detail_compiler) for
+details).
+
+# Graph compiler layer {#gapi_compiler}
+
+Every G-API computation is compiled before it executes. Compilation
+process is triggered in two ways:
+* _implicitly_, when cv::GComputation::apply() is used. In this case,
+  graph compilation is then immediately followed by execution.
+* _explicitly_, when cv::GComputation::compile() is used. In this case,
+  a cv::GCompiled object is returned which then can be invoked as a
+  C++ functor.
+
+The first way is recommended for cases when input data format is not
+known in advance -- e.g. when it comes from an arbitrary input file.
+The second way is recommended for deployment (production) scenarios
+where input data characteristics are usually predefined.
+
+Graph compilation process is built atop of ADE Framework. Initially, a
+bipartite graph is generated from expressions captured by API layer.
+This graph contains nodes of two types: _Data_ and _Operations_. Graph
+always starts and ends with a Data node(s), with Operations nodes
+in-between. Every Operation node has inputs and outputs, both are Data
+nodes.
+
+After the initial graph is generated, it is actually processed by a
+number of graph transformations, called _passes_. ADE Framework acts
+as a compiler pass management engine, and passes are written
+specifically for G-API.
+
+There are different passes which check graph validity, refine details
+on operations and data, organize nodes into clusters ("Islands") based
+on affinity or user-specified regioning[TBD], and more. Backends also
+are able to inject backend-specific passes into the compilation
+process, see more on this in the [dedicated chapter](@ref gapi_detail_meta).
+
+Result of graph compilation is a compiled object, represented by class
+cv::GCompiled. A new cv::GCompiled object is always created regardless
+if there was an explicit or implicit compilation request (see
+above). Actual graph execution happens within cv::GCompiled and is
+determined by backends which participated in the graph compilation.
+
+@sa cv::GComputation::apply(), cv::GComputation::compile(), cv::GCompiled
+
+# Backends layer {#gapi_backends}
+
+The above diagram lists two backends, _OpenCV_ and _Fluid_. _OpenCV_
+is so-called "reference backend", which implements G-API operations
+using plain old OpenCV functions. This backend is useful for
+prototyping on a familiar development system. _Fluid_ is a plugin for
+cache-efficient execution on CPU -- it implements a different
+execution policy and operates with its own, special kernels. Fluid
+backend allows to achieve less memory footprint and better memory
+locality when running on CPU.
+
+There may be more backends available, e.g. Halide, OpenCL, etc. --
+G-API provides an uniform internal API to develop backends so any
+enthusiast or a company are free to scale G-API on a new platform or
+accelerator. In terms of OpenCV infrastructure, every new backend is a
+new distinct OpenCV module, which extends G-API when build as a part
+of OpenCV.
+
+# Graph execution {#gapi_compiled}
+
+The way graph executed is defined by backends selected for
+compilation. In fact, every backend builds its own execution script as
+the final stage of graph compilation process, when an executable
+(compiled) object is being generated. For example, in OpenCV backend,
+this script is just a topologically-sorted sequence of OpenCV
+functions to call; for Fluid backend, it is a similar thing -- a
+topologically sorted list of _Agents_ processing lines of input on
+every iteration.
+
+Graph execution is triggered in two ways:
+* via cv::GComputation::apply(), with graph compiled in-place exactly
+  for the given input data;
+* via cv::GCompiled::operator()(), when the graph has been precompiled.
+
+Both methods are polimorphic and take a variadic number of arguments,
+with validity checks performed in runtime. If a number, shapes, and
+formats of passed data objects differ from expected, a run-time
+exception is thrown. G-API also provides _typed_ wrappers to move
+these checks to the compile time -- see cv::GComputationT<>.
+
+G-API graph execution is declared stateless -- it means that a
+compiled functor (cv::GCompiled) acts like a pure C++ function and
+provides the same result for the same set of input arguments.
+
+Both execution methods take \f$N+M\f$ parameters, where \f$N\f$ is a
+number of inputs, and \f$M\f$ is a number of outputs on which a
+cv::GComputation is defined. Note that while G-API types (cv::GMat,
+etc) are used in definition, the execution methods accept OpenCV's
+traditional data types (like cv::Mat) which hold actual data -- see
+table in [parameter marshalling](@#gapi_detail_params).
+
+@sa @ref gapi_impl, @ref gapi_kernel_api
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/doc/20-kernel-api.markdown b/inference-engine/thirdparty/fluid/modules/gapi/doc/20-kernel-api.markdown
new file mode 100644 (file)
index 0000000..93b852f
--- /dev/null
@@ -0,0 +1,170 @@
+# Kernel API {#gapi_kernel_api}
+
+[TOC]
+
+# G-API Kernel API
+
+The core idea behind G-API is portability -- a pipeline built with
+G-API must be portable (or at least able to be portable). It means
+that either it works out-of-the box when compiled for new platform,
+_or_ G-API provides necessary tools to make it running there, with
+little-to-no changes in the algorithm itself.
+
+This idea can be achieved by separating kernel interface from its
+implementation. Once a pipeline is built using kernel interfaces, it
+becomes implementation-neutral -- the implementation details
+(i.e. which kernels to use) are passed on a separate stage (graph
+compilation).
+
+Kernel-implementation hierarchy may look like:
+
+![Kernel API/implementation hierarchy example](pics/kernel_hierarchy.png)
+
+A pipeline itself then can be expressed only in terms of `A`, `B`, and
+so on, and choosing which implementation to use in execution becomes
+an external parameter.
+
+# Defining a kernel {#gapi_defining_kernel}
+
+G-API provides a macro to define a new kernel interface --
+G_TYPED_KERNEL():
+
+@snippet modules/gapi/samples/kernel_api_snippets.cpp filter2d_api
+
+This macro is a shortcut to a new type definition. It takes three
+arguments to register a new type, and requires type body to be present
+(see [below](@ref gapi_kernel_supp_info)). The macro arguments are:
+1. Kernel interface name -- also serves as a name of new type defined
+   with this macro;
+2. Kernel signature -- an `std::function<>`-like signature which defines
+   API of the kernel;
+3. Kernel's unique name -- used to identify kernel when its type
+   informattion is stripped within the system.
+
+Kernel declaration may be seen as function declaration -- in both cases
+a new entity must be used then according to the way it was defined.
+
+Kernel signature defines kernel's usage syntax --  which parameters
+it takes during graph construction. Implementations can also use this
+signature to derive it into backend-specific callback signatures (see
+next chapter).
+
+Kernel may accept values of any type, and G-API _dynamic_ types are
+handled in a special way. All other types are opaque to G-API and
+passed to kernel in `outMeta()` or in execution callbacks as-is.
+
+Kernel's return value can _only_ be of G-API dynamic type -- cv::GMat,
+cv::GScalar, or cv::GArray<T>. If an operation has more than one output,
+it should be wrapped into an `std::tuple<>` (which can contain only
+mentioned G-API types). Arbitrary-output-number operations are not
+supported.
+
+Once a kernel is defined, it can be used in pipelines with special,
+G-API-supplied method "::on()". This method has the same signature as
+defined in kernel, so this code:
+
+@snippet modules/gapi/samples/kernel_api_snippets.cpp filter2d_on
+
+is a perfectly legal construction. This example has some verbosity,
+though, so usually a kernel declaration comes with a C++ function
+wrapper ("factory method") which enables optional parameters, more
+compact syntax, Doxygen comments, etc:
+
+@snippet modules/gapi/samples/kernel_api_snippets.cpp filter2d_wrap
+
+so now it can be used like:
+
+@snippet modules/gapi/samples/kernel_api_snippets.cpp filter2d_wrap_call
+
+# Extra information {#gapi_kernel_supp_info}
+
+In the current version, kernel declaration body (everything within the
+curly braces) must contain a static function `outMeta()`. This function
+establishes a functional dependency between operation's input and
+output metadata.
+
+_Metadata_ is an information about data kernel operates on. Since
+non-G-API types are opaque to G-API, G-API cares only about `G*` data
+descriptors (i.e. dimensions and format of cv::GMat, etc).
+
+`outMeta()` is also an example of how kernel's signature can be
+transformed into a derived callback -- note that in this example,
+`outMeta()` signature exactly follows the kernel signature (defined
+within the macro) but is different -- where kernel expects cv::GMat,
+`outMeta()` takes and returns cv::GMatDesc (a G-API structure metadata
+for cv::GMat).
+
+The point of `outMeta()` is to propagate metadata information within
+computation from inputs to outputs and infer metadata of internal
+(intermediate, temporary) data objects. This information is required
+for further pipeline optimizations, memory allocation, and other
+operations done by G-API framework during graph compilation.
+
+<!-- TODO add examples -->
+
+# Implementing a kernel {#gapi_kernel_implementing}
+
+Once a kernel is declared, its interface can be used to implement
+versions of this kernel in different backends. This concept is
+naturally projected from object-oriented programming
+"Interface/Implementation" idiom: an interface can be implemented
+multiple times, and different implementations of a kernel should be
+substitutable with each other without breaking the algorithm
+(pipeline) logic (Liskov Substitution Principle).
+
+Every backend defines its own way to implement a kernel interface.
+This way is regular, though -- whatever plugin is, its kernel
+implementation must be "derived" from a kernel interface type.
+
+Kernel implementation are then organized into _kernel
+packages_. Kernel packages are passed to cv::GComputation::compile()
+as compile arguments, with some hints to G-API on how to select proper
+kernels (see more on this in "Heterogeneity"[TBD]).
+
+For example, the aforementioned `Filter2D` is implemented in
+"reference" CPU (OpenCV) plugin this way (*NOTE* -- this is a
+simplified form with improper border handling):
+
+@snippet modules/gapi/samples/kernel_api_snippets.cpp filter2d_ocv
+
+Note how CPU (OpenCV) plugin has transformed the original kernel
+signature:
+- Input cv::GMat has been substituted with cv::Mat, holding actual input
+  data for the underlying OpenCV function call;
+- Output cv::GMat has been transformed into extra output parameter, thus
+  `GCPUFilter2D::run()` takes one argument more than the original
+  kernel signature.
+
+The basic intuition for kernel developer here is _not to care_ where
+that cv::Mat objects come from instead of the original cv::GMat -- and
+just follow the signature conventions defined by the plugin. G-API
+will call this method during execution and supply all the necessary
+information (and forward the original opaque data as-is).
+
+# Compound kernels
+
+Sometimes kernel is a single thing only on API level. It is convenient
+for users, but on a particular  implementation side it would be better to
+have multiple kernels (a subgraph) doing the thing instead. An example
+is goodFeaturesToTrack() -- while in OpenCV backend it may remain a
+single kernel, with Fluid it becomes compound -- Fluid can handle Harris
+response calculation but can't do sparse non-maxima suppression and
+point extraction to an STL vector:
+
+<!-- PIC -->
+
+A compound kernel _implementation_ can be defined using a generic
+macro GAPI_COMPOUND_KERNEL():
+
+@snippet modules/gapi/samples/kernel_api_snippets.cpp compound
+
+<!-- TODO: ADD on how Compound kernels may simplify dispatching -->
+<!-- TODO: Add details on when expand() is called! -->
+
+It is important to distinguish a compound kernel from G-API high-order
+function, i.e. a C++ function which looks like a kernel but in fact
+generates a subgraph. The core difference is that a compound kernel is
+an _implementation detail_ and a kernel implementation may be either
+compound or not (depending on backend capabilities), while a
+high-order function is a "macro" in terms of G-API and so cannot act as
+an interface which then needs to be implemented by a backend.
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/doc/30-implementation.markdown b/inference-engine/thirdparty/fluid/modules/gapi/doc/30-implementation.markdown
new file mode 100644 (file)
index 0000000..b1ad3ba
--- /dev/null
@@ -0,0 +1,27 @@
+# Implementation details {#gapi_impl}
+
+[TOC]
+
+# G-API Implementation details {#gapi_impl_header}
+
+## Api layer details {#gapi_detail_api}
+
+### Expression unrolling {#gapi_detail_expr}
+
+### Parameter marshalling {#gapi_detail_params}
+
+### Operations representation {#gapi_detail_operations}
+
+## Graph compiler details {#gapi_detail_compiler}
+
+### ADE basics {#gapi_detail_ade}
+
+### Graph model representation {#gapi_detail_gmodel}
+
+### G-API metadata and passes {#gapi_detail_meta}
+
+## Backends details {#gapi_detail_backends}
+
+### Backend scope of work {#gapi_backend_scope}
+
+### Graph transformation {#gapi_backend_pass}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/doc/dot/kernel_hierarchy.dot b/inference-engine/thirdparty/fluid/modules/gapi/doc/dot/kernel_hierarchy.dot
new file mode 100644 (file)
index 0000000..0eb92bc
--- /dev/null
@@ -0,0 +1,17 @@
+digraph {
+  rankdir=BT;
+  node [shape=record];
+
+  ki_a [label="{<f0> interface\nA}"];
+  ki_b [label="{<f0> interface\nB}"];
+
+  {rank=same; ki_a ki_b};
+
+  "CPU::A"     -> ki_a [dir="forward"];
+  "OpenCL::A"  -> ki_a [dir="forward"];
+  "Halide::A"  -> ki_a [dir="forward"];
+
+  "CPU::B"     -> ki_b [dir="forward"];
+  "OpenCL::B"  -> ki_b [dir="forward"];
+  "Halide::B"  -> ki_b [dir="forward"];
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/doc/pics/demo.jpg b/inference-engine/thirdparty/fluid/modules/gapi/doc/pics/demo.jpg
new file mode 100644 (file)
index 0000000..742d135
Binary files /dev/null and b/inference-engine/thirdparty/fluid/modules/gapi/doc/pics/demo.jpg differ
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/doc/pics/gapi_scheme.png b/inference-engine/thirdparty/fluid/modules/gapi/doc/pics/gapi_scheme.png
new file mode 100644 (file)
index 0000000..24271e3
Binary files /dev/null and b/inference-engine/thirdparty/fluid/modules/gapi/doc/pics/gapi_scheme.png differ
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/doc/pics/kernel_hierarchy.png b/inference-engine/thirdparty/fluid/modules/gapi/doc/pics/kernel_hierarchy.png
new file mode 100644 (file)
index 0000000..631f4a1
Binary files /dev/null and b/inference-engine/thirdparty/fluid/modules/gapi/doc/pics/kernel_hierarchy.png differ
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi.hpp
new file mode 100644 (file)
index 0000000..a043a83
--- /dev/null
@@ -0,0 +1,33 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_HPP
+#define OPENCV_GAPI_HPP
+
+#include <memory>
+
+/** \defgroup gapi G-API framework
+@{
+    @defgroup gapi_main_classes G-API Main Classes
+    @defgroup gapi_data_objects G-API Data Objects
+    @{
+      @defgroup gapi_meta_args G-API Metadata Descriptors
+    @}
+    @defgroup gapi_std_backends G-API Standard backends
+    @defgroup gapi_compile_args G-API Graph Compilation Arguments
+@}
+ */
+
+#include "opencv2/gapi/gmat.hpp"
+#include "opencv2/gapi/garray.hpp"
+#include "opencv2/gapi/gcomputation.hpp"
+#include "opencv2/gapi/gcompiled.hpp"
+#include "opencv2/gapi/gtyped.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+#include "opencv2/gapi/operators.hpp"
+
+#endif // OPENCV_GAPI_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/core.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/core.hpp
new file mode 100644 (file)
index 0000000..9af3620
--- /dev/null
@@ -0,0 +1,1600 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CORE_HPP
+#define OPENCV_GAPI_CORE_HPP
+
+#include <utility> // std::tuple
+
+#include <opencv2/imgproc.hpp>
+
+#include "opencv2/gapi/gmat.hpp"
+#include "opencv2/gapi/gscalar.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+
+/** \defgroup gapi_core G-API core (basic) functionality
+@{
+    @defgroup gapi_math Graph API: Math operations
+    @defgroup gapi_pixelwise Graph API: Pixelwise operations
+    @defgroup gapi_matrixop Graph API: Operations on matrices
+    @defgroup gapi_transform Graph API: Geometric, depth and LUT-like image transformations
+@}
+ */
+namespace cv { namespace gapi {
+namespace core {
+    using GMat2 = std::tuple<GMat,GMat>;
+    using GMat3 = std::tuple<GMat,GMat,GMat>; // FIXME: how to avoid this?
+    using GMat4 = std::tuple<GMat,GMat,GMat,GMat>;
+    using GMatScalar = std::tuple<GMat, GScalar>;
+
+    G_TYPED_KERNEL(GAdd, <GMat(GMat, GMat, int)>, "org.opencv.core.math.add") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc b, int ddepth) {
+            if (ddepth == -1)
+            {
+                // OpenCV: When the input arrays in add/subtract/multiply/divide
+                // functions have different depths, the output array depth must be
+                // explicitly specified!
+                // See artim_op() @ arithm.cpp
+                GAPI_Assert(a.chan == b.chan);
+                GAPI_Assert(a.depth == b.depth);
+                return a;
+            }
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GAddC, <GMat(GMat, GScalar, int)>, "org.opencv.core.math.addC") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GSub, <GMat(GMat, GMat, int)>, "org.opencv.core.math.sub") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc b, int ddepth) {
+            if (ddepth == -1)
+            {
+                // This macro should select a larger data depth from a and b
+                // considering the number of channels in the same
+                // FIXME!!! Clarify if it is valid for sub()
+                GAPI_Assert(a.chan == b.chan);
+                ddepth = std::max(a.depth, b.depth);
+            }
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GSubC, <GMat(GMat, GScalar, int)>, "org.opencv.core.math.subC") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GSubRC,<GMat(GScalar, GMat, int)>, "org.opencv.core.math.subRC") {
+        static GMatDesc outMeta(GScalarDesc, GMatDesc b, int ddepth) {
+            return b.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GMul, <GMat(GMat, GMat, double, int)>, "org.opencv.core.math.mul") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc, double, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GMulCOld, <GMat(GMat, double, int)>, "org.opencv.core.math.mulCOld") {
+        static GMatDesc outMeta(GMatDesc a, double, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GMulC, <GMat(GMat, GScalar, int)>, "org.opencv.core.math.mulC"){
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GMulS, <GMat(GMat, GScalar)>, "org.opencv.core.math.muls") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a;
+        }
+    }; // FIXME: Merge with MulC
+
+    G_TYPED_KERNEL(GDiv, <GMat(GMat, GMat, double, int)>, "org.opencv.core.math.div") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc b, double, int ddepth) {
+            if (ddepth == -1)
+            {
+                GAPI_Assert(a.depth == b.depth);
+                return b;
+            }
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GDivC, <GMat(GMat, GScalar, double, int)>, "org.opencv.core.math.divC") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc, double, int ddepth) {
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GDivRC, <GMat(GScalar, GMat, double, int)>, "org.opencv.core.math.divRC") {
+        static GMatDesc outMeta(GScalarDesc, GMatDesc b, double, int ddepth) {
+            return b.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GMean, <GScalar(GMat)>, "org.opencv.core.math.mean") {
+        static GScalarDesc outMeta(GMatDesc) {
+            return empty_scalar_desc();
+        }
+    };
+
+    G_TYPED_KERNEL_M(GPolarToCart, <GMat2(GMat, GMat, bool)>, "org.opencv.core.math.polarToCart") {
+        static std::tuple<GMatDesc, GMatDesc> outMeta(GMatDesc, GMatDesc a, bool) {
+            return std::make_tuple(a, a);
+        }
+    };
+
+    G_TYPED_KERNEL_M(GCartToPolar, <GMat2(GMat, GMat, bool)>, "org.opencv.core.math.cartToPolar") {
+        static std::tuple<GMatDesc, GMatDesc> outMeta(GMatDesc x, GMatDesc, bool) {
+            return std::make_tuple(x, x);
+        }
+    };
+
+    G_TYPED_KERNEL(GPhase, <GMat(GMat, GMat, bool)>, "org.opencv.core.math.phase") {
+        static GMatDesc outMeta(const GMatDesc &inx, const GMatDesc &, bool) {
+            return inx;
+        }
+    };
+
+    G_TYPED_KERNEL(GMask, <GMat(GMat,GMat)>, "org.opencv.core.pixelwise.mask") {
+        static GMatDesc outMeta(GMatDesc in, GMatDesc) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpGT, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpGT") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpGE, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpGE") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpLE, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpLE") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpLT, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpLT") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpEQ, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpEQ") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpNE, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.compare.cmpNE") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpGTScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpGTScalar"){
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpGEScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpGEScalar"){
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpLEScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpLEScalar"){
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpLTScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpLTScalar"){
+    static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpEQScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpEQScalar"){
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GCmpNEScalar, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.compare.cmpNEScalar"){
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a.withDepth(CV_8U);
+        }
+    };
+
+    G_TYPED_KERNEL(GAnd, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.bitwise_and") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GAndS, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.bitwise_andS") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GOr, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.bitwise_or") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GOrS, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.bitwise_orS") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GXor, <GMat(GMat, GMat)>, "org.opencv.core.pixelwise.bitwise_xor") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GXorS, <GMat(GMat, GScalar)>, "org.opencv.core.pixelwise.bitwise_xorS") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GNot, <GMat(GMat)>, "org.opencv.core.pixelwise.bitwise_not") {
+        static GMatDesc outMeta(GMatDesc a) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GSelect, <GMat(GMat, GMat, GMat)>, "org.opencv.core.pixelwise.select") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GMin, <GMat(GMat, GMat)>, "org.opencv.core.matrixop.min") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GMax, <GMat(GMat, GMat)>, "org.opencv.core.matrixop.max") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GAbsDiff, <GMat(GMat, GMat)>, "org.opencv.core.matrixop.absdiff") {
+        static GMatDesc outMeta(GMatDesc a, GMatDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GAbsDiffC, <GMat(GMat, GScalar)>, "org.opencv.core.matrixop.absdiffC") {
+        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+            return a;
+        }
+    };
+
+    G_TYPED_KERNEL(GSum, <GScalar(GMat)>, "org.opencv.core.matrixop.sum") {
+        static GScalarDesc outMeta(GMatDesc) {
+            return empty_scalar_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GAddW, <GMat(GMat, double, GMat, double, double, int)>, "org.opencv.core.matrixop.addweighted") {
+        static GMatDesc outMeta(GMatDesc a, double, GMatDesc b, double, double, int ddepth) {
+            if (ddepth == -1)
+            {
+                // OpenCV: When the input arrays in add/subtract/multiply/divide
+                // functions have different depths, the output array depth must be
+                // explicitly specified!
+                // See artim_op() @ arithm.cpp
+                GAPI_Assert(a.chan == b.chan);
+                GAPI_Assert(a.depth == b.depth);
+                return a;
+            }
+            return a.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GNormL1, <GScalar(GMat)>, "org.opencv.core.matrixop.norml1") {
+        static GScalarDesc outMeta(GMatDesc) {
+            return empty_scalar_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GNormL2, <GScalar(GMat)>, "org.opencv.core.matrixop.norml2") {
+        static GScalarDesc outMeta(GMatDesc) {
+            return empty_scalar_desc();
+        }
+    };
+
+    G_TYPED_KERNEL(GNormInf, <GScalar(GMat)>, "org.opencv.core.matrixop.norminf") {
+        static GScalarDesc outMeta(GMatDesc) {
+            return empty_scalar_desc();
+        }
+    };
+
+    G_TYPED_KERNEL_M(GIntegral, <GMat2(GMat, int, int)>, "org.opencv.core.matrixop.integral") {
+        static std::tuple<GMatDesc, GMatDesc> outMeta(GMatDesc in, int sd, int sqd) {
+            return std::make_tuple(in.withSizeDelta(1,1).withDepth(sd),
+                                   in.withSizeDelta(1,1).withDepth(sqd));
+        }
+    };
+
+    G_TYPED_KERNEL(GThreshold, <GMat(GMat, GScalar, GScalar, int)>, "org.opencv.core.matrixop.threshold") {
+        static GMatDesc outMeta(GMatDesc in, GScalarDesc, GScalarDesc, int) {
+            return in;
+        }
+    };
+
+
+    G_TYPED_KERNEL_M(GThresholdOT, <GMatScalar(GMat, GScalar, int)>, "org.opencv.core.matrixop.thresholdOT") {
+        static std::tuple<GMatDesc,GScalarDesc> outMeta(GMatDesc in, GScalarDesc, int) {
+            return std::make_tuple(in, empty_scalar_desc());
+        }
+    };
+
+    G_TYPED_KERNEL(GInRange, <GMat(GMat, GScalar, GScalar)>, "org.opencv.core.matrixop.inrange") {
+        static GMatDesc outMeta(GMatDesc in, GScalarDesc, GScalarDesc) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL_M(GSplit3, <GMat3(GMat)>, "org.opencv.core.transform.split3") {
+        static std::tuple<GMatDesc, GMatDesc, GMatDesc> outMeta(GMatDesc in) {
+            const auto out_depth = in.depth;
+            const auto out_desc  = in.withType(out_depth, 1);
+            return std::make_tuple(out_desc, out_desc, out_desc);
+        }
+    };
+
+    G_TYPED_KERNEL_M(GSplit4, <GMat4(GMat)>,"org.opencv.core.transform.split4") {
+        static std::tuple<GMatDesc, GMatDesc, GMatDesc, GMatDesc> outMeta(GMatDesc in) {
+            const auto out_depth = in.depth;
+            const auto out_desc = in.withType(out_depth, 1);
+            return std::make_tuple(out_desc, out_desc, out_desc, out_desc);
+        }
+    };
+
+    G_TYPED_KERNEL(GResize, <GMat(GMat,Size,double,double,int)>, "org.opencv.core.transform.resize") {
+        static GMatDesc outMeta(GMatDesc in, Size sz, double fx, double fy, int) {
+            if (sz.width != 0 && sz.height != 0)
+            {
+                return in.withSize(sz);
+            }
+            else
+            {
+                GAPI_Assert(fx != 0. && fy != 0.);
+                return in.withSize
+                    (Size(static_cast<int>(std::round(in.size.width  * fx)),
+                          static_cast<int>(std::round(in.size.height * fy))));
+            }
+        }
+    };
+
+    G_TYPED_KERNEL(GMerge3, <GMat(GMat,GMat,GMat)>, "org.opencv.core.transform.merge3") {
+        static GMatDesc outMeta(GMatDesc in, GMatDesc, GMatDesc) {
+            // Preserve depth and add channel component
+            return in.withType(in.depth, 3);
+        }
+    };
+
+    G_TYPED_KERNEL(GMerge4, <GMat(GMat,GMat,GMat,GMat)>, "org.opencv.core.transform.merge4") {
+        static GMatDesc outMeta(GMatDesc in, GMatDesc, GMatDesc, GMatDesc) {
+            // Preserve depth and add channel component
+            return in.withType(in.depth, 4);
+        }
+    };
+
+    G_TYPED_KERNEL(GRemap, <GMat(GMat, Mat, Mat, int, int, Scalar)>, "org.opencv.core.transform.remap") {
+        static GMatDesc outMeta(GMatDesc in, Mat m1, Mat, int, int, Scalar) {
+            return in.withSize(m1.size());
+        }
+    };
+
+    G_TYPED_KERNEL(GFlip, <GMat(GMat, int)>, "org.opencv.core.transform.flip") {
+        static GMatDesc outMeta(GMatDesc in, int) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GCrop, <GMat(GMat, Rect)>, "org.opencv.core.transform.crop") {
+        static GMatDesc outMeta(GMatDesc in, Rect rc) {
+            return in.withSize(Size(rc.width, rc.height));
+        }
+    };
+
+    G_TYPED_KERNEL(GConcatHor, <GMat(GMat, GMat)>, "org.opencv.imgproc.transform.concatHor") {
+        static GMatDesc outMeta(GMatDesc l, GMatDesc r) {
+            return l.withSizeDelta(+r.size.width, 0);
+        }
+    };
+
+    G_TYPED_KERNEL(GConcatVert, <GMat(GMat, GMat)>, "org.opencv.imgproc.transform.concatVert") {
+        static GMatDesc outMeta(GMatDesc t, GMatDesc b) {
+            return t.withSizeDelta(0, +b.size.height);
+        }
+    };
+
+    G_TYPED_KERNEL(GLUT, <GMat(GMat, Mat)>, "org.opencv.core.transform.LUT") {
+        static GMatDesc outMeta(GMatDesc in, Mat) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GConvertTo, <GMat(GMat, int, double, double)>, "org.opencv.core.transform.convertTo") {
+        static GMatDesc outMeta(GMatDesc in, int rdepth, double, double) {
+            return rdepth < 0 ? in : in.withDepth(rdepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GSqrt, <GMat(GMat)>, "org.opencv.core.math.sqrt") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in;
+        }
+    };
+}
+
+//! @addtogroup gapi_math
+//! @{
+
+/** @brief Calculates the per-element sum of two matrices.
+
+The function add calculates sum of two matrices of the same size and the same number of channels:
+\f[\texttt{dst}(I) =  \texttt{saturate} ( \texttt{src1}(I) +  \texttt{src2}(I)) \quad \texttt{if mask}(I) \ne0\f]
+
+The function can be replaced with matrix expressions:
+    \f[\texttt{dst} =  \texttt{src1} + \texttt{src2}\f]
+
+The input matrices and the output matrix can all have the same or different depths. For example, you
+can add a 16-bit unsigned matrix to a 8-bit signed matrix and store the sum as a 32-bit
+floating-point matrix. Depth of the output matrix is determined by the ddepth parameter.
+If src1.depth() == src2.depth(), ddepth can be set to the default -1. In this case, the output matrix will have
+the same depth as the input matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.add"
+@param src1 first input matrix.
+@param src2 second input matrix.
+@param ddepth optional depth of the output matrix.
+@sa sub, addWeighted
+*/
+GAPI_EXPORTS GMat add(const GMat& src1, const GMat& src2, int ddepth = -1);
+
+/** @brief Calculates the per-element sum of matrix and given scalar.
+
+The function addC adds a given scalar value to each element of given matrix.
+The function can be replaced with matrix expressions:
+
+    \f[\texttt{dst} =  \texttt{src1} + \texttt{c}\f]
+
+Depth of the output matrix is determined by the ddepth parameter.
+If ddepth is set to default -1, the depth of output matrix will be the same as the depth of input matrix.
+The matrices can be single or multi channel. Output matrix must have the same size and number of channels as the input matrix.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.addC"
+@param src1 first input matrix.
+@param c scalar value to be added.
+@param ddepth optional depth of the output matrix.
+@sa sub, addWeighted
+*/
+GAPI_EXPORTS GMat addC(const GMat& src1, const GScalar& c, int ddepth = -1);
+//! @overload
+GAPI_EXPORTS GMat addC(const GScalar& c, const GMat& src1, int ddepth = -1);
+
+/** @brief Calculates the per-element difference between two matrices.
+
+The function sub calculates difference between two matrices, when both matrices have the same size and the same number of
+channels:
+    \f[\texttt{dst}(I) =   \texttt{src1}(I) -  \texttt{src2}(I)\f]
+
+The function can be replaced with matrix expressions:
+\f[\texttt{dst} =   \texttt{src1} -  \texttt{src2}\f]
+
+The input matrices and the output matrix can all have the same or different depths. For example, you
+can subtract two 8-bit unsigned matrices store the result as a 16-bit signed matrix.
+Depth of the output matrix is determined by the ddepth parameter.
+If src1.depth() == src2.depth(), ddepth can be set to the default -1. In this case, the output matrix will have
+the same depth as the input matrices. The matrices can be single or multi channel.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.sub"
+@param src1 first input matrix.
+@param src2 second input matrix.
+@param ddepth optional depth of the output matrix.
+@sa  add, addC
+  */
+GAPI_EXPORTS GMat sub(const GMat& src1, const GMat& src2, int ddepth = -1);
+
+/** @brief Calculates the per-element difference between matrix and given scalar.
+
+The function can be replaced with matrix expressions:
+    \f[\texttt{dst} =  \texttt{src} - \texttt{c}\f]
+
+Depth of the output matrix is determined by the ddepth parameter.
+If ddepth is set to default -1, the depth of output matrix will be the same as the depth of input matrix.
+The matrices can be single or multi channel. Output matrix must have the same size as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.subC"
+@param src first input matrix.
+@param c scalar value to subtracted.
+@param ddepth optional depth of the output matrix.
+@sa  add, addC, subRC
+  */
+GAPI_EXPORTS GMat subC(const GMat& src, const GScalar& c, int ddepth = -1);
+
+/** @brief Calculates the per-element difference between given scalar and the matrix.
+
+The function can be replaced with matrix expressions:
+    \f[\texttt{dst} =  \texttt{val} - \texttt{src}\f]
+
+Depth of the output matrix is determined by the ddepth parameter.
+If ddepth is set to default -1, the depth of output matrix will be the same as the depth of input matrix.
+The matrices can be single or multi channel. Output matrix must have the same size as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.subRC"
+@param c scalar value to subtract from.
+@param src input matrix to be subtracted.
+@param ddepth optional depth of the output matrix.
+@sa  add, addC, subC
+  */
+GAPI_EXPORTS GMat subRC(const GScalar& c, const GMat& src, int ddepth = -1);
+
+/** @brief Calculates the per-element scaled product of two matrices.
+
+The function mul calculates the per-element product of two matrices:
+
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{scale} \cdot \texttt{src1} (I)  \cdot \texttt{src2} (I))\f]
+
+If src1.depth() == src2.depth(), ddepth can be set to the default -1. In this case, the output matrix will have
+the same depth as the input matrices. The matrices can be single or multi channel.
+Output matrix must have the same size as input matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.mul"
+@param src1 first input matrix.
+@param src2 second input matrix of the same size and the same depth as src1.
+@param scale optional scale factor.
+@param ddepth optional depth of the output matrix.
+@sa add, sub, div, addWeighted
+*/
+GAPI_EXPORTS GMat mul(const GMat& src1, const GMat& src2, double scale = 1.0, int ddepth = -1);
+
+/** @brief Multiplies matrix by scalar.
+
+The function mulC multiplies each element of matrix src by given scalar value:
+
+\f[\texttt{dst} (I)= \texttt{saturate} (  \texttt{src1} (I)  \cdot \texttt{multiplier} )\f]
+
+The matrices can be single or multi channel. Output matrix must have the same size as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.mulC"
+@param src input matrix.
+@param multiplier factor to be multiplied.
+@param ddepth optional depth of the output matrix. If -1, the depth of output matrix will be the same as input matrix depth.
+@sa add, sub, div, addWeighted
+*/
+GAPI_EXPORTS GMat mulC(const GMat& src, double multiplier, int ddepth = -1);
+//! @overload
+GAPI_EXPORTS GMat mulC(const GMat& src, const GScalar& multiplier, int ddepth = -1);   // FIXME: merge with mulc
+//! @overload
+GAPI_EXPORTS GMat mulC(const GScalar& multiplier, const GMat& src, int ddepth = -1);   // FIXME: merge with mulc
+
+/** @brief Performs per-element division of two matrices.
+
+The function divides one matrix by another:
+\f[\texttt{dst(I) = saturate(src1(I)*scale/src2(I))}\f]
+
+When src2(I) is zero, dst(I) will also be zero. Different channels of
+multi-channel matrices are processed independently.
+The matrices can be single or multi channel. Output matrix must have the same size and depth as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.div"
+@param src1 first input matrix.
+@param src2 second input matrix of the same size and depth as src1.
+@param scale scalar factor.
+@param ddepth optional depth of the output matrix; you can only pass -1 when src1.depth() == src2.depth().
+@sa  mul, add, sub
+*/
+GAPI_EXPORTS GMat div(const GMat& src1, const GMat& src2, double scale, int ddepth = -1);
+
+/** @brief Divides matrix by scalar.
+
+The function divC divides each element of matrix src by given scalar value:
+
+\f[\texttt{dst(I) = saturate(src(I)*scale/divisor)}\f]
+
+When divisor is zero, dst(I) will also be zero. Different channels of
+multi-channel matrices are processed independently.
+The matrices can be single or multi channel. Output matrix must have the same size and depth as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.divC"
+@param src input matrix.
+@param divisor number to be divided by.
+@param ddepth optional depth of the output matrix. If -1, the depth of output matrix will be the same as input matrix depth.
+@param scale scale factor.
+@sa add, sub, div, addWeighted
+*/
+GAPI_EXPORTS GMat divC(const GMat& src, const GScalar& divisor, double scale, int ddepth = -1);
+
+/** @brief Divides scalar by matrix.
+
+The function divRC divides given scalar by each element of matrix src and keep the division result in new matrix of the same size and type as src:
+
+\f[\texttt{dst(I) = saturate(divident*scale/src(I))}\f]
+
+When src(I) is zero, dst(I) will also be zero. Different channels of
+multi-channel matrices are processed independently.
+The matrices can be single or multi channel. Output matrix must have the same size and depth as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.divRC"
+@param src input matrix.
+@param divident number to be divided.
+@param ddepth optional depth of the output matrix. If -1, the depth of output matrix will be the same as input matrix depth.
+@param scale scale factor
+@sa add, sub, div, addWeighted
+*/
+GAPI_EXPORTS GMat divRC(const GScalar& divident, const GMat& src, double scale, int ddepth = -1);
+
+/** @brief Applies a mask to a matrix.
+
+The function mask set value from given matrix if the corresponding pixel value in mask matrix set to true,
+and set the matrix value to 0 overwise.
+
+Supported src matrix data types are @ref CV_8UC1, @ref CV_16SC1, @ref CV_16UC1. Supported mask data type is @ref CV_8UC1.
+
+@note Function textual ID is "org.opencv.core.math.mask"
+@param src input matrix.
+@param mask input mask matrix.
+*/
+GAPI_EXPORTS GMat mask(const GMat& src, const GMat& mask);
+
+/** @brief Calculates an average (mean) of matrix elements.
+
+The function mean calculates the mean value M of matrix elements,
+independently for each channel, and return it.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.math.mean"
+@param src input matrix.
+*/
+GAPI_EXPORTS GScalar mean(const GMat& src);
+
+/** @brief Calculates x and y coordinates of 2D vectors from their magnitude and angle.
+
+The function polarToCart calculates the Cartesian coordinates of each 2D
+vector represented by the corresponding elements of magnitude and angle:
+\f[\begin{array}{l} \texttt{x} (I) =  \texttt{magnitude} (I) \cos ( \texttt{angle} (I)) \\ \texttt{y} (I) =  \texttt{magnitude} (I) \sin ( \texttt{angle} (I)) \\ \end{array}\f]
+
+The relative accuracy of the estimated coordinates is about 1e-6.
+
+First output is a matrix of x-coordinates of 2D vectors.
+Second output is a matrix of y-coordinates of 2D vectors.
+Both output must have the same size and depth as input matrices.
+
+@note Function textual ID is "org.opencv.core.math.polarToCart"
+
+@param magnitude input floating-point @ref CV_32FC1 matrix (1xN) of magnitudes of 2D vectors;
+@param angle input floating-point @ref CV_32FC1 matrix (1xN) of angles of 2D vectors.
+@param angleInDegrees when true, the input angles are measured in
+degrees, otherwise, they are measured in radians.
+@sa cartToPolar, exp, log, pow, sqrt
+*/
+GAPI_EXPORTS std::tuple<GMat, GMat> polarToCart(const GMat& magnitude, const GMat& angle,
+                                              bool angleInDegrees = false);
+
+/** @brief Calculates the magnitude and angle of 2D vectors.
+
+The function cartToPolar calculates either the magnitude, angle, or both
+for every 2D vector (x(I),y(I)):
+\f[\begin{array}{l} \texttt{magnitude} (I)= \sqrt{\texttt{x}(I)^2+\texttt{y}(I)^2} , \\ \texttt{angle} (I)= \texttt{atan2} ( \texttt{y} (I), \texttt{x} (I))[ \cdot180 / \pi ] \end{array}\f]
+
+The angles are calculated with accuracy about 0.3 degrees. For the point
+(0,0), the angle is set to 0.
+
+First output is a matrix of magnitudes of the same size and depth as input x.
+Second output is a matrix of angles that has the same size and depth as
+x; the angles are measured in radians (from 0 to 2\*Pi) or in degrees (0 to 360 degrees).
+
+@note Function textual ID is "org.opencv.core.math.cartToPolar"
+
+@param x matrix of @ref CV_32FC1 x-coordinates.
+@param y array of @ref CV_32FC1 y-coordinates.
+@param angleInDegrees a flag, indicating whether the angles are measured
+in radians (which is by default), or in degrees.
+@sa polarToCart
+*/
+GAPI_EXPORTS std::tuple<GMat, GMat> cartToPolar(const GMat& x, const GMat& y,
+                                              bool angleInDegrees = false);
+
+/** @brief Calculates the rotation angle of 2D vectors.
+
+The function cv::phase calculates the rotation angle of each 2D vector that
+is formed from the corresponding elements of x and y :
+\f[\texttt{angle} (I) =  \texttt{atan2} ( \texttt{y} (I), \texttt{x} (I))\f]
+
+The angle estimation accuracy is about 0.3 degrees. When x(I)=y(I)=0 ,
+the corresponding angle(I) is set to 0.
+@param x input floating-point array of x-coordinates of 2D vectors.
+@param y input array of y-coordinates of 2D vectors; it must have the
+same size and the same type as x.
+@param angleInDegrees when true, the function calculates the angle in
+degrees, otherwise, they are measured in radians.
+@return array of vector angles; it has the same size and same type as x.
+*/
+GAPI_EXPORTS GMat phase(const GMat& x, const GMat &y, bool angleInDegrees = false);
+
+/** @brief Calculates a square root of array elements.
+
+The function cv::gapi::sqrt calculates a square root of each input array element.
+In case of multi-channel arrays, each channel is processed
+independently. The accuracy is approximately the same as of the built-in
+std::sqrt .
+@param src input floating-point array.
+@return output array of the same size and type as src.
+*/
+GAPI_EXPORTS GMat sqrt(const GMat &src);
+
+//! @} gapi_math
+//!
+//! @addtogroup gapi_pixelwise
+//! @{
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are greater compare to elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  > \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+\f[\texttt{dst} =   \texttt{src1} > \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices/matrix.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpGT"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpLE, cmpGE, cmpLS
+*/
+GAPI_EXPORTS GMat cmpGT(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpGTScalar"
+*/
+GAPI_EXPORTS GMat cmpGT(const GMat& src1, const GScalar& src2);
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are less than elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  < \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+    \f[\texttt{dst} =   \texttt{src1} < \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices/matrix.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLT"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpLE, cmpGE, cmpGT
+*/
+GAPI_EXPORTS GMat cmpLT(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLTScalar"
+*/
+GAPI_EXPORTS GMat cmpLT(const GMat& src1, const GScalar& src2);
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are greater or equal compare to elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  >= \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+    \f[\texttt{dst} =   \texttt{src1} >= \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpGE"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpLE, cmpGT, cmpLS
+*/
+GAPI_EXPORTS GMat cmpGE(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLGEcalar"
+*/
+GAPI_EXPORTS GMat cmpGE(const GMat& src1, const GScalar& src2);
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are less or equal compare to elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  <=  \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+    \f[\texttt{dst} =   \texttt{src1} <= \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLE"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpGT, cmpGE, cmpLS
+*/
+GAPI_EXPORTS GMat cmpLE(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpLEScalar"
+*/
+GAPI_EXPORTS GMat cmpLE(const GMat& src1, const GScalar& src2);
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are equal to elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  ==  \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+    \f[\texttt{dst} =   \texttt{src1} == \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpEQ"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpNE
+*/
+GAPI_EXPORTS GMat cmpEQ(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpEQScalar"
+*/
+GAPI_EXPORTS GMat cmpEQ(const GMat& src1, const GScalar& src2);
+
+/** @brief Performs the per-element comparison of two matrices checking if elements from first matrix are not equal to elements in second.
+
+The function compares elements of two matrices src1 and src2 of the same size:
+    \f[\texttt{dst} (I) =  \texttt{src1} (I)  !=  \texttt{src2} (I)\f]
+
+When the comparison result is true, the corresponding element of output
+array is set to 255. The comparison operations can be replaced with the
+equivalent matrix expressions:
+    \f[\texttt{dst} =   \texttt{src1} != \texttt{src2}\f]
+
+Output matrix of depth @ref CV_8U must have the same size and the same number of channels as
+    the input matrices.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpNE"
+@param src1 first input matrix.
+@param src2 second input matrix/scalar of the same depth as first input matrix.
+@sa min, max, threshold, cmpEQ
+*/
+GAPI_EXPORTS GMat cmpNE(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.cmpNEScalar"
+*/
+GAPI_EXPORTS GMat cmpNE(const GMat& src1, const GScalar& src2);
+
+/** @brief computes bitwise conjunction of the two matrixes (src1 & src2)
+Calculates the per-element bit-wise logical conjunction of two matrices of the same size.
+
+In case of floating-point matrices, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel matrices, each channel is processed
+independently. Output matrix must have the same size and depth as the input
+matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_and"
+
+@param src1 first input matrix.
+@param src2 second input matrix.
+*/
+GAPI_EXPORTS GMat bitwise_and(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.bitwise_andS"
+@param src1 first input matrix.
+@param src2 scalar, which will be per-lemenetly conjuncted with elements of src1.
+*/
+GAPI_EXPORTS GMat bitwise_and(const GMat& src1, const GScalar& src2);
+
+/** @brief computes bitwise disjunction of the two matrixes (src1 | src2)
+Calculates the per-element bit-wise logical disjunction of two matrices of the same size.
+
+In case of floating-point matrices, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel matrices, each channel is processed
+independently. Output matrix must have the same size and depth as the input
+matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_or"
+
+@param src1 first input matrix.
+@param src2 second input matrix.
+*/
+GAPI_EXPORTS GMat bitwise_or(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.bitwise_orS"
+@param src1 first input matrix.
+@param src2 scalar, which will be per-lemenetly disjuncted with elements of src1.
+*/
+GAPI_EXPORTS GMat bitwise_or(const GMat& src1, const GScalar& src2);
+
+
+/** @brief computes bitwise logical "exclusive or" of the two matrixes (src1 ^ src2)
+Calculates the per-element bit-wise logical "exclusive or" of two matrices of the same size.
+
+In case of floating-point matrices, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel matrices, each channel is processed
+independently. Output matrix must have the same size and depth as the input
+matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_xor"
+
+@param src1 first input matrix.
+@param src2 second input matrix.
+*/
+GAPI_EXPORTS GMat bitwise_xor(const GMat& src1, const GMat& src2);
+/** @overload
+@note Function textual ID is "org.opencv.core.pixelwise.compare.bitwise_xorS"
+@param src1 first input matrix.
+@param src2 scalar, for which per-lemenet "logical or" operation on elements of src1 will be performed.
+*/
+GAPI_EXPORTS GMat bitwise_xor(const GMat& src1, const GScalar& src2);
+
+
+/** @brief Inverts every bit of an array.
+The function bitwise_not calculates per-element bit-wise inversion of the input
+matrix:
+\f[\texttt{dst} (I) =  \neg \texttt{src} (I)\f]
+
+In case of floating-point matrices, their machine-specific bit
+representations (usually IEEE754-compliant) are used for the operation.
+In case of multi-channel matrices, each channel is processed
+independently. Output matrix must have the same size and depth as the input
+matrix.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.bitwise_not"
+
+@param src input matrix.
+*/
+GAPI_EXPORTS GMat bitwise_not(const GMat& src);
+
+/** @brief Select values from either first or second of input matrices by given mask.
+The function set to the output matrix either the value from the first input matrix if corresponding value of mask matrix is 255,
+ or value from the second input matrix (if value of mask matrix set to 0).
+
+Input mask matrix must be of @ref CV_8UC1 type, two other inout matrices and output matrix should be of the same type. The size should
+be the same for all input and output matrices.
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.pixelwise.select"
+
+@param src1 first input matrix.
+@param src2 second input matrix.
+@param mask mask input matrix.
+*/
+GAPI_EXPORTS GMat select(const GMat& src1, const GMat& src2, const GMat& mask);
+
+//! @} gapi_pixelwise
+
+
+//! @addtogroup gapi_matrixop
+//! @{
+/** @brief Calculates per-element minimum of two matrices.
+
+The function min calculates the per-element minimum of two matrices of the same size, number of channels and depth:
+\f[\texttt{dst} (I)= \min ( \texttt{src1} (I), \texttt{src2} (I))\f]
+    where I is a multi-dimensional index of matrix elements. In case of
+    multi-channel matrices, each channel is processed independently.
+Output matrix must be of the same size and depth as src1.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.min"
+@param src1 first input matrix.
+@param src2 second input matrix of the same size and depth as src1.
+@sa max, compareEqual, compareLess, compareLessEqual
+*/
+GAPI_EXPORTS GMat min(const GMat& src1, const GMat& src2);
+
+/** @brief Calculates per-element maximum of two matrices.
+
+The function max calculates the per-element maximum of two matrices of the same size, number of channels and depth:
+\f[\texttt{dst} (I)= \max ( \texttt{src1} (I), \texttt{src2} (I))\f]
+    where I is a multi-dimensional index of matrix elements. In case of
+    multi-channel matrices, each channel is processed independently.
+Output matrix must be of the same size and depth as src1.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.max"
+@param src1 first input matrix.
+@param src2 second input matrix of the same size and depth as src1.
+@sa min, compare, compareEqual, compareGreater, compareGreaterEqual
+*/
+GAPI_EXPORTS GMat max(const GMat& src1, const GMat& src2);
+
+/** @brief Calculates the per-element absolute difference between two matrices.
+
+The function absDiff calculates absolute difference between two matrices of the same size and depth:
+    \f[\texttt{dst}(I) =  \texttt{saturate} (| \texttt{src1}(I) -  \texttt{src2}(I)|)\f]
+    where I is a multi-dimensional index of matrix elements. In case of
+    multi-channel matrices, each channel is processed independently.
+Output matrix must have the same size and depth as input matrices.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.absdiff"
+@param src1 first input matrix.
+@param src2 second input matrix.
+@sa abs
+*/
+GAPI_EXPORTS GMat absDiff(const GMat& src1, const GMat& src2);
+
+/** @brief Calculates absolute value of matrix elements.
+
+The function abs calculates absolute difference between matrix elements and given scalar value:
+    \f[\texttt{dst}(I) =  \texttt{saturate} (| \texttt{src1}(I) -  \texttt{matC}(I)|)\f]
+    where matC is constructed from given scalar c and has the same sizes and depth as input matrix src.
+
+Output matrix must be of the same size and depth as src.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.absdiffC"
+@param src input matrix.
+@param c scalar to be subtracted.
+@sa min, max
+*/
+GAPI_EXPORTS GMat absDiffC(const GMat& src, const GScalar& c);
+
+/** @brief Calculates sum of all matrix elements.
+
+The function sum calculates sum of all matrix elements, independently for each channel.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.sum"
+@param src input matrix.
+@sa min, max
+*/
+GAPI_EXPORTS GScalar sum(const GMat& src);
+
+/** @brief Calculates the weighted sum of two matrices.
+
+The function addWeighted calculates the weighted sum of two matrices as follows:
+\f[\texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)* \texttt{beta} +  \texttt{gamma} )\f]
+where I is a multi-dimensional index of array elements. In case of multi-channel matrices, each
+channel is processed independently.
+
+The function can be replaced with a matrix expression:
+    \f[\texttt{dst}(I) =  \texttt{alpha} * \texttt{src1}(I) - \texttt{beta} * \texttt{src2}(I) + \texttt{gamma} \f]
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.addweighted"
+@param src1 first input matrix.
+@param alpha weight of the first matrix elements.
+@param src2 second input matrix of the same size and channel number as src1.
+@param beta weight of the second matrix elements.
+@param gamma scalar added to each sum.
+@param ddepth optional depth of the output matrix.
+@sa  add, sub
+*/
+GAPI_EXPORTS GMat addWeighted(const GMat& src1, double alpha, const GMat& src2, double beta, double gamma, int ddepth = -1);
+
+/** @brief Calculates the  absolute L1 norm of a matrix.
+
+This version of normL1 calculates the absolute L1 norm of src.
+
+As example for one array consider the function \f$r(x)= \begin{pmatrix} x \\ 1-x \end{pmatrix}, x \in [-1;1]\f$.
+The \f$ L_{1} \f$ norm for the sample value \f$r(-1) = \begin{pmatrix} -1 \\ 2 \end{pmatrix}\f$
+is calculated as follows
+\f{align*}
+    \| r(-1) \|_{L_1} &= |-1| + |2| = 3 \\
+\f}
+and for \f$r(0.5) = \begin{pmatrix} 0.5 \\ 0.5 \end{pmatrix}\f$ the calculation is
+\f{align*}
+    \| r(0.5) \|_{L_1} &= |0.5| + |0.5| = 1 \\
+\f}
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.norml1"
+@param src input matrix.
+@sa normL2, normInf
+*/
+GAPI_EXPORTS GScalar normL1(const GMat& src);
+
+/** @brief Calculates the absolute L2 norm of a matrix.
+
+This version of normL2 calculates the absolute L2 norm of src.
+
+As example for one array consider the function \f$r(x)= \begin{pmatrix} x \\ 1-x \end{pmatrix}, x \in [-1;1]\f$.
+The \f$ L_{2} \f$  norm for the sample value \f$r(-1) = \begin{pmatrix} -1 \\ 2 \end{pmatrix}\f$
+is calculated as follows
+\f{align*}
+    \| r(-1) \|_{L_2} &= \sqrt{(-1)^{2} + (2)^{2}} = \sqrt{5} \\
+\f}
+and for \f$r(0.5) = \begin{pmatrix} 0.5 \\ 0.5 \end{pmatrix}\f$ the calculation is
+\f{align*}
+    \| r(0.5) \|_{L_2} &= \sqrt{(0.5)^{2} + (0.5)^{2}} = \sqrt{0.5} \\
+\f}
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+@note Function textual ID is "org.opencv.core.matrixop.norml2"
+@param src input matrix.
+@sa normL1, normInf
+*/
+GAPI_EXPORTS GScalar normL2(const GMat& src);
+
+/** @brief Calculates the absolute infinite norm of a matrix.
+
+This version of normInf calculates the absolute infinite norm of src.
+
+As example for one array consider the function \f$r(x)= \begin{pmatrix} x \\ 1-x \end{pmatrix}, x \in [-1;1]\f$.
+The \f$ L_{\infty} \f$ norm for the sample value \f$r(-1) = \begin{pmatrix} -1 \\ 2 \end{pmatrix}\f$
+is calculated as follows
+\f{align*}
+    \| r(-1) \|_{L_\infty} &= \max(|-1|,|2|) = 2
+\f}
+and for \f$r(0.5) = \begin{pmatrix} 0.5 \\ 0.5 \end{pmatrix}\f$ the calculation is
+\f{align*}
+    \| r(0.5) \|_{L_\infty} &= \max(|0.5|,|0.5|) = 0.5.
+\f}
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.norminf"
+@param src input matrix.
+@sa normL1, normL2
+*/
+GAPI_EXPORTS GScalar normInf(const GMat& src);
+
+/** @brief Calculates the integral of an image.
+
+The function calculates one or more integral images for the source image as follows:
+
+\f[\texttt{sum} (X,Y) =  \sum _{x<X,y<Y}  \texttt{image} (x,y)\f]
+
+\f[\texttt{sqsum} (X,Y) =  \sum _{x<X,y<Y}  \texttt{image} (x,y)^2\f]
+
+The function return integral image as \f$(W+1)\times (H+1)\f$ , 32-bit integer or floating-point (32f or 64f) and
+ integral image for squared pixel values; it is \f$(W+1)\times (H+)\f$, double-precision floating-point (64f) array.
+
+@note Function textual ID is "org.opencv.core.matrixop.integral"
+
+@param src input image.
+@param sdepth desired depth of the integral and the tilted integral images, CV_32S, CV_32F, or
+CV_64F.
+@param sqdepth desired depth of the integral image of squared pixel values, CV_32F or CV_64F.
+ */
+GAPI_EXPORTS std::tuple<GMat, GMat> integral(const GMat& src, int sdepth = -1, int sqdepth = -1);
+
+/** @brief Applies a fixed-level threshold to each matrix element.
+
+The function applies fixed-level thresholding to a single- or multiple-channel matrix.
+The function is typically used to get a bi-level (binary) image out of a grayscale image ( cmp funtions could be also used for
+this purpose) or for removing a noise, that is, filtering out pixels with too small or too large
+values. There are several depths of thresholding supported by the function. They are determined by
+depth parameter.
+
+Also, the special values cv::THRESH_OTSU or cv::THRESH_TRIANGLE may be combined with one of the
+above values. In these cases, the function determines the optimal threshold value using the Otsu's
+or Triangle algorithm and uses it instead of the specified thresh . The function returns the
+computed threshold value in addititon to thresholded matrix.
+The Otsu's and Triangle methods are implemented only for 8-bit matrices.
+
+Input image should be single channel only in case of cv::THRESH_OTSU or cv::THRESH_TRIANGLE flags.
+Output matrix must be of the same size and depth as src.
+
+@note Function textual ID is "org.opencv.core.matrixop.threshold"
+
+@param src input matrix (@ref CV_8UC1, @ref CV_8UC3, or @ref CV_32FC1).
+@param thresh threshold value.
+@param maxval maximum value to use with the cv::THRESH_BINARY and cv::THRESH_BINARY_INV thresholding
+depths.
+@param depth thresholding depth (see the cv::ThresholdTypes).
+
+@sa min, max, cmpGT, cmpLE, cmpGE, cmpLS
+ */
+GAPI_EXPORTS GMat threshold(const GMat& src, const GScalar& thresh, const GScalar& maxval, int depth);
+/** @overload
+This function appicable for all threshold depths except CV_THRESH_OTSU and CV_THRESH_TRIANGLE
+@note Function textual ID is "org.opencv.core.matrixop.thresholdOT"
+*/
+GAPI_EXPORTS std::tuple<GMat, GScalar> threshold(const GMat& src, const GScalar& maxval, int depth);
+
+/** @brief Applies a range-level threshold to each matrix element.
+
+The function applies range-level thresholding to a single- or multiple-channel matrix.
+It sets output pixel value to OxFF if the corresponding pixel value of input matrix is in specified range,or 0 otherwise.
+
+Input and output matrices must be CV_8UC1.
+
+@note Function textual ID is "org.opencv.core.matrixop.inRange"
+
+@param src input matrix (CV_8UC1).
+@param threshLow lower boundary value.
+@param threshUp upper boundary value.
+
+@sa threshold
+ */
+GAPI_EXPORTS GMat inRange(const GMat& src, const GScalar& threshLow, const GScalar& threshUp);
+
+//! @} gapi_matrixop
+
+//! @addtogroup gapi_transform
+//! @{
+/** @brief Resizes an image.
+
+The function resizes the image src down to or up to the specified size.
+
+Output image size will have the size dsize (when dsize is non-zero) or the size computed from
+src.size(), fx, and fy; the depth of output is the same as of src.
+
+If you want to resize src so that it fits the pre-created dst,
+you may call the function as follows:
+@code
+    // explicitly specify dsize=dst.size(); fx and fy will be computed from that.
+    resize(src, dst, dst.size(), 0, 0, interpolation);
+@endcode
+If you want to decimate the image by factor of 2 in each direction, you can call the function this
+way:
+@code
+    // specify fx and fy and let the function compute the destination image size.
+    resize(src, dst, Size(), 0.5, 0.5, interpolation);
+@endcode
+To shrink an image, it will generally look best with cv::INTER_AREA interpolation, whereas to
+enlarge an image, it will generally look best with cv::INTER_CUBIC (slow) or cv::INTER_LINEAR
+(faster but still looks OK).
+
+@note Function textual ID is "org.opencv.core.transform.resize"
+
+@param src input image.
+@param dsize output image size; if it equals zero, it is computed as:
+ \f[\texttt{dsize = Size(round(fx*src.cols), round(fy*src.rows))}\f]
+ Either dsize or both fx and fy must be non-zero.
+@param fx scale factor along the horizontal axis; when it equals 0, it is computed as
+\f[\texttt{(double)dsize.width/src.cols}\f]
+@param fy scale factor along the vertical axis; when it equals 0, it is computed as
+\f[\texttt{(double)dsize.height/src.rows}\f]
+@param interpolation interpolation method, see cv::InterpolationFlags
+
+@sa  warpAffine, warpPerspective, remap
+ */
+GAPI_EXPORTS GMat resize(const GMat& src, const Size& dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR);
+
+/** @brief Creates one 3-channel (4-channel) matrix out of 3(4) single-channel ones.
+
+The function merges several matrices to make a single multi-channel matrix. That is, each
+element of the output matrix will be a concatenation of the elements of the input matrices, where
+elements of i-th input matrix are treated as mv[i].channels()-element vectors.
+Input matrix must be of @ref CV_8UC3 (@ref CV_8UC4) type.
+
+The function split3/split4 does the reverse operation.
+
+@note Function textual ID for merge3 is "org.opencv.core.transform.merge3"
+@note Function textual ID for merge4 is "org.opencv.core.transform.merge4"
+
+@param src1 first input matrix to be merged
+@param src2 second input matrix to be merged
+@param src3 third input matrix to be merged
+@param src4 fourth input matrix to be merged
+@sa  split4, split3
+*/
+GAPI_EXPORTS GMat merge4(const GMat& src1, const GMat& src2, const GMat& src3, const GMat& src4);
+GAPI_EXPORTS GMat merge3(const GMat& src1, const GMat& src2, const GMat& src3);
+
+/** @brief Divides a 3-channel (4-channel) matrix into 3(4) single-channel matrices.
+
+The function splits a 3-channel (4-channel) matrix into 3(4) single-channel matrices:
+\f[\texttt{mv} [c](I) =  \texttt{src} (I)_c\f]
+
+All output matrices must be in @ref CV_8UC1.
+
+@note Function textual for split3 ID is "org.opencv.core.transform.split3"
+@note Function textual for split4 ID is "org.opencv.core.transform.split4"
+
+@param src input @ref CV_8UC4 (@ref CV_8UC3) matrix.
+@sa merge3, merge4
+*/
+GAPI_EXPORTS std::tuple<GMat, GMat, GMat,GMat> split4(const GMat& src);
+GAPI_EXPORTS std::tuple<GMat, GMat, GMat> split3(const GMat& src);
+
+/** @brief Applies a generic geometrical transformation to an image.
+
+The function remap transforms the source image using the specified map:
+
+\f[\texttt{dst} (x,y) =  \texttt{src} (map_x(x,y),map_y(x,y))\f]
+
+where values of pixels with non-integer coordinates are computed using one of available
+interpolation methods. \f$map_x\f$ and \f$map_y\f$ can be encoded as separate floating-point maps
+in \f$map_1\f$ and \f$map_2\f$ respectively, or interleaved floating-point maps of \f$(x,y)\f$ in
+\f$map_1\f$, or fixed-point maps created by using convertMaps. The reason you might want to
+convert from floating to fixed-point representations of a map is that they can yield much faster
+(\~2x) remapping operations. In the converted case, \f$map_1\f$ contains pairs (cvFloor(x),
+cvFloor(y)) and \f$map_2\f$ contains indices in a table of interpolation coefficients.
+Output image must be of the same size and depth as input one.
+
+@note Function textual ID is "org.opencv.core.transform.remap"
+
+@param src Source image.
+@param map1 The first map of either (x,y) points or just x values having the type CV_16SC2,
+CV_32FC1, or CV_32FC2.
+@param map2 The second map of y values having the type CV_16UC1, CV_32FC1, or none (empty map
+if map1 is (x,y) points), respectively.
+@param interpolation Interpolation method (see cv::InterpolationFlags). The method INTER_AREA is
+not supported by this function.
+@param borderMode Pixel extrapolation method (see cv::BorderTypes). When
+borderMode=BORDER_TRANSPARENT, it means that the pixels in the destination image that
+corresponds to the "outliers" in the source image are not modified by the function.
+@param borderValue Value used in case of a constant border. By default, it is 0.
+@note
+Due to current implementation limitations the size of an input and output images should be less than 32767x32767.
+ */
+GAPI_EXPORTS GMat remap(const GMat& src, const Mat& map1, const Mat& map2,
+                      int interpolation, int borderMode = BORDER_CONSTANT,
+                      const Scalar& borderValue = Scalar());
+
+/** @brief Flips a 2D matrix around vertical, horizontal, or both axes.
+
+The function flips the matrix in one of three different ways (row
+and column indices are 0-based):
+\f[\texttt{dst} _{ij} =
+\left\{
+\begin{array}{l l}
+\texttt{src} _{\texttt{src.rows}-i-1,j} & if\;  \texttt{flipCode} = 0 \\
+\texttt{src} _{i, \texttt{src.cols} -j-1} & if\;  \texttt{flipCode} > 0 \\
+\texttt{src} _{ \texttt{src.rows} -i-1, \texttt{src.cols} -j-1} & if\; \texttt{flipCode} < 0 \\
+\end{array}
+\right.\f]
+The example scenarios of using the function are the following:
+*   Vertical flipping of the image (flipCode == 0) to switch between
+    top-left and bottom-left image origin. This is a typical operation
+    in video processing on Microsoft Windows\* OS.
+*   Horizontal flipping of the image with the subsequent horizontal
+    shift and absolute difference calculation to check for a
+    vertical-axis symmetry (flipCode \> 0).
+*   Simultaneous horizontal and vertical flipping of the image with
+    the subsequent shift and absolute difference calculation to check
+    for a central symmetry (flipCode \< 0).
+*   Reversing the order of point arrays (flipCode \> 0 or
+    flipCode == 0).
+Output image must be of the same depth as input one, size should be correct for given flipCode.
+
+@note Function textual ID is "org.opencv.core.transform.flip"
+
+@param src input matrix.
+@param flipCode a flag to specify how to flip the array; 0 means
+flipping around the x-axis and positive value (for example, 1) means
+flipping around y-axis. Negative value (for example, -1) means flipping
+around both axes.
+@sa remap
+*/
+GAPI_EXPORTS GMat flip(const GMat& src, int flipCode);
+
+/** @brief Crops a 2D matrix.
+
+The function crops the matrix by given cv::Rect.
+
+Output matrix must be of the same depth as input one, size is specified by given rect size.
+
+@note Function textual ID is "org.opencv.core.transform.crop"
+
+@param src input matrix.
+@param rect a rect to crop a matrix to
+@sa resize
+*/
+GAPI_EXPORTS GMat crop(const GMat& src, const Rect& rect);
+
+/** @brief Applies horizontal concatenation to given matrices.
+
+The function horizontally concatenates two GMat matrices (with the same number of rows).
+@code{.cpp}
+    GMat A = { 1, 4,
+               2, 5,
+               3, 6 };
+    GMat B = { 7, 10,
+               8, 11,
+               9, 12 };
+
+    GMat C = gapi::concatHor(A, B);
+    //C:
+    //[1, 4, 7, 10;
+    // 2, 5, 8, 11;
+    // 3, 6, 9, 12]
+@endcode
+Output matrix must the same number of rows and depth as the src1 and src2, and the sum of cols of the src1 and src2.
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.imgproc.transform.concatHor"
+
+@param src1 first input matrix to be considered for horizontal concatenation.
+@param src2 second input matrix to be considered for horizontal concatenation.
+@sa concatVert
+*/
+GAPI_EXPORTS GMat concatHor(const GMat& src1, const GMat& src2);
+
+/** @overload
+The function horizontally concatenates given number of GMat matrices (with the same number of columns).
+Output matrix must the same number of columns and depth as the input matrices, and the sum of rows of input matrices.
+
+@param v vector of input matrices to be concatenated horizontally.
+*/
+GAPI_EXPORTS GMat concatHor(const std::vector<GMat> &v);
+
+/** @brief Applies vertical concatenation to given matrices.
+
+The function vertically concatenates two GMat matrices (with the same number of cols).
+ @code{.cpp}
+    GMat A = { 1, 7,
+               2, 8,
+               3, 9 };
+    GMat B = { 4, 10,
+               5, 11,
+               6, 12 };
+
+    GMat C = gapi::concatVert(A, B);
+    //C:
+    //[1, 7;
+    // 2, 8;
+    // 3, 9;
+    // 4, 10;
+    // 5, 11;
+    // 6, 12]
+ @endcode
+
+Output matrix must the same number of cols and depth as the src1 and src2, and the sum of rows of the src1 and src2.
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+
+@note Function textual ID is "org.opencv.imgproc.transform.concatVert"
+
+@param src1 first input matrix to be considered for vertical concatenation.
+@param src2 second input matrix to be considered for vertical concatenation.
+@sa concatHor
+*/
+GAPI_EXPORTS GMat concatVert(const GMat& src1, const GMat& src2);
+
+/** @overload
+The function vertically concatenates given number of GMat matrices (with the same number of columns).
+Output matrix must the same number of columns and depth as the input matrices, and the sum of rows of input matrices.
+
+@param v vector of input matrices to be concatenated vertically.
+*/
+GAPI_EXPORTS GMat concatVert(const std::vector<GMat> &v);
+
+
+/** @brief Performs a look-up table transform of a matrix.
+
+The function LUT fills the output matrix with values from the look-up table. Indices of the entries
+are taken from the input matrix. That is, the function processes each element of src as follows:
+\f[\texttt{dst} (I)  \leftarrow \texttt{lut(src(I))}\f]
+
+Supported matrix data types are @ref CV_8UC1.
+Output is a matrix of the same size and number of channels as src, and the same depth as lut.
+
+@note Function textual ID is "org.opencv.core.transform.LUT"
+
+@param src input matrix of 8-bit elements.
+@param lut look-up table of 256 elements; in case of multi-channel input array, the table should
+either have a single channel (in this case the same table is used for all channels) or the same
+number of channels as in the input matrix.
+*/
+GAPI_EXPORTS GMat LUT(const GMat& src, const Mat& lut);
+
+/** @brief Performs a 3D look-up table transform of a multi-channel matrix.
+
+The function LUT3D fills the output matrix with values from the look-up table. Indices of the entries
+are taken from the input matrix. Interpolation is applied for mapping 0-255 range values to 0-16 range of 3DLUT table.
+The function processes each element of src as follows:
+@code{.cpp}
+    dst[i][j][k] = lut3D[~src_r][~src_g][~src_b];
+@endcode
+where ~ means approximation.
+Output is a matrix of of @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.core.transform.LUT3D"
+
+@param src input matrix of @ref CV_8UC3.
+@param lut3D look-up table 17x17x17 3-channel elements.
+@param interpolation The depth of interpoolation to be used.
+*/
+GAPI_EXPORTS GMat LUT3D(const GMat& src, const GMat& lut3D, int interpolation = INTER_NEAREST);
+
+/** @brief Converts a matrix to another data depth with optional scaling.
+
+The method converts source pixel values to the target data depth. saturate_cast\<\> is applied at
+the end to avoid possible overflows:
+
+\f[m(x,y) = saturate \_ cast<rType>( \alpha (*this)(x,y) +  \beta )\f]
+Output matrix must be of the same size as input one.
+
+@note Function textual ID is "org.opencv.core.transform.convertTo"
+@param src input matrix to be converted from.
+@param rdepth desired output matrix depth or, rather, the depth since the number of channels are the
+same as the input has; if rdepth is negative, the output matrix will have the same depth as the input.
+@param alpha optional scale factor.
+@param beta optional delta added to the scaled values.
+ */
+GAPI_EXPORTS GMat convertTo(const GMat& src, int rdepth, double alpha=1, double beta=0);
+//! @} gapi_transform
+
+} //namespace gapi
+} //namespace cv
+
+#endif //OPENCV_GAPI_CORE_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/core.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/core.hpp
new file mode 100644 (file)
index 0000000..ec76fe5
--- /dev/null
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CPU_CORE_API_HPP
+#define OPENCV_GAPI_CPU_CORE_API_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+#include "opencv2/gapi/own/exports.hpp" // GAPI_EXPORTS
+
+namespace cv {
+namespace gapi {
+namespace core {
+namespace cpu {
+
+GAPI_EXPORTS GKernelPackage kernels();
+
+} // namespace cpu
+} // namespace core
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_CPU_CORE_API_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/gcpukernel.hpp
new file mode 100644 (file)
index 0000000..facaab6
--- /dev/null
@@ -0,0 +1,266 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCPUKERNEL_HPP
+#define OPENCV_GAPI_GCPUKERNEL_HPP
+
+#include <functional>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <opencv2/core/mat.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/own/convert.hpp> //to_ocv
+#include <opencv2/gapi/util/compiler_hints.hpp> //suppress_unused_warning
+
+// FIXME: namespace scheme for backends?
+namespace cv {
+
+namespace gimpl
+{
+    // Forward-declare an internal class
+    class GCPUExecutable;
+} // namespace gimpl
+
+namespace gapi
+{
+namespace cpu
+{
+    /**
+     * \addtogroup gapi_std_backends
+     * @{
+     *
+     * @brief G-API backends available in this OpenCV version
+     *
+     * G-API backends play a corner stone role in G-API execution
+     * stack. Every backend is hardware-oriented and thus can run its
+     * kernels efficiently on the target platform.
+     *
+     * Backends are usually "back boxes" for G-API users -- on the API
+     * side, all backends are represented as different objects of the
+     * same class cv::gapi::GBackend. User can manipulate with backends
+     * mainly by specifying which kernels to use or where to look up
+     * for kernels first.
+     *
+     * @sa @ref gapi_hld, cv::gapi::lookup_order()
+     */
+
+    /**
+     * @brief Get a reference to CPU (OpenCV) backend.
+     *
+     * This is the default backend in G-API at the moment, providing
+     * broader functional coverage but losing some graph model
+     * advantages. Provided mostly for reference and prototyping
+     * purposes.
+     *
+     * @sa gapi_std_backends
+     */
+    GAPI_EXPORTS cv::gapi::GBackend backend();
+    /** @} */
+} // namespace cpu
+} // namespace gapi
+
+// Represents arguments which are passed to a wrapped CPU function
+// FIXME: put into detail?
+class GAPI_EXPORTS GCPUContext
+{
+public:
+    // Generic accessor API
+    template<typename T>
+    const T& inArg(int input) { return m_args.at(input).get<T>(); }
+
+    // Syntax sugar
+    const cv::gapi::own::Mat&   inMat(int input);
+    cv::gapi::own::Mat&         outMatR(int output); // FIXME: Avoid cv::gapi::own::Mat m = ctx.outMatR()
+
+    const cv::gapi::own::Scalar& inVal(int input);
+    cv::gapi::own::Scalar& outValR(int output); // FIXME: Avoid cv::gapi::own::Scalar s = ctx.outValR()
+    template<typename T> std::vector<T>& outVecR(int output) // FIXME: the same issue
+    {
+        return outVecRef(output).wref<T>();
+    }
+
+protected:
+    detail::VectorRef& outVecRef(int output);
+
+    std::vector<GArg> m_args;
+
+    //FIXME: avoid conversion of arguments from internal representaion to OpenCV one on each call
+    //to OCV kernel. (This can be achieved by a two single time conversions in GCPUExecutable::run,
+    //once on enter for input and output arguments, and once before return for output arguments only
+    std::unordered_map<std::size_t, GRunArgP> m_results;
+
+    friend class gimpl::GCPUExecutable;
+};
+
+class GAPI_EXPORTS GCPUKernel
+{
+public:
+    // This function is kernel's execution entry point (does the processing work)
+    using F = std::function<void(GCPUContext &)>;
+
+    GCPUKernel();
+    explicit GCPUKernel(const F& f);
+
+    void apply(GCPUContext &ctx);
+
+protected:
+    F m_f;
+};
+
+// FIXME: This is an ugly ad-hoc imlpementation. TODO: refactor
+
+namespace detail
+{
+template<class T> struct get_in;
+template<> struct get_in<cv::GMat>
+{
+    static cv::Mat    get(GCPUContext &ctx, int idx) { return to_ocv(ctx.inMat(idx)); }
+};
+template<> struct get_in<cv::GScalar>
+{
+    static cv::Scalar get(GCPUContext &ctx, int idx) { return to_ocv(ctx.inVal(idx)); }
+};
+template<typename U> struct get_in<cv::GArray<U> >
+{
+    static const std::vector<U>& get(GCPUContext &ctx, int idx) { return ctx.inArg<VectorRef>(idx).rref<U>(); }
+};
+template<class T> struct get_in
+{
+    static T get(GCPUContext &ctx, int idx) { return ctx.inArg<T>(idx); }
+};
+
+struct tracked_cv_mat{
+    tracked_cv_mat(cv::gapi::own::Mat& m) : r{to_ocv(m)}, original_data{m.data} {}
+    cv::Mat r;
+    uchar* original_data;
+
+    operator cv::Mat& (){ return r;}
+    void validate() const{
+        if (r.data != original_data)
+        {
+            util::throw_error
+                (std::logic_error
+                 ("OpenCV kernel output parameter was reallocated. \n"
+                  "Incorrect meta data was provided ?"));
+        }
+    }
+};
+
+struct scalar_wrapper
+{
+    scalar_wrapper(cv::gapi::own::Scalar& s) : m_s{cv::gapi::own::to_ocv(s)}, m_org_s(s) {};
+    operator cv::Scalar& () { return m_s; }
+    void writeBack() const  { m_org_s = to_own(m_s); }
+
+    cv::Scalar m_s;
+    cv::gapi::own::Scalar& m_org_s;
+};
+
+template<typename... Outputs>
+void postprocess(Outputs&... outs)
+{
+    struct
+    {
+        void operator()(tracked_cv_mat* bm) { bm->validate();  }
+        void operator()(scalar_wrapper* sw) { sw->writeBack(); }
+        void operator()(...)                {                  }
+
+    } validate;
+    //dummy array to unfold parameter pack
+    int dummy[] = { 0, (validate(&outs), 0)... };
+    cv::util::suppress_unused_warning(dummy);
+}
+
+template<class T> struct get_out;
+template<> struct get_out<cv::GMat>
+{
+    static tracked_cv_mat get(GCPUContext &ctx, int idx)
+    {
+        auto& r = ctx.outMatR(idx);
+        return {r};
+    }
+};
+template<> struct get_out<cv::GScalar>
+{
+    static scalar_wrapper get(GCPUContext &ctx, int idx)
+    {
+        auto& s = ctx.outValR(idx);
+        return {s};
+    }
+};
+template<typename U> struct get_out<cv::GArray<U>>
+{
+    static std::vector<U>& get(GCPUContext &ctx, int idx)
+    {
+        return ctx.outVecR<U>(idx);
+    }
+};
+
+template<typename, typename, typename>
+struct OCVCallHelper;
+
+// FIXME: probably can be simplified with std::apply or analogue.
+template<typename Impl, typename... Ins, typename... Outs>
+struct OCVCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...> >
+{
+    template<typename... Inputs>
+    struct call_and_postprocess
+    {
+        template<typename... Outputs>
+        static void call(Inputs&&... ins, Outputs&&... outs)
+        {
+            //not using a std::forward on outs is deliberate in order to
+            //cause compilation error, by tring to bind rvalue references to lvalue references
+            Impl::run(std::forward<Inputs>(ins)..., outs...);
+
+            postprocess(outs...);
+        }
+    };
+
+    template<int... IIs, int... OIs>
+    static void call_impl(GCPUContext &ctx, detail::Seq<IIs...>, detail::Seq<OIs...>)
+    {
+        //Make sure that OpenCV kernels do not reallocate memory for output parameters
+        //by comparing it's state (data ptr) before and after the call.
+        //This is done by converting each output Mat into tracked_cv_mat object, and binding
+        //them to parameters of ad-hoc function
+        //Convert own::Scalar to cv::Scalar before call kernel and run kernel
+        //convert cv::Scalar to own::Scalar after call kernel and write back results
+        call_and_postprocess<decltype(get_in<Ins>::get(ctx, IIs))...>::call(get_in<Ins>::get(ctx, IIs)..., get_out<Outs>::get(ctx, OIs)...);
+    }
+
+    static void call(GCPUContext &ctx)
+    {
+        call_impl(ctx,
+                  typename detail::MkSeq<sizeof...(Ins)>::type(),
+                  typename detail::MkSeq<sizeof...(Outs)>::type());
+    }
+};
+
+} // namespace detail
+
+template<class Impl, class K>
+class GCPUKernelImpl: public detail::OCVCallHelper<Impl, typename K::InArgs, typename K::OutArgs>
+{
+    using P = detail::OCVCallHelper<Impl, typename K::InArgs, typename K::OutArgs>;
+
+public:
+    using API = K;
+
+    static cv::gapi::GBackend backend()  { return cv::gapi::cpu::backend(); }
+    static cv::GCPUKernel     kernel()   { return GCPUKernel(&P::call);     }
+};
+
+#define GAPI_OCV_KERNEL(Name, API) struct Name: public cv::GCPUKernelImpl<Name, API>
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GCPUKERNEL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/imgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/cpu/imgproc.hpp
new file mode 100644 (file)
index 0000000..0b96db0
--- /dev/null
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CPU_IMGPROC_API_HPP
+#define OPENCV_GAPI_CPU_IMGPROC_API_HPP
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace imgproc {
+namespace cpu {
+
+GAPI_EXPORTS GKernelPackage kernels();
+
+} // namespace cpu
+} // namespace imgproc
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_CPU_IMGPROC_API_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/core.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/core.hpp
new file mode 100644 (file)
index 0000000..8c21f57
--- /dev/null
@@ -0,0 +1,20 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_FLUID_CORE_HPP
+#define OPENCV_GAPI_FLUID_CORE_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+
+namespace cv { namespace gapi { namespace core { namespace fluid {
+
+GAPI_EXPORTS GKernelPackage kernels();
+
+}}}}
+
+#endif // OPENCV_GAPI_FLUID_CORE_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidbuffer.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidbuffer.hpp
new file mode 100644 (file)
index 0000000..8965ec7
--- /dev/null
@@ -0,0 +1,150 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_FLUID_BUFFER_HPP
+#define OPENCV_GAPI_FLUID_BUFFER_HPP
+
+#include <list>
+#include <numeric> // accumulate
+#include <ostream> // ostream
+#include <cstdint> // uint8_t
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/own/mat.hpp>
+#include <opencv2/gapi/gmat.hpp>
+
+#include "opencv2/gapi/util/optional.hpp"
+#include "opencv2/gapi/own/scalar.hpp"
+#include "opencv2/gapi/own/mat.hpp"
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+struct Border
+{
+#if !defined(GAPI_STANDALONE)
+    // This constructor is required to support existing kernels which are part of G-API
+    Border(int _type, cv::Scalar _val) : type(_type), value(to_own(_val)) {};
+#endif // !defined(GAPI_STANDALONE)
+    Border(int _type, cv::gapi::own::Scalar _val) : type(_type), value(_val) {};
+    int type;
+    cv::gapi::own::Scalar value;
+};
+
+using BorderOpt = util::optional<Border>;
+
+bool operator == (const Border& b1, const Border& b2);
+
+class GAPI_EXPORTS Buffer;
+
+class GAPI_EXPORTS View
+{
+public:
+    struct Cache
+    {
+        std::vector<const uint8_t*> m_linePtrs;
+        GMatDesc m_desc;
+        int m_border_size = 0;
+
+        inline const uint8_t* linePtr(int index) const
+        {
+            return m_linePtrs[index + m_border_size];
+        }
+    };
+
+    View() = default;
+
+    const inline uint8_t* InLineB(int index) const // -(w-1)/2...0...+(w-1)/2 for Filters
+    {
+        return m_cache->linePtr(index);
+    }
+
+    template<typename T> const inline T* InLine(int i) const
+    {
+        const uint8_t* ptr = this->InLineB(i);
+        return reinterpret_cast<const T*>(ptr);
+    }
+
+    inline operator bool() const { return m_priv != nullptr; }
+    bool ready() const;
+    inline int length() const { return m_cache->m_desc.size.width; }
+    int y() const;
+
+    inline const GMatDesc& meta() const { return m_cache->m_desc; }
+
+    class GAPI_EXPORTS Priv;      // internal use only
+    Priv& priv();               // internal use only
+    const Priv& priv() const;   // internal use only
+
+    View(Priv* p);
+
+private:
+    std::shared_ptr<Priv> m_priv;
+    const Cache* m_cache;
+};
+
+class GAPI_EXPORTS Buffer
+{
+public:
+    struct Cache
+    {
+        std::vector<uint8_t*> m_linePtrs;
+        GMatDesc m_desc;
+    };
+
+    // Default constructor (executable creation stage,
+    // all following initialization performed in Priv::init())
+    Buffer();
+    // Scratch constructor (user kernels)
+    Buffer(const cv::GMatDesc &desc);
+
+    // Constructor for intermediate buffers (for tests)
+    Buffer(const cv::GMatDesc &desc,
+           int max_line_consumption, int border_size,
+           int skew,
+           int wlpi,
+           BorderOpt border);
+    // Constructor for in/out buffers (for tests)
+    Buffer(const cv::gapi::own::Mat &data, bool is_input);
+
+    inline uint8_t* OutLineB(int index = 0)
+    {
+        return m_cache->m_linePtrs[index];
+    }
+
+    template<typename T> inline T* OutLine(int index = 0)
+    {
+        uint8_t* ptr = this->OutLineB(index);
+        return reinterpret_cast<T*>(ptr);
+    }
+
+    int y() const;
+
+    int linesReady() const;
+    void debug(std::ostream &os) const;
+    inline int length() const { return m_cache->m_desc.size.width; }
+    int lpi() const;  // LPI for WRITER
+
+    inline const GMatDesc& meta() const { return m_cache->m_desc; }
+
+    View mkView(int borderSize, bool ownStorage);
+
+    class GAPI_EXPORTS Priv;      // internal use only
+    Priv& priv();               // internal use only
+    const Priv& priv() const;   // internal use only
+
+private:
+    std::shared_ptr<Priv> m_priv;
+    const Cache* m_cache;
+};
+
+} // namespace cv::gapi::fluid
+} // namespace cv::gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_FLUID_BUFFER_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/gfluidkernel.hpp
new file mode 100644 (file)
index 0000000..c71c5aa
--- /dev/null
@@ -0,0 +1,302 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_FLUID_KERNEL_HPP
+#define OPENCV_GAPI_FLUID_KERNEL_HPP
+
+#include <vector>
+#include <functional>
+#include <map>
+#include <unordered_map>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/garg.hpp>
+#include <opencv2/gapi/own/types.hpp>
+
+#include <opencv2/gapi/fluid/gfluidbuffer.hpp>
+
+// FIXME: namespace scheme for backends?
+namespace cv {
+
+namespace gapi
+{
+namespace fluid
+{
+    /**
+     * \addtogroup gapi_std_backends G-API Standard backends
+     * @{
+     */
+    /**
+     * @brief Get a reference to Fluid backend.
+     *
+     * @sa gapi_std_backends
+     */
+    GAPI_EXPORTS cv::gapi::GBackend backend();
+    /** @} */
+} // namespace flud
+} // namespace gapi
+
+
+class GAPI_EXPORTS GFluidKernel
+{
+public:
+    enum class Kind
+    {
+        Filter,
+        Resize
+    };
+
+    // This function is a generic "doWork" callback
+    using F = std::function<void(const cv::GArgs&, const std::vector<gapi::fluid::Buffer*> &)>;
+
+    // This function is a generic "initScratch" callback
+    using IS = std::function<void(const cv::GMetaArgs &, const cv::GArgs&, gapi::fluid::Buffer &)>;
+
+    // This function is a generic "resetScratch" callback
+    using RS = std::function<void(gapi::fluid::Buffer &)>;
+
+    // This function describes kernel metadata inference rule.
+    using M = std::function<GMetaArgs(const GMetaArgs &, const GArgs &)>;
+
+    // This function is a generic "getBorder" callback (extracts border-related data from kernel's input parameters)
+    using B = std::function<gapi::fluid::BorderOpt(const GMetaArgs&, const GArgs&)>;
+
+    // FIXME: move implementations out of header file
+    GFluidKernel() {}
+    GFluidKernel(int w, Kind k, int l, bool scratch, const F& f, const IS &is, const RS &rs, const B& b)
+        : m_window(w)
+        , m_kind(k)
+        , m_lpi(l)
+        , m_scratch(scratch)
+        , m_f(f)
+        , m_is(is)
+        , m_rs(rs)
+        , m_b(b) {}
+
+    int m_window = -1;
+    Kind m_kind;
+    const int  m_lpi     = -1;
+    const bool m_scratch = false;
+
+    const F    m_f;
+    const IS   m_is;
+    const RS   m_rs;
+    const B    m_b;
+};
+
+// FIXME!!!
+// This is the temporary and experimental API
+// which should be replaced by runtime roi-based scheduling
+struct GFluidOutputRois
+{
+    std::vector<cv::gapi::own::Rect> rois;
+};
+
+namespace detail
+{
+template<> struct CompileArgTag<GFluidOutputRois>
+{
+    static const char* tag() { return "gapi.fluid.outputRois"; }
+};
+} // namespace detail
+
+namespace detail
+{
+template<class T> struct fluid_get_in;
+template<> struct fluid_get_in<cv::GMat>
+{
+    static const cv::gapi::fluid::View& get(const cv::GArgs &in_args, int idx)
+    {
+        return in_args[idx].unsafe_get<cv::gapi::fluid::View>();
+    }
+};
+
+template<> struct fluid_get_in<cv::GScalar>
+{
+    // FIXME: change to return by reference when moved to own::Scalar
+#if !defined(GAPI_STANDALONE)
+    static const cv::Scalar get(const cv::GArgs &in_args, int idx)
+    {
+        return cv::gapi::own::to_ocv(in_args[idx].unsafe_get<cv::gapi::own::Scalar>());
+    }
+#else
+    static const cv::gapi::own::Scalar get(const cv::GArgs &in_args, int idx)
+    {
+        return in_args[idx].get<cv::gapi::own::Scalar>();
+    }
+#endif // !defined(GAPI_STANDALONE)
+};
+template<class T> struct fluid_get_in
+{
+    static const T& get(const cv::GArgs &in_args, int idx)
+    {
+        return in_args[idx].unsafe_get<T>();
+    }
+};
+
+template<bool, typename Impl, typename... Ins>
+struct scratch_helper;
+
+template<typename Impl, typename... Ins>
+struct scratch_helper<true, Impl, Ins...>
+{
+    // Init
+    template<int... IIs>
+    static void help_init_impl(const cv::GMetaArgs &metas,
+                               const cv::GArgs     &in_args,
+                               gapi::fluid::Buffer &scratch_buf,
+                               detail::Seq<IIs...>)
+    {
+        Impl::initScratch(get_in_meta<Ins>(metas, in_args, IIs)..., scratch_buf);
+    }
+
+    static void help_init(const cv::GMetaArgs &metas,
+                          const cv::GArgs     &in_args,
+                          gapi::fluid::Buffer &b)
+    {
+        help_init_impl(metas, in_args, b, typename detail::MkSeq<sizeof...(Ins)>::type());
+    }
+
+    // Reset
+    static void help_reset(gapi::fluid::Buffer &b)
+    {
+        Impl::resetScratch(b);
+    }
+};
+
+template<typename Impl, typename... Ins>
+struct scratch_helper<false, Impl, Ins...>
+{
+    static void help_init(const cv::GMetaArgs &,
+                          const cv::GArgs     &,
+                          gapi::fluid::Buffer &)
+    {
+        GAPI_Assert(false);
+    }
+    static void help_reset(gapi::fluid::Buffer &)
+    {
+        GAPI_Assert(false);
+    }
+};
+
+template<typename T> struct is_gmat_type
+{
+    static const constexpr bool value = std::is_same<cv::GMat, T>::value;
+};
+
+template<bool CallCustomGetBorder, typename Impl, typename... Ins>
+struct get_border_helper;
+
+template<typename Impl, typename... Ins>
+struct get_border_helper<true, Impl, Ins...>
+{
+    template<int... IIs>
+    static gapi::fluid::BorderOpt get_border_impl(const GMetaArgs &metas,
+                                                  const cv::GArgs &in_args,
+                                                  cv::detail::Seq<IIs...>)
+    {
+        return util::make_optional(Impl::getBorder(cv::detail::get_in_meta<Ins>(metas, in_args, IIs)...));
+    }
+
+    static gapi::fluid::BorderOpt help(const GMetaArgs &metas,
+                                       const cv::GArgs &in_args)
+    {
+        return get_border_impl(metas, in_args, typename detail::MkSeq<sizeof...(Ins)>::type());
+    }
+};
+
+template<typename Impl, typename... Ins>
+struct get_border_helper<false, Impl, Ins...>
+{
+    static gapi::fluid::BorderOpt help(const cv::GMetaArgs &,
+                                       const cv::GArgs     &)
+    {
+        return {};
+    }
+};
+
+template<typename, typename, typename, bool UseScratch>
+struct FluidCallHelper;
+
+template<typename Impl, typename... Ins, typename... Outs, bool UseScratch>
+struct FluidCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...>, UseScratch>
+{
+    static_assert(all_satisfy<is_gmat_type, Outs...>::value, "return type must be GMat");
+
+    // Execution dispatcher ////////////////////////////////////////////////////
+    template<int... IIs, int... OIs>
+    static void call_impl(const cv::GArgs &in_args,
+                          const std::vector<gapi::fluid::Buffer*> &out_bufs,
+                          detail::Seq<IIs...>,
+                          detail::Seq<OIs...>)
+    {
+        Impl::run(fluid_get_in<Ins>::get(in_args, IIs)..., *out_bufs[OIs]...);
+    }
+
+    static void call(const cv::GArgs &in_args,
+                     const std::vector<gapi::fluid::Buffer*> &out_bufs)
+    {
+        constexpr int numOuts = (sizeof...(Outs)) + (UseScratch ? 1 : 0);
+        call_impl(in_args, out_bufs,
+                  typename detail::MkSeq<sizeof...(Ins)>::type(),
+                  typename detail::MkSeq<numOuts>::type());
+    }
+
+    // Scratch buffer initialization dispatcher ////////////////////////////////
+    static void init_scratch(const GMetaArgs &metas,
+                             const cv::GArgs &in_args,
+                             gapi::fluid::Buffer &b)
+    {
+        scratch_helper<UseScratch, Impl, Ins...>::help_init(metas, in_args, b);
+    }
+
+    // Scratch buffer reset dispatcher /////////////////////////////////////////
+    static void reset_scratch(gapi::fluid::Buffer &scratch_buf)
+    {
+        scratch_helper<UseScratch, Impl, Ins...>::help_reset(scratch_buf);
+    }
+
+    static gapi::fluid::BorderOpt getBorder(const GMetaArgs &metas, const cv::GArgs &in_args)
+    {
+        // User must provide "init" callback if Window != 1
+        // TODO: move to constexpr if when we enable C++17
+        constexpr bool callCustomGetBorder = (Impl::Window != 1);
+        return get_border_helper<callCustomGetBorder, Impl, Ins...>::help(metas, in_args);
+    }
+};
+} // namespace detail
+
+
+template<class Impl, class K, bool UseScratch>
+class GFluidKernelImpl
+{
+    static const int LPI = 1;
+    static const auto Kind = GFluidKernel::Kind::Filter;
+    using P = detail::FluidCallHelper<Impl, typename K::InArgs, typename K::OutArgs, UseScratch>;
+
+public:
+    using API = K;
+
+    static GFluidKernel kernel()
+    {
+        // FIXME: call() and getOutMeta() needs to be renamed so it is clear these
+        // functions are internal wrappers, not user API
+        return GFluidKernel(Impl::Window, Impl::Kind, Impl::LPI,
+                            UseScratch,
+                            &P::call, &P::init_scratch, &P::reset_scratch, &P::getBorder);
+    }
+
+    static cv::gapi::GBackend backend() { return cv::gapi::fluid::backend(); }
+};
+
+#define GAPI_FLUID_KERNEL(Name, API, Scratch) struct Name: public cv::GFluidKernelImpl<Name, API, Scratch>
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GCPUKERNEL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/imgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/fluid/imgproc.hpp
new file mode 100644 (file)
index 0000000..dedfa9d
--- /dev/null
@@ -0,0 +1,20 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_FLUID_IMGPROC_HPP
+#define OPENCV_GAPI_FLUID_IMGPROC_HPP
+
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+#include <opencv2/gapi/own/exports.hpp> // GAPI_EXPORTS
+
+namespace cv { namespace gapi { namespace imgproc { namespace fluid {
+
+GAPI_EXPORTS GKernelPackage kernels();
+
+}}}}
+
+#endif // OPENCV_GAPI_FLUID_IMGPROC_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garg.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garg.hpp
new file mode 100644 (file)
index 0000000..f8a3170
--- /dev/null
@@ -0,0 +1,126 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GARG_HPP
+#define OPENCV_GAPI_GARG_HPP
+
+#include <vector>
+#include <type_traits>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include "opencv2/gapi/own/mat.hpp"
+
+#include "opencv2/gapi/util/any.hpp"
+#include "opencv2/gapi/util/variant.hpp"
+
+#include "opencv2/gapi/gmat.hpp"
+#include "opencv2/gapi/gscalar.hpp"
+#include "opencv2/gapi/garray.hpp"
+#include "opencv2/gapi/gtype_traits.hpp"
+#include "opencv2/gapi/gmetaarg.hpp"
+#include "opencv2/gapi/own/scalar.hpp"
+
+namespace cv {
+
+class GArg;
+
+namespace detail {
+    template<typename T>
+    using is_garg = std::is_same<GArg, typename std::decay<T>::type>;
+}
+
+// Parameter holder class for a node
+// Depending on platform capabilities, can either support arbitrary types
+// (as `boost::any`) or a limited number of types (as `boot::variant`).
+// FIXME: put into "details" as a user shouldn't use it in his code
+class GAPI_EXPORTS GArg
+{
+public:
+    GArg() {}
+
+    template<typename T, typename std::enable_if<!detail::is_garg<T>::value, int>::type = 0>
+    explicit GArg(const T &t)
+        : kind(detail::GTypeTraits<T>::kind)
+        , value(detail::wrap_gapi_helper<T>::wrap(t))
+    {
+    }
+
+    template<typename T, typename std::enable_if<!detail::is_garg<T>::value, int>::type = 0>
+    explicit GArg(T &&t)
+        : kind(detail::GTypeTraits<typename std::decay<T>::type>::kind)
+        , value(detail::wrap_gapi_helper<T>::wrap(t))
+    {
+    }
+
+    template<typename T> inline T& get()
+    {
+        return util::any_cast<typename std::remove_reference<T>::type>(value);
+    }
+
+    template<typename T> inline const T& get() const
+    {
+        return util::any_cast<typename std::remove_reference<T>::type>(value);
+    }
+
+    template<typename T> inline T& unsafe_get()
+    {
+        return util::unsafe_any_cast<typename std::remove_reference<T>::type>(value);
+    }
+
+    template<typename T> inline const T& unsafe_get() const
+    {
+        return util::unsafe_any_cast<typename std::remove_reference<T>::type>(value);
+    }
+
+    detail::ArgKind kind = detail::ArgKind::OPAQUE;
+
+protected:
+    util::any value;
+};
+
+using GArgs = std::vector<GArg>;
+
+// FIXME: Express as M<GProtoArg...>::type
+// FIXME: Move to a separate file!
+using GRunArg  = util::variant<
+#if !defined(GAPI_STANDALONE)
+    cv::Mat,
+    cv::Scalar,
+    cv::UMat,
+#endif // !defined(GAPI_STANDALONE)
+    cv::gapi::own::Mat,
+    cv::gapi::own::Scalar,
+    cv::detail::VectorRef
+    >;
+using GRunArgs = std::vector<GRunArg>;
+
+using GRunArgP = util::variant<
+#if !defined(GAPI_STANDALONE)
+    cv::Mat*,
+    cv::Scalar*,
+    cv::UMat*,
+#endif // !defined(GAPI_STANDALONE)
+    cv::gapi::own::Mat*,
+    cv::gapi::own::Scalar*,
+    cv::detail::VectorRef
+    >;
+using GRunArgsP = std::vector<GRunArgP>;
+
+
+template<typename... Ts> inline GRunArgs gin(const Ts&... args)
+{
+    return GRunArgs{ GRunArg(detail::wrap_host_helper<Ts>::wrap_in(args))... };
+}
+
+template<typename... Ts> inline GRunArgsP gout(Ts&... args)
+{
+    return GRunArgsP{ GRunArgP(detail::wrap_host_helper<Ts>::wrap_out(args))... };
+}
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GARG_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garray.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/garray.hpp
new file mode 100644 (file)
index 0000000..87d0015
--- /dev/null
@@ -0,0 +1,251 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GARRAY_HPP
+#define OPENCV_GAPI_GARRAY_HPP
+
+#include <functional>
+#include <ostream>
+#include <vector>
+#include <memory>
+
+#include <opencv2/gapi/own/exports.hpp>
+#include <opencv2/gapi/opencv_includes.hpp>
+
+#include <opencv2/gapi/util/variant.hpp>
+#include <opencv2/gapi/util/throw.hpp>
+#include "opencv2/gapi/own/assert.hpp"
+
+namespace cv
+{
+// Forward declaration; GNode and GOrigin are an internal
+// (user-inaccessible) classes.
+class GNode;
+struct GOrigin;
+
+template<typename T> class GArray;
+
+/**
+ * \addtogroup gapi_meta_args
+ * @{
+ */
+struct GArrayDesc
+{
+    // FIXME: Body
+    // FIXME: Also implement proper operator== then
+    bool operator== (const GArrayDesc&) const { return true; }
+};
+template<typename U> GArrayDesc descr_of(const std::vector<U> &) { return {};}
+static inline GArrayDesc empty_array_desc() {return {}; }
+/** @} */
+
+std::ostream& operator<<(std::ostream& os, const cv::GArrayDesc &desc);
+
+namespace detail
+{
+    // ConstructVec is a callback which stores information about T and is used by
+    // G-API runtime to construct arrays in host memory (T remains opaque for G-API).
+    // ConstructVec is carried into G-API internals by GArrayU.
+    // Currently it is suitable for Host (CPU) plugins only, real offload may require
+    // more information for manual memory allocation on-device.
+    class VectorRef;
+    using ConstructVec = std::function<void(VectorRef&)>;
+
+
+    // This class strips type information from GArray<T> and makes it usable
+    // in the G-API graph compiler (expression unrolling, graph generation, etc).
+    // Part of GProtoArg.
+    class GAPI_EXPORTS GArrayU
+    {
+    public:
+        GArrayU(const GNode &n, std::size_t out); // Operation result constructor
+
+        GOrigin& priv();                          // Internal use only
+        const GOrigin& priv() const;              // Internal use only
+
+    protected:
+        GArrayU();                                // Default constructor
+        template<class> friend class cv::GArray;  //  (avialable to GArray<T> only)
+
+        void setConstructFcn(ConstructVec &&cv);  // Store T-aware constructor
+
+        std::shared_ptr<GOrigin> m_priv;
+    };
+
+    // This class represents a typed STL vector reference.
+    // Depending on origins, this reference may be either "just a" reference to
+    // an object created externally, OR actually own the underlying object
+    // (be value holder).
+    class BasicVectorRef
+    {
+    public:
+        std::size_t    m_elemSize = 0ul;
+        cv::GArrayDesc m_desc;
+        virtual ~BasicVectorRef() {}
+    };
+
+    template<typename T> class VectorRefT: public BasicVectorRef
+    {
+        using empty_t  = util::monostate;
+        using ro_ext_t = const std::vector<T> *;
+        using rw_ext_t =       std::vector<T> *;
+        using rw_own_t =       std::vector<T>  ;
+        util::variant<empty_t, ro_ext_t, rw_ext_t, rw_own_t> m_ref;
+
+        inline bool isEmpty() const { return util::holds_alternative<empty_t>(m_ref);  }
+        inline bool isROExt() const { return util::holds_alternative<ro_ext_t>(m_ref); }
+        inline bool isRWExt() const { return util::holds_alternative<rw_ext_t>(m_ref); }
+        inline bool isRWOwn() const { return util::holds_alternative<rw_own_t>(m_ref); }
+
+        void init(const std::vector<T>* vec = nullptr)
+        {
+            m_elemSize = sizeof(T);
+            if (vec) m_desc = cv::descr_of(*vec);
+        }
+
+    public:
+        VectorRefT() { init(); }
+        virtual ~VectorRefT() {}
+
+        explicit VectorRefT(const std::vector<T>& vec) : m_ref(&vec)      { init(&vec); }
+        explicit VectorRefT(std::vector<T>& vec)  : m_ref(&vec)           { init(&vec); }
+        explicit VectorRefT(std::vector<T>&& vec) : m_ref(std::move(vec)) { init(&vec); }
+
+        // Reset a VectorRefT. Called only for objects instantiated
+        // internally in G-API (e.g. temporary GArray<T>'s within a
+        // computation).  Reset here means both initialization
+        // (creating an object) and reset (discarding its existing
+        // content before the next execution).  Must never be called
+        // for external VectorRefTs.
+        void reset()
+        {
+            if (isEmpty())
+            {
+                std::vector<T> empty_vector;
+                m_desc = cv::descr_of(empty_vector);
+                m_ref  = std::move(empty_vector);
+                GAPI_Assert(isRWOwn());
+            }
+            else if (isRWOwn())
+            {
+                util::get<rw_own_t>(m_ref).clear();
+            }
+            else GAPI_Assert(false); // shouldn't be called in *EXT modes
+        }
+
+        // Obtain a WRITE reference to underlying object
+        // Used by CPU kernel API wrappers when a kernel execution frame
+        // is created
+        std::vector<T>& wref()
+        {
+            GAPI_Assert(isRWExt() || isRWOwn());
+            if (isRWExt()) return *util::get<rw_ext_t>(m_ref);
+            if (isRWOwn()) return  util::get<rw_own_t>(m_ref);
+            util::throw_error(std::logic_error("Impossible happened"));
+        }
+
+        // Obtain a READ reference to underlying object
+        // Used by CPU kernel API wrappers when a kernel execution frame
+        // is created
+        const std::vector<T>& rref() const
+        {
+            // ANY vector can be accessed for reading, even if it declared for
+            // output. Example -- a GComputation from [in] to [out1,out2]
+            // where [out2] is a result of operation applied to [out1]:
+            //
+            //            GComputation boundary
+            //            . . . . . . .
+            //            .           .
+            //     [in] ----> foo() ----> [out1]
+            //            .           .    :
+            //            .           . . .:. . .
+            //            .                V    .
+            //            .              bar() ---> [out2]
+            //            . . . . . . . . . . . .
+            //
+            if (isROExt()) return *util::get<ro_ext_t>(m_ref);
+            if (isRWExt()) return *util::get<rw_ext_t>(m_ref);
+            if (isRWOwn()) return  util::get<rw_own_t>(m_ref);
+            util::throw_error(std::logic_error("Impossible happened"));
+        }
+    };
+
+    // This class strips type information from VectorRefT<> and makes it usable
+    // in the G-API executables (carrying run-time data/information to kernels).
+    // Part of GRunArg.
+    // Its methods are typed proxies to VectorRefT<T>.
+    // VectorRef maintains "reference" semantics so two copies of VectoRef refer
+    // to the same underlying object.
+    // FIXME: Put a good explanation on why cv::OutputArray doesn't fit this role
+    class VectorRef
+    {
+        std::shared_ptr<BasicVectorRef> m_ref;
+
+        template<typename T> inline void check() const
+        {
+            GAPI_DbgAssert(dynamic_cast<VectorRefT<T>*>(m_ref.get()) != nullptr);
+            GAPI_Assert(sizeof(T) == m_ref->m_elemSize);
+        }
+
+    public:
+        VectorRef() = default;
+        template<typename T> explicit VectorRef(const std::vector<T>& vec) : m_ref(new VectorRefT<T>(vec)) {}
+        template<typename T> explicit VectorRef(std::vector<T>& vec)       : m_ref(new VectorRefT<T>(vec)) {}
+        template<typename T> explicit VectorRef(std::vector<T>&& vec)      : m_ref(new VectorRefT<T>(vec)) {}
+
+        template<typename T> void reset()
+        {
+            if (!m_ref) m_ref.reset(new VectorRefT<T>());
+
+            check<T>();
+            static_cast<VectorRefT<T>&>(*m_ref).reset();
+        }
+
+        template<typename T> std::vector<T>& wref()
+        {
+            check<T>();
+            return static_cast<VectorRefT<T>&>(*m_ref).wref();
+        }
+
+        template<typename T> const std::vector<T>& rref() const
+        {
+            check<T>();
+            return static_cast<VectorRefT<T>&>(*m_ref).rref();
+        }
+
+        cv::GArrayDesc descr_of() const
+        {
+            return m_ref->m_desc;
+        }
+    };
+} // namespace detail
+
+/** \addtogroup gapi_data_objects
+ * @{
+ */
+
+template<typename T> class GArray
+{
+public:
+    GArray() { putDetails(); }             // Empty constructor
+    explicit GArray(detail::GArrayU &&ref) // GArrayU-based constructor
+        : m_ref(ref) { putDetails(); }     //   (used by GCall, not for users)
+
+    detail::GArrayU strip() const { return m_ref; }
+
+private:
+    static void VCTor(detail::VectorRef& vref) { vref.reset<T>(); }
+    void putDetails() {m_ref.setConstructFcn(&VCTor); }
+
+    detail::GArrayU m_ref;
+};
+
+/** @} */
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GARRAY_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcall.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcall.hpp
new file mode 100644 (file)
index 0000000..baf4f44
--- /dev/null
@@ -0,0 +1,63 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCALL_HPP
+#define OPENCV_GAPI_GCALL_HPP
+
+#include "opencv2/gapi/garg.hpp"      // GArg
+#include "opencv2/gapi/gmat.hpp"      // GMat
+#include "opencv2/gapi/gscalar.hpp"   // GScalar
+#include "opencv2/gapi/garray.hpp"    // GArray<T>
+
+namespace cv {
+
+struct GKernel;
+
+// The whole idea of this class is to represent an operation
+// which is applied to arguments. This is part of public API,
+// since it is what users should use to define kernel interfaces.
+
+class GAPI_EXPORTS GCall final
+{
+public:
+    class Priv;
+
+    explicit GCall(const GKernel &k);
+    ~GCall();
+
+    template<typename... Ts>
+    GCall& pass(Ts&&... args)
+    {
+        setArgs({cv::GArg(std::move(args))...});
+        return *this;
+    }
+
+    // A generic yield method - obtain a link to operator's particular GMat output
+    GMat    yield      (int output = 0);
+    GScalar yieldScalar(int output = 0);
+
+    template<class T> GArray<T> yieldArray(int output = 0)
+    {
+        return GArray<T>(yieldArray(output));
+    }
+
+    // Internal use only
+    Priv& priv();
+    const Priv& priv() const;
+
+protected:
+    std::shared_ptr<Priv> m_priv;
+
+    void setArgs(std::vector<GArg> &&args);
+
+    // Public version returns a typed array, this one is implementation detail
+    detail::GArrayU yieldArray(int output = 0);
+};
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GCALL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcommon.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcommon.hpp
new file mode 100644 (file)
index 0000000..6a3f51f
--- /dev/null
@@ -0,0 +1,166 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMMON_HPP
+#define OPENCV_GAPI_GCOMMON_HPP
+
+#include <functional>   // std::hash
+#include <vector>       // std::vector
+#include <type_traits>  // decay
+
+#include <opencv2/gapi/opencv_includes.hpp>
+
+#include "opencv2/gapi/util/any.hpp"
+#include "opencv2/gapi/own/exports.hpp"
+#include "opencv2/gapi/own/assert.hpp"
+
+namespace cv {
+
+namespace detail
+{
+    // This is a trait-like structure to mark backend-specific compile arguments
+    // with tags
+    template<typename T> struct CompileArgTag;
+    template<typename T> struct CompileArgTag
+    {
+        static const char* tag() { return ""; };
+    };
+}
+
+// This definition is here because it is reused by both public(?) and internal
+// modules. Keeping it here wouldn't expose public details (e.g., API-level)
+// to components which are internal and operate on a lower-level entities
+// (e.g., compiler, backends).
+// FIXME: merge with ArgKind?
+// FIXME: replace with variant[format desc]?
+enum class GShape: int
+{
+    GMAT,
+    GSCALAR,
+    GARRAY,
+};
+
+struct GCompileArg;
+
+namespace detail {
+    template<typename T>
+    using is_compile_arg = std::is_same<GCompileArg, typename std::decay<T>::type>;
+}
+// CompileArg is an unified interface over backend-specific compilation
+// information
+// FIXME: Move to a separate file?
+/** \addtogroup gapi_compile_args
+ * @{
+ *
+ * @brief Compilation arguments: a set of data structures which can be
+ * passed to control compilation process
+ *
+ * G-API comes with a number of graph compilation options which can be
+ * passed to cv::GComputation::apply() or
+ * cv::GComputation::compile(). Known compilation options are listed
+ * in this page, while extra backends may introduce their own
+ * compilation options (G-API transparently accepts _everything_ which
+ * can be passed to cv::compile_args(), it depends on underlying
+ * backends if an option would be interpreted or not).
+ *
+ * For example, if an example computation is executed like this:
+ *
+ * @snippet modules/gapi/samples/api_ref_snippets.cpp graph_decl_apply
+ *
+ * Extra parameter specifying which kernels to compile with can be
+ * passed like this:
+ *
+ * @snippet modules/gapi/samples/api_ref_snippets.cpp apply_with_param
+ */
+
+/**
+ * @brief Represents an arbitrary compilation argument.
+ *
+ * Any value can be wrapped into cv::GCompileArg, but only known ones
+ * (to G-API or its backends) can be interpreted correctly.
+ *
+ * Normally objects of this class shouldn't be created manually, use
+ * cv::compile_args() function which automatically wraps everything
+ * passed in (a variadic template parameter pack) into a vector of
+ * cv::GCompileArg objects.
+ */
+struct GAPI_EXPORTS GCompileArg
+{
+public:
+    std::string tag;
+
+    // FIXME: use decay in GArg/other trait-based wrapper before leg is shot!
+    template<typename T, typename std::enable_if<!detail::is_compile_arg<T>::value, int>::type = 0>
+    explicit GCompileArg(T &&t)
+        : tag(detail::CompileArgTag<typename std::decay<T>::type>::tag())
+        , arg(t)
+    {
+    }
+
+    template<typename T> T& get()
+    {
+        return util::any_cast<T>(arg);
+    }
+
+    template<typename T> const T& get() const
+    {
+        return util::any_cast<T>(arg);
+    }
+
+private:
+    util::any arg;
+};
+
+using GCompileArgs = std::vector<GCompileArg>;
+
+/**
+ * Wraps a list of arguments (a parameter pack) into a vector of
+ * compilation arguments (cv::GCompileArg).
+ */
+template<typename... Ts> GCompileArgs compile_args(Ts&&... args)
+{
+    return GCompileArgs{ GCompileArg(args)... };
+}
+
+/**
+ * @brief Ask G-API to dump compiled graph in Graphviz format under
+ * the given file name.
+ *
+ * Specifies a graph dump path (path to .dot file to be generated).
+ * G-API will dump a .dot file under specified path during a
+ * compilation process if this flag is passed.
+ */
+struct graph_dump_path
+{
+    std::string m_dump_path;
+};
+/** @} */
+
+namespace detail
+{
+    template<> struct CompileArgTag<cv::graph_dump_path>
+    {
+        static const char* tag() { return "gapi.graph_dump_path"; }
+    };
+}
+
+} // namespace cv
+
+// std::hash overload for GShape
+namespace std
+{
+template<> struct hash<cv::GShape>
+{
+    size_t operator() (cv::GShape sh) const
+    {
+        return std::hash<int>()(static_cast<int>(sh));
+    }
+};
+} // namespace std
+
+
+#endif // OPENCV_GAPI_GCOMMON_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompiled.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompiled.hpp
new file mode 100644 (file)
index 0000000..ad491b7
--- /dev/null
@@ -0,0 +1,217 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMPILED_HPP
+#define OPENCV_GAPI_GCOMPILED_HPP
+
+#include <vector>
+
+#include "opencv2/gapi/opencv_includes.hpp"
+#include "opencv2/gapi/own/assert.hpp"
+#include "opencv2/gapi/garg.hpp"
+
+namespace cv {
+
+// This class represents a compiled computation.
+// In theory (and ideally), it can be used w/o the rest of APIs.
+// In theory (and ideally), it can be serialized/deserialized.
+// It can enable scenarious like deployment to an autonomous devince, FuSa, etc.
+//
+// Currently GCompiled assumes all GMats you used to pass data to G-API
+// are valid and not destroyed while you use a GCompiled object.
+//
+// FIXME: In future, there should be a way to name I/O objects and specify it
+// to GCompiled externally (for example, when it is loaded on the target system).
+
+/**
+ * \addtogroup gapi_main_classes
+ * @{
+ */
+/**
+ * @brief Represents a compiled computation (graph). Can only be used
+ * with image / data formats & resolutions it was compiled for, with
+ * some exceptions.
+ *
+ * This class represents a product of graph compilation (calling
+ * cv::GComputation::compile()). Objects of this class actually do
+ * data processing, and graph execution is incapsulated into objects
+ * of this class. Execution model itself depends on kernels and
+ * backends which were using during the compilation, see @ref
+ * gapi_compile_args for details.
+ *
+ * In a general case, GCompiled objects can be applied to data only in
+ * that formats/resolutions they were compiled for (see @ref
+ * gapi_meta_args). However, if the underlying backends allow, a
+ * compiled object can be _reshaped_ to handle data (images) of
+ * different resolution, though formats and types must remain the same.
+ *
+ * GCompiled is very similar to `std::function<>` in its semantics --
+ * running it looks like a function call in the user code.
+ *
+ * At the moment, GCompiled objects are not reentrant -- generally,
+ * the objects are stateful since graph execution itself is a stateful
+ * process and this state is now maintained in GCompiled's own memory
+ * (not on the process stack).
+ *
+ * At the same time, two different GCompiled objects produced from the
+ * single cv::GComputation are completely independent and can be used
+ * concurrently.
+ */
+class GAPI_EXPORTS GCompiled
+{
+public:
+    /// @private
+    class GAPI_EXPORTS Priv;
+
+    /**
+     * @brief Constructs an empty object
+     */
+    GCompiled();
+
+    /**
+     * @brief Run the compiled computation, a generic version.
+     *
+     * @param ins vector of inputs to process.
+     * @param outs vector of outputs to produce.
+     *
+     * Input/output vectors must have the same number of elements as
+     * defined in the cv::GComputation protocol (at the moment of its
+     * construction). Shapes of elements also must conform to protocol
+     * (e.g. cv::Mat needs to be passed where cv::GMat has been
+     * declared as input, and so on). Run-time exception is generated
+     * otherwise.
+     *
+     * Objects in output vector may remain empty (like cv::Mat) --
+     * G-API will automatically initialize output objects to proper formats.
+     *
+     * @note Don't construct GRunArgs/GRunArgsP objects manually, use
+     * cv::gin()/cv::gout() wrappers instead.
+     */
+    void operator() (GRunArgs &&ins, GRunArgsP &&outs);          // Generic arg-to-arg
+#if !defined(GAPI_STANDALONE)
+
+    /**
+     * @brief Execute an unary computation
+     *
+     * @overload
+     * @param in input cv::Mat for unary computation
+     * @param out output cv::Mat for unary computation
+     * process.
+     */
+    void operator() (cv::Mat in, cv::Mat &out);                  // Unary overload
+
+    /**
+     * @brief Execute an unary computation
+     *
+     * @overload
+     * @param in input cv::Mat for unary computation
+     * @param out output cv::Scalar for unary computation
+     * process.
+     */
+    void operator() (cv::Mat in, cv::Scalar &out);               // Unary overload (scalar)
+
+    /**
+     * @brief Execute a binary computation
+     *
+     * @overload
+     * @param in1 first input cv::Mat for binary computation
+     * @param in2 second input cv::Mat for binary computation
+     * @param out output cv::Mat for binary computation
+     * process.
+     */
+    void operator() (cv::Mat in1, cv::Mat in2, cv::Mat &out);    // Binary overload
+
+    /**
+     * @brief Execute an binary computation
+     *
+     * @overload
+     * @param in1 first input cv::Mat for binary computation
+     * @param in2 second input cv::Mat for binary computation
+     * @param out output cv::Scalar for binary computation
+     * process.
+     */
+    void operator() (cv::Mat in1, cv::Mat in2, cv::Scalar &out); // Binary overload (scalar)
+
+    /**
+     * @brief Execute a computation with arbitrary number of
+     * inputs/outputs.
+     *
+     * @overload
+     * @param ins vector of input cv::Mat objects to process by the
+     * computation.
+     * @param outs vector of output cv::Mat objects to produce by the
+     * computation.
+     *
+     * Numbers of elements in ins/outs vectos must match numbers of
+     * inputs/outputs which were used to define the source GComputation.
+     */
+    void operator() (const std::vector<cv::Mat> &ins,            // Compatibility overload
+                     const std::vector<cv::Mat> &outs);
+#endif  // !defined(GAPI_STANDALONE)
+    /// @private
+    Priv& priv();
+
+    /**
+     * @brief Check if compiled object is valid (non-empty)
+     *
+     * @return true if the object is runnable (valid), false otherwise
+     */
+    explicit operator bool () const;
+
+    /**
+     * @brief Vector of metadata this graph was compiled for.
+     *
+     * @return Unless _reshape_ is not supported, return value is the
+     * same vector which was passed to cv::GComputation::compile() to
+     * produce this compiled object. Otherwise, it is the latest
+     * metadata vector passed to reshape() (if that call was
+     * successful).
+     */
+    const GMetaArgs& metas() const; // Meta passed to compile()
+
+    /**
+     * @brief Vector of metadata descriptions of graph outputs
+     *
+     * @return vector with formats/resolutions of graph's output
+     * objects, auto-inferred from input metadata vector by
+     * operations which form this computation.
+     *
+     * @note GCompiled objects produced from the same
+     * cv::GComputiation graph with different input metas may return
+     * different values in this vector.
+     */
+    const GMetaArgs& outMetas() const;
+
+    /**
+     * @brief Check if the underlying backends support reshape or not.
+     *
+     * @return true if supported, false otherwise.
+     */
+    bool canReshape() const;
+
+    /**
+     * @brief Reshape a compiled graph to support new image
+     * resolutions.
+     *
+     * Throws an exception if an error occurs.
+     *
+     * @param inMetas new metadata to reshape on. Vector size and
+     * metadata shapes must match the computation's protocol.
+     * @param args compilation arguments to use.
+     */
+    // FIXME: Why it requires compile args?
+    void reshape(const GMetaArgs& inMetas, const GCompileArgs& args);
+
+protected:
+    /// @private
+    std::shared_ptr<Priv> m_priv;
+};
+/** @} */
+
+}
+
+#endif // OPENCV_GAPI_GCOMPILED_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompoundkernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcompoundkernel.hpp
new file mode 100644 (file)
index 0000000..c5ac8a7
--- /dev/null
@@ -0,0 +1,123 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMPOUNDKERNEL_HPP
+#define OPENCV_GAPI_GCOMPOUNDKERNEL_HPP
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/garg.hpp>
+
+namespace cv {
+namespace gapi
+{
+namespace compound
+{
+    // FIXME User does not need to know about this function
+    // Needs that user may define compound kernels(as cpu kernels)
+    GAPI_EXPORTS cv::gapi::GBackend backend();
+} // namespace compound
+} // namespace gapi
+
+namespace detail
+{
+
+struct GCompoundContext
+{
+    explicit GCompoundContext(const GArgs& in_args);
+    template<typename T>
+    const T& inArg(int input) { return m_args.at(input).get<T>(); }
+
+    GArgs m_args;
+    GArgs m_results;
+};
+
+class GAPI_EXPORTS GCompoundKernel
+{
+// Compound kernel must use all of it's inputs
+public:
+    using F = std::function<void(GCompoundContext& ctx)>;
+
+    explicit GCompoundKernel(const F& f);
+    void apply(GCompoundContext& ctx);
+
+protected:
+    F m_f;
+};
+
+template<typename T> struct get_compound_in
+{
+    static T get(GCompoundContext &ctx, int idx) { return ctx.inArg<T>(idx); }
+};
+
+template<typename U> struct get_compound_in<cv::GArray<U>>
+{
+    static cv::GArray<U> get(GCompoundContext &ctx, int idx)
+    {
+        auto array = cv::GArray<U>();
+        ctx.m_args[idx] = GArg(array);
+        return array;
+    }
+};
+
+// Kernel may return one object(GMat, GScalar) or a tuple of objects.
+// This helper is needed to cast return value to the same form(tuple)
+template<typename>
+struct tuple_wrap_helper;
+
+template<typename T> struct tuple_wrap_helper
+{
+    static std::tuple<T> get(T&& obj) { return std::make_tuple(std::move(obj)); }
+};
+
+template<typename... Objs>
+struct tuple_wrap_helper<std::tuple<Objs...>>
+{
+    static std::tuple<Objs...> get(std::tuple<Objs...>&& objs) { return objs; }
+};
+
+template<typename, typename, typename>
+struct GCompoundCallHelper;
+
+template<typename Impl, typename... Ins, typename... Outs>
+struct GCompoundCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...> >
+{
+    template<int... IIs, int... OIs>
+    static void expand_impl(GCompoundContext &ctx, detail::Seq<IIs...>, detail::Seq<OIs...>)
+    {
+        auto result = Impl::expand(get_compound_in<Ins>::get(ctx, IIs)...);
+        auto tuple_return = tuple_wrap_helper<decltype(result)>::get(std::move(result));
+        ctx.m_results = { cv::GArg(std::get<OIs>(tuple_return))... };
+    }
+
+    static void expand(GCompoundContext &ctx)
+    {
+        expand_impl(ctx,
+                    typename detail::MkSeq<sizeof...(Ins)>::type(),
+                    typename detail::MkSeq<sizeof...(Outs)>::type());
+    }
+};
+
+template<class Impl, class K>
+class GCompoundKernelImpl: public cv::detail::GCompoundCallHelper<Impl, typename K::InArgs, typename K::OutArgs>
+{
+    using P = cv::detail::GCompoundCallHelper<Impl, typename K::InArgs, typename K::OutArgs>;
+
+public:
+    using API = K;
+
+    static cv::gapi::GBackend backend() { return cv::gapi::compound::backend(); }
+    static GCompoundKernel    kernel()  { return GCompoundKernel(&P::expand);   }
+};
+
+} // namespace detail
+#define GAPI_COMPOUND_KERNEL(Name, API) struct Name: public cv::detail::GCompoundKernelImpl<Name, API>
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GCOMPOUNDKERNEL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcomputation.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gcomputation.hpp
new file mode 100644 (file)
index 0000000..e89b9ae
--- /dev/null
@@ -0,0 +1,456 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMPUTATION_HPP
+#define OPENCV_GAPI_GCOMPUTATION_HPP
+
+#include <functional>
+
+#include "opencv2/gapi/util/util.hpp"
+#include "opencv2/gapi/gcommon.hpp"
+#include "opencv2/gapi/gproto.hpp"
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/gcompiled.hpp"
+
+namespace cv {
+
+namespace detail
+{
+    // FIXME: move to algorithm, cover with separate tests
+    // FIXME: replace with O(1) version (both memory and compilation time)
+    template<typename...>
+    struct last_type;
+
+    template<typename T>
+    struct last_type<T> { using type = T;};
+
+    template<typename T, typename... Ts>
+    struct last_type<T, Ts...> { using type = typename last_type<Ts...>::type; };
+
+    template<typename... Ts>
+    using last_type_t = typename last_type<Ts...>::type;
+}
+
+/**
+ * \addtogroup gapi_main_classes
+ * @{
+ */
+/**
+ * @brief GComputation class represents a captured computation
+ * graph. GComputation objects form boundaries for expression code
+ * user writes with G-API, allowing to compile and execute it.
+ *
+ * G-API computations are defined with input/output data
+ * objects. G-API will track automatically which operations connect
+ * specified outputs to the inputs, forming up a call graph to be
+ * executed. The below example expresses calculation of Sobel operator
+ * for edge detection (\f$G = \sqrt{G_x^2 + G_y^2}\f$):
+ *
+ * @snippet modules/gapi/samples/api_ref_snippets.cpp graph_def
+ *
+ * Full pipeline can be now captured with this object declaration:
+ *
+ * @snippet modules/gapi/samples/api_ref_snippets.cpp graph_cap_full
+ *
+ * Input/output data objects on which a call graph should be
+ * reconstructed are passed using special wrappers cv::GIn and
+ * cv::GOut. G-API will track automatically which operations form a
+ * path from inputs to outputs and build the execution graph appropriately.
+ *
+ * Note that cv::GComputation doesn't take ownership on data objects
+ * it is defined. Moreover, multiple GComputation objects may be
+ * defined on the same expressions, e.g. a smaller pipeline which
+ * expects that image gradients are already pre-calculated may be
+ * defined like this:
+ *
+ * @snippet modules/gapi/samples/api_ref_snippets.cpp graph_cap_sub
+ *
+ * The resulting graph would expect two inputs and produce one
+ * output. In this case, it doesn't matter if gx/gy data objects are
+ * results of cv::gapi::Sobel operators -- G-API will stop unrolling
+ * expressions and building the underlying graph one reaching this
+ * data objects.
+ *
+ * The way how GComputation is defined is important as its definition
+ * specifies graph _protocol_ -- the way how the graph should be
+ * used. Protocol is defined by number of inputs, number of outputs,
+ * and shapes of inputs and outputs.
+ *
+ * In the above example, sobelEdge expects one Mat on input and
+ * produces one Mat; while sobelEdgeSub expects two Mats on input and
+ * produces one Mat. GComputation's protocol defines how other
+ * computaion methods should be used -- cv::GComputation::compile() and
+ * cv::GComputation::apply(). For example, if a graph is defined on
+ * two GMat inputs, two cv::Mat objects have to be passed to apply()
+ * for execution. GComputation checks protocol correctness in runtime
+ * so passing a different number of objects in apply() or passing
+ * cv::Scalar instead of cv::Mat there would compile well as a C++
+ * source but raise an exception in run-time. G-API also comes with a
+ * typed wrapper cv::GComputationT<> which introduces this type-checking in
+ * compile-time.
+ *
+ * cv::GComputation itself is a thin object which just captures what
+ * the graph is. The compiled graph (which actually process data) is
+ * represented by class GCompiled. Use compile() method to generate a
+ * compiled graph with given compile options. cv::GComputation can
+ * also be used to process data with implicit graph compilation
+ * on-the-fly, see apply() for details.
+ *
+ * GComputation is a reference-counted object -- once defined, all its
+ * copies will refer to the same instance.
+ *
+ * @sa GCompiled
+ */
+class GAPI_EXPORTS GComputation
+{
+public:
+    class Priv;
+    typedef std::function<GComputation()> Generator;
+
+    // Various constructors enable different ways to define a computation: /////
+    // 1. Generic constructors
+    /**
+     * @brief Define a computation using a generator function.
+     *
+     * Graph can be defined in-place directly at the moment of its
+     * construction with a lambda:
+     *
+     * @snippet modules/gapi/samples/api_ref_snippets.cpp graph_gen
+     *
+     * This may be useful since all temporary objects (cv::GMats) and
+     * namespaces can be localized to scope of lambda, without
+     * contaminating the parent scope with probably unecessary objects
+     * and information.
+     *
+     * @param gen generator function which returns a cv::GComputation,
+     * see Generator.
+     */
+    GComputation(const Generator& gen);                // Generator
+                                                       // overload
+
+    /**
+     * @brief Generic GComputation constructor.
+     *
+     * Constructs a new graph with a given protocol, specified as a
+     * flow of operations connecting input/output objects. Throws if
+     * the passed boundaries are invalid, e.g. if there's no
+     * functional dependency (path) between given outputs and inputs.
+     *
+     * @param ins Input data vector.
+     * @param outs Output data vector.
+     *
+     * @note Don't construct GProtoInputArgs/GProtoOutputArgs objects
+     * directly, use cv::GIn()/cv::GOut() wrapper functions instead.
+     *
+     * @sa @ref gapi_data_objects
+     */
+    GComputation(GProtoInputArgs &&ins,
+                 GProtoOutputArgs &&outs);             // Arg-to-arg overload
+
+    // 2. Syntax sugar and compatibility overloads
+    /**
+     * @brief Defines an unary (one input -- one output) computation
+     *
+     * @overload
+     * @param in input GMat of the defined unary computation
+     * @param out output GMat of the defined unary computation
+     */
+    GComputation(GMat in, GMat out);                   // Unary overload
+
+    /**
+     * @brief Defines an unary (one input -- one output) computation
+     *
+     * @overload
+     * @param in input GMat of the defined unary computation
+     * @param out output GScalar of the defined unary computation
+     */
+    GComputation(GMat in, GScalar out);                // Unary overload (scalar)
+
+    /**
+     * @brief Defines a binary (two inputs -- one output) computation
+     *
+     * @overload
+     * @param in1 first input GMat of the defined binary computation
+     * @param in2 second input GMat of the defined binary computation
+     * @param out output GMat of the defined binary computation
+     */
+    GComputation(GMat in1, GMat in2, GMat out);        // Binary overload
+
+    /**
+     * @brief Defines a binary (two inputs -- one output) computation
+     *
+     * @overload
+     * @param in1 first input GMat of the defined binary computation
+     * @param in2 second input GMat of the defined binary computation
+     * @param out output GScalar of the defined binary computation
+     */
+    GComputation(GMat in1, GMat in2, GScalar out);     // Binary
+                                                       // overload
+                                                       // (scalar)
+
+    /**
+     * @brief Defines a computation with arbitrary input/output number.
+     *
+     * @overload
+     * @param ins vector of inputs GMats for this computation
+     * @param outs vector of outputs GMats for this computation
+     *
+     * Use this overload for cases when number of computation
+     * inputs/outputs is not known in compile-time -- e.g. when graph
+     * is programmatically generated to build an image pyramid with
+     * the given number of levels, etc.
+     */
+    GComputation(const std::vector<GMat> &ins,         // Compatibility overload
+                 const std::vector<GMat> &outs);
+
+    // Various versions of apply(): ////////////////////////////////////////////
+    // 1. Generic apply()
+    /**
+     * @brief Compile graph on-the-fly and immediately execute it on
+     * the inputs data vectors.
+     *
+     * Number of input/output data objects must match GComputation's
+     * protocol, also types of host data objects (cv::Mat, cv::Scalar)
+     * must match the shapes of data objects from protocol (cv::GMat,
+     * cv::GScalar). If there's a mismatch, a run-time exception will
+     * be generated.
+     *
+     * Internally, a cv::GCompiled object is created for the given
+     * input format configuration, which then is executed on the input
+     * data immediately. cv::GComputation caches compiled objects
+     * produced within apply() -- if this method would be called next
+     * time with the same input parameters (image formats, image
+     * resolution, etc), the underlying compiled graph will be reused
+     * without recompilation. If new metadata doesn't match the cached
+     * one, the underlying compiled graph is regenerated.
+     *
+     * @note compile() always triggers a compilation process and
+     * produces a new GCompiled object regardless if a similar one has
+     * been cached via apply() or not.
+     *
+     * @param ins vector of input data to process. Don't create
+     * GRunArgs object manually, use cv::gin() wrapper instead.
+     * @param outs vector of output data to fill results in. cv::Mat
+     * objects may be empty in this vector, G-API will automatically
+     * initialize it with the required format & dimensions. Don't
+     * create GRunArgsP object manually, use cv::gout() wrapper instead.
+     * @param args a list of compilation arguments to pass to the
+     * underlying compilation process. Don't create GCompileArgs
+     * object manually, use cv::compile_args() wrapper instead.
+     *
+     * @sa @ref gapi_data_objects, @ref gapi_compile_args
+     */
+    void apply(GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args = {});       // Arg-to-arg overload
+
+    /// @private -- Exclude this function from OpenCV documentation
+    void apply(const std::vector<cv::gapi::own::Mat>& ins,                        // Compatibility overload
+               const std::vector<cv::gapi::own::Mat>& outs,
+               GCompileArgs &&args = {});
+
+    // 2. Syntax sugar and compatibility overloads
+#if !defined(GAPI_STANDALONE)
+    /**
+     * @brief Execute an unary computation (with compilation on the fly)
+     *
+     * @overload
+     * @param in input cv::Mat for unary computation
+     * @param out output cv::Mat for unary computation
+     * @param args compilation arguments for underlying compilation
+     * process.
+     */
+    void apply(cv::Mat in, cv::Mat &out, GCompileArgs &&args = {});               // Unary overload
+
+    /**
+     * @brief Execute an unary computation (with compilation on the fly)
+     *
+     * @overload
+     * @param in input cv::Mat for unary computation
+     * @param out output cv::Scalar for unary computation
+     * @param args compilation arguments for underlying compilation
+     * process.
+     */
+    void apply(cv::Mat in, cv::Scalar &out, GCompileArgs &&args = {});            // Unary overload (scalar)
+
+    /**
+     * @brief Execute a binary computation (with compilation on the fly)
+     *
+     * @overload
+     * @param in1 first input cv::Mat for binary computation
+     * @param in2 second input cv::Mat for binary computation
+     * @param out output cv::Mat for binary computation
+     * @param args compilation arguments for underlying compilation
+     * process.
+     */
+    void apply(cv::Mat in1, cv::Mat in2, cv::Mat &out, GCompileArgs &&args = {}); // Binary overload
+
+    /**
+     * @brief Execute an binary computation (with compilation on the fly)
+     *
+     * @overload
+     * @param in1 first input cv::Mat for binary computation
+     * @param in2 second input cv::Mat for binary computation
+     * @param out output cv::Scalar for binary computation
+     * @param args compilation arguments for underlying compilation
+     * process.
+     */
+    void apply(cv::Mat in1, cv::Mat in2, cv::Scalar &out, GCompileArgs &&args = {}); // Binary overload (scalar)
+
+    /**
+     * @brief Execute a computation with arbitrary number of
+     * inputs/outputs (with compilation on-the-fly).
+     *
+     * @overload
+     * @param ins vector of input cv::Mat objects to process by the
+     * computation.
+     * @param outs vector of output cv::Mat objects to produce by the
+     * computation.
+     * @param args compilation arguments for underlying compilation
+     * process.
+     *
+     * Numbers of elements in ins/outs vectos must match numbers of
+     * inputs/outputs which were used to define this GComputation.
+     */
+    void apply(const std::vector<cv::Mat>& ins,         // Compatibility overload
+               const std::vector<cv::Mat>& outs,
+               GCompileArgs &&args = {});
+#endif // !defined(GAPI_STANDALONE)
+    // Various versions of compile(): //////////////////////////////////////////
+    // 1. Generic compile() - requires metas to be passed as vector
+    /**
+     * @brief Compile the computation for specific input format(s).
+     *
+     * This method triggers compilation process and produces a new
+     * GCompiled object which then can process data of the given
+     * format. Passing data with different format to the compiled
+     * computation will generate a run-time exception.
+     *
+     * @param in_metas vector of input metadata configuration. Grab
+     * metadata from real data objects (like cv::Mat or cv::Scalar)
+     * using cv::descr_of(), or create it on your own.
+     * @param args compilation arguments for this compilation
+     * process. Compilation arguments directly affect what kind of
+     * executable object would be produced, e.g. which kernels (and
+     * thus, devices) would be used to execute computation.
+     *
+     * @return GCompiled, an executable computation compiled
+     * specifically for the given input parameters.
+     *
+     * @sa @ref gapi_compile_args
+     */
+    GCompiled compile(GMetaArgs &&in_metas, GCompileArgs &&args = {});
+
+    // 2. Syntax sugar - variadic list of metas, no extra compile args
+    // FIXME: SFINAE looks ugly in the generated documentation
+    /**
+     * @overload
+     *
+     * Takes a variadic parameter pack with metadata
+     * descriptors for which a compiled object needs to be produced.
+     *
+     * @return GCompiled, an executable computation compiled
+     * specifically for the given input parameters.
+     */
+    template<typename... Ts>
+    auto compile(const Ts&... metas) ->
+        typename std::enable_if<detail::are_meta_descrs<Ts...>::value, GCompiled>::type
+    {
+        return compile(GMetaArgs{GMetaArg(metas)...}, GCompileArgs());
+    }
+
+    // 3. Syntax sugar - variadic list of metas, extra compile args
+    // (seems optional parameters don't work well when there's an variadic template
+    // comes first)
+    //
+    // Ideally it should look like:
+    //
+    //     template<typename... Ts>
+    //     GCompiled compile(const Ts&... metas, GCompileArgs &&args)
+    //
+    // But not all compilers can hande this (and seems they shouldn't be able to).
+    // FIXME: SFINAE looks ugly in the generated documentation
+    /**
+     * @overload
+     *
+     * Takes a  variadic parameter pack with metadata
+     * descriptors for which a compiled object needs to be produced,
+     * followed by GCompileArgs object representing compilation
+     * arguments for this process.
+     *
+     * @return GCompiled, an executable computation compiled
+     * specifically for the given input parameters.
+     */
+    template<typename... Ts>
+    auto compile(const Ts&... meta_and_compile_args) ->
+        typename std::enable_if<detail::are_meta_descrs_but_last<Ts...>::value
+                                && std::is_same<GCompileArgs, detail::last_type_t<Ts...> >::value,
+                                GCompiled>::type
+    {
+        //FIXME: wrapping meta_and_compile_args into a tuple to unwrap them inside a helper function is the overkill
+        return compile(std::make_tuple(meta_and_compile_args...),
+                       typename detail::MkSeq<sizeof...(Ts)-1>::type());
+    }
+
+    // Internal use only
+    /// @private
+    Priv& priv();
+    /// @private
+    const Priv& priv() const;
+
+protected:
+
+    // 4. Helper method for (3)
+    /// @private
+    template<typename... Ts, int... IIs>
+    GCompiled compile(const std::tuple<Ts...> &meta_and_compile_args, detail::Seq<IIs...>)
+    {
+        GMetaArgs meta_args = {GMetaArg(std::get<IIs>(meta_and_compile_args))...};
+        GCompileArgs comp_args = std::get<sizeof...(Ts)-1>(meta_and_compile_args);
+        return compile(std::move(meta_args), std::move(comp_args));
+    }
+    /// @private
+    std::shared_ptr<Priv> m_priv;
+};
+/** @} */
+
+namespace gapi
+{
+    // FIXME: all these standalone functions need to be added to some
+    // common documentation section
+    /**
+     * @brief Define an tagged island (subgraph) within a computation.
+     *
+     * Declare an Island tagged with `name` and defined from `ins` to `outs`
+     * (exclusively, as ins/outs are data objects, and regioning is done on
+     * operations level).
+     * Throws if any operation between `ins` and `outs` are already assigned
+     * to another island.
+     *
+     * Islands allow to partition graph into subgraphs, fine-tuning
+     * the way it is scheduled by the underlying executor.
+     *
+     * @param name name of the Island to create
+     * @param ins vector of input data objects where the subgraph
+     * begins
+     * @param outs vector of output data objects where the subgraph
+     * ends.
+     *
+     * The way how an island is defined is similar to how
+     * cv::GComputation is defined on input/output data objects.
+     * Same rules apply here as well -- if there's no functional
+     * dependency between inputs and outputs or there's not enough
+     * input data objects were specified to properly calculate all
+     * outputs, an exception is thrown.
+     *
+     * Use cv::GIn() / cv::GOut() to specify input/output vectors.
+     */
+    void GAPI_EXPORTS island(const std::string &name,
+                             GProtoInputArgs  &&ins,
+                             GProtoOutputArgs &&outs);
+} // namespace gapi
+
+} // namespace cv
+#endif // OPENCV_GAPI_GCOMPUTATION_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gkernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gkernel.hpp
new file mode 100644 (file)
index 0000000..adc7da3
--- /dev/null
@@ -0,0 +1,563 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GKERNEL_HPP
+#define OPENCV_GAPI_GKERNEL_HPP
+
+#include <functional>
+#include <iostream>
+#include <string>  // string
+#include <type_traits> // false_type, true_type
+#include <unordered_map> // map (for GKernelPackage)
+#include <utility> // tuple
+#include <vector>  // lookup order
+
+#include <opencv2/gapi/gcommon.hpp> // CompileArgTag
+#include <opencv2/gapi/util/util.hpp> // Seq
+#include <opencv2/gapi/gcall.hpp>
+#include <opencv2/gapi/garg.hpp>      // GArg
+#include <opencv2/gapi/gmetaarg.hpp>  // GMetaArg
+#include <opencv2/gapi/gtype_traits.hpp> // GTypeTraits
+#include <opencv2/gapi/util/compiler_hints.hpp> //suppress_unused_warning
+
+
+namespace cv {
+
+using GShapes = std::vector<GShape>;
+
+// GKernel describes kernel API to the system
+// FIXME: add attributes of a kernel, (e.g. number and types
+// of inputs, etc)
+struct GAPI_EXPORTS GKernel
+{
+    using M = std::function<GMetaArgs(const GMetaArgs &, const GArgs &)>;
+
+    const std::string name;       // kernel ID, defined by its API (signature)
+    const M           outMeta;    // generic adaptor to API::outMeta(...)
+    const GShapes     outShapes; // types (shapes) kernel's outputs
+};
+
+// GKernelImpl describes particular kernel implementation to the system
+struct GAPI_EXPORTS GKernelImpl
+{
+    util::any         opaque;    // backend-specific opaque info
+};
+
+template<typename, typename> class GKernelTypeM;
+
+namespace detail
+{
+    ////////////////////////////////////////////////////////////////////////////
+    // yield() is used in graph construction time as a generic method to obtain
+    // lazy "return value" of G-API operations
+    //
+    namespace
+    {
+
+        template<typename T> struct Yield;
+        template<> struct Yield<cv::GMat>
+        {
+            static inline cv::GMat yield(cv::GCall &call, int i) { return call.yield(i); }
+        };
+        template<> struct Yield<cv::GScalar>
+        {
+            static inline cv::GScalar yield(cv::GCall &call, int i) { return call.yieldScalar(i); }
+        };
+        template<typename U> struct Yield<cv::GArray<U> >
+        {
+            static inline cv::GArray<U> yield(cv::GCall &call, int i) { return call.yieldArray<U>(i); }
+        };
+    } // anonymous namespace
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Helper classes which brings outputMeta() marshalling to kernel
+    // implementations
+    //
+    // 1. MetaType establishes G#Type -> G#Meta mapping between G-API dynamic
+    //    types and its metadata descriptor types.
+    //    This mapping is used to transform types to call outMeta() callback.
+    template<typename T> struct MetaType;
+    template<> struct MetaType<cv::GMat>    { using type = GMatDesc; };
+    template<> struct MetaType<cv::GScalar> { using type = GScalarDesc; };
+    template<typename U> struct MetaType<cv::GArray<U> > { using type = GArrayDesc; };
+    template<typename T> struct MetaType    { using type = T; }; // opaque args passed as-is
+
+    // 2. Hacky test based on MetaType to check if we operate on G-* type or not
+    template<typename T> using is_nongapi_type = std::is_same<T, typename MetaType<T>::type>;
+
+    // 3. Two ways to transform input arguments to its meta - for G-* and non-G* types:
+    template<typename T>
+    typename std::enable_if<!is_nongapi_type<T>::value, typename MetaType<T>::type>
+    ::type get_in_meta(const GMetaArgs &in_meta, const GArgs &, int idx)
+    {
+        return util::get<typename MetaType<T>::type>(in_meta.at(idx));
+    }
+
+    template<typename T>
+    typename std::enable_if<is_nongapi_type<T>::value, T>
+    ::type get_in_meta(const GMetaArgs &, const GArgs &in_args, int idx)
+    {
+        return in_args.at(idx).template get<T>();
+    }
+
+    // 4. The MetaHelper itself: an entity which generates outMeta() call
+    //    based on kernel signature, with arguments properly substituted.
+    // 4.1 - case for multiple return values
+    // FIXME: probably can be simplified with std::apply or analogue.
+    template<typename, typename, typename>
+    struct MetaHelper;
+
+    template<typename K, typename... Ins, typename... Outs>
+    struct MetaHelper<K, std::tuple<Ins...>, std::tuple<Outs...> >
+    {
+        template<int... IIs, int... OIs>
+        static GMetaArgs getOutMeta_impl(const GMetaArgs &in_meta,
+                                         const GArgs &in_args,
+                                         detail::Seq<IIs...>,
+                                         detail::Seq<OIs...>)
+        {
+            // FIXME: decay?
+            using R   = std::tuple<typename MetaType<Outs>::type...>;
+            const R r = K::outMeta( get_in_meta<Ins>(in_meta, in_args, IIs)... );
+            return GMetaArgs{ GMetaArg(std::get<OIs>(r))... };
+        }
+        // FIXME: help users identify how outMeta must look like (via default impl w/static_assert?)
+
+        static GMetaArgs getOutMeta(const GMetaArgs &in_meta,
+                                    const GArgs &in_args)
+        {
+            return getOutMeta_impl(in_meta,
+                                   in_args,
+                                   typename detail::MkSeq<sizeof...(Ins)>::type(),
+                                   typename detail::MkSeq<sizeof...(Outs)>::type());
+        }
+    };
+
+    // 4.1 - case for a single return value
+    // FIXME: How to avoid duplication here?
+    template<typename K, typename... Ins, typename Out>
+    struct MetaHelper<K, std::tuple<Ins...>, Out >
+    {
+        template<int... IIs>
+        static GMetaArgs getOutMeta_impl(const GMetaArgs &in_meta,
+                                         const GArgs &in_args,
+                                         detail::Seq<IIs...>)
+        {
+            // FIXME: decay?
+            using R = typename MetaType<Out>::type;
+            const R r = K::outMeta( get_in_meta<Ins>(in_meta, in_args, IIs)... );
+            return GMetaArgs{ GMetaArg(r) };
+        }
+        // FIXME: help users identify how outMeta must look like (via default impl w/static_assert?)
+
+        static GMetaArgs getOutMeta(const GMetaArgs &in_meta,
+                                    const GArgs &in_args)
+        {
+            return getOutMeta_impl(in_meta,
+                                   in_args,
+                                   typename detail::MkSeq<sizeof...(Ins)>::type());
+        }
+    };
+
+} // namespace detail
+
+// GKernelType and GKernelTypeM are base classes which implement typed ::on()
+// method based on kernel signature. GKernelTypeM stands for multiple-return-value kernels
+//
+// G_TYPED_KERNEL and G_TYPED_KERNEK_M macros inherit user classes from GKernelType and
+// GKernelTypeM respectively.
+
+template<typename K, typename... R, typename... Args>
+class GKernelTypeM<K, std::function<std::tuple<R...>(Args...)> >:
+        public detail::MetaHelper<K, std::tuple<Args...>, std::tuple<R...> >
+{
+    template<int... IIs>
+    static std::tuple<R...> yield(cv::GCall &call, detail::Seq<IIs...>)
+    {
+        return std::make_tuple(detail::Yield<R>::yield(call, IIs)...);
+    }
+
+public:
+    using InArgs  = std::tuple<Args...>;
+    using OutArgs = std::tuple<R...>;
+
+    static std::tuple<R...> on(Args... args)
+    {
+        cv::GCall call(GKernel{K::id(), &K::getOutMeta, {detail::GTypeTraits<R>::shape...}});
+        call.pass(args...);
+        return yield(call, typename detail::MkSeq<sizeof...(R)>::type());
+    }
+};
+
+template<typename, typename> class GKernelType;
+
+template<typename K, typename R, typename... Args>
+class GKernelType<K, std::function<R(Args...)> >:
+        public detail::MetaHelper<K, std::tuple<Args...>, R >
+{
+public:
+    using InArgs  = std::tuple<Args...>;
+    using OutArgs = std::tuple<R>;
+
+    static R on(Args... args)
+    {
+        cv::GCall call(GKernel{K::id(), &K::getOutMeta, {detail::GTypeTraits<R>::shape}});
+        call.pass(args...);
+        return detail::Yield<R>::yield(call, 0);
+    }
+};
+
+} // namespace cv
+
+
+// FIXME: I don't know a better way so far. Feel free to suggest one
+// The problem is that every typed kernel should have ::id() but body
+// of the class is defined by user (with outMeta, other stuff)
+
+#define G_ID_HELPER_CLASS(Class)  Class##IdHelper
+
+#define G_ID_HELPER_BODY(Class, Id)                                         \
+    namespace detail                                                        \
+    {                                                                       \
+        struct G_ID_HELPER_CLASS(Class)                                     \
+        {                                                                   \
+            static constexpr const char * id() {return Id;};                \
+        };                                                                  \
+    }
+
+#define G_TYPED_KERNEL(Class, API, Id)                                      \
+    G_ID_HELPER_BODY(Class, Id)                                             \
+    struct Class final: public cv::GKernelType<Class, std::function API >,  \
+                        public detail::G_ID_HELPER_CLASS(Class)
+// {body} is to be defined by user
+
+#define G_TYPED_KERNEL_M(Class, API, Id)                                    \
+    G_ID_HELPER_BODY(Class, Id)                                             \
+    struct Class final: public cv::GKernelTypeM<Class, std::function API >, \
+                        public detail::G_ID_HELPER_CLASS(Class)             \
+// {body} is to be defined by user
+
+namespace cv
+{
+// Declare <unite> in cv:: namespace
+enum class unite_policy
+{
+    REPLACE,
+    KEEP
+};
+
+namespace gapi
+{
+    // Prework: model "Device" API before it gets to G-API headers.
+    // FIXME: Don't mix with internal Backends class!
+    class GAPI_EXPORTS GBackend
+    {
+    public:
+        class Priv;
+
+        // TODO: make it template (call `new` within??)
+        GBackend();
+        explicit GBackend(std::shared_ptr<Priv> &&p);
+
+        Priv& priv();
+        const Priv& priv() const;
+        std::size_t hash() const;
+
+        bool operator== (const GBackend &rhs) const;
+
+    private:
+        std::shared_ptr<Priv> m_priv;
+    };
+
+    inline bool operator != (const GBackend &lhs, const GBackend &rhs)
+    {
+        return !(lhs == rhs);
+    }
+} // namespace gapi
+} // namespace cv
+
+namespace std
+{
+    template<> struct hash<cv::gapi::GBackend>
+    {
+        std::size_t operator() (const cv::gapi::GBackend &b) const
+        {
+            return b.hash();
+        }
+    };
+} // namespace std
+
+
+namespace cv {
+namespace gapi {
+    /** \addtogroup gapi_compile_args
+     * @{
+     */
+
+    // Lookup order is in fact a vector of Backends to traverse during look-up
+    /**
+     * @brief Priority list of backends to use during kernel
+     *   resolution process.
+     *
+     * Priority is descending -- the first backend in the list has the
+     * top priority, and the last one has the lowest priority.
+     *
+     * If there's multiple implementations available for a kernel at
+     * the moment of graph compilation, a kernel (and thus a backend)
+     * will be selected according to this order (if the parameter is passed).
+     *
+     * Default order is not specified (and by default, only
+     * CPU(OpenCV) backend is involved in graph compilation).
+     */
+    using GLookupOrder = std::vector<GBackend>;
+    /**
+     * @brief Create a backend lookup order -- priority list of
+     * backends to use during graph compilation process.
+     *
+     * @sa GLookupOrder, @ref gapi_std_backends
+     */
+    inline GLookupOrder lookup_order(std::initializer_list<GBackend> &&list)
+    {
+        return GLookupOrder(std::move(list));
+    }
+
+    // FIXME: Hide implementation
+    /**
+     * @brief A container class for heterogeneous kernel
+     * implementation collections.
+     *
+     * GKernelPackage is a special container class which stores kernel
+     * _implementations_. Objects of this class are created and passed
+     * to cv::GComputation::compile() to specify which kernels to use
+     * in the compiled graph. GKernelPackage may contain kernels of
+     * different backends, e.g. be heterogeneous.
+     *
+     * The most easy way to create a kernel package is to use function
+     * cv::gapi::kernels(). This template functions takes kernel
+     * implementations in form of type list (variadic template) and
+     * generates a kernel package atop of that.
+     *
+     * Kernel packages can be also generated programatically, starting
+     * with an empty package (created with the default constructor)
+     * and then by populating it with kernels via call to
+     * GKernelPackage::include(). Note this method is also a template
+     * one since G-API kernel implementations are _types_, not objects.
+     *
+     * Finally, two kernel packages can be combined into a new one
+     * with function cv::gapi::combine(). There are different rules
+     * apply to this process, see also cv::gapi::unite_policy for
+     * details.
+     */
+    class GAPI_EXPORTS GKernelPackage
+    {
+        /// @private
+        using S = std::unordered_map<std::string, GKernelImpl>;
+
+        /// @private
+        using M = std::unordered_map<GBackend, S>;
+
+        /// @private
+        M m_backend_kernels;
+
+    protected:
+        /// @private
+        // Check if package contains ANY implementation of a kernel API
+        // by API textual id.
+        bool includesAPI(const std::string &id) const;
+
+        /// @private
+        // Remove ALL implementations of the given API (identified by ID)
+        void removeAPI(const std::string &id);
+
+    public:
+        /**
+         * @brief Returns total number of kernels in the package
+         * (accross all backends included)
+         *
+         * @return a number of kernels in the package
+         */
+        std::size_t size() const;
+
+        /**
+         * @brief Test if a particular kernel _implementation_ KImpl is
+         * included in this kernel package.
+         *
+         * @sa includesAPI()
+         *
+         * @return true if there is such kernel, false otherwise.
+         */
+        template<typename KImpl>
+        bool includes() const
+        {
+            const auto set_iter = m_backend_kernels.find(KImpl::backend());
+            return (set_iter != m_backend_kernels.end())
+                ? (set_iter->second.count(KImpl::API::id()) > 0)
+                : false;
+        }
+
+        /**
+         * @brief Remove all kernels associated with the given backend
+         * from the package.
+         *
+         * Does nothing if there's no kernels of this backend in the package.
+         *
+         * @param backend backend which kernels to remove
+         */
+        void remove(const GBackend& backend);
+
+        /**
+         * @brief Remove all kernels implementing the given API from
+         * the package.
+         *
+         * Does nothing if there's no kernels implementing the given interface.
+         */
+        template<typename KAPI>
+        void remove()
+        {
+            removeAPI(KAPI::id());
+        }
+
+        // FIXME: Rename to includes() and distinguish API/impl case by
+        //     statically?
+        /**
+         * Check if package contains ANY implementation of a kernel API
+         * by API type.
+         */
+        template<typename KAPI>
+        bool includesAPI() const
+        {
+            return includesAPI(KAPI::id());
+        }
+
+        /**
+         * @brief Find a kernel (by its API), given the look-up order.
+         *
+         * If order is empty, returns first suitable implementation.
+         * Throws if nothing found.
+         *
+         * @return Backend which hosts matching kernel implementation.
+         *
+         * @sa cv::gapi::lookup_order
+         */
+        template<typename KAPI>
+        GBackend lookup(const GLookupOrder &order = {}) const
+        {
+            return lookup(KAPI::id(), order).first;
+        }
+
+        /// @private
+        std::pair<cv::gapi::GBackend, cv::GKernelImpl>
+        lookup(const std::string &id, const GLookupOrder &order = {}) const;
+
+        // FIXME: No overwrites allowed?
+        /**
+         * @brief Put a new kernel implementation KImpl into package.
+         *
+         * @param up unite policy to use. If the package has already
+         * implementation for this kernel (probably from another
+         * backend), and cv::unite_policy::KEEP is passed, the
+         * existing implementation remains in package; on
+         * cv::unite_policy::REPLACE all other existing
+         * implementations are first dropped from the package.
+         */
+        template<typename KImpl>
+        void include(const cv::unite_policy up = cv::unite_policy::KEEP)
+        {
+            auto backend     = KImpl::backend();
+            auto kernel_id   = KImpl::API::id();
+            auto kernel_impl = GKernelImpl{KImpl::kernel()};
+            if (up == cv::unite_policy::REPLACE) removeAPI(kernel_id);
+            else GAPI_Assert(up == cv::unite_policy::KEEP);
+
+            // Regardless of the policy, store new impl in its storage slot.
+            m_backend_kernels[backend][kernel_id] = std::move(kernel_impl);
+        }
+
+        /**
+         * @brief Lists all backends which are included into package
+         *
+         * @return vector of backends
+         */
+        std::vector<GBackend> backends() const;
+
+        // TODO: Doxygen bug -- it wants me to place this comment
+        // here, not below.
+        /**
+         * @brief Create a new package based on `lhs` and `rhs`,
+         * with unity policy defined by `policy`.
+         *
+         * @param lhs "Left-hand-side" package in the process
+         * @param rhs "Right-hand-side" package in the process
+         * @param policy Unite policy which is used in case of conflicts
+         * -- when the same kernel API is implemented in both packages by
+         * different backends; cv::unite_policy::KEEP keeps both
+         * implementation in the resulting package, while
+         * cv::unite_policy::REPLACE gives precedence two kernels from
+         * "Right-hand-side".
+         *
+         * @return a new kernel package.
+         */
+        friend GAPI_EXPORTS GKernelPackage combine(const GKernelPackage  &lhs,
+                                                   const GKernelPackage  &rhs,
+                                                   const cv::unite_policy policy);
+    };
+
+    /**
+     * @brief Create a kernel package object containing kernels
+     * specified in variadic template argument.
+     *
+     * In G-API, kernel implementations are _types_. Every backend has
+     * its own kernel API (like GAPI_OCV_KERNEL() and
+     * GAPI_FLUID_KERNEL()) but all of that APIs define a new type for
+     * each kernel implementation.
+     *
+     * Use this function to pass kernel implementations (defined in
+     * either way) to the system. Example:
+     *
+     * @snippet modules/gapi/samples/api_ref_snippets.cpp kernels_snippet
+     *
+     * Note that kernels() itself is a function returning object, not
+     * a type, so having `()` at the end is important -- it must be a
+     * function call.
+     */
+    template<typename... KK> GKernelPackage kernels()
+    {
+        GKernelPackage pkg;
+
+        // For those who wonder - below is a trick to call a number of
+        // methods based on parameter pack (zeroes just help hiding these
+        // calls into a sequence which helps to expand this parameter pack).
+        // Just note that `f(),a` always equals to `a` (with f() called!)
+        // and parentheses are used to hide function call in the expanded sequence.
+        // Leading 0 helps to handle case when KK is an empty list (kernels<>()).
+
+        int unused[] = { 0, (pkg.include<KK>(), 0)... };
+        cv::util::suppress_unused_warning(unused);
+        return pkg;
+    };
+
+    /** @} */
+
+    GAPI_EXPORTS GKernelPackage combine(const GKernelPackage  &lhs,
+                                        const GKernelPackage  &rhs,
+                                        const cv::unite_policy policy);
+} // namespace gapi
+
+namespace detail
+{
+    template<> struct CompileArgTag<cv::gapi::GKernelPackage>
+    {
+        static const char* tag() { return "gapi.kernel_package"; }
+    };
+    template<> struct CompileArgTag<cv::gapi::GLookupOrder>
+    {
+        static const char* tag() { return "gapi.lookup_order"; }
+    };
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_GAPI_GKERNEL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmat.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmat.hpp
new file mode 100644 (file)
index 0000000..0fa5342
--- /dev/null
@@ -0,0 +1,149 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GMAT_HPP
+#define OPENCV_GAPI_GMAT_HPP
+
+#include <ostream>
+#include <memory>                 // std::shared_ptr
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/gcommon.hpp> // GShape
+
+#include "opencv2/gapi/own/types.hpp" // cv::gapi::own::Size
+#include "opencv2/gapi/own/convert.hpp" // to_own
+#include "opencv2/gapi/own/assert.hpp"
+
+// TODO GAPI_EXPORTS or so
+namespace cv
+{
+// Forward declaration; GNode and GOrigin are an internal
+// (user-inaccessible) classes.
+class GNode;
+struct GOrigin;
+
+/** \addtogroup gapi_data_objects
+ * @{
+ *
+ * @brief Data-representing objects which can be used to build G-API
+ * expressions.
+ */
+
+class GAPI_EXPORTS GMat
+{
+public:
+    GMat();                                 // Empty constructor
+    GMat(const GNode &n, std::size_t out);  // Operation result constructor
+
+    GOrigin& priv();                        // Internal use only
+    const GOrigin& priv()  const;           // Internal use only
+
+private:
+    std::shared_ptr<GOrigin> m_priv;
+};
+
+/** @} */
+
+/**
+ * \addtogroup gapi_meta_args
+ * @{
+ */
+struct GAPI_EXPORTS GMatDesc
+{
+    // FIXME: Default initializers in C++14
+    int depth;
+    int chan;
+    cv::gapi::own::Size size; // NB.: no multi-dimensional cases covered yet
+
+    inline bool operator== (const GMatDesc &rhs) const
+    {
+        return depth == rhs.depth && chan == rhs.chan && size == rhs.size;
+    }
+
+    inline bool operator!= (const GMatDesc &rhs) const
+    {
+        return !(*this == rhs);
+    }
+
+    // Meta combinator: return a new GMatDesc which differs in size by delta
+    // (all other fields are taken unchanged from this GMatDesc)
+    // FIXME: a better name?
+    GMatDesc withSizeDelta(cv::gapi::own::Size delta) const
+    {
+        GMatDesc desc(*this);
+        desc.size += delta;
+        return desc;
+    }
+#if !defined(GAPI_STANDALONE)
+    GMatDesc withSizeDelta(cv::Size delta) const
+    {
+        return withSizeDelta(to_own(delta));
+    }
+
+    GMatDesc withSize(cv::Size sz) const
+    {
+        return withSize(to_own(sz));
+    }
+#endif // !defined(GAPI_STANDALONE)
+    // Meta combinator: return a new GMatDesc which differs in size by delta
+    // (all other fields are taken unchanged from this GMatDesc)
+    //
+    // This is an overload.
+    GMatDesc withSizeDelta(int dx, int dy) const
+    {
+        return withSizeDelta(cv::gapi::own::Size{dx,dy});
+    }
+
+    GMatDesc withSize(cv::gapi::own::Size sz) const
+    {
+        GMatDesc desc(*this);
+        desc.size = sz;
+        return desc;
+    }
+
+    // Meta combinator: return a new GMatDesc with specified data depth.
+    // (all other fields are taken unchanged from this GMatDesc)
+    GMatDesc withDepth(int ddepth) const
+    {
+        GAPI_Assert(CV_MAT_CN(ddepth) == 1 || ddepth == -1);
+        GMatDesc desc(*this);
+        if (ddepth != -1) desc.depth = ddepth;
+        return desc;
+    }
+
+    // Meta combinator: return a new GMatDesc with specified data depth
+    // and number of channels.
+    // (all other fields are taken unchanged from this GMatDesc)
+    GMatDesc withType(int ddepth, int dchan) const
+    {
+        GAPI_Assert(CV_MAT_CN(ddepth) == 1 || ddepth == -1);
+        GMatDesc desc = withDepth(ddepth);
+        desc.chan = dchan;
+        return desc;
+    }
+};
+
+static inline GMatDesc empty_gmat_desc() { return GMatDesc{-1,-1,{-1,-1}}; }
+
+#if !defined(GAPI_STANDALONE)
+class Mat;
+GAPI_EXPORTS GMatDesc descr_of(const cv::Mat &mat);
+GAPI_EXPORTS GMatDesc descr_of(const cv::UMat &mat);
+#endif // !defined(GAPI_STANDALONE)
+
+/** @} */
+
+namespace gapi { namespace own {
+    class Mat;
+    GAPI_EXPORTS GMatDesc descr_of(const Mat &mat);
+}}//gapi::own
+
+std::ostream& operator<<(std::ostream& os, const cv::GMatDesc &desc);
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GMAT_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmetaarg.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gmetaarg.hpp
new file mode 100644 (file)
index 0000000..473be34
--- /dev/null
@@ -0,0 +1,66 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GMETAARG_HPP
+#define OPENCV_GAPI_GMETAARG_HPP
+
+#include <vector>
+#include <type_traits>
+
+#include "opencv2/gapi/util/util.hpp"
+#include "opencv2/gapi/util/variant.hpp"
+
+#include "opencv2/gapi/gmat.hpp"
+#include "opencv2/gapi/gscalar.hpp"
+#include "opencv2/gapi/garray.hpp"
+
+namespace cv
+{
+// FIXME: Rename to GMeta?
+// FIXME: user shouldn't deal with it - put to detail?
+// GMetaArg is an union type over descriptions of G-types which can serve as
+// GComputation's in/output slots.
+//
+// GMetaArg objects are passed as arguments to GComputation::compile()
+// to specify which data a compiled computation should be specialized on.
+// For manual compile(), user must supply this metadata, in case of apply()
+// this metadata is taken from arguments computation should operate on.
+//
+// The first type (monostate) is equal to "uninitialized"/"unresolved" meta.
+using GMetaArg = util::variant
+    < util::monostate
+    , GMatDesc
+    , GScalarDesc
+    , GArrayDesc
+    >;
+std::ostream& operator<<(std::ostream& os, const GMetaArg &);
+
+using GMetaArgs = std::vector<GMetaArg>;
+
+namespace detail
+{
+    // These traits are used by GComputation::compile()
+
+    // FIXME: is_constructible<T> doesn't work as variant doesn't do any SFINAE
+    // in its current template constructor
+
+    template<typename T> struct is_meta_descr    : std::false_type {};
+    template<> struct is_meta_descr<GMatDesc>    : std::true_type {};
+    template<> struct is_meta_descr<GScalarDesc> : std::true_type {};
+    template<> struct is_meta_descr<GArrayDesc>  : std::true_type {};
+
+    template<typename... Ts>
+    using are_meta_descrs = all_satisfy<is_meta_descr, Ts...>;
+
+    template<typename... Ts>
+    using are_meta_descrs_but_last = all_satisfy<is_meta_descr, typename all_but_last<Ts...>::type>;
+
+} // namespace detail
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GMETAARG_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gproto.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gproto.hpp
new file mode 100644 (file)
index 0000000..8b53d9b
--- /dev/null
@@ -0,0 +1,96 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GPROTO_HPP
+#define OPENCV_GAPI_GPROTO_HPP
+
+#include <type_traits>
+#include <vector>
+#include <ostream>
+
+#include "opencv2/gapi/util/variant.hpp"
+
+#include "opencv2/gapi/gmat.hpp"
+#include "opencv2/gapi/gscalar.hpp"
+#include "opencv2/gapi/garray.hpp"
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/gmetaarg.hpp"
+
+namespace cv {
+
+// FIXME: user shouldn't deal with it - put to detail?
+// GProtoArg is an union type over G-types which can serve as
+// GComputation's in/output slots. In other words, GProtoArg
+// wraps any type which can serve as G-API exchange type.
+//
+// In Runtime, GProtoArgs are substituted with appropriate GRunArgs.
+//
+// GProtoArg objects are constructed in-place when user describes
+// (captures) computations, user doesn't interact with these types
+// directly.
+using GProtoArg = util::variant
+    < GMat
+    , GScalar
+    , detail::GArrayU // instead of GArray<T>
+    >;
+
+using GProtoArgs = std::vector<GProtoArg>;
+
+namespace detail
+{
+template<typename... Ts> inline GProtoArgs packArgs(Ts... args)
+{
+    return GProtoArgs{ GProtoArg(wrap_gapi_helper<Ts>::wrap(args))... };
+}
+
+}
+
+template<class Tag>
+struct GIOProtoArgs
+{
+public:
+    explicit GIOProtoArgs(const GProtoArgs& args) : m_args(args) {}
+    explicit GIOProtoArgs(GProtoArgs &&args)      : m_args(std::move(args)) {}
+
+    GProtoArgs m_args;
+};
+
+struct In_Tag{};
+struct Out_Tag{};
+
+using GProtoInputArgs  = GIOProtoArgs<In_Tag>;
+using GProtoOutputArgs = GIOProtoArgs<Out_Tag>;
+
+// Perfect forwarding
+template<typename... Ts> inline GProtoInputArgs GIn(Ts&&... ts)
+{
+    return GProtoInputArgs(detail::packArgs(std::forward<Ts>(ts)...));
+}
+
+template<typename... Ts> inline GProtoOutputArgs GOut(Ts&&... ts)
+{
+    return GProtoOutputArgs(detail::packArgs(std::forward<Ts>(ts)...));
+}
+
+// Extract run-time arguments from node origin
+// Can be used to extract constant values associated with G-objects
+// (like GScalar) at graph construction time
+GRunArg value_of(const GOrigin &origin);
+
+// Transform run-time computation arguments into a collection of metadata
+// extracted from that arguments
+GMetaArg  GAPI_EXPORTS descr_of(const GRunArg  &arg );
+GMetaArgs GAPI_EXPORTS descr_of(const GRunArgs &args);
+
+// Transform run-time operation result argument into metadata extracted from that argument
+// Used to compare the metadata, which generated at compile time with the metadata result operation in run time
+GMetaArg  GAPI_EXPORTS descr_of(const GRunArgP& argp);
+
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GPROTO_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/core.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/core.hpp
new file mode 100644 (file)
index 0000000..98d49b5
--- /dev/null
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GPU_CORE_API_HPP
+#define OPENCV_GAPI_GPU_CORE_API_HPP
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace core {
+namespace gpu {
+
+GAPI_EXPORTS GKernelPackage kernels();
+
+} // namespace gpu
+} // namespace core
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_GPU_CORE_API_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/ggpukernel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/ggpukernel.hpp
new file mode 100644 (file)
index 0000000..e5a6215
--- /dev/null
@@ -0,0 +1,244 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GGPUKERNEL_HPP
+#define OPENCV_GAPI_GGPUKERNEL_HPP
+
+#include <vector>
+#include <functional>
+#include <map>
+#include <unordered_map>
+
+#include <opencv2/core/mat.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/gkernel.hpp>
+#include <opencv2/gapi/garg.hpp>
+
+// FIXME: namespace scheme for backends?
+namespace cv {
+
+namespace gimpl
+{
+    // Forward-declare an internal class
+    class GGPUExecutable;
+} // namespace gimpl
+
+namespace gapi
+{
+namespace gpu
+{
+    /**
+     * \addtogroup gapi_std_backends G-API Standard backends
+     * @{
+     */
+    /**
+     * @brief Get a reference to GPU backend.
+     *
+     * At the moment, the GPU backend is built atop of OpenCV
+     * "Transparent API" (T-API), see cv::UMat for details.
+     *
+     * @sa gapi_std_backends
+     */
+    GAPI_EXPORTS cv::gapi::GBackend backend();
+    /** @} */
+} // namespace gpu
+} // namespace gapi
+
+
+// Represents arguments which are passed to a wrapped GPU function
+// FIXME: put into detail?
+class GAPI_EXPORTS GGPUContext
+{
+public:
+    // Generic accessor API
+    template<typename T>
+    const T& inArg(int input) { return m_args.at(input).get<T>(); }
+
+    // Syntax sugar
+    const cv::UMat&  inMat(int input);
+    cv::UMat&  outMatR(int output); // FIXME: Avoid cv::Mat m = ctx.outMatR()
+
+    const cv::gapi::own::Scalar& inVal(int input);
+    cv::gapi::own::Scalar& outValR(int output); // FIXME: Avoid cv::gapi::own::Scalar s = ctx.outValR()
+    template<typename T> std::vector<T>& outVecR(int output) // FIXME: the same issue
+    {
+        return outVecRef(output).wref<T>();
+    }
+
+protected:
+    detail::VectorRef& outVecRef(int output);
+
+    std::vector<GArg> m_args;
+    std::unordered_map<std::size_t, GRunArgP> m_results;
+
+
+    friend class gimpl::GGPUExecutable;
+};
+
+class GAPI_EXPORTS GGPUKernel
+{
+public:
+    // This function is kernel's execution entry point (does the processing work)
+    using F = std::function<void(GGPUContext &)>;
+
+    GGPUKernel();
+    explicit GGPUKernel(const F& f);
+
+    void apply(GGPUContext &ctx);
+
+protected:
+    F m_f;
+};
+
+// FIXME: This is an ugly ad-hoc imlpementation. TODO: refactor
+
+namespace detail
+{
+template<class T> struct gpu_get_in;
+template<> struct gpu_get_in<cv::GMat>
+{
+    static cv::UMat    get(GGPUContext &ctx, int idx) { return ctx.inMat(idx); }
+};
+template<> struct gpu_get_in<cv::GScalar>
+{
+    static cv::Scalar get(GGPUContext &ctx, int idx) { return to_ocv(ctx.inVal(idx)); }
+};
+template<typename U> struct gpu_get_in<cv::GArray<U> >
+{
+    static const std::vector<U>& get(GGPUContext &ctx, int idx) { return ctx.inArg<VectorRef>(idx).rref<U>(); }
+};
+template<class T> struct gpu_get_in
+{
+    static T get(GGPUContext &ctx, int idx) { return ctx.inArg<T>(idx); }
+};
+
+struct tracked_cv_umat{
+    //TODO Think if T - API could reallocate UMat to a proper size - how do we handle this ?
+    //tracked_cv_umat(cv::UMat& m) : r{(m)}, original_data{m.getMat(ACCESS_RW).data} {}
+    tracked_cv_umat(cv::UMat& m) : r{ (m) }, original_data{ nullptr } {}
+    cv::UMat r;
+    uchar* original_data;
+
+    operator cv::UMat& (){ return r;}
+    void validate() const{
+        //if (r.getMat(ACCESS_RW).data != original_data)
+        //{
+        //    util::throw_error
+        //        (std::logic_error
+        //         ("OpenCV kernel output parameter was reallocated. \n"
+        //          "Incorrect meta data was provided ?"));
+        //}
+
+    }
+};
+
+struct scalar_wrapper_gpu
+{
+    //FIXME reuse CPU (OpenCV) plugin code
+    scalar_wrapper_gpu(cv::gapi::own::Scalar& s) : m_s{cv::gapi::own::to_ocv(s)}, m_org_s(s) {};
+    operator cv::Scalar& () { return m_s; }
+    void writeBack() const  { m_org_s = to_own(m_s); }
+
+    cv::Scalar m_s;
+    cv::gapi::own::Scalar& m_org_s;
+};
+
+template<typename... Outputs>
+void postprocess_gpu(Outputs&... outs)
+{
+    struct
+    {
+        void operator()(tracked_cv_umat* bm) { bm->validate(); }
+        void operator()(scalar_wrapper_gpu* sw) { sw->writeBack(); }
+        void operator()(...) {                  }
+
+    } validate;
+    //dummy array to unfold parameter pack
+    int dummy[] = { 0, (validate(&outs), 0)... };
+    cv::util::suppress_unused_warning(dummy);
+}
+
+template<class T> struct gpu_get_out;
+template<> struct gpu_get_out<cv::GMat>
+{
+    static tracked_cv_umat get(GGPUContext &ctx, int idx)
+    {
+        auto& r = ctx.outMatR(idx);
+        return{ r };
+    }
+};
+template<> struct gpu_get_out<cv::GScalar>
+{
+    static scalar_wrapper_gpu get(GGPUContext &ctx, int idx)
+    {
+        auto& s = ctx.outValR(idx);
+        return{ s };
+    }
+};
+template<typename U> struct gpu_get_out<cv::GArray<U> >
+{
+    static std::vector<U>& get(GGPUContext &ctx, int idx) { return ctx.outVecR<U>(idx);  }
+};
+
+template<typename, typename, typename>
+struct GPUCallHelper;
+
+// FIXME: probably can be simplified with std::apply or analogue.
+template<typename Impl, typename... Ins, typename... Outs>
+struct GPUCallHelper<Impl, std::tuple<Ins...>, std::tuple<Outs...> >
+{
+    template<typename... Inputs>
+    struct call_and_postprocess
+    {
+        template<typename... Outputs>
+        static void call(Inputs&&... ins, Outputs&&... outs)
+        {
+            //not using a std::forward on outs is deliberate in order to
+            //cause compilation error, by tring to bind rvalue references to lvalue references
+            Impl::run(std::forward<Inputs>(ins)..., outs...);
+
+            postprocess_gpu(outs...);
+        }
+    };
+
+    template<int... IIs, int... OIs>
+    static void call_impl(GGPUContext &ctx, detail::Seq<IIs...>, detail::Seq<OIs...>)
+    {
+        //TODO: Make sure that OpenCV kernels do not reallocate memory for output parameters
+        //by comparing it's state (data ptr) before and after the call.
+        //Convert own::Scalar to cv::Scalar before call kernel and run kernel
+        //convert cv::Scalar to own::Scalar after call kernel and write back results
+        call_and_postprocess<decltype(gpu_get_in<Ins>::get(ctx, IIs))...>::call(gpu_get_in<Ins>::get(ctx, IIs)..., gpu_get_out<Outs>::get(ctx, OIs)...);
+    }
+
+    static void call(GGPUContext &ctx)
+    {
+        call_impl(ctx,
+            typename detail::MkSeq<sizeof...(Ins)>::type(),
+            typename detail::MkSeq<sizeof...(Outs)>::type());
+    }
+};
+
+} // namespace detail
+
+template<class Impl, class K>
+class GGPUKernelImpl: public detail::GPUCallHelper<Impl, typename K::InArgs, typename K::OutArgs>
+{
+    using P = detail::GPUCallHelper<Impl, typename K::InArgs, typename K::OutArgs>;
+
+public:
+    using API = K;
+
+    static cv::gapi::GBackend backend()  { return cv::gapi::gpu::backend(); }
+    static cv::GGPUKernel     kernel()   { return GGPUKernel(&P::call);     }
+};
+
+#define GAPI_GPU_KERNEL(Name, API) struct Name: public cv::GGPUKernelImpl<Name, API>
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GGPUKERNEL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/imgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gpu/imgproc.hpp
new file mode 100644 (file)
index 0000000..6071dda
--- /dev/null
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GPU_IMGPROC_API_HPP
+#define OPENCV_GAPI_GPU_IMGPROC_API_HPP
+
+#include <opencv2/core/cvdef.h>     // GAPI_EXPORTS
+#include <opencv2/gapi/gkernel.hpp> // GKernelPackage
+
+namespace cv {
+namespace gapi {
+namespace imgproc {
+namespace gpu {
+
+GAPI_EXPORTS GKernelPackage kernels();
+
+} // namespace gpu
+} // namespace imgproc
+} // namespace gapi
+} // namespace cv
+
+
+#endif // OPENCV_GAPI_GPU_IMGPROC_API_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gscalar.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gscalar.hpp
new file mode 100644 (file)
index 0000000..dd1205b
--- /dev/null
@@ -0,0 +1,83 @@
+// This file is part of OpenCV project.
+
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GSCALAR_HPP
+#define OPENCV_GAPI_GSCALAR_HPP
+
+#include <ostream>
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/gcommon.hpp> // GShape
+#include <opencv2/gapi/util/optional.hpp>
+#include "opencv2/gapi/own/scalar.hpp"
+
+namespace cv
+{
+// Forward declaration; GNode and GOrigin are an internal
+// (user-inaccessible) classes.
+class GNode;
+struct GOrigin;
+
+/** \addtogroup gapi_data_objects
+ * @{
+ */
+
+class GAPI_EXPORTS GScalar
+{
+public:
+    GScalar();                                         // Empty constructor
+    explicit GScalar(const cv::gapi::own::Scalar& s);  // Constant value constructor from cv::gapi::own::Scalar
+    explicit GScalar(cv::gapi::own::Scalar&& s);       // Constant value move-constructor from cv::gapi::own::Scalar
+#if !defined(GAPI_STANDALONE)
+    explicit GScalar(const cv::Scalar& s);             // Constant value constructor from cv::Scalar
+#endif  // !defined(GAPI_STANDALONE)
+    GScalar(double v0);                                // Constant value constructor from double
+    GScalar(const GNode &n, std::size_t out);          // Operation result constructor
+
+    GOrigin& priv();                                   // Internal use only
+    const GOrigin& priv()  const;                      // Internal use only
+
+private:
+    std::shared_ptr<GOrigin> m_priv;
+};
+
+/** @} */
+
+/**
+ * \addtogroup gapi_meta_args
+ * @{
+ */
+struct GScalarDesc
+{
+    // NB.: right now it is empty
+
+    inline bool operator== (const GScalarDesc &) const
+    {
+        return true; // NB: implement this method if GScalar meta appears
+    }
+
+    inline bool operator!= (const GScalarDesc &rhs) const
+    {
+        return !(*this == rhs);
+    }
+};
+
+static inline GScalarDesc empty_scalar_desc() { return GScalarDesc(); }
+
+#if !defined(GAPI_STANDALONE)
+GAPI_EXPORTS GScalarDesc descr_of(const cv::Scalar            &scalar);
+#endif // !defined(GAPI_STANDALONE)
+/** @} */
+
+GAPI_EXPORTS GScalarDesc descr_of(const cv::gapi::own::Scalar &scalar);
+
+std::ostream& operator<<(std::ostream& os, const cv::GScalarDesc &desc);
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_GSCALAR_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtype_traits.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtype_traits.hpp
new file mode 100644 (file)
index 0000000..d05e02e
--- /dev/null
@@ -0,0 +1,152 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GTYPE_TRAITS_HPP
+#define OPENCV_GAPI_GTYPE_TRAITS_HPP
+
+#include <vector>
+#include <type_traits>
+
+#include <opencv2/gapi/gmat.hpp>
+#include <opencv2/gapi/gscalar.hpp>
+#include <opencv2/gapi/garray.hpp>
+#include <opencv2/gapi/gcommon.hpp>
+#include <opencv2/gapi/own/convert.hpp>
+
+namespace cv
+{
+namespace detail
+{
+    // FIXME: These traits and enum and possible numerous switch(kind)
+    // block may be replaced with a special Handler<T> object or with
+    // a double dispatch
+    enum class ArgKind: int
+    {
+        OPAQUE,       // Unknown, generic, opaque-to-GAPI data type - STATIC
+        GOBJREF,      // <internal> reference to object
+        GMAT,         // a cv::GMat
+        GSCALAR,      // a cv::GScalar
+        GARRAY,       // a cv::GArrayU (note - exactly GArrayU, not GArray<T>!)
+    };
+
+    // Describe G-API types (G-types) with traits.  Mostly used by
+    // cv::GArg to store meta information about types passed into
+    // operation arguments. Please note that cv::GComputation is
+    // defined on GProtoArgs, not GArgs!
+    template<typename T> struct GTypeTraits;
+    template<typename T> struct GTypeTraits
+    {
+        static constexpr const ArgKind kind = ArgKind::OPAQUE;
+    };
+    template<>           struct GTypeTraits<cv::GMat>
+    {
+        static constexpr const ArgKind kind = ArgKind::GMAT;
+        static constexpr const GShape shape = GShape::GMAT;
+    };
+    template<>           struct GTypeTraits<cv::GScalar>
+    {
+        static constexpr const ArgKind kind = ArgKind::GSCALAR;
+        static constexpr const GShape shape = GShape::GSCALAR;
+    };
+    template<class T> struct GTypeTraits<cv::GArray<T> >
+    {
+        static constexpr const ArgKind kind = ArgKind::GARRAY;
+        static constexpr const GShape shape = GShape::GARRAY;
+        using host_type  = std::vector<T>;
+        using strip_type = cv::detail::VectorRef;
+        static cv::detail::GArrayU   wrap_value(const cv::GArray<T>  &t) { return t.strip();}
+        static cv::detail::VectorRef wrap_in   (const std::vector<T> &t) { return detail::VectorRef(t); }
+        static cv::detail::VectorRef wrap_out  (      std::vector<T> &t) { return detail::VectorRef(t); }
+    };
+
+    // Tests if Trait for type T requires extra marshalling ("custom wrap") or not.
+    // If Traits<T> has wrap_value() defined, it does.
+    template<class T> struct has_custom_wrap
+    {
+        template<class,class> class check;
+        template<typename C> static std::true_type  test(check<C, decltype(&GTypeTraits<C>::wrap_value)> *);
+        template<typename C> static std::false_type test(...);
+        using type = decltype(test<T>(nullptr));
+        static const constexpr bool value = std::is_same<std::true_type, decltype(test<T>(nullptr))>::value;
+    };
+
+    // Resolve a Host type back to its associated G-Type.
+    // FIXME: Probably it can be avoided
+    template<typename T> struct GTypeOf;
+#if !defined(GAPI_STANDALONE)
+    template<>           struct GTypeOf<cv::Mat>               { using type = cv::GMat;      };
+    template<>           struct GTypeOf<cv::Scalar>            { using type = cv::GScalar;   };
+#endif // !defined(GAPI_STANDALONE)
+    template<>           struct GTypeOf<cv::gapi::own::Mat>    { using type = cv::GMat;      };
+    template<>           struct GTypeOf<cv::gapi::own::Scalar> { using type = cv::GScalar;   };
+    template<typename U> struct GTypeOf<std::vector<U> >       { using type = cv::GArray<U>; };
+    template<class T> using g_type_of_t = typename GTypeOf<T>::type;
+
+    // Marshalling helper for G-types and its Host types. Helps G-API
+    // to store G types in internal generic containers for further
+    // processing. Implements the following callbacks:
+    //
+    // * wrap() - converts user-facing G-type into an internal one
+    //   for internal storage.
+    //   Used when G-API operation is instantiated (G<Kernel>::on(),
+    //   etc) during expressing a pipeline. Mostly returns input
+    //   value "as is" except the case when G-type is a template. For
+    //   template G-classes, calls custom wrap() from Traits.
+    //   The value returned by wrap() is then wrapped into GArg() and
+    //   stored in G-API metadata.
+    //
+    //   Example:
+    //   - cv::GMat arguments are passed as-is.
+    //   - integers, pointers, STL containers, user types are passed as-is.
+    //   - cv::GArray<T> is converted to cv::GArrayU.
+    //
+    // * wrap_in() / wrap_out() - convert Host type associated with
+    //   G-type to internal representation type.
+    //
+    //   - For "simple" (non-template) G-types, returns value as-is.
+    //     Example: cv::GMat has host type cv::Mat, when user passes a
+    //              cv::Mat, system stores it internally as cv::Mat.
+    //
+    //   - For "complex" (template) G-types, utilizes custom
+    //     wrap_in()/wrap_out() as described in Traits.
+    //     Example: cv::GArray<T> has host type std::vector<T>, when
+    //              user passes a std::vector<T>, system stores it
+    //              internally as VectorRef (with <T> stripped away).
+    template<typename T, class Custom = void> struct WrapValue
+    {
+        static auto wrap(const T& t) ->
+            typename std::remove_reference<T>::type
+        {
+            return static_cast<typename std::remove_reference<T>::type>(t);
+        }
+
+        template<typename U> static U  wrap_in (const U &u) { return  u;  }
+        template<typename U> static U* wrap_out(U &u)       { return &u;  }
+    };
+    template<typename T> struct WrapValue<T, typename std::enable_if<has_custom_wrap<T>::value>::type>
+    {
+        static auto wrap(const T& t) -> decltype(GTypeTraits<T>::wrap_value(t))
+        {
+            return GTypeTraits<T>::wrap_value(t);
+        }
+        template<typename U> static auto wrap_in (const U &u) -> typename GTypeTraits<T>::strip_type
+        {
+            return GTypeTraits<T>::wrap_in(u);
+        }
+        template<typename U> static auto wrap_out(U &u) -> typename GTypeTraits<T>::strip_type
+        {
+            return GTypeTraits<T>::wrap_out(u);
+        }
+    };
+
+    template<typename T> using wrap_gapi_helper = WrapValue<typename std::decay<T>::type>;
+    template<typename T> using wrap_host_helper = WrapValue<typename std::decay<g_type_of_t<T> >::type>;
+
+} // namespace detail
+} // namespace cv
+
+#endif // OPENCV_GAPI_GTYPE_TRAITS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtyped.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/gtyped.hpp
new file mode 100644 (file)
index 0000000..a966f26
--- /dev/null
@@ -0,0 +1,187 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GTYPED_HPP
+#define OPENCV_GAPI_GTYPED_HPP
+#if !defined(GAPI_STANDALONE)
+
+#include <vector>
+
+#include "opencv2/gapi/gcomputation.hpp"
+#include "opencv2/gapi/gcompiled.hpp"
+#include "opencv2/gapi/gproto.hpp"
+#include "opencv2/gapi/gcommon.hpp"
+
+namespace cv {
+
+namespace detail
+{
+    // FIXME: How to prevent coolhackers from extending it by their own types?
+    // FIXME: ...Should we care?
+    template<typename T> struct ProtoToParam;
+    template<> struct ProtoToParam<cv::GMat>    { using type = cv::Mat; };
+    template<> struct ProtoToParam<cv::GScalar> { using type = cv::Scalar; };
+    template<typename U> struct ProtoToParam<cv::GArray<U> > { using type = std::vector<U>; };
+    template<typename T> using ProtoToParamT = typename ProtoToParam<T>::type;
+
+    template<typename T> struct ProtoToMeta;
+    template<> struct ProtoToMeta<cv::GMat>     { using type = cv::GMatDesc; };
+    template<> struct ProtoToMeta<cv::GScalar>  { using type = cv::GScalarDesc; };
+    template<typename U> struct ProtoToMeta<cv::GArray<U> > { using type = cv::GArrayDesc; };
+    template<typename T> using ProtoToMetaT = typename ProtoToMeta<T>::type;
+
+    //workaround for MSVC 19.0 bug
+    template <typename T>
+    auto make_default()->decltype(T{}) {return {};}
+}; // detail
+
+template<typename> class GComputationT;
+
+// Single return value implementation
+template<typename R, typename... Args> class GComputationT<R(Args...)>
+{
+public:
+    typedef std::function<R(Args...)> Gen;
+
+    class GCompiledT
+    {
+    private:
+        friend class GComputationT<R(Args...)>;
+
+        cv::GCompiled m_comp;
+
+        explicit GCompiledT(const cv::GCompiled &comp) : m_comp(comp) {}
+
+    public:
+        GCompiledT() {}
+
+        void operator()(detail::ProtoToParamT<Args>... inArgs,
+                        detail::ProtoToParamT<R> &outArg)
+        {
+            m_comp(cv::gin(inArgs...), cv::gout(outArg));
+        }
+
+        explicit operator bool() const
+        {
+            return static_cast<bool>(m_comp);
+        }
+    };
+
+private:
+    typedef std::pair<R, GProtoInputArgs > Captured;
+
+    Captured capture(const Gen& g, Args... args)
+    {
+        return Captured(g(args...), cv::GIn(args...));
+    }
+
+    Captured m_capture;
+    cv::GComputation m_comp;
+
+public:
+    GComputationT(const Gen &generator)
+        : m_capture(capture(generator, detail::make_default<Args>()...))
+        , m_comp(cv::GProtoInputArgs(std::move(m_capture.second)),
+                 cv::GOut(m_capture.first))
+    {
+    }
+
+    void apply(detail::ProtoToParamT<Args>... inArgs,
+               detail::ProtoToParamT<R> &outArg)
+    {
+        m_comp.apply(cv::gin(inArgs...), cv::gout(outArg));
+    }
+
+    GCompiledT compile(detail::ProtoToMetaT<Args>... inDescs)
+    {
+        GMetaArgs inMetas = { GMetaArg(inDescs)... };
+        return GCompiledT(m_comp.compile(std::move(inMetas), GCompileArgs()));
+    }
+
+    GCompiledT compile(detail::ProtoToMetaT<Args>... inDescs, GCompileArgs &&args)
+    {
+        GMetaArgs inMetas = { GMetaArg(inDescs)... };
+        return GCompiledT(m_comp.compile(std::move(inMetas), std::move(args)));
+    }
+};
+
+// Multiple (fixed) return value implementation. FIXME: How to avoid copy-paste?
+template<typename... R, typename... Args> class GComputationT<std::tuple<R...>(Args...)>
+{
+public:
+    typedef std::function<std::tuple<R...>(Args...)> Gen;
+
+    class GCompiledT
+    {
+    private:
+        friend class GComputationT<std::tuple<R...>(Args...)>;
+
+        cv::GCompiled m_comp;
+        explicit GCompiledT(const cv::GCompiled &comp) : m_comp(comp) {}
+
+    public:
+        GCompiledT() {}
+
+        void operator()(detail::ProtoToParamT<Args>... inArgs,
+                        detail::ProtoToParamT<R>&... outArgs)
+        {
+            m_comp(cv::gin(inArgs...), cv::gout(outArgs...));
+        }
+
+        explicit operator bool() const
+        {
+            return static_cast<bool>(m_comp);
+        }
+    };
+
+private:
+    typedef std::pair<GProtoArgs, GProtoArgs> Captured;
+
+    template<int... IIs>
+    Captured capture(GProtoArgs &&args, const std::tuple<R...> &rr, detail::Seq<IIs...>)
+    {
+        return Captured(cv::GOut(std::get<IIs>(rr)...).m_args, args);
+    }
+
+    Captured capture(const Gen& g, Args... args)
+    {
+        return capture(cv::GIn(args...).m_args, g(args...), typename detail::MkSeq<sizeof...(R)>::type());
+    }
+
+    Captured m_capture;
+    cv::GComputation m_comp;
+
+public:
+    GComputationT(const Gen &generator)
+        : m_capture(capture(generator, detail::make_default<Args>()...))
+        , m_comp(cv::GProtoInputArgs(std::move(m_capture.second)),
+                 cv::GProtoOutputArgs(std::move(m_capture.first)))
+    {
+    }
+
+    void apply(detail::ProtoToParamT<Args>... inArgs,
+               detail::ProtoToParamT<R>&... outArgs)
+    {
+        m_comp.apply(cv::gin(inArgs...), cv::gout(outArgs...));
+    }
+
+    GCompiledT compile(detail::ProtoToMetaT<Args>... inDescs)
+    {
+        GMetaArgs inMetas = { GMetaArg(inDescs)... };
+        return GCompiledT(m_comp.compile(std::move(inMetas), GCompileArgs()));
+    }
+
+    GCompiledT compile(detail::ProtoToMetaT<Args>... inDescs, GCompileArgs &&args)
+    {
+        GMetaArgs inMetas = { GMetaArg(inDescs)... };
+        return GCompiledT(m_comp.compile(std::move(inMetas), std::move(args)));
+    }
+};
+
+} // namespace cv
+#endif // !defined(GAPI_STANDALONE)
+#endif // OPENCV_GAPI_GTYPED_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/imgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/imgproc.hpp
new file mode 100644 (file)
index 0000000..aeed9fa
--- /dev/null
@@ -0,0 +1,677 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_IMGPROC_HPP
+#define OPENCV_GAPI_IMGPROC_HPP
+
+#include "opencv2/imgproc.hpp"
+
+#include <utility> // std::tuple
+
+#include "opencv2/gapi/gkernel.hpp"
+#include "opencv2/gapi/gmat.hpp"
+#include "opencv2/gapi/gscalar.hpp"
+
+
+/** \defgroup gapi_imgproc G-API image processing functionality
+@{
+    @defgroup gapi_filters Graph API: Image filters
+    @defgroup gapi_colorconvert Graph API: Converting image from one color space to another
+@}
+ */
+
+namespace cv { namespace gapi {
+
+namespace imgproc {
+    using GMat3 = std::tuple<GMat,GMat,GMat>; // FIXME: how to avoid this?
+
+    G_TYPED_KERNEL(GFilter2D, <GMat(GMat,int,Mat,Point,Scalar,int,Scalar)>,"org.opencv.imgproc.filters.filter2D") {
+        static GMatDesc outMeta(GMatDesc in, int ddepth, Mat, Point, Scalar, int, Scalar) {
+            return in.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GSepFilter, <GMat(GMat,int,Mat,Mat,Point,Scalar,int,Scalar)>, "org.opencv.imgproc.filters.sepfilter") {
+        static GMatDesc outMeta(GMatDesc in, int ddepth, Mat, Mat, Point, Scalar, int, Scalar) {
+            return in.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GBoxFilter, <GMat(GMat,int,Size,Point,bool,int,Scalar)>, "org.opencv.imgproc.filters.boxfilter") {
+        static GMatDesc outMeta(GMatDesc in, int ddepth, Size, Point, bool, int, Scalar) {
+            return in.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GBlur, <GMat(GMat,Size,Point,int,Scalar)>,         "org.opencv.imgproc.filters.blur"){
+        static GMatDesc outMeta(GMatDesc in, Size, Point, int, Scalar) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GGaussBlur, <GMat(GMat,Size,double,double,int,Scalar)>, "org.opencv.imgproc.filters.gaussianBlur") {
+        static GMatDesc outMeta(GMatDesc in, Size, double, double, int, Scalar) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GMedianBlur, <GMat(GMat,int)>, "org.opencv.imgproc.filters.medianBlur") {
+        static GMatDesc outMeta(GMatDesc in, int) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GErode, <GMat(GMat,Mat,Point,int,int,Scalar)>, "org.opencv.imgproc.filters.erode") {
+        static GMatDesc outMeta(GMatDesc in, Mat, Point, int, int, Scalar) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GDilate, <GMat(GMat,Mat,Point,int,int,Scalar)>, "org.opencv.imgproc.filters.dilate") {
+        static GMatDesc outMeta(GMatDesc in, Mat, Point, int, int, Scalar) {
+            return in;
+        }
+    };
+
+    G_TYPED_KERNEL(GSobel, <GMat(GMat,int,int,int,int,double,double,int,Scalar)>, "org.opencv.imgproc.filters.sobel") {
+        static GMatDesc outMeta(GMatDesc in, int ddepth, int, int, int, double, double, int, Scalar) {
+            return in.withDepth(ddepth);
+        }
+    };
+
+    G_TYPED_KERNEL(GEqHist, <GMat(GMat)>, "org.opencv.imgproc.equalizeHist"){
+        static GMatDesc outMeta(GMatDesc in) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(GCanny, <GMat(GMat,double,double,int,bool)>, "org.opencv.imgproc.canny"){
+        static GMatDesc outMeta(GMatDesc in, double, double, int, bool) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2YUV, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.rgb2yuv") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GYUV2RGB, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.yuv2rgb") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2Lab, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.rgb2lab") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GBGR2LUV, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.bgr2luv") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GLUV2BGR, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.luv2bgr") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GYUV2BGR, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.yuv2bgr") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GBGR2YUV, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.bgr2yuv") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in; // type still remains CV_8UC3;
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2Gray, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.rgb2gray") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(GRGB2GrayCustom, <GMat(GMat,float,float,float)>, "org.opencv.imgproc.colorconvert.rgb2graycustom") {
+        static GMatDesc outMeta(GMatDesc in, float, float, float) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+
+    G_TYPED_KERNEL(GBGR2Gray, <GMat(GMat)>, "org.opencv.imgproc.colorconvert.bgr2gray") {
+        static GMatDesc outMeta(GMatDesc in) {
+            return in.withType(CV_8U, 1);
+        }
+    };
+}
+
+
+//! @addtogroup gapi_filters
+//! @{
+/** @brief Applies a separable linear filter to a matrix(image).
+
+The function applies a separable linear filter to the matrix. That is, first, every row of src is
+filtered with the 1D kernel kernelX. Then, every column of the result is filtered with the 1D
+kernel kernelY. The final result is returned.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note In case of floating-point computation, rounding to nearest even is procedeed
+if hardware supports it (if not - to nearest value).
+
+@note Function textual ID is "org.opencv.imgproc.filters.sepfilter"
+@param src Source image.
+@param ddepth desired depth of the destination image (the following combinations of src.depth() and ddepth are supported:
+
+        src.depth() = CV_8U, ddepth = -1/CV_16S/CV_32F/CV_64F
+        src.depth() = CV_16U/CV_16S, ddepth = -1/CV_32F/CV_64F
+        src.depth() = CV_32F, ddepth = -1/CV_32F/CV_64F
+        src.depth() = CV_64F, ddepth = -1/CV_64F
+
+when ddepth=-1, the output image will have the same depth as the source)
+@param kernelX Coefficients for filtering each row.
+@param kernelY Coefficients for filtering each column.
+@param anchor Anchor position within the kernel. The default value \f$(-1,-1)\f$ means that the anchor
+is at the kernel center.
+@param delta Value added to the filtered results before storing them.
+@param borderType Pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa  boxFilter, gaussianBlur, medianBlur
+ */
+GAPI_EXPORTS GMat sepFilter(const GMat& src, int ddepth, const Mat& kernelX, const Mat& kernelY, const Point& anchor /*FIXME: = Point(-1,-1)*/,
+                            const Scalar& delta /*FIXME = GScalar(0)*/, int borderType = BORDER_DEFAULT,
+                            const Scalar& borderValue = Scalar(0));
+
+/** @brief Convolves an image with the kernel.
+
+The function applies an arbitrary linear filter to an image. When
+the aperture is partially outside the image, the function interpolates outlier pixel values
+according to the specified border mode.
+
+The function does actually compute correlation, not the convolution:
+
+\f[\texttt{dst} (x,y) =  \sum _{ \stackrel{0\leq x' < \texttt{kernel.cols},}{0\leq y' < \texttt{kernel.rows}} }  \texttt{kernel} (x',y')* \texttt{src} (x+x'- \texttt{anchor.x} ,y+y'- \texttt{anchor.y} )\f]
+
+That is, the kernel is not mirrored around the anchor point. If you need a real convolution, flip
+the kernel using flip and set the new anchor to `(kernel.cols - anchor.x - 1, kernel.rows -
+anchor.y - 1)`.
+
+Supported matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+Output image must have the same size and number of channels an input image.
+@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+
+@note Function textual ID is "org.opencv.imgproc.filters.filter2D"
+
+@param src input image.
+@param ddepth desired depth of the destination image
+@param kernel convolution kernel (or rather a correlation kernel), a single-channel floating point
+matrix; if you want to apply different kernels to different channels, split the image into
+separate color planes using split and process them individually.
+@param anchor anchor of the kernel that indicates the relative position of a filtered point within
+the kernel; the anchor should lie within the kernel; default value (-1,-1) means that the anchor
+is at the kernel center.
+@param delta optional value added to the filtered pixels before storing them in dst.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa  sepFilter
+ */
+GAPI_EXPORTS GMat filter2D(const GMat& src, int ddepth, const Mat& kernel, const Point& anchor = Point(-1,-1), const Scalar& delta = Scalar(0),
+                           int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
+
+
+/** @brief Blurs an image using the box filter.
+
+The function smooths an image using the kernel:
+
+\f[\texttt{K} =  \alpha \begin{bmatrix} 1 & 1 & 1 &  \cdots & 1 & 1  \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \hdotsfor{6} \\ 1 & 1 & 1 &  \cdots & 1 & 1 \end{bmatrix}\f]
+
+where
+
+\f[\alpha = \fork{\frac{1}{\texttt{ksize.width*ksize.height}}}{when \texttt{normalize=true}}{1}{otherwise}\f]
+
+Unnormalized box filter is useful for computing various integral characteristics over each pixel
+neighborhood, such as covariance matrices of image derivatives (used in dense optical flow
+algorithms, and so on). If you need to compute pixel sums over variable-size windows, use cv::integral.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+
+@note Function textual ID is "org.opencv.imgproc.filters.boxfilter"
+
+@param src Source image.
+@param dtype the output image depth (-1 to set the input image data type).
+@param ksize blurring kernel size.
+@param anchor Anchor position within the kernel. The default value \f$(-1,-1)\f$ means that the anchor
+is at the kernel center.
+@param normalize flag, specifying whether the kernel is normalized by its area or not.
+@param borderType Pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa  sepFilter, gaussianBlur, medianBlur, integral
+ */
+GAPI_EXPORTS GMat boxFilter(const GMat& src, int dtype, const Size& ksize, const Point& anchor = Point(-1,-1),
+                            bool normalize = true, int borderType = BORDER_DEFAULT,
+                            const Scalar& borderValue = Scalar(0));
+
+/** @brief Blurs an image using the normalized box filter.
+
+The function smooths an image using the kernel:
+
+\f[\texttt{K} =  \frac{1}{\texttt{ksize.width*ksize.height}} \begin{bmatrix} 1 & 1 & 1 &  \cdots & 1 & 1  \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \hdotsfor{6} \\ 1 & 1 & 1 &  \cdots & 1 & 1  \\ \end{bmatrix}\f]
+
+The call `blur(src, dst, ksize, anchor, borderType)` is equivalent to `boxFilter(src, dst, src.type(),
+anchor, true, borderType)`.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+
+@note Function textual ID is "org.opencv.imgproc.filters.blur"
+
+@param src Source image.
+@param ksize blurring kernel size.
+@param anchor anchor point; default value Point(-1,-1) means that the anchor is at the kernel
+center.
+@param borderType border mode used to extrapolate pixels outside of the image, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa  boxFilter, bilateralFilter, GaussianBlur, medianBlur
+ */
+GAPI_EXPORTS GMat blur(const GMat& src, const Size& ksize, const Point& anchor = Point(-1,-1),
+                       int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
+
+
+//GAPI_EXPORTS_W void blur( InputArray src, OutputArray dst,
+ //                       Size ksize, Point anchor = Point(-1,-1),
+ //                       int borderType = BORDER_DEFAULT );
+
+
+/** @brief Blurs an image using a Gaussian filter.
+
+The function filter2Ds the source image with the specified Gaussian kernel.
+Output image must have the same type and number of channels an input image.
+
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+
+@note Function textual ID is "org.opencv.imgproc.filters.gaussianBlur"
+
+@param src input image;
+@param ksize Gaussian kernel size. ksize.width and ksize.height can differ but they both must be
+positive and odd. Or, they can be zero's and then they are computed from sigma.
+@param sigmaX Gaussian kernel standard deviation in X direction.
+@param sigmaY Gaussian kernel standard deviation in Y direction; if sigmaY is zero, it is set to be
+equal to sigmaX, if both sigmas are zeros, they are computed from ksize.width and ksize.height,
+respectively (see cv::getGaussianKernel for details); to fully control the result regardless of
+possible future modifications of all this semantics, it is recommended to specify all of ksize,
+sigmaX, and sigmaY.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa  sepFilter, boxFilter, medianBlur
+ */
+GAPI_EXPORTS GMat gaussianBlur(const GMat& src, const Size& ksize, double sigmaX, double sigmaY = 0,
+                               int borderType = BORDER_DEFAULT, const Scalar& borderValue = Scalar(0));
+
+/** @brief Blurs an image using the median filter.
+
+The function smoothes an image using the median filter with the \f$\texttt{ksize} \times
+\texttt{ksize}\f$ aperture. Each channel of a multi-channel image is processed independently.
+Output image must have the same type, size, and number of channels as the input image.
+@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+The median filter uses cv::BORDER_REPLICATE internally to cope with border pixels, see cv::BorderTypes
+
+@note Function textual ID is "org.opencv.imgproc.filters.medianBlur"
+
+@param src input matrix (image)
+@param ksize aperture linear size; it must be odd and greater than 1, for example: 3, 5, 7 ...
+@sa  boxFilter, gaussianBlur
+ */
+GAPI_EXPORTS GMat medianBlur(const GMat& src, int ksize);
+
+/** @brief Erodes an image by using a specific structuring element.
+
+The function erodes the source image using the specified structuring element that determines the
+shape of a pixel neighborhood over which the minimum is taken:
+
+\f[\texttt{dst} (x,y) =  \min _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f]
+
+Erosion can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+
+@note Function textual ID is "org.opencv.imgproc.filters.erode"
+
+@param src input image
+@param kernel structuring element used for erosion; if `element=Mat()`, a `3 x 3` rectangular
+structuring element is used. Kernel can be created using getStructuringElement.
+@param anchor position of the anchor within the element; default value (-1, -1) means that the
+anchor is at the element center.
+@param iterations number of times erosion is applied.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of a constant border
+@sa  dilate
+ */
+GAPI_EXPORTS GMat erode(const GMat& src, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1,
+                        int borderType = BORDER_CONSTANT,
+                        const  Scalar& borderValue = morphologyDefaultBorderValue());
+
+/** @brief Erodes an image by using 3 by 3 rectangular structuring element.
+
+The function erodes the source image using the rectangular structuring element with rectangle center as an anchor.
+Erosion can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+
+@param src input image
+@param iterations number of times erosion is applied.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of a constant border
+@sa  erode, dilate3x3
+ */
+GAPI_EXPORTS GMat erode3x3(const GMat& src, int iterations = 1,
+                           int borderType = BORDER_CONSTANT,
+                           const  Scalar& borderValue = morphologyDefaultBorderValue());
+
+/** @brief Dilates an image by using a specific structuring element.
+
+The function dilates the source image using the specified structuring element that determines the
+shape of a pixel neighborhood over which the maximum is taken:
+\f[\texttt{dst} (x,y) =  \max _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f]
+
+Dilation can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+
+@note Function textual ID is "org.opencv.imgproc.filters.dilate"
+
+@param src input image.
+@param kernel structuring element used for dilation; if elemenat=Mat(), a 3 x 3 rectangular
+structuring element is used. Kernel can be created using getStructuringElement
+@param anchor position of the anchor within the element; default value (-1, -1) means that the
+anchor is at the element center.
+@param iterations number of times dilation is applied.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of a constant border
+@sa  erode, morphologyEx, getStructuringElement
+ */
+GAPI_EXPORTS GMat dilate(const GMat& src, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1,
+                         int borderType = BORDER_CONSTANT,
+                         const  Scalar& borderValue = morphologyDefaultBorderValue());
+
+/** @brief Dilates an image by using 3 by 3 rectangular structuring element.
+
+The function dilates the source image using the specified structuring element that determines the
+shape of a pixel neighborhood over which the maximum is taken:
+\f[\texttt{dst} (x,y) =  \max _{(x',y'):  \, \texttt{element} (x',y') \ne0 } \texttt{src} (x+x',y+y')\f]
+
+Dilation can be applied several (iterations) times. In case of multi-channel images, each channel is processed independently.
+Supported input matrix data types are @ref CV_8UC1, @ref CV_8UC3, @ref CV_16UC1, @ref CV_16SC1, and @ref CV_32FC1.
+Output image must have the same type, size, and number of channels as the input image.
+@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+
+@note Function textual ID is "org.opencv.imgproc.filters.dilate"
+
+@param src input image.
+@param iterations number of times dilation is applied.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of a constant border
+@sa  dilate, erode3x3
+ */
+
+GAPI_EXPORTS GMat dilate3x3(const GMat& src, int iterations = 1,
+                            int borderType = BORDER_CONSTANT,
+                            const  Scalar& borderValue = morphologyDefaultBorderValue());
+
+/** @brief Calculates the first, second, third, or mixed image derivatives using an extended Sobel operator.
+
+In all cases except one, the \f$\texttt{ksize} \times \texttt{ksize}\f$ separable kernel is used to
+calculate the derivative. When \f$\texttt{ksize = 1}\f$, the \f$3 \times 1\f$ or \f$1 \times 3\f$
+kernel is used (that is, no Gaussian smoothing is done). `ksize = 1` can only be used for the first
+or the second x- or y- derivatives.
+
+There is also the special value `ksize = FILTER_SCHARR (-1)` that corresponds to the \f$3\times3\f$ Scharr
+filter that may give more accurate results than the \f$3\times3\f$ Sobel. The Scharr aperture is
+
+\f[\vecthreethree{-3}{0}{3}{-10}{0}{10}{-3}{0}{3}\f]
+
+for the x-derivative, or transposed for the y-derivative.
+
+The function calculates an image derivative by convolving the image with the appropriate kernel:
+
+\f[\texttt{dst} =  \frac{\partial^{xorder+yorder} \texttt{src}}{\partial x^{xorder} \partial y^{yorder}}\f]
+
+The Sobel operators combine Gaussian smoothing and differentiation, so the result is more or less
+resistant to the noise. Most often, the function is called with ( xorder = 1, yorder = 0, ksize = 3)
+or ( xorder = 0, yorder = 1, ksize = 3) to calculate the first x- or y- image derivative. The first
+case corresponds to a kernel of:
+
+\f[\vecthreethree{-1}{0}{1}{-2}{0}{2}{-1}{0}{1}\f]
+
+The second case corresponds to a kernel of:
+
+\f[\vecthreethree{-1}{-2}{-1}{0}{0}{0}{1}{2}{1}\f]
+
+@note Rounding to nearest even is procedeed if hardware supports it, if not - to nearest.
+
+@note Function textual ID is "org.opencv.imgproc.filters.sobel"
+
+@param src input image.
+@param ddepth output image depth, see @ref filter_depths "combinations"; in the case of
+    8-bit input images it will result in truncated derivatives.
+@param dx order of the derivative x.
+@param dy order of the derivative y.
+@param ksize size of the extended Sobel kernel; it must be odd.
+@param scale optional scale factor for the computed derivative values; by default, no scaling is
+applied (see cv::getDerivKernels for details).
+@param delta optional delta value that is added to the results prior to storing them in dst.
+@param borderType pixel extrapolation method, see cv::BorderTypes
+@param borderValue border value in case of constant border type
+@sa filter2D, gaussianBlur, cartToPolar
+ */
+GAPI_EXPORTS GMat Sobel(const GMat& src, int ddepth, int dx, int dy, int ksize = 3,
+                        double scale = 1, double delta = 0,
+                        int borderType = BORDER_DEFAULT,
+                        const Scalar& borderValue = Scalar(0));
+
+/** @brief Finds edges in an image using the Canny algorithm.
+
+The function finds edges in the input image and marks them in the output map edges using the
+Canny algorithm. The smallest value between threshold1 and threshold2 is used for edge linking. The
+largest value is used to find initial segments of strong edges. See
+<http://en.wikipedia.org/wiki/Canny_edge_detector>
+
+@note Function textual ID is "org.opencv.imgproc.filters.canny"
+
+@param image 8-bit input image.
+@param threshold1 first threshold for the hysteresis procedure.
+@param threshold2 second threshold for the hysteresis procedure.
+@param apertureSize aperture size for the Sobel operator.
+@param L2gradient a flag, indicating whether a more accurate \f$L_2\f$ norm
+\f$=\sqrt{(dI/dx)^2 + (dI/dy)^2}\f$ should be used to calculate the image gradient magnitude (
+L2gradient=true ), or whether the default \f$L_1\f$ norm \f$=|dI/dx|+|dI/dy|\f$ is enough (
+L2gradient=false ).
+ */
+GAPI_EXPORTS GMat Canny(const GMat& image, double threshold1, double threshold2,
+                        int apertureSize = 3, bool L2gradient = false);
+
+/** @brief Equalizes the histogram of a grayscale image.
+
+The function equalizes the histogram of the input image using the following algorithm:
+
+- Calculate the histogram \f$H\f$ for src .
+- Normalize the histogram so that the sum of histogram bins is 255.
+- Compute the integral of the histogram:
+\f[H'_i =  \sum _{0  \le j < i} H(j)\f]
+- Transform the image using \f$H'\f$ as a look-up table: \f$\texttt{dst}(x,y) = H'(\texttt{src}(x,y))\f$
+
+The algorithm normalizes the brightness and increases the contrast of the image.
+@note The returned image is of the same size and type as input.
+
+@note Function textual ID is "org.opencv.imgproc.equalizeHist"
+
+@param src Source 8-bit single channel image.
+ */
+GAPI_EXPORTS GMat equalizeHist(const GMat& src);
+
+//! @} gapi_filters
+
+//! @addtogroup gapi_colorconvert
+//! @{
+/** @brief Converts an image from RGB color space to gray-scaled.
+The conventional ranges for R, G, and B channel values are 0 to 255.
+Resulting gray color value computed as
+\f[\texttt{dst} (I)= \texttt{0.299} * \texttt{src}(I).R + \texttt{0.587} * \texttt{src}(I).G  + \texttt{0.114} * \texttt{src}(I).B \f]
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2gray"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC1.
+@sa RGB2YUV
+ */
+GAPI_EXPORTS GMat RGB2Gray(const GMat& src);
+
+/** @overload
+Resulting gray color value computed as
+\f[\texttt{dst} (I)= \texttt{rY} * \texttt{src}(I).R + \texttt{gY} * \texttt{src}(I).G  + \texttt{bY} * \texttt{src}(I).B \f]
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2graycustom"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC1.
+@param rY float multiplier for R channel.
+@param gY float multiplier for G channel.
+@param bY float multiplier for B channel.
+@sa RGB2YUV
+ */
+GAPI_EXPORTS GMat RGB2Gray(const GMat& src, float rY, float gY, float bY);
+
+/** @brief Converts an image from BGR color space to gray-scaled.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+Resulting gray color value computed as
+\f[\texttt{dst} (I)= \texttt{0.114} * \texttt{src}(I).B + \texttt{0.587} * \texttt{src}(I).G  + \texttt{0.299} * \texttt{src}(I).R \f]
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.bgr2gray"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC1.
+@sa BGR2LUV
+ */
+GAPI_EXPORTS GMat BGR2Gray(const GMat& src);
+
+/** @brief Converts an image from RGB color space to YUV color space.
+
+The function converts an input image from RGB color space to YUV.
+The conventional ranges for R, G, and B channel values are 0 to 255.
+
+In case of linear transformations, the range does not matter. But in case of a non-linear
+transformation, an input RGB image should be normalized to the proper value range to get the correct
+results, like here, at RGB \f$\rightarrow\f$ Y\*u\*v\* transformation.
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2yuv"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa YUV2RGB, RGB2Lab
+*/
+GAPI_EXPORTS GMat RGB2YUV(const GMat& src);
+
+/** @brief Converts an image from BGR color space to LUV color space.
+
+The function converts an input image from BGR color space to LUV.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.bgr2luv"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa RGB2Lab, RGB2LUV
+*/
+GAPI_EXPORTS GMat BGR2LUV(const GMat& src);
+
+/** @brief Converts an image from LUV color space to BGR color space.
+
+The function converts an input image from LUV color space to BGR.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.luv2bgr"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa BGR2LUV
+*/
+GAPI_EXPORTS GMat LUV2BGR(const GMat& src);
+
+/** @brief Converts an image from YUV color space to BGR color space.
+
+The function converts an input image from YUV color space to BGR.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.yuv2bgr"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa BGR2YUV
+*/
+GAPI_EXPORTS GMat YUV2BGR(const GMat& src);
+
+/** @brief Converts an image from BGR color space to YUV color space.
+
+The function converts an input image from BGR color space to YUV.
+The conventional ranges for B, G, and R channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.bgr2yuv"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+@sa YUV2BGR
+*/
+GAPI_EXPORTS GMat BGR2YUV(const GMat& src);
+
+/** @brief Converts an image from RGB color space to Lab color space.
+
+The function converts an input image from BGR color space to Lab.
+The conventional ranges for R, G, and B channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC1.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.rgb2lab"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC1.
+@sa RGB2YUV, RGB2LUV
+*/
+GAPI_EXPORTS GMat RGB2Lab(const GMat& src);
+
+/** @brief Converts an image from YUV color space to RGB.
+The function converts an input image from YUV color space to RGB.
+The conventional ranges for Y, U, and V channel values are 0 to 255.
+
+Output image must be 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@note Function textual ID is "org.opencv.imgproc.colorconvert.yuv2rgb"
+
+@param src input image: 8-bit unsigned 3-channel image @ref CV_8UC3.
+
+@sa RGB2Lab, RGB2YUV
+*/
+GAPI_EXPORTS GMat YUV2RGB(const GMat& src);
+
+//! @} gapi_colorconvert
+} //namespace gapi
+} //namespace cv
+
+#endif // OPENCV_GAPI_IMGPROC_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/opencv_includes.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/opencv_includes.hpp
new file mode 100644 (file)
index 0000000..5acf280
--- /dev/null
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OPENCV_INCLUDES_HPP
+#define OPENCV_GAPI_OPENCV_INCLUDES_HPP
+
+#if !defined(GAPI_STANDALONE)
+#  include <opencv2/core/mat.hpp>
+#  include <opencv2/core/cvdef.h>
+#  include <opencv2/core/types.hpp>
+#  include <opencv2/core/base.hpp>
+#else   // Without OpenCV
+#  include <opencv2/gapi/own/cvdefs.hpp>
+#endif // !defined(GAPI_STANDALONE)
+
+#endif // OPENCV_GAPI_OPENCV_INCLUDES_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/operators.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/operators.hpp
new file mode 100644 (file)
index 0000000..27a1d80
--- /dev/null
@@ -0,0 +1,69 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OPERATORS_HPP
+#define OPENCV_GAPI_OPERATORS_HPP
+
+#include "opencv2/gapi/gmat.hpp"
+#include "opencv2/gapi/gscalar.hpp"
+
+GAPI_EXPORTS cv::GMat operator+(const cv::GMat&    lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator+(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator+(const cv::GScalar& lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator-(const cv::GMat&    lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator-(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator-(const cv::GScalar& lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator*(const cv::GMat&    lhs, float              rhs);
+GAPI_EXPORTS cv::GMat operator*(float              lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator*(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator*(const cv::GScalar& lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator/(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator/(const cv::GScalar& lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator/(const cv::GMat&    lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator&(const cv::GMat&    lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator|(const cv::GMat&    lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator^(const cv::GMat&    lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator~(const cv::GMat&    lhs);
+
+GAPI_EXPORTS cv::GMat operator&(const cv::GScalar& lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator|(const cv::GScalar& lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator^(const cv::GScalar& lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator&(const cv::GMat& lhs, const cv::GScalar&    rhs);
+GAPI_EXPORTS cv::GMat operator|(const cv::GMat& lhs, const cv::GScalar&    rhs);
+GAPI_EXPORTS cv::GMat operator^(const cv::GMat& lhs, const cv::GScalar&    rhs);
+
+GAPI_EXPORTS cv::GMat operator>(const cv::GMat&    lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator>=(const cv::GMat&   lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator<(const cv::GMat&    lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator<=(const cv::GMat&   lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator==(const cv::GMat&   lhs, const cv::GMat&    rhs);
+GAPI_EXPORTS cv::GMat operator!=(const cv::GMat&   lhs, const cv::GMat&    rhs);
+
+GAPI_EXPORTS cv::GMat operator>(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator>=(const cv::GMat&   lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator<(const cv::GMat&    lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator<=(const cv::GMat&   lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator==(const cv::GMat&   lhs, const cv::GScalar& rhs);
+GAPI_EXPORTS cv::GMat operator!=(const cv::GMat&   lhs, const cv::GScalar& rhs);
+
+GAPI_EXPORTS cv::GMat operator>(const cv::GScalar&    lhs, const cv::GMat& rhs);
+GAPI_EXPORTS cv::GMat operator>=(const cv::GScalar&   lhs, const cv::GMat& rhs);
+GAPI_EXPORTS cv::GMat operator<(const cv::GScalar&    lhs, const cv::GMat& rhs);
+GAPI_EXPORTS cv::GMat operator<=(const cv::GScalar&   lhs, const cv::GMat& rhs);
+GAPI_EXPORTS cv::GMat operator==(const cv::GScalar&   lhs, const cv::GMat& rhs);
+GAPI_EXPORTS cv::GMat operator!=(const cv::GScalar&   lhs, const cv::GMat& rhs);
+
+
+
+#endif // OPENCV_GAPI_OPERATORS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/assert.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/assert.hpp
new file mode 100644 (file)
index 0000000..8d3feff
--- /dev/null
@@ -0,0 +1,41 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OWN_ASSERT_HPP
+#define OPENCV_GAPI_OWN_ASSERT_HPP
+
+#if !defined(GAPI_STANDALONE)
+#include <opencv2/core/base.hpp>
+#define GAPI_Assert(expr) CV_Assert(expr)
+
+#else
+#include <stdexcept>
+#include <sstream>
+#include "opencv2/gapi/util/throw.hpp"
+
+namespace detail
+{
+    inline void assert_abort(const char* str, int line, const char* file, const char* func)
+    {
+        std::stringstream ss;
+        ss << file << ":" << line << ": Assertion " << str << " in function " << func << " failed\n";
+        cv::util::throw_error(std::logic_error(ss.str()));
+    }
+}
+
+#define GAPI_Assert(expr) \
+{ if (!(expr)) ::detail::assert_abort(#expr, __LINE__, __FILE__, __func__); }
+
+#endif
+
+#ifdef NDEBUG
+#  define GAPI_DbgAssert(expr)
+#else
+#  define GAPI_DbgAssert(expr) GAPI_Assert(expr)
+#endif
+
+#endif // OPENCV_GAPI_OWN_ASSERT_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/convert.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/convert.hpp
new file mode 100644 (file)
index 0000000..8c1feb4
--- /dev/null
@@ -0,0 +1,50 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OWN_CONVERT_HPP
+#define OPENCV_GAPI_OWN_CONVERT_HPP
+
+#if !defined(GAPI_STANDALONE)
+
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/own/types.hpp>
+#include <opencv2/gapi/own/mat.hpp>
+#include "opencv2/gapi/own/scalar.hpp"
+
+namespace cv
+{
+    inline cv::gapi::own::Mat to_own(Mat const& m) { return {m.rows, m.cols, m.type(), m.data, m.step};};
+           cv::gapi::own::Mat to_own(Mat&&) = delete;
+
+    inline cv::gapi::own::Scalar to_own(const cv::Scalar& s) { return {s[0], s[1], s[2], s[3]}; };
+
+    inline cv::gapi::own::Size to_own (const Size& s) { return {s.width, s.height}; };
+
+    inline cv::gapi::own::Rect to_own (const Rect& r) { return {r.x, r.y, r.width, r.height}; };
+
+
+
+namespace gapi
+{
+namespace own
+{
+    inline cv::Mat to_ocv(Mat const& m) { return {m.rows, m.cols, m.type(), m.data, m.step};};
+           cv::Mat to_ocv(Mat&&)    = delete;
+
+    inline cv::Scalar to_ocv(const Scalar& s) { return {s[0], s[1], s[2], s[3]}; };
+
+    inline cv::Size to_ocv (const Size& s) { return cv::Size(s.width, s.height); };
+
+    inline cv::Rect to_ocv (const Rect& r) { return cv::Rect(r.x, r.y, r.width, r.height); };
+
+} // namespace own
+} // namespace gapi
+} // namespace cv
+
+#endif // !defined(GAPI_STANDALONE)
+
+#endif // OPENCV_GAPI_OWN_CONVERT_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/cvdefs.hpp
new file mode 100644 (file)
index 0000000..e110536
--- /dev/null
@@ -0,0 +1,146 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CV_DEFS_HPP
+#define OPENCV_GAPI_CV_DEFS_HPP
+
+#if defined(GAPI_STANDALONE)
+
+// Simulate OpenCV definitions taken from various
+// OpenCV interface headers if G-API is built in a
+// standalone mode.
+
+// interface.h:
+
+typedef unsigned char uchar;
+typedef          char schar;
+
+typedef unsigned short ushort;
+
+#define CV_CN_MAX     512
+#define CV_CN_SHIFT   3
+#define CV_DEPTH_MAX  (1 << CV_CN_SHIFT)
+
+
+#define CV_8U   0
+#define CV_8S   1
+#define CV_16U  2
+#define CV_16S  3
+#define CV_32S  4
+#define CV_32F  5
+#define CV_64F  6
+#define CV_USRTYPE1 7
+
+#define CV_MAT_DEPTH_MASK       (CV_DEPTH_MAX - 1)
+#define CV_MAT_DEPTH(flags)     ((flags) & CV_MAT_DEPTH_MASK)
+
+#define CV_MAKETYPE(depth,cn) (CV_MAT_DEPTH(depth) + (((cn)-1) << CV_CN_SHIFT))
+#define CV_MAKE_TYPE CV_MAKETYPE
+
+#define CV_8UC1 CV_MAKETYPE(CV_8U,1)
+#define CV_8UC2 CV_MAKETYPE(CV_8U,2)
+#define CV_8UC3 CV_MAKETYPE(CV_8U,3)
+#define CV_8UC4 CV_MAKETYPE(CV_8U,4)
+#define CV_8UC(n) CV_MAKETYPE(CV_8U,(n))
+
+#define CV_8SC1 CV_MAKETYPE(CV_8S,1)
+#define CV_8SC2 CV_MAKETYPE(CV_8S,2)
+#define CV_8SC3 CV_MAKETYPE(CV_8S,3)
+#define CV_8SC4 CV_MAKETYPE(CV_8S,4)
+#define CV_8SC(n) CV_MAKETYPE(CV_8S,(n))
+
+#define CV_16UC1 CV_MAKETYPE(CV_16U,1)
+#define CV_16UC2 CV_MAKETYPE(CV_16U,2)
+#define CV_16UC3 CV_MAKETYPE(CV_16U,3)
+#define CV_16UC4 CV_MAKETYPE(CV_16U,4)
+#define CV_16UC(n) CV_MAKETYPE(CV_16U,(n))
+
+#define CV_16SC1 CV_MAKETYPE(CV_16S,1)
+#define CV_16SC2 CV_MAKETYPE(CV_16S,2)
+#define CV_16SC3 CV_MAKETYPE(CV_16S,3)
+#define CV_16SC4 CV_MAKETYPE(CV_16S,4)
+#define CV_16SC(n) CV_MAKETYPE(CV_16S,(n))
+
+#define CV_32SC1 CV_MAKETYPE(CV_32S,1)
+#define CV_32SC2 CV_MAKETYPE(CV_32S,2)
+#define CV_32SC3 CV_MAKETYPE(CV_32S,3)
+#define CV_32SC4 CV_MAKETYPE(CV_32S,4)
+#define CV_32SC(n) CV_MAKETYPE(CV_32S,(n))
+
+#define CV_32FC1 CV_MAKETYPE(CV_32F,1)
+#define CV_32FC2 CV_MAKETYPE(CV_32F,2)
+#define CV_32FC3 CV_MAKETYPE(CV_32F,3)
+#define CV_32FC4 CV_MAKETYPE(CV_32F,4)
+#define CV_32FC(n) CV_MAKETYPE(CV_32F,(n))
+
+#define CV_64FC1 CV_MAKETYPE(CV_64F,1)
+#define CV_64FC2 CV_MAKETYPE(CV_64F,2)
+#define CV_64FC3 CV_MAKETYPE(CV_64F,3)
+#define CV_64FC4 CV_MAKETYPE(CV_64F,4)
+#define CV_64FC(n) CV_MAKETYPE(CV_64F,(n))
+
+// cvdef.h:
+
+#define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
+#define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
+#define CV_MAT_TYPE_MASK        (CV_DEPTH_MAX*CV_CN_MAX - 1)
+#define CV_MAT_TYPE(flags)      ((flags) & CV_MAT_TYPE_MASK)
+#define CV_MAT_CONT_FLAG_SHIFT  14
+#define CV_MAT_CONT_FLAG        (1 << CV_MAT_CONT_FLAG_SHIFT)
+#define CV_IS_MAT_CONT(flags)   ((flags) & CV_MAT_CONT_FLAG)
+#define CV_IS_CONT_MAT          CV_IS_MAT_CONT
+#define CV_SUBMAT_FLAG_SHIFT    15
+#define CV_SUBMAT_FLAG          (1 << CV_SUBMAT_FLAG_SHIFT)
+#define CV_IS_SUBMAT(flags)     ((flags) & CV_MAT_SUBMAT_FLAG)
+
+///** Size of each channel item,
+//   0x8442211 = 1000 0100 0100 0010 0010 0001 0001 ~ array of sizeof(arr_type_elem) */
+//#define CV_ELEM_SIZE1(type) \
+//    ((((sizeof(size_t)<<28)|0x8442211) >> CV_MAT_DEPTH(type)*4) & 15)
+
+#define CV_MAT_TYPE(flags)      ((flags) & CV_MAT_TYPE_MASK)
+
+/** 0x3a50 = 11 10 10 01 01 00 00 ~ array of log2(sizeof(arr_type_elem)) */
+#define CV_ELEM_SIZE(type) \
+    (CV_MAT_CN(type) << ((((sizeof(size_t)/4+1)*16384|0x3a50) >> CV_MAT_DEPTH(type)*2) & 3))
+
+// base.h:
+namespace cv
+{
+enum BorderTypes {
+    BORDER_CONSTANT    = 0, //!< `iiiiii|abcdefgh|iiiiiii`  with some specified `i`
+    BORDER_REPLICATE   = 1, //!< `aaaaaa|abcdefgh|hhhhhhh`
+    BORDER_REFLECT     = 2, //!< `fedcba|abcdefgh|hgfedcb`
+    BORDER_WRAP        = 3, //!< `cdefgh|abcdefgh|abcdefg`
+    BORDER_REFLECT_101 = 4, //!< `gfedcb|abcdefgh|gfedcba`
+    BORDER_TRANSPARENT = 5, //!< `uvwxyz|abcdefgh|ijklmno`
+
+    BORDER_REFLECT101  = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
+    BORDER_DEFAULT     = BORDER_REFLECT_101, //!< same as BORDER_REFLECT_101
+    BORDER_ISOLATED    = 16 //!< do not look outside of ROI
+};
+// imgproc.hpp:
+enum InterpolationFlags{
+    INTER_NEAREST        = 0,
+    INTER_LINEAR         = 1,
+    INTER_CUBIC          = 2,
+    INTER_AREA           = 3,
+    INTER_LANCZOS4       = 4,
+    INTER_LINEAR_EXACT   = 5,
+    INTER_MAX            = 7,
+};
+} // namespace cv
+
+static inline int cvFloor( double value )
+{
+    int i = (int)value;
+    return i - (i > value);
+}
+
+#endif //  defined(GAPI_STANDALONE)
+
+#endif //  OPENCV_GAPI_CV_DEFS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/exports.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/exports.hpp
new file mode 100644 (file)
index 0000000..0d955d0
--- /dev/null
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OWN_TYPES_HPP
+#define OPENCV_GAPI_OWN_TYPES_HPP
+
+#   if 0
+#       include <opencv2/core/base.hpp>
+#       define GAPI_EXPORTS CV_EXPORTS
+
+#   else
+#       if defined _WIN32
+#           define GAPI_EXPORTS __declspec(dllexport)
+#       elif defined __GNUC__ && __GNUC__ >= 4
+#           define GAPI_EXPORTS __attribute__ ((visibility ("default")))
+#       endif
+
+#       ifndef GAPI_EXPORTS
+#           define GAPI_EXPORTS
+#       endif
+
+#   endif
+
+#endif // OPENCV_GAPI_OWN_TYPES_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/mat.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/mat.hpp
new file mode 100644 (file)
index 0000000..73f3afc
--- /dev/null
@@ -0,0 +1,291 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OWN_MAT_HPP
+#define OPENCV_GAPI_OWN_MAT_HPP
+
+#include "opencv2/gapi/opencv_includes.hpp"
+#include "opencv2/gapi/own/types.hpp"
+#include "opencv2/gapi/own/scalar.hpp"
+#include "opencv2/gapi/own/saturate.hpp"
+#include "opencv2/gapi/own/assert.hpp"
+
+#include <memory>                   //std::shared_ptr
+#include <cstring>                  //std::memcpy
+#include "opencv2/gapi/util/throw.hpp"
+
+namespace cv { namespace gapi { namespace own {
+    namespace detail {
+        template <typename T, unsigned char channels>
+        void assign_row(void* ptr, int cols, Scalar const& s)
+        {
+            auto p = static_cast<T*>(ptr);
+            for (int c = 0; c < cols; c++)
+            {
+                for (int ch = 0; ch < channels; ch++)
+                {
+                    p[c * channels + ch] = saturate<T>(s[ch], roundd);
+                }
+            }
+        }
+
+        inline size_t default_step(int type, int cols)
+        {
+            return CV_ELEM_SIZE(type) * cols;
+        }
+        //Matrix header, i.e. fields that are unique to each Mat object.
+        //Devoted class is needed to implement custom behavior on move (erasing state of moved from object)
+        struct MatHeader{
+            enum { AUTO_STEP = 0};
+            enum { TYPE_MASK = 0x00000FFF  };
+
+            MatHeader() = default;
+
+            MatHeader(int _rows, int _cols, int type, void* _data, size_t _step)
+            : flags((type & TYPE_MASK)), rows(_rows), cols(_cols), data((uchar*)_data), step(_step == AUTO_STEP ? detail::default_step(type, _cols) : _step)
+            {}
+
+            MatHeader(const MatHeader& ) = default;
+            MatHeader(MatHeader&& src) : MatHeader(src) // reuse copy constructor here
+            {
+                MatHeader empty; //give it a name to call copy(not move) assignment below
+                src = empty;
+            }
+            MatHeader& operator=(const MatHeader& ) = default;
+            MatHeader& operator=(MatHeader&& src)
+            {
+                *this = src; //calling a copy assignment here, not move one
+                MatHeader empty; //give it a name to call copy(not move) assignment below
+                src = empty;
+                return *this;
+            }
+            /*! includes several bit-fields:
+                 - depth
+                 - number of channels
+             */
+            int flags = 0;
+
+            //! the number of rows and columns or (-1, -1) when the matrix has more than 2 dimensions
+            int rows = 0, cols = 0;
+            //! pointer to the data
+            uchar* data = nullptr;
+            size_t step = 0;
+        };
+    }
+    //concise version of cv::Mat suitable for GAPI needs (used when no dependence on OpenCV is required)
+    class Mat : public detail::MatHeader{
+    public:
+
+        Mat() = default;
+
+        /** @overload
+        @param _rows Number of rows in a 2D array.
+        @param _cols Number of columns in a 2D array.
+        @param _type Array type. Use CV_8UC1, ..., CV_64FC4 to create 1-4 channel matrices, or
+        CV_8UC(n), ..., CV_64FC(n) to create multi-channel (up to CV_CN_MAX channels) matrices.
+        @param _data Pointer to the user data. Matrix constructors that take data and step parameters do not
+        allocate matrix data. Instead, they just initialize the matrix header that points to the specified
+        data, which means that no data is copied. This operation is very efficient and can be used to
+        process external data using OpenCV functions. The external data is not automatically deallocated, so
+        you should take care of it.
+        @param _step Number of bytes each matrix row occupies. The value should include the padding bytes at
+        the end of each row, if any. If the parameter is missing (set to AUTO_STEP ), no padding is assumed
+        and the actual step is calculated as cols*elemSize(). See Mat::elemSize.
+        */
+        Mat(int _rows, int _cols, int _type, void* _data, size_t _step = AUTO_STEP)
+        : MatHeader (_rows, _cols, _type, _data, _step)
+        {}
+
+        Mat(Mat const& src, const Rect& roi )
+        : Mat(src)
+        {
+           rows = roi.height;
+           cols = roi.width;
+           data = ptr(roi.y, roi.x);
+        }
+
+        Mat(Mat const& src) = default;
+        Mat(Mat&& src) = default;
+
+        Mat& operator=(Mat const& src) = default;
+        Mat& operator=(Mat&& src) = default;
+
+        /** @brief Sets all or some of the array elements to the specified value.
+        @param s Assigned scalar converted to the actual array type.
+        */
+        Mat& operator = (const Scalar& s)
+        {
+            constexpr unsigned max_channels = 4; //Scalar can't fit more than 4
+            const auto channels = static_cast<unsigned int>(this->channels());
+            GAPI_Assert(channels <= max_channels);
+
+            using func_p_t = void (*)(void*, int, Scalar const&);
+            using detail::assign_row;
+            #define TABLE_ENTRY(type)  {assign_row<type, 1>, assign_row<type, 2>, assign_row<type, 3>, assign_row<type, 4>}
+            static constexpr func_p_t func_tbl[][max_channels] = {
+                    TABLE_ENTRY(uchar),
+                    TABLE_ENTRY(schar),
+                    TABLE_ENTRY(ushort),
+                    TABLE_ENTRY(short),
+                    TABLE_ENTRY(int),
+                    TABLE_ENTRY(float),
+                    TABLE_ENTRY(double)
+            };
+            #undef TABLE_ENTRY
+
+            static_assert(CV_8U == 0 && CV_8S == 1  && CV_16U == 2 && CV_16S == 3
+                       && CV_32S == 4 && CV_32F == 5 && CV_64F == 6,
+                       "OCV type ids used as indexes to array, thus exact numbers are important!"
+            );
+
+            const auto depth = static_cast<unsigned int>(this->depth());
+            GAPI_Assert(depth < sizeof(func_tbl)/sizeof(func_tbl[0]));
+
+            for (int r = 0; r < rows; ++r)
+            {
+                auto* f = func_tbl[depth][channels -1];
+                (*f)(static_cast<void *>(ptr(r)), cols, s );
+            }
+            return *this;
+        }
+
+        /** @brief Returns the matrix element size in bytes.
+
+        The method returns the matrix element size in bytes. For example, if the matrix type is CV_16SC3 ,
+        the method returns 3\*sizeof(short) or 6.
+         */
+        size_t elemSize() const
+        {
+            return CV_ELEM_SIZE(type());
+        }
+        /** @brief Returns the type of a matrix element.
+
+        The method returns a matrix element type. This is an identifier compatible with the CvMat type
+        system, like CV_16SC3 or 16-bit signed 3-channel array, and so on.
+         */
+        int type() const            {return CV_MAT_TYPE(flags);}
+
+        /** @brief Returns the depth of a matrix element.
+
+        The method returns the identifier of the matrix element depth (the type of each individual channel).
+        For example, for a 16-bit signed element array, the method returns CV_16S . A complete list of
+        matrix types contains the following values:
+        -   CV_8U - 8-bit unsigned integers ( 0..255 )
+        -   CV_8S - 8-bit signed integers ( -128..127 )
+        -   CV_16U - 16-bit unsigned integers ( 0..65535 )
+        -   CV_16S - 16-bit signed integers ( -32768..32767 )
+        -   CV_32S - 32-bit signed integers ( -2147483648..2147483647 )
+        -   CV_32F - 32-bit floating-point numbers ( -FLT_MAX..FLT_MAX, INF, NAN )
+        -   CV_64F - 64-bit floating-point numbers ( -DBL_MAX..DBL_MAX, INF, NAN )
+         */
+        int depth() const           {return CV_MAT_DEPTH(flags);}
+
+        /** @brief Returns the number of matrix channels.
+
+        The method returns the number of matrix channels.
+         */
+        int channels() const        {return CV_MAT_CN(flags);}
+
+        /**
+        @param _rows New number of rows.
+        @param _cols New number of columns.
+        @param _type New matrix type.
+         */
+        void create(int _rows, int _cols, int _type)
+        {
+            create({_cols, _rows}, _type);
+        }
+        /** @overload
+        @param _size Alternative new matrix size specification: Size(cols, rows)
+        @param _type New matrix type.
+        */
+        void create(Size _size, int _type)
+        {
+            if (_size != Size{cols, rows} )
+            {
+                Mat tmp{_size.height, _size.width, _type, nullptr};
+                tmp.memory.reset(new uchar[ tmp.step * tmp.rows], [](uchar * p){delete[] p;});
+                tmp.data = tmp.memory.get();
+
+                *this = std::move(tmp);
+            }
+        }
+
+        /** @brief Copies the matrix to another one.
+
+        The method copies the matrix data to another matrix. Before copying the data, the method invokes :
+        @code
+            m.create(this->size(), this->type());
+        @endcode
+        so that the destination matrix is reallocated if needed. While m.copyTo(m); works flawlessly, the
+        function does not handle the case of a partial overlap between the source and the destination
+        matrices.
+         */
+        void copyTo(Mat& dst) const
+        {
+            dst.create(rows, cols, type());
+            for (int r = 0; r < rows; ++r)
+            {
+                std::copy_n(ptr(r), detail::default_step(type(),cols), dst.ptr(r));
+            }
+        }
+
+        /** @brief Returns true if the array has no elements.
+
+        The method returns true if Mat::total() is 0 or if Mat::data is NULL. Because of pop_back() and
+        resize() methods `M.total() == 0` does not imply that `M.data == NULL`.
+         */
+        bool empty() const;
+
+        /** @brief Returns the total number of array elements.
+
+        The method returns the number of array elements (a number of pixels if the array represents an
+        image).
+         */
+        size_t total() const
+        {
+            return static_cast<size_t>(rows * cols);
+        }
+
+
+        /** @overload
+        @param roi Extracted submatrix specified as a rectangle.
+        */
+        Mat operator()( const Rect& roi ) const
+        {
+            return Mat{*this, roi};
+        }
+
+
+        /** @brief Returns a pointer to the specified matrix row.
+
+        The methods return `uchar*` or typed pointer to the specified matrix row. See the sample in
+        Mat::isContinuous to know how to use these methods.
+        @param row Index along the dimension 0
+        @param col Index along the dimension 1
+        */
+        uchar* ptr(int row, int col = 0)
+        {
+            return const_cast<uchar*>(const_cast<const Mat*>(this)->ptr(row,col));
+        }
+        /** @overload */
+        const uchar* ptr(int row, int col = 0) const
+        {
+            return data + step * row + CV_ELEM_SIZE(type()) * col;
+        }
+
+
+    private:
+        //actual memory allocated for storage, or nullptr if object is non owning view to over memory
+        std::shared_ptr<uchar> memory;
+    };
+
+} //namespace own
+} //namespace gapi
+} //namespace cv
+
+#endif /* OPENCV_GAPI_OWN_MAT_HPP */
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/saturate.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/saturate.hpp
new file mode 100644 (file)
index 0000000..207dcde
--- /dev/null
@@ -0,0 +1,90 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OWN_SATURATE_HPP
+#define OPENCV_GAPI_OWN_SATURATE_HPP
+
+#include <cmath>
+
+#include <limits>
+#include <type_traits>
+
+#include <opencv2/gapi/own/assert.hpp>
+
+namespace cv { namespace gapi { namespace own {
+//-----------------------------
+//
+// Numeric cast with saturation
+//
+//-----------------------------
+
+template<typename DST, typename SRC>
+static inline DST saturate(SRC x)
+{
+    // only integral types please!
+    GAPI_DbgAssert(std::is_integral<DST>::value &&
+                   std::is_integral<SRC>::value);
+
+    if (std::is_same<DST, SRC>::value)
+        return static_cast<DST>(x);
+
+    if (sizeof(DST) > sizeof(SRC))
+        return static_cast<DST>(x);
+
+    // compiler must recognize this saturation,
+    // so compile saturate<s16>(a + b) with adds
+    // instruction (e.g.: _mm_adds_epi16 if x86)
+    return x < std::numeric_limits<DST>::min()?
+               std::numeric_limits<DST>::min():
+           x > std::numeric_limits<DST>::max()?
+               std::numeric_limits<DST>::max():
+           static_cast<DST>(x);
+}
+
+// Note, that OpenCV rounds differently:
+// - like std::round() for add, subtract
+// - like std::rint() for multiply, divide
+template<typename DST, typename SRC, typename R>
+static inline DST saturate(SRC x, R round)
+{
+    if (std::is_floating_point<DST>::value)
+    {
+        return static_cast<DST>(x);
+    }
+    else if (std::is_integral<SRC>::value)
+    {
+        GAPI_DbgAssert(std::is_integral<DST>::value &&
+                       std::is_integral<SRC>::value);
+        return saturate<DST>(x);
+    }
+    else
+    {
+        GAPI_DbgAssert(std::is_integral<DST>::value &&
+                 std::is_floating_point<SRC>::value);
+#ifdef _WIN32
+// Suppress warning about convering x to floating-point
+// Note that x is already floating-point at this point
+#pragma warning(disable: 4244)
+#endif
+        int ix = static_cast<int>(round(x));
+#ifdef _WIN32
+#pragma warning(default: 4244)
+#endif
+        return saturate<DST>(ix);
+    }
+}
+
+// explicit suffix 'd' for double type
+inline double  ceild(double x) { return std::ceil(x); }
+inline double floord(double x) { return std::floor(x); }
+inline double roundd(double x) { return std::round(x); }
+inline double  rintd(double x) { return std::rint(x); }
+
+} //namespace own
+} //namespace gapi
+} //namespace cv
+#endif /* OPENCV_GAPI_OWN_SATURATE_HPP */
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/scalar.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/scalar.hpp
new file mode 100644 (file)
index 0000000..bda91c8
--- /dev/null
@@ -0,0 +1,47 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GAPI_OWN_SCALAR_HPP
+#define OPENCV_GAPI_GAPI_OWN_SCALAR_HPP
+
+#include <opencv2/gapi/own/exports.hpp>
+
+namespace cv
+{
+namespace gapi
+{
+namespace own
+{
+
+class GAPI_EXPORTS Scalar
+{
+public:
+    Scalar() = default;
+    explicit Scalar(double v0) { val[0] = v0; };
+    Scalar(double v0, double v1, double v2 = 0, double v3 = 0)
+        : val{v0, v1, v2, v3}
+    {
+    }
+
+    const double& operator[](int i) const { return val[i]; }
+          double& operator[](int i)       { return val[i]; }
+
+    static Scalar all(double v0) { return Scalar(v0, v0, v0, v0); }
+
+    double val[4] = {0};
+};
+
+inline bool operator==(const Scalar& lhs, const Scalar& rhs)
+{
+    return std::equal(std::begin(lhs.val), std::end(lhs.val), std::begin(rhs.val));
+}
+
+} // namespace own
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_GAPI_OWN_SCALAR_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/types.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/own/types.hpp
new file mode 100644 (file)
index 0000000..20445ee
--- /dev/null
@@ -0,0 +1,135 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_TYPES_HPP
+#define OPENCV_GAPI_TYPES_HPP
+
+#include <algorithm>              // std::max, std::min
+#include <ostream>
+
+namespace cv
+{
+namespace gapi
+{
+namespace own
+{
+
+class Point
+{
+public:
+    Point() = default;
+    Point(int _x, int _y) : x(_x),  y(_y)  {};
+
+    int x = 0;
+    int y = 0;
+};
+
+class Rect
+{
+public:
+    Rect() = default;
+    Rect(int _x, int _y, int _width, int _height) : x(_x), y(_y),   width(_width),  height(_height)  {};
+#if !defined(GAPI_STANDALONE)
+    Rect(const cv::Rect& other) : x(other.x), y(other.y), width(other.width), height(other.height) {};
+    inline Rect& operator=(const cv::Rect& other)
+    {
+        x = other.x;
+        y = other.x;
+        width  = other.width;
+        height = other.height;
+        return *this;
+    }
+#endif // !defined(GAPI_STANDALONE)
+
+    int x      = 0; //!< x coordinate of the top-left corner
+    int y      = 0; //!< y coordinate of the top-left corner
+    int width  = 0; //!< width of the rectangle
+    int height = 0; //!< height of the rectangle
+};
+
+inline bool operator==(const Rect& lhs, const Rect& rhs)
+{
+    return lhs.x == rhs.x && lhs.y == rhs.y && lhs.width == rhs.width && lhs.height == rhs.height;
+}
+
+inline bool operator!=(const Rect& lhs, const Rect& rhs)
+{
+    return !(lhs == rhs);
+}
+
+inline Rect& operator&=(Rect& lhs, const Rect& rhs)
+{
+    int x1 = std::max(lhs.x, rhs.x);
+    int y1 = std::max(lhs.y, rhs.y);
+    lhs.width  = std::min(lhs.x + lhs.width,  rhs.x + rhs.width) -  x1;
+    lhs.height = std::min(lhs.y + lhs.height, rhs.y + rhs.height) - y1;
+    lhs.x = x1;
+    lhs.y = y1;
+    if( lhs.width <= 0 || lhs.height <= 0 )
+        lhs = Rect();
+    return lhs;
+}
+
+inline const Rect operator&(const Rect& lhs, const Rect& rhs)
+{
+    Rect result = lhs;
+    return result &= rhs;
+}
+
+inline std::ostream& operator<<(std::ostream& o, const Rect& rect)
+{
+    return o << "[" << rect.width << " x " << rect.height << " from (" << rect.x << ", " << rect.y << ")]";
+}
+
+class Size
+{
+public:
+    Size() = default;
+    Size(int _width, int _height) : width(_width),  height(_height)  {};
+#if !defined(GAPI_STANDALONE)
+    Size(const cv::Size& other) : width(other.width), height(other.height) {};
+    inline Size& operator=(const cv::Size& rhs)
+    {
+        width  = rhs.width;
+        height = rhs.height;
+        return *this;
+    }
+#endif // !defined(GAPI_STANDALONE)
+
+    int width  = 0;
+    int height = 0;
+};
+
+inline Size& operator+=(Size& lhs, const Size& rhs)
+{
+    lhs.width  += rhs.width;
+    lhs.height += rhs.height;
+    return lhs;
+}
+
+inline bool operator==(const Size& lhs, const Size& rhs)
+{
+    return lhs.width == rhs.width && lhs.height == rhs.height;
+}
+
+inline bool operator!=(const Size& lhs, const Size& rhs)
+{
+    return !(lhs == rhs);
+}
+
+
+inline std::ostream& operator<<(std::ostream& o, const Size& s)
+{
+    o << "[" << s.width << " x " << s.height << "]";
+    return o;
+}
+
+} // namespace own
+} // namespace gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_TYPES_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/any.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/any.hpp
new file mode 100644 (file)
index 0000000..3146cb6
--- /dev/null
@@ -0,0 +1,186 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_ANY_HPP
+#define OPENCV_GAPI_UTIL_ANY_HPP
+
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+
+#include "opencv2/gapi/util/throw.hpp"
+
+#if defined(_MSC_VER)
+   // disable MSVC warning on "multiple copy constructors specified"
+#  pragma warning(disable: 4521)
+#endif
+
+namespace cv
+{
+
+namespace internal
+{
+    template <class T, class Source>
+    T down_cast(Source operand)
+    {
+#if defined(__GXX_RTTI) || defined(_CPPRTTI)
+       return dynamic_cast<T>(operand);
+#else
+    #warning used static cast instead of dynamic because RTTI is disabled
+       return static_cast<T>(operand);
+#endif
+    }
+}
+
+namespace util
+{
+   class bad_any_cast : public std::bad_cast
+   {
+   public:
+       virtual const char* what() const noexcept override
+       {
+           return "Bad any cast";
+       }
+   };
+
+   //modeled against C++17 std::any
+
+   class any
+   {
+   private:
+      struct holder;
+      using holder_ptr = std::unique_ptr<holder>;
+      struct holder
+      {
+         virtual holder_ptr clone() = 0;
+         virtual ~holder() = default;
+      };
+
+      template <typename value_t>
+      struct holder_impl : holder
+      {
+         value_t v;
+         template<typename arg_t>
+         holder_impl(arg_t&& a) : v(std::forward<arg_t>(a)) {}
+         holder_ptr clone() override { return holder_ptr(new holder_impl (v));}
+      };
+
+      holder_ptr hldr;
+   public:
+      template<class value_t>
+      any(value_t&& arg) :  hldr(new holder_impl<typename std::decay<value_t>::type>( std::forward<value_t>(arg))) {}
+
+      any(any const& src) : hldr( src.hldr ? src.hldr->clone() : nullptr) {}
+      //simple hack in order not to write enable_if<not any> for the template constructor
+      any(any & src) : any (const_cast<any const&>(src)) {}
+
+      any()       = default;
+      any(any&& ) = default;
+
+      any& operator=(any&&) = default;
+
+      any& operator=(any const& src)
+      {
+         any copy(src);
+         swap(*this, copy);
+         return *this;
+      }
+
+      template<class value_t>
+      friend value_t* any_cast(any* operand);
+
+      template<class value_t>
+      friend const value_t* any_cast(const any* operand);
+
+      template<class value_t>
+      friend value_t& unsafe_any_cast(any& operand);
+
+      template<class value_t>
+      friend const value_t& unsafe_any_cast(const any& operand);
+
+      friend void swap(any & lhs, any& rhs)
+      {
+         swap(lhs.hldr, rhs.hldr);
+      }
+
+   };
+
+   template<class value_t>
+   value_t* any_cast(any* operand)
+   {
+      auto casted = internal::down_cast<any::holder_impl<typename std::decay<value_t>::type> *>(operand->hldr.get());
+      if (casted){
+         return & (casted->v);
+      }
+      return nullptr;
+   }
+
+   template<class value_t>
+   const value_t* any_cast(const any* operand)
+   {
+      auto casted = internal::down_cast<any::holder_impl<typename std::decay<value_t>::type> *>(operand->hldr.get());
+      if (casted){
+         return & (casted->v);
+      }
+      return nullptr;
+   }
+
+   template<class value_t>
+   value_t& any_cast(any& operand)
+   {
+      auto ptr = any_cast<value_t>(&operand);
+      if (ptr)
+      {
+         return *ptr;
+      }
+
+      throw_error(bad_any_cast());
+   }
+
+
+   template<class value_t>
+   const value_t& any_cast(const any& operand)
+   {
+      auto ptr = any_cast<value_t>(&operand);
+      if (ptr)
+      {
+         return *ptr;
+      }
+
+      throw_error(bad_any_cast());
+   }
+
+   template<class value_t>
+   inline value_t& unsafe_any_cast(any& operand)
+   {
+#ifdef DEBUG
+      return any_cast<value_t>(operand);
+#else
+      return static_cast<any::holder_impl<typename std::decay<value_t>::type> *>(operand.hldr.get())->v;
+#endif
+   }
+
+   template<class value_t>
+   inline const value_t& unsafe_any_cast(const any& operand)
+   {
+#ifdef DEBUG
+      return any_cast<value_t>(operand);
+#else
+      return static_cast<any::holder_impl<typename std::decay<value_t>::type> *>(operand.hldr.get())->v;
+#endif
+   }
+
+} // namespace util
+} // namespace cv
+
+#if defined(_MSC_VER)
+   // Enable "multiple copy constructors specified" back
+#  pragma warning(default: 4521)
+#endif
+
+#endif // OPENCV_GAPI_UTIL_ANY_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/compiler_hints.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/compiler_hints.hpp
new file mode 100644 (file)
index 0000000..575655e
--- /dev/null
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+#ifndef OPENCV_GAPI_UTIL_COMPILER_HINTS_HPP
+#define OPENCV_GAPI_UTIL_COMPILER_HINTS_HPP
+
+namespace cv
+{
+namespace util
+{
+    //! Utility template function to prevent "unused" warnings by various compilers.
+    template<typename T> void suppress_unused_warning( const T& ) {}
+} // namespace util
+} // namespace cv
+
+#define UNUSED(x) cv::util::suppress_unused_warning(x)
+
+#endif /* OPENCV_GAPI_UTIL_COMPILER_HINTS_HPP */
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/optional.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/optional.hpp
new file mode 100644 (file)
index 0000000..54126d6
--- /dev/null
@@ -0,0 +1,178 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_OPTIONAL_HPP
+#define OPENCV_GAPI_UTIL_OPTIONAL_HPP
+
+#include "opencv2/gapi/util/variant.hpp"
+
+// A poor man's `optional` implementation, incompletely modeled against C++17 spec.
+namespace cv
+{
+namespace util
+{
+    class bad_optional_access: public std::exception
+    {
+    public:
+        virtual const char *what() const noexcept override
+        {
+            return "Bad optional access";
+        }
+    };
+
+    // TODO: nullopt_t
+
+    // Interface ///////////////////////////////////////////////////////////////
+    template<typename T> class optional
+    {
+    public:
+        // Constructors
+        // NB.: there were issues with Clang 3.8 when =default() was used
+        // instead {}
+        optional() {};
+        optional(const optional&) = default;
+        explicit optional(T &&value) noexcept;
+        explicit optional(const T &value) noexcept;
+        optional(optional &&) noexcept;
+        // TODO: optional(nullopt_t) noexcept;
+        // TODO: optional(const optional<U> &)
+        // TODO: optional(optional<U> &&)
+        // TODO: optional(Args&&...)
+        // TODO: optional(initializer_list<U>)
+        // TODO: optional(U&& value);
+
+        // Assignment
+        optional& operator=(const optional& rhs) = default;
+        optional& operator=(optional&& rhs);
+
+        // Observers
+        T* operator-> ();
+        const T* operator-> () const;
+        T& operator* ();
+        const T& operator* () const;
+        // TODO: && versions
+
+        operator bool() const noexcept;
+        bool has_value() const noexcept;
+
+        T& value();
+        const T& value() const;
+        // TODO: && versions
+
+        template<class U>
+        T value_or(U &&default_value) const;
+
+        void swap(optional &other) noexcept;
+        void reset() noexcept;
+        // TODO: emplace
+
+        // TODO: operator==, !=, <, <=, >, >=
+
+    private:
+        struct nothing {};
+        util::variant<nothing, T> m_holder;
+    };
+
+    template<class T>
+    optional<typename std::decay<T>::type> make_optional(T&& value);
+
+    // TODO: Args... and initializer_list versions
+
+    // Implementation //////////////////////////////////////////////////////////
+    template<class T> optional<T>::optional(T &&v) noexcept
+        : m_holder(v)
+    {
+    }
+
+    template<class T> optional<T>::optional(const T &v) noexcept
+        : m_holder(v)
+    {
+    }
+
+    template<class T> optional<T>::optional(optional&& rhs) noexcept
+        : m_holder(std::move(rhs.m_holder))
+    {
+        rhs.reset();
+    }
+
+    template<class T> optional<T>& optional<T>::operator=(optional&& rhs)
+    {
+        m_holder = std::move(rhs.m_holder);
+        rhs.reset();
+        return *this;
+    }
+
+    template<class T> T* optional<T>::operator-> ()
+    {
+        return & *(*this);
+    }
+
+    template<class T> const T* optional<T>::operator-> () const
+    {
+        return & *(*this);
+    }
+
+    template<class T> T& optional<T>::operator* ()
+    {
+        return this->value();
+    }
+
+    template<class T> const T& optional<T>::operator* () const
+    {
+        return this->value();
+    }
+
+    template<class T> optional<T>::operator bool() const noexcept
+    {
+        return this->has_value();
+    }
+
+    template<class T> bool optional<T>::has_value() const noexcept
+    {
+        return util::holds_alternative<T>(m_holder);
+    }
+
+    template<class T> T& optional<T>::value()
+    {
+        if (!this->has_value())
+            throw_error(bad_optional_access());
+        return util::get<T>(m_holder);
+    }
+
+    template<class T> const T& optional<T>::value() const
+    {
+        if (!this->has_value())
+            throw_error(bad_optional_access());
+        return util::get<T>(m_holder);
+    }
+
+    template<class T>
+    template<class U> T optional<T>::value_or(U &&default_value) const
+    {
+        return (this->has_value() ? this->value() : T(default_value));
+    }
+
+    template<class T> void optional<T>::swap(optional<T> &other) noexcept
+    {
+        m_holder.swap(other.m_holder);
+    }
+
+    template<class T> void optional<T>::reset() noexcept
+    {
+        if (this->has_value())
+            m_holder = nothing{};
+    }
+
+    template<class T>
+    optional<typename std::decay<T>::type> make_optional(T&& value)
+    {
+        return optional<typename std::decay<T>::type>(std::forward<T>(value));
+    }
+} // namespace util
+} // namespace cv
+
+#endif // OPENCV_GAPI_UTIL_OPTIONAL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/throw.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/throw.hpp
new file mode 100644 (file)
index 0000000..689bf58
--- /dev/null
@@ -0,0 +1,36 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_THROW_HPP
+#define OPENCV_GAPI_UTIL_THROW_HPP
+
+#include <utility>  // std::forward
+
+#if !defined(__EXCEPTIONS)
+#include <stdlib.h>
+#include <stdio.h>
+#endif
+
+namespace cv
+{
+namespace util
+{
+template <class ExceptionType>
+[[noreturn]] void throw_error(ExceptionType &&e)
+{
+#if defined(__EXCEPTIONS) || defined(_CPPUNWIND)
+    throw std::forward<ExceptionType>(e);
+#else
+    fprintf(stderr, "An exception thrown! %s\n" , e.what());
+    fflush(stderr);
+    abort();
+#endif
+}
+} // namespace util
+} // namespace cv
+
+#endif // OPENCV_GAPI_UTIL_THROW_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/util.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/util.hpp
new file mode 100644 (file)
index 0000000..d0378e0
--- /dev/null
@@ -0,0 +1,92 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_HPP
+#define OPENCV_GAPI_UTIL_HPP
+
+#include <utility> // std::tuple
+
+// \cond HIDDEN_SYMBOLS
+// This header file contains some generic utility functions which are
+// used in other G-API Public API headers.
+//
+// PLEASE don't put any stuff here if it is NOT used in public API headers!
+
+namespace cv
+{
+namespace detail
+{
+    // Recursive integer sequence type, useful for enumerating elements of
+    // template parameter packs.
+    template<int... I> struct Seq     { using next = Seq<I..., sizeof...(I)>; };
+    template<int Sz>   struct MkSeq   { using type = typename MkSeq<Sz-1>::type::next; };
+    template<>         struct MkSeq<0>{ using type = Seq<>; };
+
+    // Checks if elements of variadic template satisfy the given Predicate.
+    // Implemented via tuple, with an interface to accept plain type lists
+    template<template<class> class, typename, typename...> struct all_satisfy;
+
+    template<template<class> class F, typename T, typename... Ts>
+    struct all_satisfy<F, std::tuple<T, Ts...> >
+    {
+        static const constexpr bool value = F<T>::value
+            && all_satisfy<F, std::tuple<Ts...> >::value;
+    };
+    template<template<class> class F, typename T>
+    struct all_satisfy<F, std::tuple<T> >
+    {
+        static const constexpr bool value = F<T>::value;
+    };
+
+    template<template<class> class F, typename T, typename... Ts>
+    struct all_satisfy: public all_satisfy<F, std::tuple<T, Ts...> > {};
+
+    // Permute given tuple type C with given integer sequence II
+    // Sequence may be less than tuple C size.
+    template<class, class> struct permute_tuple;
+
+    template<class C, int... IIs>
+    struct permute_tuple<C, Seq<IIs...> >
+    {
+        using type = std::tuple< typename std::tuple_element<IIs, C>::type... >;
+    };
+
+    // Given T..., generates a type sequence of sizeof...(T)-1 elements
+    // which is T... without its last element
+    // Implemented via tuple, with an interface to accept plain type lists
+    template<typename T, typename... Ts> struct all_but_last;
+
+    template<typename T, typename... Ts>
+    struct all_but_last<std::tuple<T, Ts...> >
+    {
+        using C    = std::tuple<T, Ts...>;
+        using S    = typename MkSeq<std::tuple_size<C>::value - 1>::type;
+        using type = typename permute_tuple<C, S>::type;
+    };
+
+    template<typename T, typename... Ts>
+    struct all_but_last: public all_but_last<std::tuple<T, Ts...> > {};
+
+    template<typename... Ts>
+    using all_but_last_t = typename all_but_last<Ts...>::type;
+
+    // NB.: This is here because there's no constexpr std::max in C++11
+    template<std::size_t S0, std::size_t... SS> struct max_of_t
+    {
+        static constexpr const std::size_t rest  = max_of_t<SS...>::value;
+        static constexpr const std::size_t value = rest > S0 ? rest : S0;
+    };
+    template<std::size_t S> struct max_of_t<S>
+    {
+        static constexpr const std::size_t value = S;
+    };
+} // namespace detail
+} // namespace cv
+
+// \endcond
+
+#endif //  OPENCV_GAPI_UTIL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/variant.hpp b/inference-engine/thirdparty/fluid/modules/gapi/include/opencv2/gapi/util/variant.hpp
new file mode 100644 (file)
index 0000000..cb0270a
--- /dev/null
@@ -0,0 +1,377 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_UTIL_VARIANT_HPP
+#define OPENCV_GAPI_UTIL_VARIANT_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "opencv2/gapi/util/throw.hpp"
+#include "opencv2/gapi/util/util.hpp" // max_of_t
+
+// A poor man's `variant` implementation, incompletely modeled against C++17 spec.
+namespace cv
+{
+namespace util
+{
+    namespace detail
+    {
+        template<std::size_t I, typename Target, typename First, typename... Remaining>
+        struct type_list_index_helper
+        {
+            static const constexpr bool is_same = std::is_same<Target, First>::value;
+            static const constexpr std::size_t value =
+                std::conditional<is_same, std::integral_constant<std::size_t, I>, type_list_index_helper<I + 1, Target, Remaining...>>::type::value;
+        };
+
+        template<std::size_t I, typename Target, typename First>
+        struct type_list_index_helper<I, Target, First>
+        {
+            static_assert(std::is_same<Target, First>::value, "Type not found");
+            static const constexpr std::size_t value = I;
+        };
+
+
+        template<class T, class U, class V> using are_different =
+            std::enable_if<!std::is_same<typename std::decay<T>::type,
+                                         typename std::decay<U>::type>::value,
+                           V>;
+    }
+
+    template<typename Target, typename... Types>
+    struct type_list_index
+    {
+        static const constexpr std::size_t value = detail::type_list_index_helper<0, Target, Types...>::value;
+    };
+
+    class bad_variant_access: public std::exception
+    {
+    public:
+        virtual const char *what() const noexcept override
+        {
+            return "Bad variant access";
+        }
+    };
+
+    // Interface ///////////////////////////////////////////////////////////////
+    struct monostate {};
+    inline bool operator==(const util::monostate&, const util::monostate&)
+    {
+        return true;
+    }
+
+    template<typename... Ts> // FIXME: no references, arrays, and void
+    class variant
+    {
+        // FIXME: Replace with std::aligned_union after gcc4.8 support is dropped
+        static constexpr const std::size_t S = cv::detail::max_of_t<sizeof(Ts)...>::value;
+        static constexpr const std::size_t A = cv::detail::max_of_t<alignof(Ts)...>::value;
+        using Memory = typename std::aligned_storage<S, A>::type[1];
+
+        template<typename T> struct cctr_h {
+            static void help(Memory memory, const Memory from) {
+                new (memory) T(*reinterpret_cast<const T*>(from));
+            }
+        };
+
+        template<typename T> struct vctr_h {
+            static void help(Memory memory, const void* pval) {
+                new (memory) T(*reinterpret_cast<const T*>(pval));
+            }
+        };
+
+        template<typename T> struct mctr_h {
+            static void help(Memory memory, void *pval) {
+                new (memory) T(std::move(*reinterpret_cast<T*>(pval)));
+            }
+        };
+
+        template<typename T> struct copy_h {
+            static void help(Memory to, const Memory from) {
+                *reinterpret_cast<T*>(to) = *reinterpret_cast<const T*>(from);
+            }
+        };
+
+        template<typename T> struct move_h {
+            static void help(Memory to, const Memory from) {
+                *reinterpret_cast<T*>(to) = std::move(*reinterpret_cast<const T*>(from));
+            }
+        };
+
+        template<typename T> struct swap_h {
+            static void help(Memory to, Memory from) {
+                std::swap(*reinterpret_cast<T*>(to), *reinterpret_cast<T*>(from));
+            }
+        };
+
+        template<typename T> struct dtor_h {
+            static void help(Memory memory) {
+                (void) memory; // MSCV warning
+                reinterpret_cast<T*>(memory)->~T();
+            }
+        };
+
+        template<typename T> struct equal_h {
+            static bool help(const Memory lhs, const Memory rhs) {
+                const T& t_lhs = *reinterpret_cast<const T*>(lhs);
+                const T& t_rhs = *reinterpret_cast<const T*>(rhs);
+                return t_lhs == t_rhs;
+            }
+        };
+
+        typedef void (*CCtr) (Memory, const Memory);  // Copy c-tor (variant)
+        typedef void (*VCtr) (Memory, const void*);   // Copy c-tor (value)
+        typedef void (*MCtr) (Memory, void*);         // Generic move c-tor
+        typedef void (*Copy) (Memory, const Memory);  // Copy assignment
+        typedef void (*Move) (Memory, const Memory);  // Move assignment
+        typedef void (*Swap) (Memory, Memory);        // Swap
+        typedef void (*Dtor) (Memory);                // Destructor
+
+        typedef bool (*Equal)(const Memory, const Memory); // Equality test (external)
+
+        static constexpr std::array<CCtr, sizeof...(Ts)> cctrs(){ return {{(&cctr_h<Ts>::help)...}};}
+        static constexpr std::array<VCtr, sizeof...(Ts)> vctrs(){ return {{(&vctr_h<Ts>::help)...}};}
+        static constexpr std::array<MCtr, sizeof...(Ts)> mctrs(){ return {{(&mctr_h<Ts>::help)...}};}
+        static constexpr std::array<Copy, sizeof...(Ts)> cpyrs(){ return {{(&copy_h<Ts>::help)...}};}
+        static constexpr std::array<Move, sizeof...(Ts)> mvers(){ return {{(&move_h<Ts>::help)...}};}
+        static constexpr std::array<Swap, sizeof...(Ts)> swprs(){ return {{(&swap_h<Ts>::help)...}};}
+        static constexpr std::array<Dtor, sizeof...(Ts)> dtors(){ return {{(&dtor_h<Ts>::help)...}};}
+
+        std::size_t m_index = 0;
+
+    protected:
+        template<typename T, typename... Us> friend T& get(variant<Us...> &v);
+        template<typename T, typename... Us> friend const T& get(const variant<Us...> &v);
+        template<typename... Us> friend bool operator==(const variant<Us...> &lhs,
+                                                        const variant<Us...> &rhs);
+        Memory memory;
+
+    public:
+        // Constructors
+        variant() noexcept;
+        variant(const variant& other);
+        variant(variant&& other) noexcept;
+        template<typename T> explicit variant(const T& t);
+        // are_different is a SFINAE trick to avoid variant(T &&t) with T=variant
+        // for some reason, this version is called instead of variant(variant&& o) when
+        // variant is used in STL containers (examples: vector assignment)
+        template<typename T> explicit variant(T&& t, typename detail::are_different<variant, T, int>::type = 0);
+        // template<class T, class... Args> explicit variant(Args&&... args);
+        // FIXME: other constructors
+
+        // Destructor
+        ~variant();
+
+        // Assignment
+        variant& operator=(const variant& rhs);
+        variant& operator=(variant &&rhs) noexcept;
+
+        // SFINAE trick to avoid operator=(T&&) with T=variant<>, see comment above
+        template<class T>
+        typename detail::are_different<variant, T, variant&>
+        ::type operator=(T&& t) noexcept;
+
+        // Observers
+        std::size_t index() const noexcept;
+        // FIXME: valueless_by_exception()
+
+        // Modifiers
+        // FIXME: emplace()
+        void swap(variant &rhs) noexcept;
+
+        // Non-C++17x!
+        template<typename T> static constexpr std::size_t index_of();
+    };
+
+    // FIMXE: visit
+
+    template<typename T, typename... Types>
+    T& get(util::variant<Types...> &v);
+
+    template<typename T, typename... Types>
+    const T& get(const util::variant<Types...> &v);
+
+    template<typename T, typename... Types>
+    bool holds_alternative(const util::variant<Types...> &v) noexcept;
+
+    // FIXME: T&&, const TT&& versions.
+
+    // Implementation //////////////////////////////////////////////////////////
+    template<typename... Ts>
+    variant<Ts...>::variant() noexcept
+    {
+        typedef typename std::tuple_element<0, std::tuple<Ts...> >::type TFirst;
+        new (memory) TFirst();
+    }
+
+    template<typename... Ts>
+    variant<Ts...>::variant(const variant &other)
+        : m_index(other.m_index)
+    {
+        (cctrs()[m_index])(memory, other.memory);
+    }
+
+    template<typename... Ts>
+    variant<Ts...>::variant(variant &&other) noexcept
+        : m_index(other.m_index)
+    {
+        (mctrs()[m_index])(memory, other.memory);
+    }
+
+    template<typename... Ts>
+    template<class T>
+    variant<Ts...>::variant(const T& t)
+        : m_index(util::type_list_index<T, Ts...>::value)
+    {
+        (vctrs()[m_index])(memory, &t);
+    }
+
+    template<typename... Ts>
+    template<class T>
+    variant<Ts...>::variant(T&& t, typename detail::are_different<variant, T, int>::type)
+        : m_index(util::type_list_index<typename std::remove_reference<T>::type, Ts...>::value)
+    {
+        (mctrs()[m_index])(memory, &t);
+    }
+
+    template<typename... Ts>
+    variant<Ts...>::~variant()
+    {
+        (dtors()[m_index])(memory);
+    }
+
+    template<typename... Ts>
+    variant<Ts...>& variant<Ts...>::operator=(const variant<Ts...> &rhs)
+    {
+        if (m_index != rhs.m_index)
+        {
+            (dtors()[    m_index])(memory);
+            (cctrs()[rhs.m_index])(memory, rhs.memory);
+            m_index = rhs.m_index;
+        }
+        else
+        {
+            (cpyrs()[rhs.m_index])(memory, rhs.memory);
+        }
+        return *this;
+    }
+
+    template<typename... Ts>
+    variant<Ts...>& variant<Ts...>::operator=(variant<Ts...> &&rhs) noexcept
+    {
+        if (m_index != rhs.m_index)
+        {
+            (dtors()[    m_index])(memory);
+            (mctrs()[rhs.m_index])(memory, rhs.memory);
+            m_index = rhs.m_index;
+        }
+        else
+        {
+            (mvers()[rhs.m_index])(memory, rhs.memory);
+        }
+        return *this;
+    }
+
+    template<typename... Ts>
+    template<class T> typename detail::are_different<variant<Ts...>, T, variant<Ts...>&>
+    ::type variant<Ts...>::operator=(T&& t) noexcept
+    {
+        // FIXME: No version with implicit type conversion available!
+        static const constexpr std::size_t t_index =
+            util::type_list_index<T, Ts...>::value;
+
+        if (t_index == m_index)
+        {
+            util::get<T>(*this) = std::move(t);
+            return *this;
+        }
+        else return (*this = variant(std::move(t)));
+    }
+
+    template<typename... Ts>
+    std::size_t util::variant<Ts...>::index() const noexcept
+    {
+        return m_index;
+    }
+
+    template<typename... Ts>
+    void variant<Ts...>::swap(variant<Ts...> &rhs) noexcept
+    {
+        if (m_index == rhs.index())
+        {
+            (swprs()[m_index](memory, rhs.memory));
+        }
+        else
+        {
+            variant<Ts...> tmp(std::move(*this));
+            *this = std::move(rhs);
+            rhs   = std::move(tmp);
+        }
+    }
+
+    template<typename... Ts>
+    template<typename T>
+    constexpr std::size_t variant<Ts...>::index_of()
+    {
+        return util::type_list_index<T, Ts...>::value; // FIXME: tests!
+    }
+
+    template<typename T, typename... Types>
+    T& get(util::variant<Types...> &v)
+    {
+        const constexpr std::size_t t_index =
+            util::type_list_index<T, Types...>::value;
+
+        if (v.index() == t_index)
+            return reinterpret_cast<T&>(v.memory);
+        else
+            throw_error(bad_variant_access());
+    }
+
+    template<typename T, typename... Types>
+    const T& get(const util::variant<Types...> &v)
+    {
+        const constexpr std::size_t t_index =
+            util::type_list_index<T, Types...>::value;
+
+        if (v.index() == t_index)
+            return reinterpret_cast<const T&>(v.memory);
+        else
+            throw_error(bad_variant_access());
+    }
+
+    template<typename T, typename... Types>
+    bool holds_alternative(const util::variant<Types...> &v) noexcept
+    {
+        return v.index() == util::variant<Types...>::template index_of<T>();
+    }
+
+    template<typename... Us> bool operator==(const variant<Us...> &lhs,
+                                             const variant<Us...> &rhs)
+    {
+        using V = variant<Us...>;
+
+        // Instantiate table only here since it requires operator== for <Us...>
+        // <Us...> should have operator== only if this one is used, not in general
+        static const std::array<typename V::Equal, sizeof...(Us)> eqs = {
+            {(&V::template equal_h<Us>::help)...}
+        };
+        if (lhs.index() != rhs.index())
+            return false;
+        return (eqs[lhs.index()])(lhs.memory, rhs.memory);
+    }
+
+    template<typename... Us> bool operator!=(const variant<Us...> &lhs,
+                                             const variant<Us...> &rhs)
+    {
+        return !(lhs == rhs);
+    }
+} // namespace cv
+} // namespace util
+
+#endif // OPENCV_GAPI_UTIL_VARIANT_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.cpp
new file mode 100644 (file)
index 0000000..2df4d88
--- /dev/null
@@ -0,0 +1,9 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "perf_precomp.hpp"
+#include "gapi_core_perf_tests_inl.hpp"
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.hpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests.hpp
new file mode 100644 (file)
index 0000000..8af7b1a
--- /dev/null
@@ -0,0 +1,76 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CORE_PERF_TESTS_HPP
+#define OPENCV_GAPI_CORE_PERF_TESTS_HPP
+
+
+#include "../../test/common/gapi_tests_common.hpp"
+#include "opencv2/gapi/core.hpp"
+
+namespace opencv_test
+{
+  using namespace perf;
+
+  enum bitwiseOp
+  {
+      AND = 0,
+      OR = 1,
+      XOR = 2,
+      NOT = 3
+  };
+
+//------------------------------------------------------------------------------
+
+    class AddPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class AddCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class SubPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class SubCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class SubRCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class MulPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class MulDoublePerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class MulCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class DivPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class DivCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class DivRCPerfTest : public TestPerfParams<tuple<compare_f,cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class MaskPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class MeanPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class Polar2CartPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+    class Cart2PolarPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+    class CmpPerfTest : public TestPerfParams<tuple<CmpTypes, cv::Size, MatType, cv::GCompileArgs>> {};
+    class CmpWithScalarPerfTest : public TestPerfParams<tuple<CmpTypes, cv::Size, MatType, cv::GCompileArgs>> {};
+    class BitwisePerfTest : public TestPerfParams<tuple<bitwiseOp, cv::Size, MatType, cv::GCompileArgs>> {};
+    class BitwiseNotPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class SelectPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class MinPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class MaxPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class AbsDiffPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class AbsDiffCPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class SumPerfTest : public TestPerfParams<tuple<cv::Size, MatType, double, cv::GCompileArgs>> {};
+    class AddWeightedPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, double, cv::GCompileArgs>> {};
+    class NormPerfTest : public TestPerfParams<tuple<NormTypes, cv::Size, MatType, double, cv::GCompileArgs>> {};
+    class IntegralPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class ThresholdPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class ThresholdOTPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class InRangePerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class Split3PerfTest : public TestPerfParams<tuple<cv::Size, cv::GCompileArgs>> {};
+    class Split4PerfTest : public TestPerfParams<tuple<cv::Size, cv::GCompileArgs>> {};
+    class Merge3PerfTest : public TestPerfParams<tuple<cv::Size, cv::GCompileArgs>> {};
+    class Merge4PerfTest : public TestPerfParams<tuple<cv::Size, cv::GCompileArgs>> {};
+    class RemapPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class FlipPerfTest : public TestPerfParams<tuple<cv::Size, MatType, int, cv::GCompileArgs>> {};
+    class CropPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::Rect, cv::GCompileArgs>> {};
+    class ConcatHorPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class ConcatHorVecPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class ConcatVertPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class ConcatVertVecPerfTest : public TestPerfParams<tuple<cv::Size, MatType, cv::GCompileArgs>> {};
+    class LUTPerfTest : public TestPerfParams<tuple<MatType, MatType, cv::Size, cv::GCompileArgs>> {};
+    class ConvertToPerfTest : public TestPerfParams<tuple<MatType, int, cv::Size, cv::GCompileArgs>> {};
+    class ResizePerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, cv::Size, cv::GCompileArgs>> {};
+    class ResizeFxFyPerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, double, double, cv::GCompileArgs>> {};
+}
+#endif // OPENCV_GAPI_CORE_PERF_TESTS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_core_perf_tests_inl.hpp
new file mode 100644 (file)
index 0000000..f49e061
--- /dev/null
@@ -0,0 +1,1841 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CORE_PERF_TESTS_INL_HPP
+#define OPENCV_GAPI_CORE_PERF_TESTS_INL_HPP
+
+#include <iostream>
+
+#include "gapi_core_perf_tests.hpp"
+
+namespace opencv_test
+{
+using namespace perf;
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(AddPerfTest, TestPerformance)
+{
+    Size sz = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int dtype = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::add(in_mat1, in_mat2, out_mat_ocv, cv::noArray(), dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out;
+    out = cv::gapi::add(in1, in2, dtype);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(AddCPerfTest, TestPerformance)
+{
+    Size sz = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int dtype = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::add(in_mat1, sc, out_mat_ocv, cv::noArray(), dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar sc1;
+    out = cv::gapi::addC(in1, sc1, dtype);
+    cv::GComputation c(GIn(in1, sc1), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(SubPerfTest, TestPerformance)
+{
+    Size sz = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int dtype = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::subtract(in_mat1, in_mat2, out_mat_ocv, cv::noArray(), dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out;
+    out = cv::gapi::sub(in1, in2, dtype);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(SubCPerfTest, TestPerformance)
+{
+    Size sz = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int dtype = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::subtract(in_mat1, sc, out_mat_ocv, cv::noArray(), dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar sc1;
+    out = cv::gapi::subC(in1, sc1, dtype);
+    cv::GComputation c(GIn(in1, sc1), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(SubRCPerfTest, TestPerformance)
+{
+    Size sz = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int dtype = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::subtract(sc, in_mat1, out_mat_ocv, cv::noArray(), dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar sc1;
+    out = cv::gapi::subRC(sc1, in1, dtype);
+    cv::GComputation c(GIn(in1, sc1), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(MulPerfTest, TestPerformance)
+{
+    Size sz = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int dtype = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::multiply(in_mat1, in_mat2, out_mat_ocv, 1.0, dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out;
+    out = cv::gapi::mul(in1, in2, 1.0, dtype);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(MulDoublePerfTest, TestPerformance)
+{
+    Size sz = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int dtype = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    auto& rng = cv::theRNG();
+    double d = rng.uniform(0.0, 10.0);
+    initMatrixRandU(type, sz, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::multiply(in_mat1, d, out_mat_ocv, 1, dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    out = cv::gapi::mulC(in1, d, dtype);
+    cv::GComputation c(in1, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(MulCPerfTest, TestPerformance)
+{
+    Size sz = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int dtype = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::multiply(in_mat1, sc, out_mat_ocv, 1, dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar sc1;
+    out = cv::gapi::mulC(in1, sc1, dtype);
+    cv::GComputation c(GIn(in1, sc1), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(DivPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    MatType type = get<2>(GetParam());
+    int dtype = get<3>(GetParam());
+    cv::GCompileArgs compile_args = get<4>(GetParam());
+
+    // FIXIT Unstable input data for divide
+    initMatsRandU(type, sz, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::divide(in_mat1, in_mat2, out_mat_ocv, dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out;
+    out = cv::gapi::div(in1, in2, dtype);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(DivCPerfTest, TestPerformance)
+{
+    Size sz = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int dtype = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    // FIXIT Unstable input data for divide
+    initMatsRandU(type, sz, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::divide(in_mat1, sc, out_mat_ocv, 1.0, dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar sc1;
+    out = cv::gapi::divC(in1, sc1, 1.0, dtype);
+    cv::GComputation c(GIn(in1, sc1), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(DivRCPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    MatType type = get<2>(GetParam());
+    int dtype = get<3>(GetParam());
+    cv::GCompileArgs compile_args = get<4>(GetParam());
+
+    // FIXIT Unstable input data for divide
+    initMatsRandU(type, sz, dtype, false);
+
+    // FIXIT Unstable input data for divide, don't process zeros
+    sc += Scalar::all(1);
+    in_mat1 += 1;
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::divide(sc, in_mat1, out_mat_ocv, 1.0, dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar sc1;
+    out = cv::gapi::divRC(sc1, in1, 1.0, dtype);
+    cv::GComputation c(GIn(in1, sc1), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(MaskPerfTest, TestPerformance)
+{
+    Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatrixRandU(type, sz_in, type, false);
+    in_mat2 = cv::Mat(sz_in, CV_8UC1);
+    cv::randu(in_mat2, cv::Scalar::all(0), cv::Scalar::all(255));
+    in_mat2 = in_mat2 > 128;
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    out_mat_ocv = cv::Mat::zeros(in_mat1.size(), in_mat1.type());
+    in_mat1.copyTo(out_mat_ocv, in_mat2);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in, m;
+    auto out = cv::gapi::mask(in, m);
+    cv::GComputation c(cv::GIn(in, m), cv::GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(cv::gin(in_mat1, in_mat2), cv::gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1, in_mat2), cv::gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(MeanPerfTest, TestPerformance)
+{
+    Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatrixRandU(type, sz_in, false);
+    cv::Scalar out_norm;
+    cv::Scalar out_norm_ocv;
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    out_norm_ocv = cv::mean(in_mat1);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::mean(in);
+    cv::GComputation c(cv::GIn(in), cv::GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(cv::gin(in_mat1), cv::gout(out_norm), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1), cv::gout(out_norm), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(out_norm[0], out_norm_ocv[0]);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(Polar2CartPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz_in = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandU(CV_32FC1, sz_in, CV_32FC1, false);
+    cv::Mat out_mat2;
+    cv::Mat out_mat_ocv2;
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::polarToCart(in_mat1, in_mat2, out_mat_ocv, out_mat_ocv2);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out1, out2;
+    std::tie(out1, out2) = cv::gapi::polarToCart(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out1, out2));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi, out_mat2), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi, out_mat2), std::move(compile_args));
+    }
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    EXPECT_TRUE(cmpF(out_mat_ocv2, out_mat2));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(Cart2PolarPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz_in = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandU(CV_32FC1, sz_in, CV_32FC1, false);
+    cv::Mat out_mat2(sz_in, CV_32FC1);
+    cv::Mat out_mat_ocv2(sz_in, CV_32FC1);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::cartToPolar(in_mat1, in_mat2, out_mat_ocv, out_mat_ocv2);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out1, out2;
+    std::tie(out1, out2) = cv::gapi::cartToPolar(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out1, out2));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi, out_mat2), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi, out_mat2), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    EXPECT_TRUE(cmpF(out_mat_ocv2, out_mat2));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(CmpPerfTest, TestPerformance)
+{
+    CmpTypes opType = get<0>(GetParam());
+    cv::Size sz = get<1>(GetParam());
+    MatType type = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz, CV_8U, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::compare(in_mat1, in_mat2, out_mat_ocv, opType);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out;
+    switch (opType)
+    {
+    case CMP_EQ: out = cv::gapi::cmpEQ(in1, in2); break;
+    case CMP_GT: out = cv::gapi::cmpGT(in1, in2); break;
+    case CMP_GE: out = cv::gapi::cmpGE(in1, in2); break;
+    case CMP_LT: out = cv::gapi::cmpLT(in1, in2); break;
+    case CMP_LE: out = cv::gapi::cmpLE(in1, in2); break;
+    case CMP_NE: out = cv::gapi::cmpNE(in1, in2); break;
+    default: FAIL() << "no such compare operation type for two matrices!";
+    }
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(CmpWithScalarPerfTest, TestPerformance)
+{
+    CmpTypes opType = get<0>(GetParam());
+    cv::Size sz = get<1>(GetParam());
+    MatType type = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz, CV_8U, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::compare(in_mat1, sc, out_mat_ocv, opType);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar in2;
+    switch (opType)
+    {
+    case CMP_EQ: out = cv::gapi::cmpEQ(in1, in2); break;
+    case CMP_GT: out = cv::gapi::cmpGT(in1, in2); break;
+    case CMP_GE: out = cv::gapi::cmpGE(in1, in2); break;
+    case CMP_LT: out = cv::gapi::cmpLT(in1, in2); break;
+    case CMP_LE: out = cv::gapi::cmpLE(in1, in2); break;
+    case CMP_NE: out = cv::gapi::cmpNE(in1, in2); break;
+    default: FAIL() << "no such compare operation type for matrix and scalar!";
+    }
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(BitwisePerfTest, TestPerformance)
+{
+    bitwiseOp opType = get<0>(GetParam());
+    cv::Size sz = get<1>(GetParam());
+    MatType type = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatsRandU(type, sz, type, false);
+
+    // G-API code & corresponding OpenCV code ////////////////////////////////
+    cv::GMat in1, in2, out;
+    switch (opType)
+    {
+    case AND:
+    {
+        out = cv::gapi::bitwise_and(in1, in2);
+        cv::bitwise_and(in_mat1, in_mat2, out_mat_ocv);
+        break;
+    }
+    case OR:
+    {
+        out = cv::gapi::bitwise_or(in1, in2);
+        cv::bitwise_or(in_mat1, in_mat2, out_mat_ocv);
+        break;
+    }
+    case XOR:
+    {
+        out = cv::gapi::bitwise_xor(in1, in2);
+        cv::bitwise_xor(in_mat1, in_mat2, out_mat_ocv);
+        break;
+    }
+    default:
+    {
+        FAIL() << "no such bitwise operation type!";
+    }
+    }
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(BitwiseNotPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatrixRandU(type, sz_in, type, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::bitwise_not(in_mat1, out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in, out;
+    out = cv::gapi::bitwise_not(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(SelectPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandU(type, sz_in, type, false);
+    cv::Mat in_mask(sz_in, CV_8UC1);
+    cv::randu(in_mask, cv::Scalar::all(0), cv::Scalar::all(255));
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    in_mat2.copyTo(out_mat_ocv);
+    in_mat1.copyTo(out_mat_ocv, in_mask);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, in3, out;
+    out = cv::gapi::select(in1, in2, in3);
+    cv::GComputation c(GIn(in1, in2, in3), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2, in_mask), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2, in_mask), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(MinPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+
+    initMatsRandU(type, sz_in, type, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::min(in_mat1, in_mat2, out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out;
+    out = cv::gapi::min(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(MaxPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+
+    initMatsRandU(type, sz_in, type, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::max(in_mat1, in_mat2, out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out;
+    out = cv::gapi::max(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(AbsDiffPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+
+    initMatsRandU(type, sz_in, type, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::absdiff(in_mat1, in_mat2, out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out;
+    out = cv::gapi::absDiff(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(AbsDiffCPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+
+    initMatsRandU(type, sz_in, type, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::absdiff(in_mat1, sc, out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar sc1;
+    out = cv::gapi::absDiffC(in1, sc1);
+    cv::GComputation c(cv::GIn(in1, sc1), cv::GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(SumPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    double tolerance = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+
+    initMatrixRandU(type, sz_in, false);
+    cv::Scalar out_sum;
+    cv::Scalar out_sum_ocv;
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    out_sum_ocv = cv::sum(in_mat1);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::sum(in);
+    cv::GComputation c(cv::GIn(in), cv::GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(cv::gin(in_mat1), cv::gout(out_sum), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1), cv::gout(out_sum), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    {
+        EXPECT_LE(std::abs(out_sum[0] - out_sum_ocv[0]) / std::max(1.0, std::abs(out_sum_ocv[0])), tolerance)
+            << "OCV=" << out_sum_ocv[0] << "   GAPI=" << out_sum[0];
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(AddWeightedPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int dtype = get<2>(GetParam());
+    double tolerance = get<3>(GetParam());
+    cv::GCompileArgs compile_args = get<4>(GetParam());
+
+    auto& rng = cv::theRNG();
+    double alpha = rng.uniform(0.0, 1.0);
+    double beta = rng.uniform(0.0, 1.0);
+    double gamma = rng.uniform(0.0, 1.0);
+    initMatsRandU(type, sz_in, dtype, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::addWeighted(in_mat1, alpha, in_mat2, beta, gamma, out_mat_ocv, dtype);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = cv::gapi::addWeighted(in1, alpha, in2, beta, gamma, dtype);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check
+    if (0)
+    {
+        // Note, that we cannot expect bitwise results for add-weighted:
+        //
+        //    tmp = src1*alpha + src2*beta + gamma;
+        //    dst = saturate<DST>( round(tmp) );
+        //
+        // Because tmp is floating-point, dst depends on compiler optimizations
+        //
+        // However, we must expect good accuracy of tmp, and rounding correctly
+
+        cv::Mat failures;
+
+        if (out_mat_ocv.type() == CV_32FC1)
+        {
+            // result: float - may vary in 7th decimal digit
+            failures = abs(out_mat_gapi - out_mat_ocv) > abs(out_mat_ocv) * 1e-6;
+        }
+        else
+        {
+            // result: integral - rounding may vary if fractional part of tmp
+            //                    is nearly 0.5
+
+            cv::Mat inexact, incorrect, diff, tmp;
+
+            inexact = out_mat_gapi != out_mat_ocv;
+
+            // even if rounded differently, check if still rounded correctly
+            cv::addWeighted(in_mat1, alpha, in_mat2, beta, gamma, tmp, CV_32F);
+            cv::subtract(out_mat_gapi, tmp, diff, cv::noArray(), CV_32F);
+            incorrect = abs(diff) >= tolerance;// 0.5000005f; // relative to 6 digits
+
+            failures = inexact & incorrect;
+        }
+
+        EXPECT_EQ(0, cv::countNonZero(failures));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(NormPerfTest, TestPerformance)
+{
+    NormTypes opType = get<0>(GetParam());
+    cv::Size sz = get<1>(GetParam());
+    MatType type = get<2>(GetParam());
+    double tolerance = get<3>(GetParam());
+    cv::GCompileArgs compile_args = get<4>(GetParam());
+
+
+    initMatrixRandU(type, sz, type, false);
+    cv::Scalar out_norm;
+    cv::Scalar out_norm_ocv;
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    out_norm_ocv = cv::norm(in_mat1, opType);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1;
+    cv::GScalar out;
+    switch (opType)
+    {
+    case NORM_L1: out = cv::gapi::normL1(in1); break;
+    case NORM_L2: out = cv::gapi::normL2(in1); break;
+    case NORM_INF: out = cv::gapi::normInf(in1); break;
+    default: FAIL() << "no such norm operation type!";
+    }
+    cv::GComputation c(GIn(in1), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1), gout(out_norm), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1), gout(out_norm), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    {
+        EXPECT_LE(std::abs(out_norm[0] - out_norm_ocv[0]) / std::max(1.0, std::abs(out_norm_ocv[0])), tolerance)
+            << "OCV=" << out_norm_ocv[0] << "   GAPI=" << out_norm[0];
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(IntegralPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+
+    MatType type_out = (type == CV_8U) ? CV_32SC1 : CV_64FC1;
+
+
+    in_mat1 = cv::Mat(sz_in, type);
+    cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+
+    cv::Size sz_out = cv::Size(sz_in.width + 1, sz_in.height + 1);
+    cv::Mat out_mat1(sz_out, type_out);
+    cv::Mat out_mat_ocv1(sz_out, type_out);
+
+    cv::Mat out_mat2(sz_out, CV_64FC1);
+    cv::Mat out_mat_ocv2(sz_out, CV_64FC1);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::integral(in_mat1, out_mat_ocv1, out_mat_ocv2);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out1, out2;
+    std::tie(out1, out2) = cv::gapi::integral(in1, type_out, CV_64FC1);
+    cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2));
+
+    // Warm-up graph engine:
+    c.apply(cv::gin(in_mat1), cv::gout(out_mat1, out_mat2), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1), cv::gout(out_mat1, out_mat2), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_ocv1 != out_mat1));
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_ocv2 != out_mat2));
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ThresholdPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int tt = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    cv::Scalar thr = initScalarRandU(50);
+    cv::Scalar maxval = initScalarRandU(50) + cv::Scalar(50, 50, 50, 50);
+    initMatrixRandU(type, sz_in, type, false);
+    cv::Scalar out_scalar;
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::threshold(in_mat1, out_mat_ocv, thr.val[0], maxval.val[0], tt);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar th1, mv1;
+    out = cv::gapi::threshold(in1, th1, mv1, tt);
+    cv::GComputation c(GIn(in1, th1, mv1), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, thr, maxval), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, thr, maxval), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ThresholdOTPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int tt = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    cv::Scalar maxval = initScalarRandU(50) + cv::Scalar(50, 50, 50, 50);
+    initMatrixRandU(type, sz_in, type, false);
+    cv::Scalar out_gapi_scalar;
+    double ocv_res;
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    ocv_res = cv::threshold(in_mat1, out_mat_ocv, maxval.val[0], maxval.val[0], tt);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar mv1, scout;
+    std::tie<cv::GMat, cv::GScalar>(out, scout) = cv::gapi::threshold(in1, mv1, tt);
+    cv::GComputation c(cv::GIn(in1, mv1), cv::GOut(out, scout));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, maxval), gout(out_mat_gapi, out_gapi_scalar), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, maxval), gout(out_mat_gapi, out_gapi_scalar), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    EXPECT_EQ(ocv_res, out_gapi_scalar.val[0]);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(InRangePerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    cv::Scalar thrLow = initScalarRandU(100);
+    cv::Scalar thrUp = initScalarRandU(100) + cv::Scalar(100, 100, 100, 100);
+    initMatrixRandU(type, sz_in, type, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::inRange(in_mat1, thrLow, thrUp, out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1;
+    cv::GScalar th1, mv1;
+    auto out = cv::gapi::inRange(in1, th1, mv1);
+    cv::GComputation c(GIn(in1, th1, mv1), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, thrLow, thrUp), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, thrLow, thrUp), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(Split3PerfTest, TestPerformance)
+{
+    Size sz_in = get<0>(GetParam());
+    cv::GCompileArgs compile_args = get<1>(GetParam());
+
+
+    initMatrixRandU(CV_8UC3, sz_in, CV_8UC1);
+    cv::Mat out_mat2 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat3 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv2 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv3 = cv::Mat(sz_in, CV_8UC1);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    std::vector<cv::Mat> out_mats_ocv = { out_mat_ocv, out_mat_ocv2, out_mat_ocv3 };
+    cv::split(in_mat1, out_mats_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out1, out2, out3;
+    std::tie(out1, out2, out3) = cv::gapi::split3(in1);
+    cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2, out3));
+
+    // Warm-up graph engine:
+    c.apply(cv::gin(in_mat1), cv::gout(out_mat_gapi, out_mat2, out_mat3), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1), cv::gout(out_mat_gapi, out_mat2, out_mat3), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(0, cv::norm(out_mat_ocv2, out_mat2, NORM_INF));
+    EXPECT_EQ(0, cv::norm(out_mat_ocv3, out_mat3, NORM_INF));
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(Split4PerfTest, TestPerformance)
+{
+    Size sz_in = get<0>(GetParam());
+    cv::GCompileArgs compile_args = get<1>(GetParam());
+
+    initMatrixRandU(CV_8UC4, sz_in, CV_8UC1);
+    cv::Mat out_mat2 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat3 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat4 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv2 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv3 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv4 = cv::Mat(sz_in, CV_8UC1);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    std::vector<cv::Mat> out_mats_ocv = { out_mat_ocv, out_mat_ocv2, out_mat_ocv3, out_mat_ocv4 };
+    cv::split(in_mat1, out_mats_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out1, out2, out3, out4;
+    std::tie(out1, out2, out3, out4) = cv::gapi::split4(in1);
+    cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2, out3, out4));
+
+    // Warm-up graph engine:
+    c.apply(cv::gin(in_mat1), cv::gout(out_mat_gapi, out_mat2, out_mat3, out_mat4), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1), cv::gout(out_mat_gapi, out_mat2, out_mat3, out_mat4), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(0, cv::norm(out_mat_ocv2, out_mat2, NORM_INF));
+    EXPECT_EQ(0, cv::norm(out_mat_ocv3, out_mat3, NORM_INF));
+    EXPECT_EQ(0, cv::norm(out_mat_ocv4, out_mat4, NORM_INF));
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(Merge3PerfTest, TestPerformance)
+{
+    Size sz_in = get<0>(GetParam());
+    cv::GCompileArgs compile_args = get<1>(GetParam());
+
+    initMatsRandU(CV_8UC1, sz_in, CV_8UC3);
+    cv::Mat in_mat3(sz_in, CV_8UC1);
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+    cv::randn(in_mat3, mean, stddev);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    std::vector<cv::Mat> in_mats_ocv = { in_mat1, in_mat2, in_mat3 };
+    cv::merge(in_mats_ocv, out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, in3;
+    auto out = cv::gapi::merge3(in1, in2, in3);
+    cv::GComputation c(cv::GIn(in1, in2, in3), cv::GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(cv::gin(in_mat1, in_mat2, in_mat3), cv::gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1, in_mat2, in_mat3), cv::gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(Merge4PerfTest, TestPerformance)
+{
+    Size sz_in = get<0>(GetParam());
+    cv::GCompileArgs compile_args = get<1>(GetParam());
+
+    initMatsRandU(CV_8UC1, sz_in, CV_8UC3);
+    cv::Mat in_mat3(sz_in, CV_8UC1);
+    cv::Mat in_mat4(sz_in, CV_8UC1);
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+    cv::randn(in_mat3, mean, stddev);
+    cv::randn(in_mat4, mean, stddev);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    std::vector<cv::Mat> in_mats_ocv = { in_mat1, in_mat2, in_mat3, in_mat4 };
+    cv::merge(in_mats_ocv, out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, in3, in4;
+    auto out = cv::gapi::merge4(in1, in2, in3, in4);
+    cv::GComputation c(cv::GIn(in1, in2, in3, in4), cv::GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(cv::gin(in_mat1, in_mat2, in_mat3, in_mat4), cv::gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(cv::gin(in_mat1, in_mat2, in_mat3, in_mat4), cv::gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(RemapPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatrixRandU(type, sz_in, type, false);
+    cv::Mat in_map1(sz_in, CV_16SC2);
+    cv::Mat in_map2 = cv::Mat();
+    cv::randu(in_map1, cv::Scalar::all(0), cv::Scalar::all(255));
+    cv::Scalar bv = cv::Scalar();
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::remap(in_mat1, out_mat_ocv, in_map1, in_map2, cv::INTER_NEAREST, cv::BORDER_REPLICATE, bv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1;
+    auto out = cv::gapi::remap(in1, in_map1, in_map2, cv::INTER_NEAREST, cv::BORDER_REPLICATE, bv);
+    cv::GComputation c(in1, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(FlipPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int flipCode = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatrixRandU(type, sz_in, type, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::flip(in_mat1, out_mat_ocv, flipCode);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::flip(in, flipCode);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(CropPerfTest, TestPerformance)
+{
+    cv::Size sz_in = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::Rect rect_to = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatrixRandU(type, sz_in, type, false);
+    cv::Size sz_out = cv::Size(rect_to.width, rect_to.height);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::Mat(in_mat1, rect_to).copyTo(out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::crop(in, rect_to);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(out_mat_gapi.size(), sz_out);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ConcatHorPerfTest, TestPerformance)
+{
+    cv::Size sz_out = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    int wpart = sz_out.width / 4;
+
+    cv::Size sz_in1 = cv::Size(wpart, sz_out.height);
+    cv::Size sz_in2 = cv::Size(sz_out.width - wpart, sz_out.height);
+
+    in_mat1 = cv::Mat(sz_in1, type);
+    in_mat2 = cv::Mat(sz_in2, type);
+
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+    cv::randn(in_mat2, mean, stddev);
+
+    out_mat_gapi = cv::Mat(sz_out, type);
+    out_mat_ocv = cv::Mat(sz_out, type);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::hconcat(in_mat1, in_mat2, out_mat_ocv);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = cv::gapi::concatHor(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ConcatHorVecPerfTest, TestPerformance)
+{
+    cv::Size sz_out = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    int wpart1 = sz_out.width / 3;
+    int wpart2 = sz_out.width / 2;
+
+    cv::Size sz_in1 = cv::Size(wpart1, sz_out.height);
+    cv::Size sz_in2 = cv::Size(wpart2, sz_out.height);
+    cv::Size sz_in3 = cv::Size(sz_out.width - wpart1 - wpart2, sz_out.height);
+
+    in_mat1 = cv::Mat(sz_in1, type);
+    in_mat2 = cv::Mat(sz_in2, type);
+    cv::Mat in_mat3(sz_in3, type);
+
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+    cv::randn(in_mat2, mean, stddev);
+    cv::randn(in_mat3, mean, stddev);
+
+    out_mat_gapi = cv::Mat(sz_out, type);
+    out_mat_ocv = cv::Mat(sz_out, type);
+
+    std::vector <cv::Mat> cvmats = { in_mat1, in_mat2, in_mat3 };
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::hconcat(cvmats, out_mat_ocv);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    std::vector <cv::GMat> mats(3);
+    auto out = cv::gapi::concatHor(mats);
+    cv::GComputation c({ mats[0], mats[1], mats[2] }, { out });
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2, in_mat3), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2, in_mat3), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ConcatVertPerfTest, TestPerformance)
+{
+    cv::Size sz_out = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    int hpart = sz_out.height * 2 / 3;
+
+    cv::Size sz_in1 = cv::Size(sz_out.width, hpart);
+    cv::Size sz_in2 = cv::Size(sz_out.width, sz_out.height - hpart);
+
+    in_mat1 = cv::Mat(sz_in1, type);
+    in_mat2 = cv::Mat(sz_in2, type);
+
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+    cv::randn(in_mat2, mean, stddev);
+
+    out_mat_gapi = cv::Mat(sz_out, type);
+    out_mat_ocv = cv::Mat(sz_out, type);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::vconcat(in_mat1, in_mat2, out_mat_ocv);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = cv::gapi::concatVert(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ConcatVertVecPerfTest, TestPerformance)
+{
+    cv::Size sz_out = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    int hpart1 = sz_out.height * 2 / 5;
+    int hpart2 = sz_out.height / 5;
+
+    cv::Size sz_in1 = cv::Size(sz_out.width, hpart1);
+    cv::Size sz_in2 = cv::Size(sz_out.width, hpart2);
+    cv::Size sz_in3 = cv::Size(sz_out.width, sz_out.height - hpart1 - hpart2);
+
+    in_mat1 = cv::Mat(sz_in1, type);
+    in_mat2 = cv::Mat(sz_in2, type);
+    cv::Mat in_mat3(sz_in3, type);
+
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+    cv::randn(in_mat2, mean, stddev);
+    cv::randn(in_mat3, mean, stddev);
+
+    out_mat_gapi = cv::Mat(sz_out, type);
+    out_mat_ocv = cv::Mat(sz_out, type);
+
+    std::vector <cv::Mat> cvmats = { in_mat1, in_mat2, in_mat3 };
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::vconcat(cvmats, out_mat_ocv);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    std::vector <cv::GMat> mats(3);
+    auto out = cv::gapi::concatVert(mats);
+    cv::GComputation c({ mats[0], mats[1], mats[2] }, { out });
+
+    // Warm-up graph engine:
+    c.apply(gin(in_mat1, in_mat2, in_mat3), gout(out_mat_gapi), std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(gin(in_mat1, in_mat2, in_mat3), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(LUTPerfTest, TestPerformance)
+{
+    MatType type_mat = get<0>(GetParam());
+    MatType type_lut = get<1>(GetParam());
+    MatType type_out = CV_MAKETYPE(CV_MAT_DEPTH(type_lut), CV_MAT_CN(type_mat));
+    cv::Size sz_in = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+
+    initMatrixRandU(type_mat, sz_in, type_out);
+    cv::Size sz_lut = cv::Size(1, 256);
+    cv::Mat in_lut(sz_lut, type_lut);
+    cv::randu(in_lut, cv::Scalar::all(0), cv::Scalar::all(255));
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::LUT(in_mat1, in_lut, out_mat_ocv);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::LUT(in, in_lut);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_INF));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ConvertToPerfTest, TestPerformance)
+{
+    MatType type_mat = get<0>(GetParam());
+    int depth_to = get<1>(GetParam());
+    cv::Size sz_in = get<2>(GetParam());
+    cv::GCompileArgs compile_args = get<3>(GetParam());
+    MatType type_out = CV_MAKETYPE(depth_to, CV_MAT_CN(type_mat));
+
+    initMatrixRandU(type_mat, sz_in, type_out);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    in_mat1.convertTo(out_mat_ocv, depth_to);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::convertTo(in, depth_to);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    // FIXIT unrealiable check: EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ResizePerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int interp = get<2>(GetParam());
+    cv::Size sz_in = get<3>(GetParam());
+    cv::Size sz_out = get<4>(GetParam());
+    cv::GCompileArgs compile_args = get<5>(GetParam());
+
+    in_mat1 = cv::Mat(sz_in, type);
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+    cv::randn(in_mat1, mean, stddev);
+    out_mat_gapi = cv::Mat(sz_out, type);
+    out_mat_ocv = cv::Mat(sz_out, type);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::resize(in_mat1, out_mat_ocv, sz_out, 0.0, 0.0, interp);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::resize(in, sz_out, 0.0, 0.0, interp);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ResizeFxFyPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    MatType type = get<1>(GetParam());
+    int interp = get<2>(GetParam());
+    cv::Size sz_in = get<3>(GetParam());
+    double fx = get<4>(GetParam());
+    double fy = get<5>(GetParam());
+    cv::GCompileArgs compile_args = get<6>(GetParam());
+
+    in_mat1 = cv::Mat(sz_in, type);
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+    cv::randn(in_mat1, mean, stddev);
+    cv::Size sz_out = cv::Size(saturate_cast<int>(sz_in.width *fx), saturate_cast<int>(sz_in.height*fy));
+    out_mat_gapi = cv::Mat(sz_out, type);
+    out_mat_ocv = cv::Mat(sz_out, type);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::resize(in_mat1, out_mat_ocv, sz_out, fx, fy, interp);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::resize(in, sz_out, fx, fy, interp);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison ////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+}
+#endif // OPENCV_GAPI_CORE_PERF_TESTS_INL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.cpp
new file mode 100644 (file)
index 0000000..5a2ffb8
--- /dev/null
@@ -0,0 +1,9 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "perf_precomp.hpp"
+#include "gapi_imgproc_perf_tests_inl.hpp"
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests.hpp
new file mode 100644 (file)
index 0000000..750c069
--- /dev/null
@@ -0,0 +1,46 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_IMGPROC_PERF_TESTS_HPP
+#define OPENCV_GAPI_IMGPROC_PERF_TESTS_HPP
+
+
+
+#include "../../test/common/gapi_tests_common.hpp"
+#include "opencv2/gapi/imgproc.hpp"
+
+namespace opencv_test
+{
+
+  using namespace perf;
+
+  //------------------------------------------------------------------------------
+
+class SepFilterPerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, int, cv::GCompileArgs>> {};
+class Filter2DPerfTest : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size,int,int, cv::GCompileArgs>> {};
+class BoxFilterPerfTest : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size,int,int, cv::GCompileArgs>> {};
+class BlurPerfTest : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size,int, cv::GCompileArgs>> {};
+class GaussianBlurPerfTest : public TestPerfParams<tuple<compare_f, MatType, int, cv::Size, cv::GCompileArgs>> {};
+class MedianBlurPerfTest : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size, cv::GCompileArgs>> {};
+class ErodePerfTest : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size,int, cv::GCompileArgs>> {};
+class Erode3x3PerfTest : public TestPerfParams<tuple<compare_f, MatType, cv::Size, int, cv::GCompileArgs>> {};
+class DilatePerfTest : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size,int, cv::GCompileArgs>> {};
+class Dilate3x3PerfTest : public TestPerfParams<tuple<compare_f, MatType,cv::Size,int, cv::GCompileArgs>> {};
+class SobelPerfTest : public TestPerfParams<tuple<compare_f, MatType,int,cv::Size,int,int,int, cv::GCompileArgs>> {};
+class CannyPerfTest : public TestPerfParams<tuple<compare_f, MatType,cv::Size,double,double,int,bool, cv::GCompileArgs>> {};
+class EqHistPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs >> {};
+class RGB2GrayPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs >> {};
+class BGR2GrayPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs >> {};
+class RGB2YUVPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs >> {};
+class YUV2RGBPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs >> {};
+class RGB2LabPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+class BGR2LUVPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+class LUV2BGRPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+class BGR2YUVPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+class YUV2BGRPerfTest : public TestPerfParams<tuple<compare_f, cv::Size, cv::GCompileArgs>> {};
+}
+#endif //OPENCV_GAPI_IMGPROC_PERF_TESTS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/common/gapi_imgproc_perf_tests_inl.hpp
new file mode 100644 (file)
index 0000000..5a13cfe
--- /dev/null
@@ -0,0 +1,909 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_IMGPROC_PERF_TESTS_INL_HPP
+#define OPENCV_GAPI_IMGPROC_PERF_TESTS_INL_HPP
+
+
+#include <iostream>
+
+#include "gapi_imgproc_perf_tests.hpp"
+
+namespace opencv_test
+{
+
+  using namespace perf;
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(SepFilterPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0, dtype = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, dtype, compile_args) = GetParam();
+
+    cv::Mat kernelX(kernSize, 1, CV_32F);
+    cv::Mat kernelY(kernSize, 1, CV_32F);
+    randu(kernelX, -1, 1);
+    randu(kernelY, -1, 1);
+    initMatsRandN(type, sz, dtype, false);
+
+    cv::Point anchor = cv::Point(-1, -1);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::sepFilter2D(in_mat1, out_mat_ocv, dtype, kernelX, kernelY );
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::sepFilter(in, dtype, kernelX, kernelY, anchor, cv::Scalar() );
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+      c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(Filter2DPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0, borderType = 0, dtype = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, borderType, dtype, compile_args) = GetParam();
+
+    initMatsRandN(type, sz, dtype, false);
+
+    cv::Point anchor = {-1, -1};
+    double delta = 0;
+
+    cv::Mat kernel = cv::Mat(kernSize, kernSize, CV_32FC1 );
+    cv::Scalar kernMean = cv::Scalar::all(1.0);
+    cv::Scalar kernStddev = cv::Scalar::all(2.0/3);
+    randn(kernel, kernMean, kernStddev);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::filter2D(in_mat1, out_mat_ocv, dtype, kernel, anchor, delta, borderType);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::filter2D(in, dtype, kernel, anchor, delta, borderType);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(BoxFilterPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int filterSize = 0, borderType = 0, dtype = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, filterSize, sz, borderType, dtype, compile_args) = GetParam();
+
+    initMatsRandN(type, sz, dtype, false);
+
+    cv::Point anchor = {-1, -1};
+    bool normalize = true;
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::boxFilter(in_mat1, out_mat_ocv, dtype, cv::Size(filterSize, filterSize), anchor, normalize, borderType);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::boxFilter(in, dtype, cv::Size(filterSize, filterSize), anchor, normalize, borderType);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(BlurPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int filterSize = 0, borderType = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, filterSize, sz, borderType, compile_args) = GetParam();
+
+    initMatsRandN(type, sz, type, false);
+
+    cv::Point anchor = {-1, -1};
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::blur(in_mat1, out_mat_ocv, cv::Size(filterSize, filterSize), anchor, borderType);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::blur(in, cv::Size(filterSize, filterSize), anchor, borderType);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(GaussianBlurPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, compile_args) = GetParam();
+
+    cv::Size kSize = cv::Size(kernSize, kernSize);
+    auto& rng = cv::theRNG();
+    double sigmaX = rng();
+    initMatsRandN(type, sz, type, false);
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::GaussianBlur(in_mat1, out_mat_ocv, kSize, sigmaX);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::gaussianBlur(in, kSize, sigmaX);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(MedianBlurPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, compile_args) = GetParam();
+
+    initMatsRandN(type, sz, type, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::medianBlur(in_mat1, out_mat_ocv, kernSize);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::medianBlur(in, kernSize);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(ErodePerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0, kernType = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, kernType,  compile_args) = GetParam();
+
+    initMatsRandN(type, sz, type, false);
+
+    cv::Mat kernel = cv::getStructuringElement(kernType, cv::Size(kernSize, kernSize));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::erode(in_mat1, out_mat_ocv, kernel);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::erode(in, kernel);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(Erode3x3PerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int numIters = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, numIters, compile_args) = GetParam();
+
+    initMatsRandN(type, sz, type, false);
+
+    cv::Mat kernel = cv::getStructuringElement(cv::MorphShapes::MORPH_RECT, cv::Size(3, 3));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::erode(in_mat1, out_mat_ocv, kernel, cv::Point(-1, -1), numIters);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::erode3x3(in, numIters);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(DilatePerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0, kernType = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, kernType, compile_args) = GetParam();
+
+    initMatsRandN(type, sz, type, false);
+
+    cv::Mat kernel = cv::getStructuringElement(kernType, cv::Size(kernSize, kernSize));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::dilate(in_mat1, out_mat_ocv, kernel);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::dilate(in, kernel);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(Dilate3x3PerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int numIters = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, numIters, compile_args) = GetParam();
+
+    initMatsRandN(type, sz, type, false);
+
+    cv::Mat kernel = cv::getStructuringElement(cv::MorphShapes::MORPH_RECT, cv::Size(3, 3));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::dilate(in_mat1, out_mat_ocv, kernel, cv::Point(-1,-1), numIters);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::dilate3x3(in, numIters);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(SobelPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0, dtype = 0, dx = 0, dy = 0;
+    cv::Size sz;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, dtype, dx, dy, compile_args) = GetParam();
+
+    initMatsRandN(type, sz, dtype, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::Sobel(in_mat1, out_mat_ocv, dtype, dx, dy, kernSize);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::Sobel(in, dtype, dx, dy, kernSize);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(CannyPerfTest, TestPerformance)
+{
+    compare_f cmpF;
+    MatType type;
+    int apSize = 0;
+    double thrLow = 0.0, thrUp = 0.0;
+    cv::Size sz;
+    bool l2gr = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, thrLow, thrUp, apSize, l2gr, compile_args) = GetParam();
+
+    initMatsRandN(type, sz, CV_8UC1, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::Canny(in_mat1, out_mat_ocv, thrLow, thrUp, apSize, l2gr);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::Canny(in, thrLow, thrUp, apSize, l2gr);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(EqHistPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandN(CV_8UC1, sz, CV_8UC1, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::equalizeHist(in_mat1, out_mat_ocv);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::equalizeHist(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(RGB2GrayPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandN(CV_8UC3, sz, CV_8UC1, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_RGB2GRAY);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::RGB2Gray(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(BGR2GrayPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandN(CV_8UC3, sz, CV_8UC1, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_BGR2GRAY);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::BGR2Gray(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(RGB2YUVPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandN(CV_8UC3, sz, CV_8UC3, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_RGB2YUV);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::RGB2YUV(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(YUV2RGBPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandN(CV_8UC3, sz, CV_8UC3, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_YUV2RGB);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::YUV2RGB(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(RGB2LabPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandN(CV_8UC3, sz, CV_8UC3, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_RGB2Lab);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::RGB2Lab(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(BGR2LUVPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandN(CV_8UC3, sz, CV_8UC3, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_BGR2Luv);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::BGR2LUV(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(LUV2BGRPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandN(CV_8UC3, sz, CV_8UC3, false);
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_Luv2BGR);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::LUV2BGR(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+
+    SANITY_CHECK_NOTHING();
+
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(BGR2YUVPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandN(CV_8UC3, sz, CV_8UC3, false);
+
+    cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_BGR2YUV);
+
+    cv::GMat in;
+    auto out = cv::gapi::BGR2YUV(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+PERF_TEST_P_(YUV2BGRPerfTest, TestPerformance)
+{
+    compare_f cmpF = get<0>(GetParam());
+    Size sz = get<1>(GetParam());
+    cv::GCompileArgs compile_args = get<2>(GetParam());
+
+    initMatsRandN(CV_8UC3, sz, CV_8UC3, false);
+
+    cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_YUV2BGR);
+
+    cv::GMat in;
+    auto out = cv::gapi::YUV2BGR(in);
+    cv::GComputation c(in, out);
+
+    // Warm-up graph engine:
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    TEST_CYCLE()
+    {
+        c.apply(in_mat1, out_mat_gapi);
+    }
+
+    EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+    EXPECT_EQ(out_mat_gapi.size(), sz);
+
+    SANITY_CHECK_NOTHING();
+}
+
+//------------------------------------------------------------------------------
+
+}
+#endif //OPENCV_GAPI_IMGPROC_PERF_TESTS_INL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_core_perf_tests_cpu.cpp
new file mode 100644 (file)
index 0000000..6957401
--- /dev/null
@@ -0,0 +1,286 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../perf_precomp.hpp"
+#include "../common/gapi_core_perf_tests.hpp"
+#include "opencv2/gapi/cpu/core.hpp"
+
+#define CORE_CPU cv::gapi::core::cpu::kernels()
+
+namespace opencv_test
+{
+
+
+INSTANTIATE_TEST_CASE_P(AddPerfTestCPU, AddPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(AddCPerfTestCPU, AddCPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SubPerfTestCPU, SubPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SubCPerfTestCPU, SubCPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SubRCPerfTestCPU, SubRCPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MulPerfTestCPU, MulPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MulDoublePerfTestCPU, MulDoublePerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MulCPerfTestCPU, MulCPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(DivPerfTestCPU, DivPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(DivCPerfTestCPU, DivCPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(DivRCPerfTestCPU, DivRCPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MaskPerfTestCPU, MaskPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_16UC1, CV_16SC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MeanPerfTestCPU, MeanPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Polar2CartPerfTestCPU, Polar2CartPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Cart2PolarPerfTestCPU, Cart2PolarPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(CmpPerfTestCPU, CmpPerfTest,
+    Combine(Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(CmpWithScalarPerfTestCPU, CmpWithScalarPerfTest,
+    Combine(Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BitwisePerfTestCPU, BitwisePerfTest,
+    Combine(Values(AND, OR, XOR),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseNotPerfTestCPU, BitwiseNotPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SelectPerfTestCPU, SelectPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MinPerfTestCPU, MinPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MaxPerfTestCPU, MaxPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestCPU, AbsDiffPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestCPU, AbsDiffCPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SumPerfTestCPU, SumPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(0.0),
+        Values(cv::compile_args(CORE_CPU))));
+
+// FIXME: Comparison introduced by YL doesn't work with C3
+INSTANTIATE_TEST_CASE_P(AddWeightedPerfTestCPU, AddWeightedPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, /*CV_8UC3,*/ CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(-1, CV_8U, CV_16U, CV_32F),
+        Values(0.5000005),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(NormPerfTestCPU, NormPerfTest,
+    Combine(Values(NORM_INF, NORM_L1, NORM_L2),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(0.0),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(IntegralPerfTestCPU, IntegralPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ThresholdPerfTestCPU, ThresholdPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ThresholdPerfTestCPU, ThresholdOTPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1),
+        Values(cv::THRESH_OTSU, cv::THRESH_TRIANGLE),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(InRangePerfTestCPU, InRangePerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Split3PerfTestCPU, Split3PerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Split4PerfTestCPU, Split4PerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Merge3PerfTestCPU, Merge3PerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Merge4PerfTestCPU, Merge4PerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(RemapPerfTestCPU, RemapPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FlipPerfTestCPU, FlipPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(0, 1, -1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(CropPerfTestCPU, CropPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50)),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatHorPerfTestCPU, ConcatHorPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatHorVecPerfTestCPU, ConcatHorVecPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatVertPerfTestCPU, ConcatVertPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatVertVecPerfTestCPU, ConcatVertVecPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(LUTPerfTestCPU, LUTPerfTest,
+    Combine(Values(CV_8UC1, CV_8UC3),
+        Values(CV_8UC1),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(LUTPerfTestCustomCPU, LUTPerfTest,
+    Combine(Values(CV_8UC3),
+        Values(CV_8UC3),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(CORE_CPU))));
+
+
+INSTANTIATE_TEST_CASE_P(ConvertToPerfTestCPU, ConvertToPerfTest,
+    Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_32FC1),
+        Values(CV_8U, CV_16U, CV_16S, CV_32F),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ResizePerfTestCPU, ResizePerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_16UC1, CV_16SC1),
+        Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(cv::Size(64, 64),
+            cv::Size(30, 30)),
+        Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ResizeFxFyPerfTestCPU, ResizeFxFyPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_16UC1, CV_16SC1),
+        Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+        Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(0.5, 0.1),
+        Values(0.5, 0.1),
+        Values(cv::compile_args(CORE_CPU))));
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_cpu.cpp
new file mode 100644 (file)
index 0000000..ea3d753
--- /dev/null
@@ -0,0 +1,188 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../perf_precomp.hpp"
+#include "../common/gapi_imgproc_perf_tests.hpp"
+#include "opencv2/gapi/cpu/imgproc.hpp"
+
+
+#define IMGPROC_CPU cv::gapi::imgproc::cpu::kernels()
+
+namespace opencv_test
+{
+
+INSTANTIATE_TEST_CASE_P(SepFilterPerfTestCPU_8U, SepFilterPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_8UC3),
+        Values(3),
+        Values(szVGA, sz720p, sz1080p),
+        Values(-1, CV_16S, CV_32F),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SepFilterPerfTestCPU_other, SepFilterPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(3),
+        Values(szVGA, sz720p, sz1080p),
+        Values(-1, CV_32F),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Filter2DPerfTestCPU, Filter2DPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(3, 4, 5, 7),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::BORDER_DEFAULT),
+        Values(-1, CV_32F),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BoxFilterPerfTestCPU, BoxFilterPerfTest,
+    Combine(Values(AbsTolerance(0).to_compare_f()),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(3, 5),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::BORDER_DEFAULT),
+        Values(-1, CV_32F),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BlurPerfTestCPU, BlurPerfTest,
+    Combine(Values(AbsTolerance(0).to_compare_f()),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(3, 5),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::BORDER_DEFAULT),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(GaussianBlurPerfTestCPU, GaussianBlurPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(3, 5),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MedianBlurPerfTestCPU, MedianBlurPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(3, 5),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ErodePerfTestCPU, ErodePerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(3, 5),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::MorphShapes::MORPH_RECT,
+            cv::MorphShapes::MORPH_CROSS,
+            cv::MorphShapes::MORPH_ELLIPSE),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Erode3x3PerfTestCPU, Erode3x3PerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(szVGA, sz720p, sz1080p),
+        Values(1, 2, 4),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(DilatePerfTestCPU, DilatePerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(3, 5),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::MorphShapes::MORPH_RECT,
+            cv::MorphShapes::MORPH_CROSS,
+            cv::MorphShapes::MORPH_ELLIPSE),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Dilate3x3PerfTestCPU, Dilate3x3PerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(szVGA, sz720p, sz1080p),
+        Values(1, 2, 4),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SobelPerfTestCPU, SobelPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
+        Values(3, 5),
+        Values(szVGA, sz720p, sz1080p),
+        Values(-1, CV_16S, CV_32F),
+        Values(0, 1),
+        Values(1, 2),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SobelPerfTestCPU32F, SobelPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_32FC1),
+        Values(3, 5),
+        Values(szVGA, sz720p, sz1080p),
+        Values(CV_32F),
+        Values(0, 1),
+        Values(1, 2),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(CannyPerfTestCPU, CannyPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(CV_8UC1, CV_8UC3),
+        Values(szVGA, sz720p, sz1080p),
+        Values(3.0, 120.0),
+        Values(125.0, 240.0),
+        Values(3, 5),
+        Values(true, false),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(EqHistPerfTestCPU, EqHistPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestCPU, RGB2GrayPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestCPU, BGR2GrayPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestCPU, RGB2YUVPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestCPU, YUV2RGBPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestCPU, RGB2LabPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestCPU, BGR2LUVPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(LUV2BGRPerfTestCPU, LUV2BGRPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestCPU, BGR2YUVPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestCPU, YUV2BGRPerfTest,
+    Combine(Values(AbsExact().to_compare_f()),
+        Values(szVGA, sz720p, sz1080p),
+        Values(cv::compile_args(IMGPROC_CPU))));
+
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/cpu/gapi_imgproc_perf_tests_fluid.cpp
new file mode 100644 (file)
index 0000000..a5d13e6
--- /dev/null
@@ -0,0 +1,76 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../perf_precomp.hpp"
+#include "../common/gapi_imgproc_perf_tests.hpp"
+
+#define IMGPROC_FLUID cv::gapi::imgproc::fluid::kernels()
+
+namespace opencv_test
+{
+
+    INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid, SobelPerfTest,
+        Combine(Values(AbsExact().to_compare_f()),
+            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),  // add CV_32FC1 when ready
+            Values(3),                                     // add 5x5 once supported
+            Values(szVGA, sz720p, sz1080p),
+            Values(-1, CV_16S, CV_32F),
+            Values(0, 1),
+            Values(1, 2),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+    INSTANTIATE_TEST_CASE_P(SobelPerfTestFluid32F, SobelPerfTest,
+        Combine(Values(ToleranceFilter(1e-3f, 0.0).to_compare_f()),
+            Values(CV_32FC1),
+            Values(3),                                     // add 5x5 once supported
+            Values(szVGA, sz720p, sz1080p),
+            Values(CV_32F),
+            Values(0, 1),
+            Values(1, 2),
+            Values(cv::compile_args(IMGPROC_FLUID))));
+
+    INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestFluid, RGB2GrayPerfTest,
+        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                Values(szVGA, sz720p, sz1080p),
+                Values(cv::compile_args(IMGPROC_FLUID))));
+
+    INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestFluid, BGR2GrayPerfTest,
+        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                Values(szVGA, sz720p, sz1080p),
+                Values(cv::compile_args(IMGPROC_FLUID))));
+
+    INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestFluid, RGB2YUVPerfTest,
+        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                Values(szVGA, sz720p, sz1080p),
+                Values(cv::compile_args(IMGPROC_FLUID))));
+
+    INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestFluid, YUV2RGBPerfTest,
+        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                Values(szVGA, sz720p, sz1080p),
+                Values(cv::compile_args(IMGPROC_FLUID))));
+
+    INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestFluid, BGR2YUVPerfTest,
+        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                Values(szVGA, sz720p, sz1080p),
+                Values(cv::compile_args(IMGPROC_FLUID))));
+
+    INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestFluid, YUV2BGRPerfTest,
+        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                Values(szVGA, sz720p, sz1080p),
+                Values(cv::compile_args(IMGPROC_FLUID))));
+
+    INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestFluid, BGR2LUVPerfTest,
+        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+                Values(szVGA, sz720p, sz1080p),
+                Values(cv::compile_args(IMGPROC_FLUID))));
+
+    INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestFluid, RGB2LabPerfTest,
+        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+                Values(szVGA, sz720p, sz1080p),
+                Values(cv::compile_args(IMGPROC_FLUID))));
+
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_core_perf_tests_gpu.cpp
new file mode 100644 (file)
index 0000000..652cbae
--- /dev/null
@@ -0,0 +1,291 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../perf_precomp.hpp"
+#include "../common/gapi_core_perf_tests.hpp"
+#include "opencv2/gapi/gpu/core.hpp"
+
+#define CORE_GPU cv::gapi::core::gpu::kernels()
+
+namespace opencv_test
+{
+
+INSTANTIATE_TEST_CASE_P(AddPerfTestGPU, AddPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(AddCPerfTestGPU, AddCPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(SubPerfTestGPU, SubPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(SubCPerfTestGPU, SubCPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(SubRCPerfTestGPU, SubRCPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MulPerfTestGPU, MulPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MulDoublePerfTestGPU, MulDoublePerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MulCPerfTestGPU, MulCPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(DivPerfTestGPU, DivPerfTest,
+                        Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_f()),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(DivCPerfTestGPU, DivCPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(DivRCPerfTestGPU, DivRCPerfTest,
+                        Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_f()),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(cv::compile_args(CORE_GPU))));
+//TODO: mask test doesn't work
+#if 0
+INSTANTIATE_TEST_CASE_P(MaskPerfTestGPU, MaskPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::compile_args(CORE_GPU))));
+#endif
+
+INSTANTIATE_TEST_CASE_P(MeanPerfTestGPU, MeanPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Polar2CartPerfTestGPU, Polar2CartPerfTest,
+                        Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_f()),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Cart2PolarPerfTestGPU, Cart2PolarPerfTest,
+                        Combine(Values(Tolerance_FloatRel_IntAbs(1e-2, 2).to_compare_f()),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(CmpPerfTestGPU, CmpPerfTest,
+                        Combine(Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(CmpWithScalarPerfTestGPU, CmpWithScalarPerfTest,
+                        Combine(Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BitwisePerfTestGPU, BitwisePerfTest,
+                        Combine(Values(AND, OR, XOR),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseNotPerfTestGPU, BitwiseNotPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(SelectPerfTestGPU, SelectPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MinPerfTestGPU, MinPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MaxPerfTestGPU, MaxPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestGPU, AbsDiffPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestGPU, AbsDiffCPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(SumPerfTestGPU, SumPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(4.0), //TODO: too relaxed?
+                                Values(cv::compile_args(CORE_GPU))));
+
+// FIXME: Comparison introduced by YL doesn't work with C3
+INSTANTIATE_TEST_CASE_P(AddWeightedPerfTestGPU, AddWeightedPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, /*CV_8UC3,*/ CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+                                Values(0.50005),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(NormPerfTestGPU, NormPerfTest,
+                        Combine(Values(NORM_INF, NORM_L1, NORM_L2),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(4.0), //TODO: too relaxed?
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(IntegralPerfTestGPU, IntegralPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ThresholdPerfTestGPU, ThresholdPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ThresholdPerfTestGPU, ThresholdOTPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1 ),
+                                Values(cv::THRESH_OTSU, cv::THRESH_TRIANGLE),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(InRangePerfTestGPU, InRangePerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Split3PerfTestGPU, Split3PerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Split4PerfTestGPU, Split4PerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Merge3PerfTestGPU, Merge3PerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Merge4PerfTestGPU, Merge4PerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(RemapPerfTestGPU, RemapPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(FlipPerfTestGPU, FlipPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(0,1,-1),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(CropPerfTestGPU, CropPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatHorPerfTestGPU, ConcatHorPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatVertPerfTestGPU, ConcatVertPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+//TODO: fix this backend to allow ConcatVertVec ConcatHorVec
+#if 0
+INSTANTIATE_TEST_CASE_P(ConcatHorVecPerfTestGPU, ConcatHorVecPerfTest,
+    Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+        Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+        Values(cv::compile_args(CORE_GPU))));
+
+
+INSTANTIATE_TEST_CASE_P(ConcatVertVecPerfTestGPU, ConcatVertVecPerfTest,
+                        Combine(Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::compile_args(CORE_GPU))));
+#endif
+
+INSTANTIATE_TEST_CASE_P(LUTPerfTestGPU, LUTPerfTest,
+                        Combine(Values(CV_8UC1, CV_8UC3),
+                                Values(CV_8UC1),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(LUTPerfTestCustomGPU, LUTPerfTest,
+                        Combine(Values(CV_8UC3),
+                                Values(CV_8UC3),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+
+INSTANTIATE_TEST_CASE_P(ConvertToPerfTestGPU, ConvertToPerfTest,
+                        Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_32FC1),
+                                Values(CV_8U, CV_16U, CV_16S, CV_32F),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ResizePerfTestGPU, ResizePerfTest,
+                        Combine(Values(AbsSimilarPoints(2, 0.05).to_compare_f()),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(cv::Size(64,64),
+                                       cv::Size(30,30)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ResizeFxFyPerfTestGPU, ResizeFxFyPerfTest,
+                        Combine(Values(AbsSimilarPoints(2, 0.05).to_compare_f()),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values( szSmall128, szVGA, sz720p, sz1080p ),
+                                Values(0.5, 0.1),
+                                Values(0.5, 0.1),
+                                Values(cv::compile_args(CORE_GPU))));
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_imgproc_perf_tests_gpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/gpu/gapi_imgproc_perf_tests_gpu.cpp
new file mode 100644 (file)
index 0000000..14ef606
--- /dev/null
@@ -0,0 +1,180 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../perf_precomp.hpp"
+#include "../common/gapi_imgproc_perf_tests.hpp"
+#include "opencv2/gapi/gpu/imgproc.hpp"
+
+#define IMGPROC_GPU cv::gapi::imgproc::gpu::kernels()
+
+namespace opencv_test
+{
+
+
+INSTANTIATE_TEST_CASE_P(SepFilterPerfTestGPU_8U, SepFilterPerfTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3),
+                                Values(3),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(-1, CV_16S, CV_32F),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(SepFilterPerfTestGPU_other, SepFilterPerfTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(-1, CV_32F),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+
+
+INSTANTIATE_TEST_CASE_P(Filter2DPerfTestGPU, Filter2DPerfTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 4, 5, 7),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::BORDER_DEFAULT),
+                                Values(-1, CV_32F),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BoxFilterPerfTestGPU, BoxFilterPerfTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(/*CV_8UC1,*/ CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3,5),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::BORDER_DEFAULT),
+                                Values(-1, CV_32F),
+                                Values(cv::compile_args(IMGPROC_GPU)))); //TODO: 8UC1 doesn't work
+
+INSTANTIATE_TEST_CASE_P(BlurPerfTestGPU, BlurPerfTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::BORDER_DEFAULT),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(GaussianBlurPerfTestGPU, GaussianBlurPerfTest,
+                        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()), //TODO: too relaxed?
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MedianBlurPerfTestGPU, MedianBlurPerfTest,
+                         Combine(Values(AbsExact().to_compare_f()),
+                                 Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                 Values(3, 5),
+                                 Values(szVGA, sz720p, sz1080p),
+                                 Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ErodePerfTestGPU, ErodePerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::MorphShapes::MORPH_RECT,
+                                       cv::MorphShapes::MORPH_CROSS,
+                                       cv::MorphShapes::MORPH_ELLIPSE),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Erode3x3PerfTestGPU, Erode3x3PerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(1,2,4),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(DilatePerfTestGPU, DilatePerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(cv::MorphShapes::MORPH_RECT,
+                                       cv::MorphShapes::MORPH_CROSS,
+                                       cv::MorphShapes::MORPH_ELLIPSE),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Dilate3x3PerfTestGPU, Dilate3x3PerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(1,2,4),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(SobelPerfTestGPU, SobelPerfTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1/*, CV_32FC1*/), //TODO: CV_32FC1 fails accuracy
+                                Values(3, 5),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(-1, CV_32F),
+                                Values(0, 1),
+                                Values(1, 2),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(CannyPerfTestGPU, CannyPerfTest,
+                        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3),
+                                Values(szVGA, sz720p, sz1080p),
+                                Values(3.0, 120.0),
+                                Values(125.0, 240.0),
+                                Values(3, 5),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(EqHistPerfTestGPU, EqHistPerfTest,
+                        Combine(Values(AbsExact().to_compare_f()),  // FIXIT unrealiable check
+                        Values(szVGA, sz720p, sz1080p),
+                        Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2GrayPerfTestGPU, RGB2GrayPerfTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                        Values(szVGA, sz720p, sz1080p),
+                        Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2GrayPerfTestGPU, BGR2GrayPerfTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                        Values(szVGA, sz720p, sz1080p),
+                        Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2YUVPerfTestGPU, RGB2YUVPerfTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                        Values(szVGA, sz720p, sz1080p),
+                        Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(YUV2RGBPerfTestGPU, YUV2RGBPerfTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                        Values(szVGA, sz720p, sz1080p),
+                        Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2LabPerfTestGPU, RGB2LabPerfTest,
+                        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+                        Values(szVGA, sz720p, sz1080p),
+                        Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2LUVPerfTestGPU, BGR2LUVPerfTest,
+                        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+                        Values(szVGA, sz720p, sz1080p),
+                        Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(LUV2BGRPerfTestGPU, LUV2BGRPerfTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                        Values(szVGA, sz720p, sz1080p),
+                        Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2YUVPerfTestGPU, BGR2YUVPerfTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                        Values(szVGA, sz720p, sz1080p),
+                        Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(YUV2BGRPerfTestGPU, YUV2BGRPerfTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                        Values(szVGA, sz720p, sz1080p),
+                        Values(cv::compile_args(IMGPROC_GPU))));
+
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/internal/gapi_compiler_perf_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/internal/gapi_compiler_perf_tests.cpp
new file mode 100644 (file)
index 0000000..48786b6
--- /dev/null
@@ -0,0 +1,45 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "perf_precomp.hpp"
+#include "../../test/common/gapi_tests_common.hpp"
+
+namespace opencv_test
+{
+using namespace perf;
+
+class CompilerPerfTest : public TestPerfParams<tuple<cv::Size, MatType>> {};
+PERF_TEST_P_(CompilerPerfTest, TestPerformance)
+{
+  const auto params = GetParam();
+  Size sz = get<0>(params);
+  MatType type = get<1>(params);
+
+  initMatsRandU(type, sz, type, false);
+
+  // G-API code ////////////////////////////////////////////////////////////
+  cv::GMat in;
+  auto splitted = cv::gapi::split3(in);
+  auto add1 = cv::gapi::addC({1}, std::get<0>(splitted));
+  auto add2 = cv::gapi::addC({2}, std::get<1>(splitted));
+  auto add3 = cv::gapi::addC({3}, std::get<2>(splitted));
+  auto out = cv::gapi::merge3(add1, add2, add3);
+
+  TEST_CYCLE()
+  {
+      cv::GComputation c(in, out);
+      c.apply(in_mat1, out_mat_gapi, cv::compile_args(cv::gapi::core::fluid::kernels()));
+  }
+
+  SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(CompilerPerfTest, CompilerPerfTest,
+                        Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
+                                Values(CV_8UC3)));
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_main.cpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_main.cpp
new file mode 100644 (file)
index 0000000..8d6d77e
--- /dev/null
@@ -0,0 +1,11 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "perf_precomp.hpp"
+//#include "../test/test_precomp.hpp"
+
+CV_PERF_TEST_MAIN(gapi)
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_precomp.hpp b/inference-engine/thirdparty/fluid/modules/gapi/perf/perf_precomp.hpp
new file mode 100644 (file)
index 0000000..abd7cbe
--- /dev/null
@@ -0,0 +1,25 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef __OPENCV_GAPI_PERF_PRECOMP_HPP__
+#define __OPENCV_GAPI_PERF_PRECOMP_HPP__
+
+#include <cstdint>
+#include <vector>
+
+#include "opencv2/ts.hpp"
+#include "opencv2/gapi.hpp"
+#include "opencv2/gapi/imgproc.hpp"
+#include "opencv2/gapi/core.hpp"
+#include "opencv2/gapi/cpu/gcpukernel.hpp"
+#include "opencv2/gapi/gpu/ggpukernel.hpp"
+#include "opencv2/gapi/operators.hpp"
+
+#include "opencv2/gapi/fluid/core.hpp"
+#include "opencv2/gapi/fluid/imgproc.hpp"
+
+#endif // __OPENCV_GAPI_PERF_PRECOMP_HPP__
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/samples/api_example.cpp b/inference-engine/thirdparty/fluid/modules/gapi/samples/api_example.cpp
new file mode 100644 (file)
index 0000000..a731000
--- /dev/null
@@ -0,0 +1,34 @@
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/core.hpp>
+#include <opencv2/gapi/imgproc.hpp>
+
+int main(int argc, char *argv[])
+{
+    cv::VideoCapture cap;
+    if (argc > 1) cap.open(argv[1]);
+    else cap.open(0);
+    CV_Assert(cap.isOpened());
+
+    cv::GMat in;
+    cv::GMat vga      = cv::gapi::resize(in, cv::Size(), 0.5, 0.5);
+    cv::GMat gray     = cv::gapi::BGR2Gray(vga);
+    cv::GMat blurred  = cv::gapi::blur(gray, cv::Size(5,5));
+    cv::GMat edges    = cv::gapi::Canny(blurred, 32, 128, 3);
+    cv::GMat b,g,r;
+    std::tie(b,g,r)   = cv::gapi::split3(vga);
+    cv::GMat out      = cv::gapi::merge3(b, g | edges, r);
+    cv::GComputation ac(in, out);
+
+    cv::Mat input_frame;
+    cv::Mat output_frame;
+    CV_Assert(cap.read(input_frame));
+    do
+    {
+        ac.apply(input_frame, output_frame);
+        cv::imshow("output", output_frame);
+    } while (cap.read(input_frame) && cv::waitKey(30) < 0);
+
+    return 0;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/samples/api_ref_snippets.cpp b/inference-engine/thirdparty/fluid/modules/gapi/samples/api_ref_snippets.cpp
new file mode 100644 (file)
index 0000000..5e8859d
--- /dev/null
@@ -0,0 +1,82 @@
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/gapi.hpp>
+#include <opencv2/gapi/core.hpp>
+#include <opencv2/gapi/imgproc.hpp>
+
+#include <opencv2/gapi/cpu/gcpukernel.hpp>
+
+#include <opencv2/gapi/fluid/core.hpp>
+#include <opencv2/gapi/fluid/imgproc.hpp>
+
+G_TYPED_KERNEL(IAdd, <cv::GMat(cv::GMat)>, "test.custom.add") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in) { return in; }
+};
+G_TYPED_KERNEL(IFilter2D, <cv::GMat(cv::GMat)>, "test.custom.filter2d") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in) { return in; }
+};
+G_TYPED_KERNEL(IRGB2YUV, <cv::GMat(cv::GMat)>, "test.custom.add") {
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in) { return in; }
+};
+GAPI_OCV_KERNEL(CustomAdd,      IAdd)      { static void run(cv::Mat, cv::Mat &) {} };
+GAPI_OCV_KERNEL(CustomFilter2D, IFilter2D) { static void run(cv::Mat, cv::Mat &) {} };
+GAPI_OCV_KERNEL(CustomRGB2YUV,  IRGB2YUV)  { static void run(cv::Mat, cv::Mat &) {} };
+
+int main(int argc, char *argv[])
+{
+    if (argc < 3)
+        return -1;
+
+    cv::Mat input = cv::imread(argv[1]);
+    cv::Mat output;
+
+    {
+    //! [graph_def]
+    cv::GMat in;
+    cv::GMat gx = cv::gapi::Sobel(in, CV_32F, 1, 0);
+    cv::GMat gy = cv::gapi::Sobel(in, CV_32F, 0, 1);
+    cv::GMat g  = cv::gapi::sqrt(cv::gapi::mul(gx, gx) + cv::gapi::mul(gy, gy));
+    cv::GMat out = cv::gapi::convertTo(g, CV_8U);
+    //! [graph_def]
+
+    //! [graph_decl_apply]
+    //! [graph_cap_full]
+    cv::GComputation sobelEdge(cv::GIn(in), cv::GOut(out));
+    //! [graph_cap_full]
+    sobelEdge.apply(input, output);
+    //! [graph_decl_apply]
+
+    //! [apply_with_param]
+    cv::gapi::GKernelPackage kernels = cv::gapi::combine
+        (cv::gapi::core::fluid::kernels(),
+         cv::gapi::imgproc::fluid::kernels(),
+         cv::unite_policy::KEEP);
+    sobelEdge.apply(input, output, cv::compile_args(kernels));
+    //! [apply_with_param]
+
+    //! [graph_cap_sub]
+    cv::GComputation sobelEdgeSub(cv::GIn(gx, gy), cv::GOut(out));
+    //! [graph_cap_sub]
+    }
+    //! [graph_gen]
+    cv::GComputation sobelEdgeGen([](){
+            cv::GMat in;
+            cv::GMat gx = cv::gapi::Sobel(in, CV_32F, 1, 0);
+            cv::GMat gy = cv::gapi::Sobel(in, CV_32F, 0, 1);
+            cv::GMat g  = cv::gapi::sqrt(cv::gapi::mul(gx, gx) + cv::gapi::mul(gy, gy));
+            cv::GMat out = cv::gapi::convertTo(g, CV_8U);
+            return cv::GComputation(in, out);
+        });
+    //! [graph_gen]
+
+    cv::imwrite(argv[2], output);
+
+    //! [kernels_snippet]
+    cv::gapi::GKernelPackage pkg = cv::gapi::kernels
+        < CustomAdd
+        , CustomFilter2D
+        , CustomRGB2YUV
+        >();
+    //! [kernels_snippet]
+    return 0;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/samples/kernel_api_snippets.cpp b/inference-engine/thirdparty/fluid/modules/gapi/samples/kernel_api_snippets.cpp
new file mode 100644 (file)
index 0000000..a30161d
--- /dev/null
@@ -0,0 +1,157 @@
+// [filter2d_api]
+#include <opencv2/gapi.hpp>
+
+G_TYPED_KERNEL(GFilter2D,
+               <cv::GMat(cv::GMat,int,cv::Mat,cv::Point,double,int,cv::Scalar)>,
+               "org.opencv.imgproc.filters.filter2D")
+{
+    static cv::GMatDesc                 // outMeta's return value type
+    outMeta(cv::GMatDesc    in       ,  // descriptor of input GMat
+            int             ddepth   ,  // depth parameter
+            cv::Mat      /* coeffs */,  // (unused)
+            cv::Point    /* anchor */,  // (unused)
+            double       /* scale  */,  // (unused)
+            int          /* border */,  // (unused)
+            cv::Scalar   /* bvalue */ ) // (unused)
+    {
+        return in.withDepth(ddepth);
+    }
+};
+// [filter2d_api]
+
+cv::GMat filter2D(cv::GMat  ,
+                  int       ,
+                  cv::Mat   ,
+                  cv::Point ,
+                  double    ,
+                  int       ,
+                  cv::Scalar);
+
+// [filter2d_wrap]
+cv::GMat filter2D(cv::GMat   in,
+                  int        ddepth,
+                  cv::Mat    k,
+                  cv::Point  anchor  = cv::Point(-1,-1),
+                  double     scale   = 0.,
+                  int        border  = cv::BORDER_DEFAULT,
+                  cv::Scalar bval    = cv::Scalar(0))
+{
+    return GFilter2D::on(in, ddepth, k, anchor, scale, border, bval);
+}
+// [filter2d_wrap]
+
+// [compound]
+#include <opencv2/gapi/gcompoundkernel.hpp>       // GAPI_COMPOUND_KERNEL()
+
+using PointArray2f = cv::GArray<cv::Point2f>;
+
+G_TYPED_KERNEL(HarrisCorners,
+               <PointArray2f(cv::GMat,int,double,double,int,double)>,
+               "org.opencv.imgproc.harris_corner")
+{
+    static cv::GArrayDesc outMeta(const cv::GMatDesc &,
+                                  int,
+                                  double,
+                                  double,
+                                  int,
+                                  double)
+    {
+        // No special metadata for arrays in G-API (yet)
+        return cv::empty_array_desc();
+    }
+};
+
+// Define Fluid-backend-local kernels which form GoodFeatures
+G_TYPED_KERNEL(HarrisResponse,
+               <cv::GMat(cv::GMat,double,int,double)>,
+               "org.opencv.fluid.harris_response")
+{
+    static cv::GMatDesc outMeta(const cv::GMatDesc &in,
+                                double,
+                                int,
+                                double)
+    {
+        return in.withType(CV_32F, 1);
+    }
+};
+
+G_TYPED_KERNEL(ArrayNMS,
+               <PointArray2f(cv::GMat,int,double)>,
+               "org.opencv.cpu.nms_array")
+{
+    static cv::GArrayDesc outMeta(const cv::GMatDesc &,
+                                  int,
+                                  double)
+    {
+        return cv::empty_array_desc();
+    }
+};
+
+GAPI_COMPOUND_KERNEL(GFluidHarrisCorners, HarrisCorners)
+{
+    static PointArray2f
+    expand(cv::GMat in,
+           int      maxCorners,
+           double   quality,
+           double   minDist,
+           int      blockSize,
+           double   k)
+    {
+        cv::GMat response = HarrisResponse::on(in, quality, blockSize, k);
+        return ArrayNMS::on(response, maxCorners, minDist);
+    }
+};
+
+// Then implement HarrisResponse as Fluid kernel and NMSresponse
+// as a generic (OpenCV) kernel
+// [compound]
+
+// [filter2d_ocv]
+#include <opencv2/gapi/cpu/gcpukernel.hpp>     // GAPI_OCV_KERNEL()
+#include <opencv2/imgproc.hpp>                 // cv::filter2D()
+
+GAPI_OCV_KERNEL(GCPUFilter2D, GFilter2D)
+{
+    static void
+    run(const cv::Mat    &in,       // in - derived from GMat
+        const int         ddepth,   // opaque (passed as-is)
+        const cv::Mat    &k,        // opaque (passed as-is)
+        const cv::Point  &anchor,   // opaque (passed as-is)
+        const double      delta,    // opaque (passed as-is)
+        const int         border,   // opaque (passed as-is)
+        const cv::Scalar &,         // opaque (passed as-is)
+        cv::Mat          &out)      // out - derived from GMat (retval)
+    {
+        cv::filter2D(in, out, ddepth, k, anchor, delta, border);
+    }
+};
+// [filter2d_ocv]
+
+int main(int, char *[])
+{
+    std::cout << "This sample is non-complete. It is used as code snippents in documentation." << std::endl;
+
+cv::Mat conv_kernel_mat;
+
+{
+// [filter2d_on]
+cv::GMat in;
+cv::GMat out = GFilter2D::on(/* GMat    */  in,
+                             /* int     */  -1,
+                             /* Mat     */  conv_kernel_mat,
+                             /* Point   */  cv::Point(-1,-1),
+                             /* double  */  0.,
+                             /* int     */  cv::BORDER_DEFAULT,
+                             /* Scalar  */  cv::Scalar(0));
+// [filter2d_on]
+}
+
+{
+// [filter2d_wrap_call]
+cv::GMat in;
+cv::GMat out = filter2D(in, -1, conv_kernel_mat);
+// [filter2d_wrap_call]
+}
+
+return 0;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/README.md b/inference-engine/thirdparty/fluid/modules/gapi/src/api/README.md
new file mode 100644 (file)
index 0000000..970f730
--- /dev/null
@@ -0,0 +1 @@
+This directory contains implementation of G-API frontend (public API classes).
\ No newline at end of file
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.cpp
new file mode 100644 (file)
index 0000000..744db16
--- /dev/null
@@ -0,0 +1,44 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+#include <ade/util/assert.hpp>
+
+#include "api/gapi_priv.hpp"
+#include "api/gnode_priv.hpp"
+
+cv::GOrigin::GOrigin(GShape s,
+                    const cv::GNode& n,
+                    std::size_t p,
+                    const cv::gimpl::HostCtor c)
+    : shape(s), node(n), port(p), ctor(c)
+{
+}
+
+cv::GOrigin::GOrigin(GShape s, cv::gimpl::ConstVal v)
+    : shape(s), node(cv::GNode::Const()), value(v), port(INVALID_PORT)
+{
+}
+
+bool cv::detail::GOriginCmp::operator() (const cv::GOrigin &lhs,
+                                         const cv::GOrigin &rhs) const
+{
+    const GNode::Priv* lhs_p = &lhs.node.priv();
+    const GNode::Priv* rhs_p = &rhs.node.priv();
+    if (lhs_p == rhs_p)
+    {
+        if (lhs.port == rhs.port)
+        {
+            // A data Origin is uniquely identified by {node/port} pair.
+            // The situation when there're two Origins with same {node/port}s
+            // but with different shapes (data formats) is illegal!
+            GAPI_Assert(lhs.shape == rhs.shape);
+        }
+        return lhs.port < rhs.port;
+    }
+    else return lhs_p < rhs_p;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gapi_priv.hpp
new file mode 100644 (file)
index 0000000..edab0a0
--- /dev/null
@@ -0,0 +1,77 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_PRIV_HPP
+#define OPENCV_GAPI_PRIV_HPP
+
+#include <set>   // set
+#include <map>   // map
+#include <limits>
+
+#include "opencv2/gapi/util/variant.hpp"   // variant
+#include "opencv2/gapi/garray.hpp"         // ConstructVec
+#include "opencv2/gapi/gscalar.hpp"
+#include "opencv2/gapi/gcommon.hpp"
+
+#include "opencv2/gapi/opencv_includes.hpp"
+
+#include "api/gnode.hpp"
+
+namespace cv
+{
+
+namespace gimpl
+{
+    // Union type for various user-defined type constructors (GArray<T>, etc)
+    // FIXME: Replace construct-only API with a more generic one
+    //    (probably with bits of introspection)
+    // Not required for non-user-defined types (GMat, GScalar, etc)
+    using HostCtor = util::variant
+        < util::monostate
+        , detail::ConstructVec
+        >;
+
+    using ConstVal = util::variant
+        < util::monostate
+        , cv::gapi::own::Scalar
+        >;
+}
+
+// TODO namespace gimpl?
+
+struct GOrigin
+{
+    static constexpr const std::size_t INVALID_PORT = std::numeric_limits<std::size_t>::max();
+
+    GOrigin(GShape s,
+            const GNode& n,
+            std::size_t p = INVALID_PORT,
+            const gimpl::HostCtor h = {});
+    GOrigin(GShape s, gimpl::ConstVal value);
+
+    const GShape          shape;           // Shape of a produced object
+    const GNode           node;            // a GNode which produces an object
+    const gimpl::ConstVal value;           // Node can have initial constant value, now only scalar is supported
+    const std::size_t     port;            // GNode's output number; FIXME: "= max_size" in C++14
+    gimpl::HostCtor       ctor;            // FIXME: replace with an interface?
+};
+
+namespace detail
+{
+    struct GOriginCmp
+    {
+        bool operator() (const GOrigin &lhs, const GOrigin &rhs) const;
+    };
+} // namespace cv::details
+
+// TODO introduce a hash on GOrigin and define this via unordered_ ?
+using GOriginSet = std::set<GOrigin, detail::GOriginCmp>;
+template<typename T> using GOriginMap = std::map<GOrigin, T, detail::GOriginCmp>;
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_PRIV_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/garray.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/garray.cpp
new file mode 100644 (file)
index 0000000..0fd19a7
--- /dev/null
@@ -0,0 +1,45 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+#include "opencv2/gapi/garray.hpp"
+#include "api/gapi_priv.hpp" // GOrigin
+
+// cv::detail::GArrayU public implementation ///////////////////////////////////
+cv::detail::GArrayU::GArrayU()
+    : m_priv(new GOrigin(GShape::GARRAY, cv::GNode::Param()))
+{
+}
+
+cv::detail::GArrayU::GArrayU(const GNode &n, std::size_t out)
+    : m_priv(new GOrigin(GShape::GARRAY, n, out))
+{
+}
+
+cv::GOrigin& cv::detail::GArrayU::priv()
+{
+    return *m_priv;
+}
+
+const cv::GOrigin& cv::detail::GArrayU::priv() const
+{
+    return *m_priv;
+}
+
+void cv::detail::GArrayU::setConstructFcn(ConstructVec &&cv)
+{
+    m_priv->ctor = std::move(cv);
+}
+
+namespace cv {
+std::ostream& operator<<(std::ostream& os, const cv::GArrayDesc &)
+{
+    // FIXME: add type information here
+    os << "(array)";
+    return os;
+}
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend.cpp
new file mode 100644 (file)
index 0000000..8144d21
--- /dev/null
@@ -0,0 +1,353 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+#include <memory> // unique_ptr
+
+#include "opencv2/gapi/gkernel.hpp"
+#include "opencv2/gapi/own/convert.hpp"
+
+#include "api/gbackend_priv.hpp"
+#include "backends/common/gbackend.hpp"
+#include "compiler/gobjref.hpp"
+#include "compiler/gislandmodel.hpp"
+
+// GBackend private implementation /////////////////////////////////////////////
+void cv::gapi::GBackend::Priv::unpackKernel(ade::Graph             & /*graph  */ ,
+                                            const ade::NodeHandle  & /*op_node*/ ,
+                                            const GKernelImpl      & /*impl   */ )
+{
+    // Default implementation is still there as Priv
+    // is instantiated by some tests.
+    // Priv is even instantiated as a mock object in a number of tests
+    // as a backend and this method is called for mock objects (doing nothing).
+    // FIXME: add a warning message here
+    // FIXME: Do something with this! Ideally this function should be "=0";
+}
+
+std::unique_ptr<cv::gimpl::GIslandExecutable>
+cv::gapi::GBackend::Priv::compile(const ade::Graph&,
+                                  const GCompileArgs&,
+                                  const std::vector<ade::NodeHandle> &) const
+{
+    // ...and this method is here for the same reason!
+    GAPI_Assert(false);
+    return {};
+}
+
+void cv::gapi::GBackend::Priv::addBackendPasses(ade::ExecutionEngineSetupContext &)
+{
+    // Do nothing by default, plugins may override this to
+    // add custom (backend-specific) graph transformations
+}
+
+// GBackend public implementation //////////////////////////////////////////////
+cv::gapi::GBackend::GBackend()
+{
+}
+
+cv::gapi::GBackend::GBackend(std::shared_ptr<cv::gapi::GBackend::Priv> &&p)
+    : m_priv(std::move(p))
+{
+}
+
+cv::gapi::GBackend::Priv& cv::gapi::GBackend::priv()
+{
+    return *m_priv;
+}
+
+const cv::gapi::GBackend::Priv& cv::gapi::GBackend::priv() const
+{
+    return *m_priv;
+}
+
+std::size_t cv::gapi::GBackend::hash() const
+{
+    return std::hash<const cv::gapi::GBackend::Priv*>{}(m_priv.get());
+}
+
+bool cv::gapi::GBackend::operator== (const cv::gapi::GBackend &rhs) const
+{
+    return m_priv == rhs.m_priv;
+}
+
+// Abstract Host-side data manipulation ////////////////////////////////////////
+// Reused between CPU backend and more generic GExecutor
+namespace cv {
+namespace gimpl {
+namespace magazine {
+
+// FIXME implement the below functions with visit()?
+
+void bindInArg(Mag& mag, const RcDesc &rc, const GRunArg &arg, bool is_umat)
+{
+    switch (rc.shape)
+    {
+    case GShape::GMAT:
+    {
+        switch (arg.index())
+        {
+        case GRunArg::index_of<cv::gapi::own::Mat>() :
+            if (is_umat)
+            {
+#if !defined(GAPI_STANDALONE)
+                auto& mag_umat = mag.template slot<cv::UMat>()[rc.id];
+                mag_umat = to_ocv(util::get<cv::gapi::own::Mat>(arg)).getUMat(ACCESS_READ);
+#else
+                util::throw_error(std::logic_error("UMat is not supported in stadnalone build"));
+#endif // !defined(GAPI_STANDALONE)
+            }
+            else
+            {
+                auto& mag_mat = mag.template slot<cv::gapi::own::Mat>()[rc.id];
+                mag_mat = util::get<cv::gapi::own::Mat>(arg);
+            }
+            break;
+#if !defined(GAPI_STANDALONE)
+        case GRunArg::index_of<cv::Mat>() :
+            if (is_umat)
+            {
+                auto& mag_umat = mag.template slot<cv::UMat>()[rc.id];
+                mag_umat = (util::get<cv::UMat>(arg));
+            }
+            else
+            {
+                auto& mag_mat = mag.template slot<cv::gapi::own::Mat>()[rc.id];
+                mag_mat = to_own(util::get<cv::Mat>(arg));
+            }
+            break;
+#endif //  !defined(GAPI_STANDALONE)
+        default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+        }
+        break;
+    }
+
+
+    case GShape::GSCALAR:
+    {
+        auto& mag_scalar = mag.template slot<cv::gapi::own::Scalar>()[rc.id];
+        switch (arg.index())
+        {
+            case GRunArg::index_of<cv::gapi::own::Scalar>() : mag_scalar = util::get<cv::gapi::own::Scalar>(arg); break;
+#if !defined(GAPI_STANDALONE)
+            case GRunArg::index_of<cv::Scalar>()            : mag_scalar = to_own(util::get<cv::Scalar>(arg));    break;
+#endif //  !defined(GAPI_STANDALONE)
+            default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+        }
+        break;
+    }
+
+    case GShape::GARRAY:
+        mag.template slot<cv::detail::VectorRef>()[rc.id] = util::get<cv::detail::VectorRef>(arg);
+        break;
+
+    default:
+        util::throw_error(std::logic_error("Unsupported GShape type"));
+    }
+}
+
+void bindOutArg(Mag& mag, const RcDesc &rc, const GRunArgP &arg, bool is_umat)
+{
+    switch (rc.shape)
+    {
+    case GShape::GMAT:
+    {
+        switch (arg.index())
+        {
+        case GRunArgP::index_of<cv::gapi::own::Mat*>() :
+            if (is_umat)
+            {
+#if !defined(GAPI_STANDALONE)
+                auto& mag_umat = mag.template slot<cv::UMat>()[rc.id];
+                mag_umat = to_ocv(*(util::get<cv::gapi::own::Mat*>(arg))).getUMat(ACCESS_RW);
+#else
+                util::throw_error(std::logic_error("UMat is not supported in standalone build"));
+#endif // !defined(GAPI_STANDALONE)
+            }
+            else
+            {
+                auto& mag_mat = mag.template slot<cv::gapi::own::Mat>()[rc.id];
+                mag_mat = *util::get<cv::gapi::own::Mat*>(arg);
+            }
+            break;
+#if !defined(GAPI_STANDALONE)
+        case GRunArgP::index_of<cv::Mat*>() :
+            if (is_umat)
+            {
+                auto& mag_umat = mag.template slot<cv::UMat>()[rc.id];
+                mag_umat = (*util::get<cv::UMat*>(arg));
+            }
+            else
+            {
+                auto& mag_mat = mag.template slot<cv::gapi::own::Mat>()[rc.id];
+                mag_mat = to_own(*util::get<cv::Mat*>(arg));
+            }
+            break;
+#endif //  !defined(GAPI_STANDALONE)
+        default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+        }
+        break;
+    }
+
+    case GShape::GSCALAR:
+    {
+        auto& mag_scalar = mag.template slot<cv::gapi::own::Scalar>()[rc.id];
+        switch (arg.index())
+        {
+            case GRunArgP::index_of<cv::gapi::own::Scalar*>() : mag_scalar = *util::get<cv::gapi::own::Scalar*>(arg); break;
+#if !defined(GAPI_STANDALONE)
+            case GRunArgP::index_of<cv::Scalar*>()            : mag_scalar = to_own(*util::get<cv::Scalar*>(arg)); break;
+#endif //  !defined(GAPI_STANDALONE)
+            default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+        }
+        break;
+    }
+    case GShape::GARRAY:
+        mag.template slot<cv::detail::VectorRef>()[rc.id] = util::get<cv::detail::VectorRef>(arg);
+        break;
+
+    default:
+        util::throw_error(std::logic_error("Unsupported GShape type"));
+        break;
+    }
+}
+
+void resetInternalData(Mag& mag, const Data &d)
+{
+    if (d.storage != Data::Storage::INTERNAL)
+        return;
+
+    switch (d.shape)
+    {
+    case GShape::GARRAY:
+        util::get<cv::detail::ConstructVec>(d.ctor)
+            (mag.template slot<cv::detail::VectorRef>()[d.rc]);
+        break;
+
+    case GShape::GSCALAR:
+        mag.template slot<cv::gapi::own::Scalar>()[d.rc] = cv::gapi::own::Scalar();
+        break;
+
+    case GShape::GMAT:
+        // Do nothign here - FIXME unify with initInternalData?
+        break;
+
+    default:
+        util::throw_error(std::logic_error("Unsupported GShape type"));
+        break;
+    }
+}
+
+cv::GRunArg getArg(const Mag& mag, const RcDesc &ref)
+{
+    // Wrap associated CPU object (either host or an internal one)
+    switch (ref.shape)
+    {
+    case GShape::GMAT:    return GRunArg(mag.template slot<cv::gapi::own::Mat>().at(ref.id));
+    case GShape::GSCALAR: return GRunArg(mag.template slot<cv::gapi::own::Scalar>().at(ref.id));
+    // Note: .at() is intentional for GArray as object MUST be already there
+    //   (and constructed by either bindIn/Out or resetInternal)
+    case GShape::GARRAY:  return GRunArg(mag.template slot<cv::detail::VectorRef>().at(ref.id));
+    default:
+        util::throw_error(std::logic_error("Unsupported GShape type"));
+        break;
+    }
+}
+
+cv::GRunArgP getObjPtr(Mag& mag, const RcDesc &rc, bool is_umat)
+{
+    switch (rc.shape)
+    {
+    case GShape::GMAT:
+        if (is_umat)
+        {
+#if !defined(GAPI_STANDALONE)
+            return GRunArgP(&mag.template slot<cv::UMat>()[rc.id]);
+#else
+            util::throw_error(std::logic_error("UMat is not supported in standalone build"));
+#endif //  !defined(GAPI_STANDALONE)
+        }
+        else
+            return GRunArgP(&mag.template slot<cv::gapi::own::Mat>()[rc.id]);
+    case GShape::GSCALAR: return GRunArgP(&mag.template slot<cv::gapi::own::Scalar>()[rc.id]);
+    // Note: .at() is intentional for GArray as object MUST be already there
+    //   (and constructer by either bindIn/Out or resetInternal)
+    case GShape::GARRAY:
+        // FIXME(DM): For some absolutely unknown to me reason, move
+        // semantics is involved here without const_cast to const (and
+        // value from map is moved into return value GRunArgP, leaving
+        // map with broken value I've spent few late Friday hours
+        // debugging this!!!1
+        return GRunArgP(const_cast<const Mag&>(mag)
+                        .template slot<cv::detail::VectorRef>().at(rc.id));
+    default:
+        util::throw_error(std::logic_error("Unsupported GShape type"));
+        break;
+    }
+}
+
+void writeBack(const Mag& mag, const RcDesc &rc, GRunArgP &g_arg, bool is_umat)
+{
+    switch (rc.shape)
+    {
+    case GShape::GARRAY:
+        // Do nothing - should we really do anything here?
+        break;
+
+    case GShape::GMAT:
+    {
+        //simply check that memory was not reallocated, i.e.
+        //both instances of Mat pointing to the same memory
+        uchar* out_arg_data = nullptr;
+        switch (g_arg.index())
+        {
+            case GRunArgP::index_of<cv::gapi::own::Mat*>() : out_arg_data = util::get<cv::gapi::own::Mat*>(g_arg)->data; break;
+#if !defined(GAPI_STANDALONE)
+            case GRunArgP::index_of<cv::Mat*>()            : out_arg_data = util::get<cv::Mat*>(g_arg)->data; break;
+            case GRunArgP::index_of<cv::UMat*>()           : out_arg_data = (util::get<cv::UMat*>(g_arg))->getMat(ACCESS_RW).data; break;
+#endif //  !defined(GAPI_STANDALONE)
+            default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+        }
+        if (is_umat)
+        {
+#if !defined(GAPI_STANDALONE)
+            auto& in_mag = mag.template slot<cv::UMat>().at(rc.id);
+            GAPI_Assert((out_arg_data == (in_mag.getMat(ACCESS_RW).data)) && " data for output parameters was reallocated ?");
+#else
+            util::throw_error(std::logic_error("UMat is not supported in standalone build"));
+#endif // !defined(GAPI_STANDALONE)
+        }
+        else
+        {
+            auto& in_mag = mag.template slot<cv::gapi::own::Mat>().at(rc.id);
+            GAPI_Assert((out_arg_data == in_mag.data) && " data for output parameters was reallocated ?");
+        }
+        break;
+    }
+
+    case GShape::GSCALAR:
+    {
+        switch (g_arg.index())
+        {
+            case GRunArgP::index_of<cv::gapi::own::Scalar*>() : *util::get<cv::gapi::own::Scalar*>(g_arg) = mag.template slot<cv::gapi::own::Scalar>().at(rc.id); break;
+#if !defined(GAPI_STANDALONE)
+            case GRunArgP::index_of<cv::Scalar*>()            : *util::get<cv::Scalar*>(g_arg) = cv::gapi::own::to_ocv(mag.template slot<cv::gapi::own::Scalar>().at(rc.id)); break;
+#endif //  !defined(GAPI_STANDALONE)
+            default: util::throw_error(std::logic_error("content type of the runtime argument does not match to resource description ?"));
+        }
+        break;
+    }
+
+    default:
+        util::throw_error(std::logic_error("Unsupported GShape type"));
+        break;
+    }
+}
+
+} // namespace magazine
+} // namespace gimpl
+} // namespace cv
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gbackend_priv.hpp
new file mode 100644 (file)
index 0000000..1c6e297
--- /dev/null
@@ -0,0 +1,53 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef GAPI_API_GBACKEND_PRIV_HPP
+#define GAPI_API_GBACKEND_PRIV_HPP
+
+#include <memory>
+#include <unordered_set>
+
+#include <ade/graph.hpp>
+#include <ade/passes/pass_base.hpp> // passes::PassContext
+#include <ade/execution_engine/execution_engine.hpp> // ..SetupContext
+
+#include "opencv2/gapi/gcommon.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+
+namespace cv
+{
+namespace gimpl
+{
+    class GBackend;
+    class GIslandExecutable;
+} // namespace gimpl
+} // namespace cv
+
+// GAPI_EXPORTS is here to make tests build on Windows
+class GAPI_EXPORTS cv::gapi::GBackend::Priv
+{
+public:
+    using EPtr = std::unique_ptr<cv::gimpl::GIslandExecutable>;
+
+    virtual void unpackKernel(ade::Graph            &graph,
+                              const ade::NodeHandle &op_node,
+                              const GKernelImpl     &impl);
+
+    // FIXME: since backends are not passed to ADE anymore,
+    // there's no need in having both cv::gimpl::GBackend
+    // and cv::gapi::GBackend - these two things can be unified
+    // NOTE - nodes are guaranteed to be topologically sorted.
+    virtual EPtr compile(const ade::Graph   &graph,
+                         const GCompileArgs &args,
+                         const std::vector<ade::NodeHandle> &nodes) const;
+
+    virtual void addBackendPasses(ade::ExecutionEngineSetupContext &);
+
+    virtual ~Priv() = default;
+};
+
+#endif // GAPI_API_GBACKEND_PRIV_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall.cpp
new file mode 100644 (file)
index 0000000..2dd823d
--- /dev/null
@@ -0,0 +1,65 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+#include <cassert>
+#include "opencv2/gapi/gcall.hpp"
+#include "api/gcall_priv.hpp"
+
+// GCall private implementation ////////////////////////////////////////////////
+cv::GCall::Priv::Priv(const cv::GKernel &k)
+    : m_k(k)
+{
+}
+
+// GCall public implementation /////////////////////////////////////////////////
+
+cv::GCall::GCall(const cv::GKernel &k)
+    : m_priv(new Priv(k))
+{
+    // Here we have a reference to GNode,
+    // and GNode has a reference to us. Cycle! Now see destructor.
+    m_priv->m_node = GNode::Call(*this);
+}
+
+cv::GCall::~GCall()
+{
+    // When a GCall object is destroyed (and GCall::Priv is likely still alive,
+    // as there might be other references), reset m_node to break cycle.
+   m_priv->m_node = GNode();
+}
+
+void cv::GCall::setArgs(std::vector<GArg> &&args)
+{
+    // FIXME: Check if argument number is matching kernel prototype
+    m_priv->m_args = std::move(args);
+}
+
+cv::GMat cv::GCall::yield(int output)
+{
+    return cv::GMat(m_priv->m_node, output);
+}
+
+cv::GScalar cv::GCall::yieldScalar(int output)
+{
+    return cv::GScalar(m_priv->m_node, output);
+}
+
+cv::detail::GArrayU cv::GCall::yieldArray(int output)
+{
+    return cv::detail::GArrayU(m_priv->m_node, output);
+}
+
+cv::GCall::Priv& cv::GCall::priv()
+{
+    return *m_priv;
+}
+
+const cv::GCall::Priv& cv::GCall::priv() const
+{
+    return *m_priv;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcall_priv.hpp
new file mode 100644 (file)
index 0000000..ffb122e
--- /dev/null
@@ -0,0 +1,37 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GCALL_PRIV_HPP
+#define OPENCV_GCALL_PRIV_HPP
+
+#include <vector>
+#include <unordered_map>
+
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/gcall.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+
+#include "api/gnode.hpp"
+
+namespace cv {
+
+class GCall::Priv
+{
+public:
+    std::vector<GArg> m_args;
+    const GKernel     m_k;
+
+    // FIXME: Document that there's no recursion here.
+    // TODO: Rename to "constructionNode" or smt to reflect its lifetime
+    GNode             m_node;
+
+    explicit Priv(const GKernel &k);
+};
+
+}
+
+#endif // OPENCV_GCALL_PRIV_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation.cpp
new file mode 100644 (file)
index 0000000..ab761ed
--- /dev/null
@@ -0,0 +1,238 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+#include <algorithm> // remove_if
+#include <cctype>    // isspace (non-locale version)
+#include <ade/util/algorithm.hpp>
+
+#include "logger.hpp" // GAPI_LOG
+
+#include "opencv2/gapi/gcomputation.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+
+#include "api/gcomputation_priv.hpp"
+#include "api/gcall_priv.hpp"
+#include "api/gnode_priv.hpp"
+
+#include "compiler/gmodelbuilder.hpp"
+#include "compiler/gcompiler.hpp"
+
+// cv::GComputation private implementation /////////////////////////////////////
+// <none>
+
+// cv::GComputation public implementation //////////////////////////////////////
+cv::GComputation::GComputation(const Generator& gen)
+    : m_priv(gen().m_priv)
+{
+}
+
+cv::GComputation::GComputation(GMat in, GMat out)
+    : cv::GComputation(cv::GIn(in), cv::GOut(out))
+{
+}
+
+
+cv::GComputation::GComputation(GMat in, GScalar out)
+    : cv::GComputation(cv::GIn(in), cv::GOut(out))
+{
+}
+
+cv::GComputation::GComputation(GMat in1, GMat in2, GMat out)
+    : cv::GComputation(cv::GIn(in1, in2), cv::GOut(out))
+{
+}
+
+cv::GComputation::GComputation(GMat in1, GMat in2, GScalar out)
+    : cv::GComputation(cv::GIn(in1, in2), cv::GOut(out))
+{
+}
+
+cv::GComputation::GComputation(const std::vector<GMat> &ins,
+                               const std::vector<GMat> &outs)
+    : m_priv(new Priv())
+{
+    const auto wrap = [](cv::GMat m) { return GProtoArg(m); };
+    ade::util::transform(ins,  std::back_inserter(m_priv->m_ins),  wrap);
+    ade::util::transform(outs, std::back_inserter(m_priv->m_outs), wrap);
+}
+
+cv::GComputation::GComputation(cv::GProtoInputArgs &&ins,
+                               cv::GProtoOutputArgs &&outs)
+    : m_priv(new Priv())
+{
+    m_priv->m_ins  = std::move(ins.m_args);
+    m_priv->m_outs = std::move(outs.m_args);
+}
+
+cv::GCompiled cv::GComputation::compile(GMetaArgs &&metas, GCompileArgs &&args)
+{
+    // FIXME: Cache gcompiled per parameters here?
+    cv::gimpl::GCompiler comp(*this, std::move(metas), std::move(args));
+    return comp.compile();
+}
+
+// FIXME: Introduce similar query/test method for GMetaArgs as a building block
+// for functions like this?
+static bool formats_are_same(const cv::GMetaArgs& metas1, const cv::GMetaArgs& metas2)
+{
+    return std::equal(metas1.cbegin(), metas1.cend(), metas2.cbegin(),
+                      [](const cv::GMetaArg& meta1, const cv::GMetaArg& meta2) {
+                          if (meta1.index() == meta2.index() && meta1.index() == cv::GMetaArg::index_of<cv::GMatDesc>())
+                          {
+                              const auto& desc1 = cv::util::get<cv::GMatDesc>(meta1);
+                              const auto& desc2 = cv::util::get<cv::GMatDesc>(meta2);
+
+                              // comparison by size is omitted
+                              return (desc1.chan  == desc2.chan &&
+                                      desc1.depth == desc2.depth);
+                          }
+                          else
+                          {
+                              return meta1 == meta2;
+                          }
+                     });
+}
+
+void cv::GComputation::apply(GRunArgs &&ins, GRunArgsP &&outs, GCompileArgs &&args)
+{
+    const auto in_metas = descr_of(ins);
+    // FIXME Graph should be recompiled when GCompileArgs have changed
+    if (m_priv->m_lastMetas != in_metas)
+    {
+        if (m_priv->m_lastCompiled &&
+            m_priv->m_lastCompiled.canReshape() &&
+            formats_are_same(m_priv->m_lastMetas, in_metas))
+        {
+            m_priv->m_lastCompiled.reshape(in_metas, args);
+        }
+        else
+        {
+            // FIXME: Had to construct temporary object as compile() takes && (r-value)
+            m_priv->m_lastCompiled = compile(GMetaArgs(in_metas), std::move(args));
+        }
+        m_priv->m_lastMetas = in_metas;
+    }
+    m_priv->m_lastCompiled(std::move(ins), std::move(outs));
+}
+
+void cv::GComputation::apply(const std::vector<cv::gapi::own::Mat> &ins,
+                             const std::vector<cv::gapi::own::Mat> &outs,
+                             GCompileArgs &&args)
+{
+    GRunArgs call_ins;
+    GRunArgsP call_outs;
+
+    auto tmp = outs;
+    for (const cv::gapi::own::Mat &m : ins) { call_ins.emplace_back(m);   }
+    for (      cv::gapi::own::Mat &m : tmp) { call_outs.emplace_back(&m); }
+
+    apply(std::move(call_ins), std::move(call_outs), std::move(args));
+}
+
+#if !defined(GAPI_STANDALONE)
+void cv::GComputation::apply(cv::Mat in, cv::Mat &out, GCompileArgs &&args)
+{
+    apply(cv::gin(in), cv::gout(out), std::move(args));
+    // FIXME: The following doesn't work!
+    // Operation result is not replicated into user's object
+    // apply({GRunArg(in)}, {GRunArg(out)});
+}
+
+void cv::GComputation::apply(cv::Mat in, cv::Scalar &out, GCompileArgs &&args)
+{
+    apply(cv::gin(in), cv::gout(out), std::move(args));
+}
+
+void cv::GComputation::apply(cv::Mat in1, cv::Mat in2, cv::Mat &out, GCompileArgs &&args)
+{
+    apply(cv::gin(in1, in2), cv::gout(out), std::move(args));
+}
+
+void cv::GComputation::apply(cv::Mat in1, cv::Mat in2, cv::Scalar &out, GCompileArgs &&args)
+{
+    apply(cv::gin(in1, in2), cv::gout(out), std::move(args));
+}
+
+void cv::GComputation::apply(const std::vector<cv::Mat> &ins,
+                             const std::vector<cv::Mat> &outs,
+                             GCompileArgs &&args)
+{
+    GRunArgs call_ins;
+    GRunArgsP call_outs;
+
+    // Make a temporary copy of vector outs - cv::Mats are copies anyway
+    auto tmp = outs;
+    for (const cv::Mat &m : ins) { call_ins.emplace_back(m);   }
+    for (      cv::Mat &m : tmp) { call_outs.emplace_back(&m); }
+
+    apply(std::move(call_ins), std::move(call_outs), std::move(args));
+}
+#endif // !defined(GAPI_STANDALONE)
+
+cv::GComputation::Priv& cv::GComputation::priv()
+{
+    return *m_priv;
+}
+
+const cv::GComputation::Priv& cv::GComputation::priv() const
+{
+    return *m_priv;
+}
+
+// Islands /////////////////////////////////////////////////////////////////////
+
+void cv::gapi::island(const std::string       &name,
+                            GProtoInputArgs  &&ins,
+                            GProtoOutputArgs &&outs)
+{
+    {
+        // Island must have a printable name.
+        // Forbid names which contain only spaces.
+        GAPI_Assert(!name.empty());
+        const auto first_printable_it = std::find_if_not(name.begin(), name.end(), isspace);
+        const bool likely_printable   = first_printable_it != name.end();
+        GAPI_Assert(likely_printable);
+    }
+    // Even if the name contains spaces, keep it unmodified as user will
+    // then use this string to assign affinity, etc.
+
+    // First, set island tags on all operations from `ins` to `outs`
+    auto island = cv::gimpl::unrollExpr(ins.m_args, outs.m_args);
+    if (island.all_ops.empty())
+    {
+        util::throw_error(std::logic_error("Operation range is empty"));
+    }
+    for (auto &op_expr_node : island.all_ops)
+    {
+        auto &op_expr_node_p = op_expr_node.priv();
+
+        GAPI_Assert(op_expr_node.shape() == GNode::NodeShape::CALL);
+        const GCall&       call   = op_expr_node.call();
+        const GCall::Priv& call_p = call.priv();
+
+        if (!op_expr_node_p.m_island.empty())
+        {
+            util::throw_error(std::logic_error
+                              (  "Operation " + call_p.m_k.name
+                               + " is already assigned to island \""
+                               + op_expr_node_p.m_island + "\""));
+        }
+        else
+        {
+            op_expr_node_p.m_island = name;
+            GAPI_LOG_INFO(NULL,
+                          "Assigned " << call_p.m_k.name << "_" << &call_p <<
+                          " to island \"" << name << "\"");
+        }
+    }
+
+    // Note - this function only sets islands to all operations in
+    // expression tree, it is just a first step.
+    // The second step is assigning intermediate data objects to Islands,
+    // see passes::initIslands for details.
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gcomputation_priv.hpp
new file mode 100644 (file)
index 0000000..13d1b9a
--- /dev/null
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMPUTATION_PRIV_HPP
+#define OPENCV_GAPI_GCOMPUTATION_PRIV_HPP
+
+#include "opencv2/gapi.hpp"
+#include "opencv2/gapi/gcall.hpp"
+
+#include "opencv2/gapi/util/variant.hpp"
+
+namespace cv {
+
+class GComputation::Priv
+{
+public:
+    GCompiled   m_lastCompiled;
+    GMetaArgs   m_lastMetas; // TODO: make GCompiled remember its metas?
+    GProtoArgs  m_ins;
+    GProtoArgs  m_outs;
+};
+
+}
+
+#endif // OPENCV_GAPI_GCOMPUTATION_PRIV_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gkernel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gkernel.cpp
new file mode 100644 (file)
index 0000000..f8c851a
--- /dev/null
@@ -0,0 +1,147 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+#include <iostream> // cerr
+#include <functional> // hash
+#include <numeric> // accumulate
+
+#include <ade/util/algorithm.hpp>
+
+#include "logger.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+
+#include "api/gbackend_priv.hpp"
+
+// GKernelPackage public implementation ////////////////////////////////////////
+void cv::gapi::GKernelPackage::remove(const cv::gapi::GBackend& backend)
+{
+    m_backend_kernels.erase(backend);
+}
+
+bool cv::gapi::GKernelPackage::includesAPI(const std::string &id) const
+{
+    // In current form not very efficient (n * log n)
+    auto it = std::find_if(m_backend_kernels.begin(),
+                           m_backend_kernels.end(),
+                           [&id](const M::value_type &p) {
+                               return ade::util::contains(p.second, id);
+                           });
+    return (it != m_backend_kernels.end());
+}
+
+void cv::gapi::GKernelPackage::removeAPI(const std::string &id)
+{
+    for (auto &bk : m_backend_kernels)
+        bk.second.erase(id);
+}
+
+std::size_t cv::gapi::GKernelPackage::size() const
+{
+    return std::accumulate(m_backend_kernels.begin(),
+                           m_backend_kernels.end(),
+                           static_cast<std::size_t>(0u),
+                           [](std::size_t acc, const M::value_type& v) {
+                               return acc + v.second.size();
+                           });
+}
+
+cv::gapi::GKernelPackage cv::gapi::combine(const GKernelPackage  &lhs,
+                                           const GKernelPackage  &rhs,
+                                           const cv::unite_policy policy)
+{
+
+    if (policy == cv::unite_policy::REPLACE)
+    {
+        // REPLACE policy: if there is a collision, prefer RHS
+        // to LHS
+        // since RHS package has a precedense, start with its copy
+        GKernelPackage result(rhs);
+        // now iterate over LHS package and put kernel if and only
+        // if there's no such one
+        for (const auto &backend : lhs.m_backend_kernels)
+        {
+            for (const auto &kimpl : backend.second)
+            {
+                if (!result.includesAPI(kimpl.first))
+                    result.m_backend_kernels[backend.first].insert(kimpl);
+            }
+        }
+        return result;
+    }
+    else if (policy == cv::unite_policy::KEEP)
+    {
+        // KEEP policy: if there is a collision, just keep two versions
+        // of a kernel
+        GKernelPackage result(lhs);
+        for (const auto &p : rhs.m_backend_kernels)
+        {
+            result.m_backend_kernels[p.first].insert(p.second.begin(),
+                                                     p.second.end());
+        }
+        return result;
+    }
+    else GAPI_Assert(false);
+    return GKernelPackage();
+}
+
+std::pair<cv::gapi::GBackend, cv::GKernelImpl>
+cv::gapi::GKernelPackage::lookup(const std::string &id,
+                                 const GLookupOrder &order) const
+{
+    if (order.empty())
+    {
+        // If order is empty, return what comes first
+        auto it = std::find_if(m_backend_kernels.begin(),
+                               m_backend_kernels.end(),
+                               [&id](const M::value_type &p) {
+                                   return ade::util::contains(p.second, id);
+                               });
+        if (it != m_backend_kernels.end())
+        {
+            // FIXME: Two lookups!
+            return std::make_pair(it->first, it->second.find(id)->second);
+        }
+    }
+    else
+    {
+        // There is order, so:
+        // 1. Limit search scope only to specified backends
+        //    FIXME: Currently it is not configurable if search can fall-back
+        //    to other backends (not listed in order) if kernel hasn't been found
+        //    in the look-up list
+        // 2. Query backends in the specified order
+        for (const auto &selected_backend : order)
+        {
+            const auto kernels_it = m_backend_kernels.find(selected_backend);
+            if (kernels_it == m_backend_kernels.end())
+            {
+                GAPI_LOG_WARNING(NULL,
+                                 "Backend "
+                                  << &selected_backend.priv() // FIXME: name instead
+                                  << " was listed in lookup list but was not found "
+                                     "in the package");
+                continue;
+            }
+            if (ade::util::contains(kernels_it->second, id))
+            {
+                // FIXME: two lookups!
+                return std::make_pair(selected_backend, kernels_it->second.find(id)->second);
+            }
+        }
+    }
+
+    // If reached here, kernel was not found among selected backends.
+    util::throw_error(std::logic_error("Kernel " + id + " was not found"));
+}
+
+std::vector<cv::gapi::GBackend> cv::gapi::GKernelPackage::backends() const
+{
+    std::vector<cv::gapi::GBackend> result;
+    for (const auto &p : m_backend_kernels) result.emplace_back(p.first);
+    return result;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gmat.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gmat.cpp
new file mode 100644 (file)
index 0000000..e8c5285
--- /dev/null
@@ -0,0 +1,78 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+#include <opencv2/gapi/opencv_includes.hpp>
+#include <opencv2/gapi/own/mat.hpp> //gapi::own::Mat
+
+#include "opencv2/gapi/gmat.hpp"
+#include "api/gapi_priv.hpp" // GOrigin
+
+// cv::GMat public implementation //////////////////////////////////////////////
+cv::GMat::GMat()
+    : m_priv(new GOrigin(GShape::GMAT, GNode::Param()))
+{
+}
+
+cv::GMat::GMat(const GNode &n, std::size_t out)
+    : m_priv(new GOrigin(GShape::GMAT, n, out))
+{
+}
+
+cv::GOrigin& cv::GMat::priv()
+{
+    return *m_priv;
+}
+
+const cv::GOrigin& cv::GMat::priv() const
+{
+    return *m_priv;
+}
+
+#if !defined(GAPI_STANDALONE)
+cv::GMatDesc cv::descr_of(const cv::Mat &mat)
+{
+    return GMatDesc{mat.depth(), mat.channels(), {mat.cols, mat.rows}};
+}
+cv::GMatDesc cv::descr_of(const cv::UMat &mat)
+{
+    return GMatDesc{ mat.depth(), mat.channels(),{ mat.cols, mat.rows } };
+}
+#endif
+
+cv::GMatDesc cv::gapi::own::descr_of(const cv::gapi::own::Mat &mat)
+{
+    return GMatDesc{mat.depth(), mat.channels(), {mat.cols, mat.rows}};
+}
+
+namespace cv {
+std::ostream& operator<<(std::ostream& os, const cv::GMatDesc &desc)
+{
+    switch (desc.depth)
+    {
+#define TT(X) case CV_##X: os << #X; break;
+        TT(8U);
+        TT(8S);
+        TT(16U);
+        TT(16S);
+        TT(32S);
+        TT(32F);
+        TT(64F);
+#undef TT
+    default:
+        os << "(user type "
+           << std::hex << desc.depth << std::dec
+           << ")";
+        break;
+    }
+
+    os << "C" << desc.chan << " ";
+    os << desc.size.width << "x" << desc.size.height;
+
+    return os;
+}
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.cpp
new file mode 100644 (file)
index 0000000..efda5d5
--- /dev/null
@@ -0,0 +1,89 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+#include <cassert>
+
+#include "api/gnode.hpp"
+#include "api/gnode_priv.hpp"
+
+// GNode private implementation
+cv::GNode::Priv::Priv()
+    : m_shape(NodeShape::EMPTY)
+{
+}
+
+cv::GNode::Priv::Priv(GCall c)
+    : m_shape(NodeShape::CALL), m_spec(c)
+{
+}
+
+cv::GNode::Priv::Priv(ParamTag)
+    : m_shape(NodeShape::PARAM)
+{
+}
+
+cv::GNode::Priv::Priv(ConstTag)
+    : m_shape(NodeShape::CONST_BOUNDED)
+{
+}
+
+// GNode public implementation
+cv::GNode::GNode()
+    : m_priv(new Priv())
+{
+}
+
+cv::GNode::GNode(const GCall &c)
+    : m_priv(new Priv(c))
+{
+}
+
+cv::GNode::GNode(ParamTag)
+    : m_priv(new Priv(Priv::ParamTag()))
+{
+}
+
+cv::GNode::GNode(ConstTag)
+    : m_priv(new Priv(Priv::ConstTag()))
+{
+}
+
+cv::GNode cv::GNode::Call(const GCall &c)
+{
+    return GNode(c);
+}
+
+cv::GNode cv::GNode::Param()
+{
+    return GNode(ParamTag());
+}
+
+cv::GNode cv::GNode::Const()
+{
+    return GNode(ConstTag());
+}
+
+cv::GNode::Priv& cv::GNode::priv()
+{
+    return *m_priv;
+}
+
+const cv::GNode::Priv& cv::GNode::priv() const
+{
+    return *m_priv;
+}
+
+const cv::GNode::NodeShape& cv::GNode::shape() const
+{
+    return m_priv->m_shape;
+}
+
+const cv::GCall& cv::GNode::call()  const
+{
+    return util::get<GCall>(m_priv->m_spec);
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode.hpp
new file mode 100644 (file)
index 0000000..bd6c790
--- /dev/null
@@ -0,0 +1,58 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GNODE_HPP
+#define OPENCV_GAPI_GNODE_HPP
+
+#include <memory> // std::shared_ptr
+
+namespace cv {
+
+class GCall;
+
+// TODO Move "internal" namespace
+// TODO Expose details?
+
+// This class won't be public
+
+// data GNode = Call Operation [GNode]
+//            | Const <T>
+//            | Param <GMat|GParam>
+
+class GNode
+{
+public:
+    class Priv;
+
+    // Constructors
+    GNode();                               // Empty (invalid) constructor
+    static GNode Call (const GCall &c);    // Call constructor
+    static GNode Param();                  // Param constructor
+    static GNode Const();
+
+    // Internal use only
+    Priv& priv();
+    const Priv& priv() const;
+    enum class NodeShape: unsigned int;
+
+    const NodeShape& shape() const;
+    const GCall&     call()  const;
+
+protected:
+    struct ParamTag {};
+    struct ConstTag {};
+
+    explicit GNode(const GCall &c);
+    explicit GNode(ParamTag unused);
+    explicit GNode(ConstTag unused);
+
+    std::shared_ptr<Priv> m_priv;
+};
+
+}
+
+#endif // OPENCV_GAPI_GNODE_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gnode_priv.hpp
new file mode 100644 (file)
index 0000000..5425471
--- /dev/null
@@ -0,0 +1,52 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GNODE_PRIV_HPP
+#define OPENCV_GNODE_PRIV_HPP
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "opencv2/gapi/util/variant.hpp"
+
+#include "opencv2/gapi/gcall.hpp"
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+
+#include "api/gnode.hpp"
+
+namespace cv {
+
+enum class GNode::NodeShape: unsigned int
+{
+    EMPTY,
+    CALL,
+    PARAM,
+    CONST_BOUNDED
+};
+
+class GNode::Priv
+{
+public:
+    // TODO: replace with optional?
+    typedef util::variant<util::monostate, GCall> NodeSpec;
+    const NodeShape m_shape;
+    const NodeSpec  m_spec;
+    std::string     m_island; // user-modifiable attribute
+    struct ParamTag {};
+    struct ConstTag {};
+
+    Priv();                    // Empty (invalid) constructor
+    explicit Priv(GCall c);    // Call conctrustor
+    explicit Priv(ParamTag u); // Param constructor
+    explicit Priv(ConstTag u); // Param constructor
+};
+
+}
+
+#endif // OPENCV_GNODE_PRIV_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto.cpp
new file mode 100644 (file)
index 0000000..2482d62
--- /dev/null
@@ -0,0 +1,162 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <ade/util/algorithm.hpp>
+#include "opencv2/gapi/util/throw.hpp"
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/gproto.hpp"
+
+#include "api/gapi_priv.hpp"
+#include "api/gproto_priv.hpp"
+
+// FIXME: it should be a visitor!
+// FIXME: Reimplement with traits?
+
+const cv::GOrigin& cv::gimpl::proto::origin_of(const cv::GProtoArg &arg)
+{
+    switch (arg.index())
+    {
+    case cv::GProtoArg::index_of<cv::GMat>():
+        return util::get<cv::GMat>(arg).priv();
+
+    case cv::GProtoArg::index_of<cv::GScalar>():
+        return util::get<cv::GScalar>(arg).priv();
+
+    case cv::GProtoArg::index_of<cv::detail::GArrayU>():
+        return util::get<cv::detail::GArrayU>(arg).priv();
+
+    default:
+        util::throw_error(std::logic_error("Unsupported GProtoArg type"));
+    }
+}
+
+const cv::GOrigin& cv::gimpl::proto::origin_of(const cv::GArg &arg)
+{
+    // Generic, but not very efficient implementation
+    // FIXME: Walking a thin line here!!! Here we rely that GArg and
+    // GProtoArg share the same object and this is true while objects
+    // are reference-counted, so return value is not a reference to a tmp.
+    return origin_of(rewrap(arg));
+}
+
+bool cv::gimpl::proto::is_dynamic(const cv::GArg& arg)
+{
+    // FIXME: refactor this method to be auto-generated from
+    // - GProtoArg variant parameter pack, and
+    // - traits over every type
+    switch (arg.kind)
+    {
+    case detail::ArgKind::GMAT:
+    case detail::ArgKind::GSCALAR:
+    case detail::ArgKind::GARRAY:
+        return true;
+
+    default:
+        return false;
+    }
+}
+
+cv::GRunArg cv::value_of(const cv::GOrigin &origin)
+{
+    switch (origin.shape)
+    {
+    case GShape::GSCALAR: return GRunArg(util::get<cv::gapi::own::Scalar>(origin.value));
+    default: util::throw_error(std::logic_error("Unsupported shape for constant"));
+    }
+}
+
+cv::GProtoArg cv::gimpl::proto::rewrap(const cv::GArg &arg)
+{
+    // FIXME: replace with a more generic any->variant
+    // (or variant<T> -> variant<U>) conversion?
+    switch (arg.kind)
+    {
+    case detail::ArgKind::GMAT:    return GProtoArg(arg.get<cv::GMat>());
+    case detail::ArgKind::GSCALAR: return GProtoArg(arg.get<cv::GScalar>());
+    case detail::ArgKind::GARRAY:  return GProtoArg(arg.get<cv::detail::GArrayU>());
+    default: util::throw_error(std::logic_error("Unsupported GArg type"));
+    }
+}
+
+cv::GMetaArg cv::descr_of(const cv::GRunArg &arg)
+{
+    switch (arg.index())
+    {
+#if !defined(GAPI_STANDALONE)
+        case GRunArg::index_of<cv::Mat>():
+            return cv::GMetaArg(descr_of(util::get<cv::Mat>(arg)));
+
+        case GRunArg::index_of<cv::Scalar>():
+            return cv::GMetaArg(descr_of(util::get<cv::Scalar>(arg)));
+#endif // !defined(GAPI_STANDALONE)
+
+        case GRunArg::index_of<cv::gapi::own::Mat>():
+            return cv::GMetaArg(descr_of(util::get<cv::gapi::own::Mat>(arg)));
+
+        case GRunArg::index_of<cv::gapi::own::Scalar>():
+            return cv::GMetaArg(descr_of(util::get<cv::gapi::own::Scalar>(arg)));
+
+        case GRunArg::index_of<cv::detail::VectorRef>():
+            return cv::GMetaArg(util::get<cv::detail::VectorRef>(arg).descr_of());
+
+        default: util::throw_error(std::logic_error("Unsupported GRunArg type"));
+    }
+}
+
+cv::GMetaArgs cv::descr_of(const cv::GRunArgs &args)
+{
+    cv::GMetaArgs metas;
+    ade::util::transform(args, std::back_inserter(metas), [](const cv::GRunArg &arg){ return descr_of(arg); });
+    return metas;
+}
+
+cv::GMetaArg cv::descr_of(const cv::GRunArgP &argp)
+{
+    switch (argp.index())
+    {
+#if !defined(GAPI_STANDALONE)
+    case GRunArgP::index_of<cv::Mat*>():               return GMetaArg(descr_of(*util::get<cv::Mat*>(argp)));
+    case GRunArgP::index_of<cv::UMat*>():              return GMetaArg(descr_of(*util::get<cv::UMat*>(argp)));
+    case GRunArgP::index_of<cv::Scalar*>():            return GMetaArg(descr_of(*util::get<cv::Scalar*>(argp)));
+#endif //  !defined(GAPI_STANDALONE)
+    case GRunArgP::index_of<cv::gapi::own::Mat*>():    return GMetaArg(descr_of(*util::get<cv::gapi::own::Mat*>(argp)));
+    case GRunArgP::index_of<cv::gapi::own::Scalar*>(): return GMetaArg(descr_of(*util::get<cv::gapi::own::Scalar*>(argp)));
+    case GRunArgP::index_of<cv::detail::VectorRef>(): return GMetaArg(util::get<cv::detail::VectorRef>(argp).descr_of());
+    default: util::throw_error(std::logic_error("Unsupported GRunArgP type"));
+    }
+}
+
+namespace cv {
+std::ostream& operator<<(std::ostream& os, const cv::GMetaArg &arg)
+{
+    // FIXME: Implement via variant visitor
+    switch (arg.index())
+    {
+    case cv::GMetaArg::index_of<util::monostate>():
+        os << "(unresolved)";
+        break;
+
+    case cv::GMetaArg::index_of<cv::GMatDesc>():
+        os << util::get<cv::GMatDesc>(arg);
+        break;
+
+    case cv::GMetaArg::index_of<cv::GScalarDesc>():
+        os << util::get<cv::GScalarDesc>(arg);
+        break;
+
+    case cv::GMetaArg::index_of<cv::GArrayDesc>():
+        os << util::get<cv::GArrayDesc>(arg);
+        break;
+    default:
+        GAPI_Assert(false);
+    }
+
+    return os;
+}
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gproto_priv.hpp
new file mode 100644 (file)
index 0000000..2684924
--- /dev/null
@@ -0,0 +1,35 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GPROTO_PRIV_HPP
+#define OPENCV_GAPI_GPROTO_PRIV_HPP
+
+#include "opencv2/gapi/gproto.hpp"
+#include "opencv2/gapi/garg.hpp"
+
+#include "api/gapi_priv.hpp"
+
+namespace cv {
+namespace gimpl {
+namespace proto {
+
+// These methods are used by GModelBuilder only
+// FIXME: Document semantics
+
+// FIXME: GAPI_EXPORTS because of tests only!
+// FIXME: Possible dangling reference alert!!!
+GAPI_EXPORTS const GOrigin& origin_of (const GProtoArg &arg);
+GAPI_EXPORTS const GOrigin& origin_of (const GArg      &arg);
+
+bool           is_dynamic(const GArg      &arg);
+GProtoArg      rewrap    (const GArg      &arg);
+
+} // proto
+} // gimpl
+} // cv
+
+#endif // OPENCV_GAPI_GPROTO_PRIV_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/gscalar.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/gscalar.cpp
new file mode 100644 (file)
index 0000000..30f3dc9
--- /dev/null
@@ -0,0 +1,73 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/gscalar.hpp"
+#include "opencv2/gapi/own/convert.hpp"
+#include "api/gapi_priv.hpp" // GOrigin
+
+// cv::GScalar public implementation ///////////////////////////////////////////
+cv::GScalar::GScalar()
+    : m_priv(new GOrigin(GShape::GSCALAR, cv::GNode::Param()))
+{
+}
+
+cv::GScalar::GScalar(const GNode &n, std::size_t out)
+    : m_priv(new GOrigin(GShape::GSCALAR, n, out))
+{
+}
+
+cv::GScalar::GScalar(const cv::gapi::own::Scalar& s)
+    : m_priv(new GOrigin(GShape::GSCALAR, cv::gimpl::ConstVal(s)))
+{
+}
+
+cv::GScalar::GScalar(cv::gapi::own::Scalar&& s)
+    : m_priv(new GOrigin(GShape::GSCALAR, cv::gimpl::ConstVal(std::move(s))))
+{
+}
+
+cv::GScalar::GScalar(double v0)
+    : m_priv(new GOrigin(GShape::GSCALAR, cv::gimpl::ConstVal(cv::gapi::own::Scalar(v0))))
+{
+}
+
+cv::GOrigin& cv::GScalar::priv()
+{
+    return *m_priv;
+}
+
+const cv::GOrigin& cv::GScalar::priv() const
+{
+    return *m_priv;
+}
+
+cv::GScalarDesc cv::descr_of(const cv::gapi::own::Scalar &)
+{
+    return empty_scalar_desc();
+}
+
+#if !defined(GAPI_STANDALONE)
+cv::GScalar::GScalar(const cv::Scalar& s)
+    : m_priv(new GOrigin(GShape::GSCALAR, cv::gimpl::ConstVal(to_own(s))))
+{
+}
+
+cv::GScalarDesc cv::descr_of(const cv::Scalar& s)
+{
+    return cv::descr_of(to_own(s));
+}
+#endif // !defined(GAPI_STANDALONE)
+
+namespace cv {
+std::ostream& operator<<(std::ostream& os, const cv::GScalarDesc &)
+{
+    os << "(scalar)";
+    return os;
+}
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_core.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_core.cpp
new file mode 100644 (file)
index 0000000..c9fe19e
--- /dev/null
@@ -0,0 +1,359 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/gcall.hpp"
+#include "opencv2/gapi/gscalar.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+#include "opencv2/gapi/core.hpp"
+
+#include <tuple>
+#include <numeric>
+
+namespace cv { namespace gapi {
+
+GMat add(const GMat& src1, const GMat& src2, int dtype)
+{
+    return core::GAdd::on(src1, src2, dtype);
+}
+
+GMat addC(const GMat& src1, const GScalar& c, int dtype)
+{
+    return core::GAddC::on(src1, c, dtype);
+}
+
+GMat addC(const GScalar& c, const GMat& src1, int dtype)
+{
+    return core::GAddC::on(src1, c, dtype);
+}
+
+GMat sub(const GMat& src1, const GMat& src2, int dtype)
+{
+    return core::GSub::on(src1, src2, dtype);
+}
+
+GMat subC(const GMat& src1, const GScalar& c, int dtype)
+{
+    return core::GSubC::on(src1, c, dtype);
+}
+
+GMat subRC(const GScalar& c, const GMat& src, int dtype)
+{
+    return core::GSubRC::on(c, src, dtype);
+}
+
+GMat mul(const GMat& src1, const GMat& src2, double scale, int dtype)
+{
+    return core::GMul::on(src1, src2, scale, dtype);
+}
+
+GMat mulC(const GMat& src, double scale, int dtype)
+{
+    return core::GMulCOld::on(src, scale, dtype);
+}
+
+GMat mulC(const GMat& src, const GScalar& multiplier, int dtype)
+{
+    return core::GMulC::on(src, multiplier, dtype);
+}
+
+GMat mulC(const GScalar& multiplier, const GMat& src, int dtype)
+{
+    return core::GMulC::on(src, multiplier, dtype);
+}
+
+GMat div(const GMat& src1, const GMat& src2, double scale, int dtype)
+{
+    return core::GDiv::on(src1, src2, scale, dtype);
+}
+
+GMat divC(const GMat& src, const GScalar& divisor, double scale, int dtype)
+{
+    return core::GDivC::on(src, divisor, scale, dtype);
+}
+
+GMat divRC(const GScalar& divident, const GMat& src, double scale, int dtype)
+{
+    return core::GDivRC::on(divident, src, scale, dtype);
+}
+
+GScalar mean(const GMat& src)
+{
+    return core::GMean::on(src);
+}
+
+GMat mask(const GMat& src, const GMat& mask)
+{
+    return core::GMask::on(src, mask);
+}
+
+std::tuple<GMat, GMat> polarToCart(const GMat& magnitude, const GMat& angle,
+                                   bool angleInDegrees)
+{
+    return core::GPolarToCart::on(magnitude, angle, angleInDegrees);
+}
+
+std::tuple<GMat, GMat> cartToPolar(const GMat& x, const GMat& y,
+                                   bool angleInDegrees)
+{
+    return core::GCartToPolar::on(x, y, angleInDegrees);
+}
+
+GMat phase(const GMat &x, const GMat &y, bool angleInDegrees)
+{
+    return core::GPhase::on(x, y, angleInDegrees);
+}
+
+GMat cmpGT(const GMat& src1, const GMat& src2)
+{
+    return core::GCmpGT::on(src1, src2);
+}
+
+GMat cmpLT(const GMat& src1, const GMat& src2)
+{
+    return core::GCmpLT::on(src1, src2);
+}
+
+GMat cmpGE(const GMat& src1, const GMat& src2)
+{
+    return core::GCmpGE::on(src1, src2);
+}
+
+GMat cmpLE(const GMat& src1, const GMat& src2)
+{
+    return core::GCmpLE::on(src1, src2);
+}
+
+GMat cmpEQ(const GMat& src1, const GMat& src2)
+{
+    return core::GCmpEQ::on(src1, src2);
+}
+
+GMat cmpNE(const GMat& src1, const GMat& src2)
+{
+    return core::GCmpNE::on(src1, src2);
+}
+
+GMat cmpGT(const GMat& src1, const GScalar& src2)
+{
+    return core::GCmpGTScalar::on(src1, src2);
+}
+
+GMat cmpLT(const GMat& src1, const GScalar& src2)
+{
+    return core::GCmpLTScalar::on(src1, src2);
+}
+
+GMat cmpGE(const GMat& src1, const GScalar& src2)
+{
+    return core::GCmpGEScalar::on(src1, src2);
+}
+
+GMat cmpLE(const GMat& src1, const GScalar& src2)
+{
+    return core::GCmpLEScalar::on(src1, src2);
+}
+
+GMat cmpEQ(const GMat& src1, const GScalar& src2)
+{
+    return core::GCmpEQScalar::on(src1, src2);
+}
+
+GMat cmpNE(const GMat& src1, const GScalar& src2)
+{
+    return core::GCmpNEScalar::on(src1, src2);
+}
+
+GMat min(const GMat& src1, const GMat& src2)
+{
+    return core::GMin::on(src1, src2);
+}
+
+GMat max(const GMat& src1, const GMat& src2)
+{
+    return core::GMax::on(src1, src2);
+}
+
+GMat absDiff(const GMat& src1, const GMat& src2)
+{
+    return core::GAbsDiff::on(src1, src2);
+}
+
+GMat absDiffC(const GMat& src, const GScalar& c)
+{
+    return core::GAbsDiffC::on(src, c);
+}
+
+GMat bitwise_and(const GMat& src1, const GMat& src2)
+{
+    return core::GAnd::on(src1, src2);
+}
+
+GMat bitwise_and(const GMat& src1, const GScalar& src2)
+{
+    return core::GAndS::on(src1, src2);
+}
+
+GMat bitwise_or(const GMat& src1, const GMat& src2)
+{
+    return core::GOr::on(src1, src2);
+}
+
+GMat bitwise_or(const GMat& src1, const GScalar& src2)
+{
+    return core::GOrS::on(src1, src2);
+}
+
+GMat bitwise_xor(const GMat& src1, const GMat& src2)
+{
+    return core::GXor::on(src1, src2);
+}
+
+GMat bitwise_xor(const GMat& src1, const GScalar& src2)
+{
+    return core::GXorS::on(src1, src2);
+}
+
+GMat bitwise_not(const GMat& src1)
+{
+    return core::GNot::on(src1);
+}
+
+GMat select(const GMat& src1, const GMat& src2, const GMat& mask)
+{
+    return core::GSelect::on(src1, src2, mask);
+}
+
+GScalar sum(const GMat& src)
+{
+    return core::GSum::on(src);
+}
+
+GMat addWeighted(const GMat& src1, double alpha, const GMat& src2, double beta, double gamma, int dtype)
+{
+    return core::GAddW::on(src1, alpha, src2, beta, gamma, dtype);
+}
+
+GScalar normL1(const GMat& src)
+{
+    return core::GNormL1::on(src);
+}
+
+GScalar normL2(const GMat& src)
+{
+    return core::GNormL2::on(src);
+}
+
+GScalar normInf(const GMat& src)
+{
+    return core::GNormInf::on(src);
+}
+
+std::tuple<GMat, GMat> integral(const GMat& src, int sdepth, int sqdepth)
+{
+    return core::GIntegral::on(src, sdepth, sqdepth);
+}
+
+GMat threshold(const GMat& src, const GScalar& thresh, const GScalar& maxval, int type)
+{
+    GAPI_Assert(type != cv::THRESH_TRIANGLE && type != cv::THRESH_OTSU);
+    return core::GThreshold::on(src, thresh, maxval, type);
+}
+
+std::tuple<GMat, GScalar> threshold(const GMat& src, const GScalar& maxval, int type)
+{
+    GAPI_Assert(type == cv::THRESH_TRIANGLE || type == cv::THRESH_OTSU);
+    return core::GThresholdOT::on(src, maxval, type);
+}
+
+GMat inRange(const GMat& src, const GScalar& threshLow, const GScalar& threshUp)
+{
+    return core::GInRange::on(src, threshLow, threshUp);
+}
+
+std::tuple<GMat, GMat, GMat> split3(const GMat& src)
+{
+    return core::GSplit3::on(src);
+}
+
+std::tuple<GMat, GMat, GMat, GMat> split4(const GMat& src)
+{
+    return core::GSplit4::on(src);
+}
+
+GMat merge3(const GMat& src1, const GMat& src2, const GMat& src3)
+{
+    return core::GMerge3::on(src1, src2, src3);
+}
+
+GMat merge4(const GMat& src1, const GMat& src2, const GMat& src3, const GMat& src4)
+{
+    return core::GMerge4::on(src1, src2, src3, src4);
+}
+
+GMat resize(const GMat& src, const Size& dsize, double fx, double fy, int interpolation)
+{
+    return core::GResize::on(src, dsize, fx, fy, interpolation);
+}
+
+GMat remap(const GMat& src, const Mat& map1, const Mat& map2,
+           int interpolation, int borderMode,
+           const Scalar& borderValue)
+{
+    return core::GRemap::on(src, map1, map2, interpolation, borderMode, borderValue);
+}
+
+GMat flip(const GMat& src, int flipCode)
+{
+    return core::GFlip::on(src, flipCode);
+}
+
+GMat crop(const GMat& src, const Rect& rect)
+{
+    return core::GCrop::on(src, rect);
+}
+
+GMat concatHor(const GMat& src1, const GMat& src2)
+{
+    return core::GConcatHor::on(src1, src2);
+}
+
+GMat concatHor(const std::vector<GMat>& v)
+{
+    GAPI_Assert(v.size() >= 2);
+    return std::accumulate(v.begin()+1, v.end(), v[0], core::GConcatHor::on);
+}
+
+GMat concatVert(const GMat& src1, const GMat& src2)
+{
+    return core::GConcatVert::on(src1, src2);
+}
+
+GMat concatVert(const std::vector<GMat>& v)
+{
+    GAPI_Assert(v.size() >= 2);
+    return std::accumulate(v.begin()+1, v.end(), v[0], core::GConcatVert::on);
+}
+
+GMat LUT(const GMat& src, const Mat& lut)
+{
+    return core::GLUT::on(src, lut);
+}
+
+GMat convertTo(const GMat& m, int rtype, double alpha, double beta)
+{
+    return core::GConvertTo::on(m, rtype, alpha, beta);
+}
+
+GMat sqrt(const GMat& src)
+{
+    return core::GSqrt::on(src);
+}
+
+} //namespace gapi
+} //namespace cv
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_imgproc.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/kernels_imgproc.cpp
new file mode 100644 (file)
index 0000000..7c4b522
--- /dev/null
@@ -0,0 +1,144 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/gscalar.hpp"
+#include "opencv2/gapi/gcall.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+#include "opencv2/gapi/imgproc.hpp"
+
+namespace cv { namespace gapi {
+
+GMat sepFilter(const GMat& src, int ddepth, const Mat& kernelX, const Mat& kernelY, const Point& anchor,
+               const Scalar& delta, int borderType, const Scalar& borderVal)
+{
+    return imgproc::GSepFilter::on(src, ddepth, kernelX, kernelY, anchor, delta, borderType, borderVal);
+}
+
+GMat filter2D(const GMat& src, int ddepth, const Mat& kernel, const Point& anchor, const Scalar& delta, int borderType,
+              const Scalar& bordVal)
+{
+    return imgproc::GFilter2D::on(src, ddepth, kernel, anchor, delta, borderType, bordVal);
+}
+
+GMat boxFilter(const GMat& src, int dtype, const Size& ksize, const Point& anchor,
+               bool normalize, int borderType, const Scalar& bordVal)
+{
+    return imgproc::GBoxFilter::on(src, dtype, ksize, anchor, normalize, borderType, bordVal);
+}
+
+GMat blur(const GMat& src, const Size& ksize, const Point& anchor,
+               int borderType, const Scalar& bordVal)
+{
+    return imgproc::GBlur::on(src, ksize, anchor, borderType, bordVal);
+}
+
+GMat gaussianBlur(const GMat& src, const Size& ksize, double sigmaX, double sigmaY,
+                  int borderType, const Scalar& bordVal)
+{
+    return imgproc::GGaussBlur::on(src, ksize, sigmaX, sigmaY, borderType, bordVal);
+}
+
+GMat medianBlur(const GMat& src, int ksize)
+{
+    return imgproc::GMedianBlur::on(src, ksize);
+}
+
+GMat erode(const GMat& src, const Mat& kernel, const Point& anchor, int iterations,
+           int borderType, const Scalar& borderValue )
+{
+    return imgproc::GErode::on(src, kernel, anchor, iterations, borderType, borderValue);
+}
+
+GMat erode3x3(const GMat& src, int iterations,
+           int borderType, const Scalar& borderValue )
+{
+    return erode(src, cv::Mat(), cv::Point(-1, -1), iterations, borderType, borderValue);
+}
+
+GMat dilate(const GMat& src, const Mat& kernel, const Point& anchor, int iterations,
+            int borderType, const Scalar& borderValue)
+{
+    return imgproc::GDilate::on(src, kernel, anchor, iterations, borderType, borderValue);
+}
+
+GMat dilate3x3(const GMat& src, int iterations,
+            int borderType, const Scalar& borderValue)
+{
+    return dilate(src, cv::Mat(), cv::Point(-1,-1), iterations, borderType, borderValue);
+}
+
+GMat Sobel(const GMat& src, int ddepth, int dx, int dy, int ksize,
+           double scale, double delta,
+           int borderType, const Scalar& bordVal)
+{
+    return imgproc::GSobel::on(src, ddepth, dx, dy, ksize, scale, delta, borderType, bordVal);
+}
+
+GMat equalizeHist(const GMat& src)
+{
+    return imgproc::GEqHist::on(src);
+}
+
+GMat Canny(const GMat& src, double thr1, double thr2, int apertureSize, bool l2gradient)
+{
+    return imgproc::GCanny::on(src, thr1, thr2, apertureSize, l2gradient);
+}
+
+GMat RGB2Gray(const GMat& src)
+{
+    return imgproc::GRGB2Gray::on(src);
+}
+
+GMat RGB2Gray(const GMat& src, float rY, float gY, float bY)
+{
+    return imgproc::GRGB2GrayCustom::on(src, rY, gY, bY);
+}
+
+GMat BGR2Gray(const GMat& src)
+{
+    return imgproc::GBGR2Gray::on(src);
+}
+
+GMat RGB2YUV(const GMat& src)
+{
+    return imgproc::GRGB2YUV::on(src);
+}
+
+GMat BGR2LUV(const GMat& src)
+{
+    return imgproc::GBGR2LUV::on(src);
+}
+
+GMat LUV2BGR(const GMat& src)
+{
+    return imgproc::GLUV2BGR::on(src);
+}
+
+GMat BGR2YUV(const GMat& src)
+{
+    return imgproc::GBGR2YUV::on(src);
+}
+
+GMat YUV2BGR(const GMat& src)
+{
+    return imgproc::GYUV2BGR::on(src);
+}
+
+GMat YUV2RGB(const GMat& src)
+{
+    return imgproc::GYUV2RGB::on(src);
+}
+
+GMat RGB2Lab(const GMat& src)
+{
+    return imgproc::GRGB2Lab::on(src);
+}
+
+} //namespace gapi
+} //namespace cv
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/api/operators.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/api/operators.cpp
new file mode 100644 (file)
index 0000000..44fc4fa
--- /dev/null
@@ -0,0 +1,213 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/imgproc.hpp"
+#include "opencv2/gapi/core.hpp"
+#include "opencv2/gapi/gscalar.hpp"
+#include "opencv2/gapi/operators.hpp"
+
+cv::GMat operator+(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::add(lhs, rhs);
+}
+
+cv::GMat operator+(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::addC(lhs, rhs);
+}
+
+cv::GMat operator+(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::addC(rhs, lhs);
+}
+
+cv::GMat operator-(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::sub(lhs, rhs);
+}
+
+cv::GMat operator-(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::subC(lhs, rhs);
+}
+
+cv::GMat operator-(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::subRC(lhs, rhs);
+}
+
+cv::GMat operator*(const cv::GMat& lhs, float rhs)
+{
+    return cv::gapi::mulC(lhs, static_cast<double>(rhs));
+}
+
+cv::GMat operator*(float lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::mulC(rhs, static_cast<double>(lhs));
+}
+
+cv::GMat operator*(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::mulC(lhs, rhs);
+}
+
+cv::GMat operator*(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::mulC(rhs, lhs);
+}
+
+cv::GMat operator/(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::divC(lhs, rhs, 1.0);
+}
+
+cv::GMat operator/(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::div(lhs, rhs, 1.0);
+}
+
+cv::GMat operator/(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::divRC(lhs, rhs, 1.0);
+}
+
+cv::GMat operator&(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::bitwise_and(lhs, rhs);
+}
+
+cv::GMat operator&(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::bitwise_and(lhs, rhs);
+}
+
+cv::GMat operator&(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::bitwise_and(rhs, lhs);
+}
+
+cv::GMat operator|(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::bitwise_or(lhs, rhs);
+}
+
+cv::GMat operator|(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::bitwise_or(lhs, rhs);
+}
+
+cv::GMat operator|(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::bitwise_or(rhs, lhs);
+}
+
+cv::GMat operator^(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::bitwise_xor(lhs, rhs);
+}
+
+cv::GMat operator^(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::bitwise_xor(lhs, rhs);
+}
+
+cv::GMat operator^(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::bitwise_xor(rhs, lhs);
+}
+
+cv::GMat operator~(const cv::GMat& lhs)
+{
+    return cv::gapi::bitwise_not(lhs);
+}
+
+cv::GMat operator>(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpGT(lhs, rhs);
+}
+
+cv::GMat operator>=(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpGE(lhs, rhs);
+}
+
+cv::GMat operator<(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpLT(lhs, rhs);
+}
+
+cv::GMat operator<=(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpLE(lhs, rhs);
+}
+
+cv::GMat operator==(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpEQ(lhs, rhs);
+}
+
+cv::GMat operator!=(const cv::GMat& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpNE(lhs, rhs);
+}
+
+cv::GMat operator>(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::cmpGT(lhs, rhs);
+}
+
+cv::GMat operator>=(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::cmpGE(lhs, rhs);
+}
+
+cv::GMat operator<(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::cmpLT(lhs, rhs);
+}
+
+cv::GMat operator<=(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::cmpLE(lhs, rhs);
+}
+
+cv::GMat operator==(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::cmpEQ(lhs, rhs);
+}
+
+cv::GMat operator!=(const cv::GMat& lhs, const cv::GScalar& rhs)
+{
+    return cv::gapi::cmpNE(lhs, rhs);
+}
+
+cv::GMat operator>(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpLT(rhs, lhs);
+}
+cv::GMat operator>=(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpLE(rhs, lhs);
+}
+cv::GMat operator<(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpGT(rhs, lhs);
+}
+cv::GMat operator<=(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpGE(rhs, lhs);
+}
+cv::GMat operator==(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpEQ(rhs, lhs);
+}
+cv::GMat operator!=(const cv::GScalar& lhs, const cv::GMat& rhs)
+{
+    return cv::gapi::cmpNE(rhs, lhs);
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/README.md b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/README.md
new file mode 100644 (file)
index 0000000..3aeeb1e
--- /dev/null
@@ -0,0 +1,2 @@
+This directory contains various G-API backends, which provide scheduling
+logic and kernel implementations for specific targets.
\ No newline at end of file
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gbackend.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gbackend.hpp
new file mode 100644 (file)
index 0000000..613022c
--- /dev/null
@@ -0,0 +1,106 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GBACKEND_HPP
+#define OPENCV_GAPI_GBACKEND_HPP
+
+#include <string>
+#include <memory>
+
+#include <ade/node.hpp>
+
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/own/mat.hpp"
+
+#include "opencv2/gapi/util/optional.hpp"
+#include "opencv2/gapi/own/scalar.hpp"
+
+#include "compiler/gmodel.hpp"
+
+namespace cv {
+namespace gimpl {
+
+    // Forward declarations
+    struct Data;
+    struct RcDesc;
+
+namespace magazine {
+    template<typename... Ts> struct Class
+    {
+        template<typename T> using MapT = std::unordered_map<int, T>;
+        template<typename T>       MapT<T>& slot()
+        {
+            return std::get<ade::util::type_list_index<T, Ts...>::value>(slots);
+        }
+        template<typename T> const MapT<T>& slot() const
+        {
+            return std::get<ade::util::type_list_index<T, Ts...>::value>(slots);
+        }
+    private:
+        std::tuple<MapT<Ts>...> slots;
+    };
+
+} // namespace magazine
+#if !defined(GAPI_STANDALONE)
+using Mag = magazine::Class<cv::gapi::own::Mat, cv::UMat, cv::gapi::own::Scalar, cv::detail::VectorRef>;
+#else
+using Mag = magazine::Class<cv::gapi::own::Mat, cv::gapi::own::Scalar, cv::detail::VectorRef>;
+#endif
+
+namespace magazine
+{
+    void         bindInArg (Mag& mag, const RcDesc &rc, const GRunArg  &arg, bool is_umat = false);
+    void         bindOutArg(Mag& mag, const RcDesc &rc, const GRunArgP &arg, bool is_umat = false);
+
+    void         resetInternalData(Mag& mag, const Data &d);
+    cv::GRunArg  getArg    (const Mag& mag, const RcDesc &ref);
+    cv::GRunArgP getObjPtr (      Mag& mag, const RcDesc &rc, bool is_umat = false);
+    void         writeBack (const Mag& mag, const RcDesc &rc, GRunArgP &g_arg, bool is_umat = false);
+} // namespace magazine
+
+namespace detail
+{
+template<typename... Ts> struct magazine
+{
+    template<typename T> using MapT = std::unordered_map<int, T>;
+    template<typename T>       MapT<T>& slot()
+    {
+        return std::get<util::type_list_index<T, Ts...>::value>(slots);
+    }
+    template<typename T> const MapT<T>& slot() const
+    {
+        return std::get<util::type_list_index<T, Ts...>::value>(slots);
+    }
+private:
+    std::tuple<MapT<Ts>...> slots;
+};
+} // namespace detail
+
+struct GRuntimeArgs
+{
+    GRunArgs   inObjs;
+    GRunArgsP outObjs;
+};
+
+template<typename T>
+inline cv::util::optional<T> getCompileArg(const cv::GCompileArgs &args)
+{
+    for (auto &compile_arg : args)
+    {
+        if (compile_arg.tag == cv::detail::CompileArgTag<T>::tag())
+        {
+            return cv::util::optional<T>(compile_arg.get<T>());
+        }
+    }
+    return cv::util::optional<T>();
+}
+
+
+
+}} // cv::gimpl
+
+#endif // OPENCV_GAPI_GBACKEND_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundbackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundbackend.cpp
new file mode 100644 (file)
index 0000000..948898f
--- /dev/null
@@ -0,0 +1,20 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/gcompoundkernel.hpp" // compound::backend()
+
+#include "api/gbackend_priv.hpp"
+#include "compiler/gislandmodel.hpp" // GIslandExecutable
+
+cv::gapi::GBackend cv::gapi::compound::backend()
+{
+    // A pointer to dummy Priv is used to uniquely identify backends
+    static cv::gapi::GBackend this_backend(std::make_shared<cv::gapi::GBackend::Priv>());
+    return this_backend;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundkernel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/common/gcompoundkernel.cpp
new file mode 100644 (file)
index 0000000..89abcef
--- /dev/null
@@ -0,0 +1,47 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <ade/util/zip_range.hpp>   // util::indexed
+#include "opencv2/gapi/gcompoundkernel.hpp"
+#include "compiler/gobjref.hpp"
+
+// FIXME move to backends
+
+cv::detail::GCompoundContext::GCompoundContext(const cv::GArgs& in_args)
+{
+    m_args.resize(in_args.size());
+    for (const auto& it : ade::util::indexed(in_args))
+    {
+        const auto& i      = ade::util::index(it);
+        const auto& in_arg = ade::util::value(it);
+
+        if (in_arg.kind != cv::detail::ArgKind::GOBJREF)
+        {
+            m_args[i] = in_arg;
+        }
+        else
+        {
+            const cv::gimpl::RcDesc &ref = in_arg.get<cv::gimpl::RcDesc>();
+            switch (ref.shape)
+            {
+                case GShape::GMAT   : m_args[i] = GArg(GMat());    break;
+                case GShape::GSCALAR: m_args[i] = GArg(GScalar()); break;
+                case GShape::GARRAY :/* do nothing - as handled in a special way, see gcompoundkernel.hpp for details */; break;
+                default: GAPI_Assert(false);
+            }
+        }
+    }
+    GAPI_Assert(m_args.size() == in_args.size());
+}
+
+cv::detail::GCompoundKernel::GCompoundKernel(const F& f) : m_f(f)
+{
+}
+
+void cv::detail::GCompoundKernel::apply(cv::detail::GCompoundContext& ctx) { m_f(ctx); }
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.cpp
new file mode 100644 (file)
index 0000000..5cc8bb0
--- /dev/null
@@ -0,0 +1,229 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <functional>
+#include <unordered_set>
+
+#include <ade/util/algorithm.hpp>
+
+#include <ade/util/range.hpp>
+#include <ade/util/zip_range.hpp>
+#include <ade/util/chain_range.hpp>
+
+#include <ade/typed_graph.hpp>
+
+#include "opencv2/gapi/gcommon.hpp"
+#include "opencv2/gapi/util/any.hpp"
+#include "opencv2/gapi/gtype_traits.hpp"
+
+#include "compiler/gobjref.hpp"
+#include "compiler/gmodel.hpp"
+
+#include "backends/cpu/gcpubackend.hpp"
+#include "backends/cpu/gcpuimgproc.hpp"
+#include "backends/cpu/gcpucore.hpp"
+
+#include "api/gbackend_priv.hpp" // FIXME: Make it part of Backend SDK!
+
+// FIXME: Is there a way to take a typed graph (our GModel),
+// and create a new typed graph _ATOP_ of that (by extending with a couple of
+// new types?).
+// Alternatively, is there a way to compose types graphs?
+//
+// If not, we need to introduce that!
+using GCPUModel = ade::TypedGraph
+    < cv::gimpl::Unit
+    , cv::gimpl::Protocol
+    >;
+
+// FIXME: Same issue with Typed and ConstTyped
+using GConstGCPUModel = ade::ConstTypedGraph
+    < cv::gimpl::Unit
+    , cv::gimpl::Protocol
+    >;
+
+namespace
+{
+    class GCPUBackendImpl final: public cv::gapi::GBackend::Priv
+    {
+        virtual void unpackKernel(ade::Graph            &graph,
+                                  const ade::NodeHandle &op_node,
+                                  const cv::GKernelImpl &impl) override
+        {
+            GCPUModel gm(graph);
+            auto cpu_impl = cv::util::any_cast<cv::GCPUKernel>(impl.opaque);
+            gm.metadata(op_node).set(cv::gimpl::Unit{cpu_impl});
+        }
+
+        virtual EPtr compile(const ade::Graph &graph,
+                             const cv::GCompileArgs &,
+                             const std::vector<ade::NodeHandle> &nodes) const override
+        {
+            return EPtr{new cv::gimpl::GCPUExecutable(graph, nodes)};
+        }
+   };
+}
+
+cv::gapi::GBackend cv::gapi::cpu::backend()
+{
+    static cv::gapi::GBackend this_backend(std::make_shared<GCPUBackendImpl>());
+    return this_backend;
+}
+
+// GCPUExcecutable implementation //////////////////////////////////////////////
+cv::gimpl::GCPUExecutable::GCPUExecutable(const ade::Graph &g,
+                                          const std::vector<ade::NodeHandle> &nodes)
+    : m_g(g), m_gm(m_g)
+{
+    // Convert list of operations (which is topologically sorted already)
+    // into an execution script.
+    for (auto &nh : nodes)
+    {
+        switch (m_gm.metadata(nh).get<NodeType>().t)
+        {
+        case NodeType::OP: m_script.push_back({nh, GModel::collectOutputMeta(m_gm, nh)}); break;
+        case NodeType::DATA:
+        {
+            m_dataNodes.push_back(nh);
+            const auto &desc = m_gm.metadata(nh).get<Data>();
+            if (desc.storage == Data::Storage::CONST)
+            {
+                auto rc = RcDesc{desc.rc, desc.shape, desc.ctor};
+                magazine::bindInArg(m_res, rc, m_gm.metadata(nh).get<ConstValue>().arg);
+            }
+            //preallocate internal Mats in advance
+            if (desc.storage == Data::Storage::INTERNAL && desc.shape == GShape::GMAT)
+            {
+                const auto mat_desc = util::get<cv::GMatDesc>(desc.meta);
+                const auto type = CV_MAKETYPE(mat_desc.depth, mat_desc.chan);
+                m_res.slot<cv::gapi::own::Mat>()[desc.rc].create(mat_desc.size, type);
+            }
+            break;
+        }
+        default: util::throw_error(std::logic_error("Unsupported NodeType type"));
+        }
+    }
+}
+
+// FIXME: Document what it does
+cv::GArg cv::gimpl::GCPUExecutable::packArg(const GArg &arg)
+{
+    // No API placeholders allowed at this point
+    // FIXME: this check has to be done somewhere in compilation stage.
+    GAPI_Assert(   arg.kind != cv::detail::ArgKind::GMAT
+              && arg.kind != cv::detail::ArgKind::GSCALAR
+              && arg.kind != cv::detail::ArgKind::GARRAY);
+
+    if (arg.kind != cv::detail::ArgKind::GOBJREF)
+    {
+        // All other cases - pass as-is, with no transformations to GArg contents.
+        return arg;
+    }
+    GAPI_Assert(arg.kind == cv::detail::ArgKind::GOBJREF);
+
+    // Wrap associated CPU object (either host or an internal one)
+    // FIXME: object can be moved out!!! GExecutor faced that.
+    const cv::gimpl::RcDesc &ref = arg.get<cv::gimpl::RcDesc>();
+    switch (ref.shape)
+    {
+    case GShape::GMAT:    return GArg(m_res.slot<cv::gapi::own::Mat>()   [ref.id]);
+    case GShape::GSCALAR: return GArg(m_res.slot<cv::gapi::own::Scalar>()[ref.id]);
+    // Note: .at() is intentional for GArray as object MUST be already there
+    //   (and constructed by either bindIn/Out or resetInternal)
+    case GShape::GARRAY:  return GArg(m_res.slot<cv::detail::VectorRef>().at(ref.id));
+    default:
+        util::throw_error(std::logic_error("Unsupported GShape type"));
+        break;
+    }
+}
+
+void cv::gimpl::GCPUExecutable::run(std::vector<InObj>  &&input_objs,
+                                    std::vector<OutObj> &&output_objs)
+{
+    // Update resources with run-time information - what this Island
+    // has received from user (or from another Island, or mix...)
+    // FIXME: Check input/output objects against GIsland protocol
+
+    for (auto& it : input_objs)   magazine::bindInArg (m_res, it.first, it.second);
+    for (auto& it : output_objs)  magazine::bindOutArg(m_res, it.first, it.second);
+
+    // Initialize (reset) internal data nodes with user structures
+    // before processing a frame (no need to do it for external data structures)
+    GModel::ConstGraph gm(m_g);
+    for (auto nh : m_dataNodes)
+    {
+        const auto &desc = gm.metadata(nh).get<Data>();
+
+        if (   desc.storage == Data::Storage::INTERNAL
+            && !util::holds_alternative<util::monostate>(desc.ctor))
+        {
+            // FIXME: Note that compile-time constant data objects (like
+            // a value-initialized GArray<T>) also satisfy this condition
+            // and should be excluded, but now we just don't support it
+            magazine::resetInternalData(m_res, desc);
+        }
+    }
+
+    // OpenCV backend execution is not a rocket science at all.
+    // Simply invoke our kernels in the proper order.
+    GConstGCPUModel gcm(m_g);
+    for (auto &op_info : m_script)
+    {
+        const auto &op = m_gm.metadata(op_info.nh).get<Op>();
+
+        // Obtain our real execution unit
+        // TODO: Should kernels be copyable?
+        GCPUKernel k = gcm.metadata(op_info.nh).get<Unit>().k;
+
+        // Initialize kernel's execution context:
+        // - Input parameters
+        GCPUContext context;
+        context.m_args.reserve(op.args.size());
+
+        using namespace std::placeholders;
+        ade::util::transform(op.args,
+                          std::back_inserter(context.m_args),
+                          std::bind(&GCPUExecutable::packArg, this, _1));
+
+        // - Output parameters.
+        // FIXME: pre-allocate internal Mats, etc, according to the known meta
+        for (const auto &out_it : ade::util::indexed(op.outs))
+        {
+            // FIXME: Can the same GArg type resolution mechanism be reused here?
+            const auto out_port  = ade::util::index(out_it);
+            const auto out_desc  = ade::util::value(out_it);
+            context.m_results[out_port] = magazine::getObjPtr(m_res, out_desc);
+        }
+
+        // Now trigger the executable unit
+        k.apply(context);
+
+        //As Kernels are forbidden to allocate memory for (Mat) outputs,
+        //this code seems redundant, at least for Mats
+        //FIXME: unify with cv::detail::ensure_out_mats_not_reallocated
+        for (const auto &out_it : ade::util::indexed(op_info.expected_out_metas))
+        {
+            const auto out_index      = ade::util::index(out_it);
+            const auto expected_meta  = ade::util::value(out_it);
+            const auto out_meta       = descr_of(context.m_results[out_index]);
+
+            if (expected_meta != out_meta)
+            {
+                util::throw_error
+                    (std::logic_error
+                     ("Output meta doesn't "
+                      "coincide with the generated meta\n"
+                      "Expected: " + ade::util::to_string(expected_meta) + "\n"
+                      "Actual  : " + ade::util::to_string(out_meta)));
+            }
+        }
+    } // for(m_script)
+
+    for (auto &it : output_objs) magazine::writeBack(m_res, it.first, it.second);
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpubackend.hpp
new file mode 100644 (file)
index 0000000..6ce8c48
--- /dev/null
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCPUBACKEND_HPP
+#define OPENCV_GAPI_GCPUBACKEND_HPP
+
+#include <map>                // map
+#include <unordered_map>      // unordered_map
+#include <tuple>              // tuple
+#include <ade/util/algorithm.hpp> // type_list_index
+
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/gproto.hpp"
+#include "opencv2/gapi/cpu/gcpukernel.hpp"
+
+
+#include "api/gapi_priv.hpp"
+#include "backends/common/gbackend.hpp"
+#include "compiler/gislandmodel.hpp"
+
+namespace cv { namespace gimpl {
+
+struct Unit
+{
+    static const char *name() { return "HostKernel"; }
+    GCPUKernel k;
+};
+
+class GCPUExecutable final: public GIslandExecutable
+{
+    const ade::Graph &m_g;
+    GModel::ConstGraph m_gm;
+
+    struct OperationInfo
+    {
+        ade::NodeHandle nh;
+        GMetaArgs expected_out_metas;
+    };
+
+    // Execution script, currently absolutely naive
+    std::vector<OperationInfo> m_script;
+    // List of all resources in graph (both internal and external)
+    std::vector<ade::NodeHandle> m_dataNodes;
+
+    // Actual data of all resources in graph (both internal and external)
+    Mag m_res;
+    GArg packArg(const GArg &arg);
+
+public:
+    GCPUExecutable(const ade::Graph                   &graph,
+                   const std::vector<ade::NodeHandle> &nodes);
+
+    virtual inline bool canReshape() const override { return false; }
+    virtual inline void reshape(ade::Graph&, const GCompileArgs&) override
+    {
+        // FIXME: CPU plugin is in fact reshapeable (as it was initially,
+        // even before outMeta() has been introduced), so this limitation
+        // should be dropped.
+        util::throw_error(std::logic_error("GCPUExecutable::reshape() should never be called"));
+    }
+
+    virtual void run(std::vector<InObj>  &&input_objs,
+                     std::vector<OutObj> &&output_objs) override;
+};
+
+}}
+
+#endif // OPENCV_GAPI_GBACKEND_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.cpp
new file mode 100644 (file)
index 0000000..c42f863
--- /dev/null
@@ -0,0 +1,595 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/core.hpp"
+#include "opencv2/gapi/cpu/core.hpp"
+#include "backends/cpu/gcpucore.hpp"
+
+GAPI_OCV_KERNEL(GCPUAdd, cv::gapi::core::GAdd)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, int dtype, cv::Mat& out)
+    {
+        cv::add(a, b, out, cv::noArray(), dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUAddC, cv::gapi::core::GAddC)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, int dtype, cv::Mat& out)
+    {
+        cv::add(a, b, out, cv::noArray(), dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUSub, cv::gapi::core::GSub)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, int dtype, cv::Mat& out)
+    {
+        cv::subtract(a, b, out, cv::noArray(), dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUSubC, cv::gapi::core::GSubC)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, int dtype, cv::Mat& out)
+    {
+        cv::subtract(a, b, out, cv::noArray(), dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUSubRC, cv::gapi::core::GSubRC)
+{
+    static void run(const cv::Scalar& a, const cv::Mat& b, int dtype, cv::Mat& out)
+    {
+        cv::subtract(a, b, out, cv::noArray(), dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUMul, cv::gapi::core::GMul)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, double scale, int dtype, cv::Mat& out)
+    {
+        cv::multiply(a, b, out, scale, dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUMulCOld, cv::gapi::core::GMulCOld)
+{
+    static void run(const cv::Mat& a, double b, int dtype, cv::Mat& out)
+    {
+        cv::multiply(a, b, out, 1, dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUMulC, cv::gapi::core::GMulC)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, int dtype, cv::Mat& out)
+    {
+        cv::multiply(a, b, out, 1, dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUDiv, cv::gapi::core::GDiv)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, double scale, int dtype, cv::Mat& out)
+    {
+        cv::divide(a, b, out, scale, dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUDivC, cv::gapi::core::GDivC)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, double scale, int dtype, cv::Mat& out)
+    {
+        cv::divide(a, b, out, scale, dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUDivRC, cv::gapi::core::GDivRC)
+{
+    static void run(const cv::Scalar& a, const cv::Mat& b, double scale, int dtype, cv::Mat& out)
+    {
+        cv::divide(a, b, out, scale, dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUMask, cv::gapi::core::GMask)
+{
+    static void run(const cv::Mat& in, const cv::Mat& mask, cv::Mat& out)
+    {
+        out = cv::Mat::zeros(in.size(), in.type());
+        in.copyTo(out, mask);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUMean, cv::gapi::core::GMean)
+{
+    static void run(const cv::Mat& in, cv::Scalar& out)
+    {
+        out = cv::mean(in);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUPolarToCart, cv::gapi::core::GPolarToCart)
+{
+    static void run(const cv::Mat& magn, const cv::Mat& angle, bool angleInDegrees, cv::Mat& outx, cv::Mat& outy)
+    {
+        cv::polarToCart(magn, angle, outx, outy, angleInDegrees);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCartToPolar, cv::gapi::core::GCartToPolar)
+{
+    static void run(const cv::Mat& x, const cv::Mat& y, bool angleInDegrees, cv::Mat& outmagn, cv::Mat& outangle)
+    {
+        cv::cartToPolar(x, y, outmagn, outangle, angleInDegrees);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUPhase, cv::gapi::core::GPhase)
+{
+    static void run(const cv::Mat &x, const cv::Mat &y, bool angleInDegrees, cv::Mat &out)
+    {
+        cv::phase(x, y, out, angleInDegrees);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpGT, cv::gapi::core::GCmpGT)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_GT);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpGE, cv::gapi::core::GCmpGE)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_GE);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpLE, cv::gapi::core::GCmpLE)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_LE);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpLT, cv::gapi::core::GCmpLT)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_LT);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpEQ, cv::gapi::core::GCmpEQ)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_EQ);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpNE, cv::gapi::core::GCmpNE)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_NE);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpGTScalar, cv::gapi::core::GCmpGTScalar)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_GT);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpGEScalar, cv::gapi::core::GCmpGEScalar)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_GE);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpLEScalar, cv::gapi::core::GCmpLEScalar)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_LE);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpLTScalar, cv::gapi::core::GCmpLTScalar)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_LT);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpEQScalar, cv::gapi::core::GCmpEQScalar)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_EQ);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCmpNEScalar, cv::gapi::core::GCmpNEScalar)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, cv::Mat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_NE);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUAnd, cv::gapi::core::GAnd)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, cv::Mat& out)
+    {
+        cv::bitwise_and(a, b, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUAndS, cv::gapi::core::GAndS)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, cv::Mat& out)
+    {
+        cv::bitwise_and(a, b, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUOr, cv::gapi::core::GOr)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, cv::Mat& out)
+    {
+        cv::bitwise_or(a, b, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUOrS, cv::gapi::core::GOrS)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, cv::Mat& out)
+    {
+        cv::bitwise_or(a, b, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUXor, cv::gapi::core::GXor)
+{
+    static void run(const cv::Mat& a, const cv::Mat& b, cv::Mat& out)
+    {
+        cv::bitwise_xor(a, b, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUXorS, cv::gapi::core::GXorS)
+{
+    static void run(const cv::Mat& a, const cv::Scalar& b, cv::Mat& out)
+    {
+        cv::bitwise_xor(a, b, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUNot, cv::gapi::core::GNot)
+{
+    static void run(const cv::Mat& a, cv::Mat& out)
+    {
+        cv::bitwise_not(a, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUSelect, cv::gapi::core::GSelect)
+{
+    static void run(const cv::Mat& src1, const cv::Mat& src2, const cv::Mat& mask, cv::Mat& out)
+    {
+        src2.copyTo(out);
+        src1.copyTo(out, mask);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUMin, cv::gapi::core::GMin)
+{
+    static void run(const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out)
+    {
+        out = cv::min(in1, in2);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUMax, cv::gapi::core::GMax)
+{
+    static void run(const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out)
+    {
+        out = cv::max(in1, in2);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUAbsDiff, cv::gapi::core::GAbsDiff)
+{
+    static void run(const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out)
+    {
+        cv::absdiff(in1, in2, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUAbsDiffC, cv::gapi::core::GAbsDiffC)
+{
+    static void run(const cv::Mat& in1, const cv::Scalar& in2, cv::Mat& out)
+    {
+        cv::absdiff(in1, in2, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUSum, cv::gapi::core::GSum)
+{
+    static void run(const cv::Mat& in, cv::Scalar& out)
+    {
+        out = cv::sum(in);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUAddW, cv::gapi::core::GAddW)
+{
+    static void run(const cv::Mat& in1, double alpha, const cv::Mat& in2, double beta, double gamma, int dtype, cv::Mat& out)
+    {
+        cv::addWeighted(in1, alpha, in2, beta, gamma, out, dtype);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUNormL1, cv::gapi::core::GNormL1)
+{
+    static void run(const cv::Mat& in, cv::Scalar& out)
+    {
+        out = cv::norm(in, cv::NORM_L1);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUNormL2, cv::gapi::core::GNormL2)
+{
+    static void run(const cv::Mat& in, cv::Scalar& out)
+    {
+        out = cv::norm(in, cv::NORM_L2);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUNormInf, cv::gapi::core::GNormInf)
+{
+    static void run(const cv::Mat& in, cv::Scalar& out)
+    {
+        out = cv::norm(in, cv::NORM_INF);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUIntegral, cv::gapi::core::GIntegral)
+{
+    static void run(const cv::Mat& in, int sdepth, int sqdepth, cv::Mat& out, cv::Mat& outSq)
+    {
+        cv::integral(in, out, outSq, sdepth, sqdepth);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUThreshold, cv::gapi::core::GThreshold)
+{
+    static void run(const cv::Mat& in, const cv::Scalar& a, const cv::Scalar& b, int type, cv::Mat& out)
+    {
+        cv::threshold(in, out, a.val[0], b.val[0], type);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUThresholdOT, cv::gapi::core::GThresholdOT)
+{
+    static void run(const cv::Mat& in, const cv::Scalar& b, int type, cv::Mat& out, cv::Scalar& outScalar)
+    {
+        outScalar = cv::threshold(in, out, b.val[0], b.val[0], type);
+    }
+};
+
+
+GAPI_OCV_KERNEL(GCPUInRange, cv::gapi::core::GInRange)
+{
+    static void run(const cv::Mat& in, const cv::Scalar& low, const cv::Scalar& up, cv::Mat& out)
+    {
+        cv::inRange(in, low, up, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUSplit3, cv::gapi::core::GSplit3)
+{
+    static void run(const cv::Mat& in, cv::Mat &m1, cv::Mat &m2, cv::Mat &m3)
+    {
+        std::vector<cv::Mat> outMats = {m1, m2, m3};
+        cv::split(in, outMats);
+
+        // Write back FIXME: Write a helper or avoid this nonsence completely!
+        m1 = outMats[0];
+        m2 = outMats[1];
+        m3 = outMats[2];
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUSplit4, cv::gapi::core::GSplit4)
+{
+    static void run(const cv::Mat& in, cv::Mat &m1, cv::Mat &m2, cv::Mat &m3, cv::Mat &m4)
+    {
+        std::vector<cv::Mat> outMats = {m1, m2, m3, m4};
+        cv::split(in, outMats);
+
+        // Write back FIXME: Write a helper or avoid this nonsence completely!
+        m1 = outMats[0];
+        m2 = outMats[1];
+        m3 = outMats[2];
+        m4 = outMats[3];
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUMerge3, cv::gapi::core::GMerge3)
+{
+    static void run(const cv::Mat& in1, const cv::Mat& in2, const cv::Mat& in3, cv::Mat &out)
+    {
+        std::vector<cv::Mat> inMats = {in1, in2, in3};
+        cv::merge(inMats, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUMerge4, cv::gapi::core::GMerge4)
+{
+    static void run(const cv::Mat& in1, const cv::Mat& in2, const cv::Mat& in3, const cv::Mat& in4, cv::Mat &out)
+    {
+        std::vector<cv::Mat> inMats = {in1, in2, in3, in4};
+        cv::merge(inMats, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUResize, cv::gapi::core::GResize)
+{
+    static void run(const cv::Mat& in, cv::Size sz, double fx, double fy, int interp, cv::Mat &out)
+    {
+        cv::resize(in, out, sz, fx, fy, interp);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPURemap, cv::gapi::core::GRemap)
+{
+    static void run(const cv::Mat& in, const cv::Mat& x, const cv::Mat& y, int a, int b, cv::Scalar s, cv::Mat& out)
+    {
+        cv::remap(in, out, x, y, a, b, s);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUFlip, cv::gapi::core::GFlip)
+{
+    static void run(const cv::Mat& in, int code, cv::Mat& out)
+    {
+        cv::flip(in, out, code);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCrop, cv::gapi::core::GCrop)
+{
+    static void run(const cv::Mat& in, cv::Rect rect, cv::Mat& out)
+    {
+        cv::Mat(in, rect).copyTo(out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUConcatHor, cv::gapi::core::GConcatHor)
+{
+    static void run(const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out)
+    {
+        cv::hconcat(in1, in2, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUConcatVert, cv::gapi::core::GConcatVert)
+{
+    static void run(const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out)
+    {
+        cv::vconcat(in1, in2, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPULUT, cv::gapi::core::GLUT)
+{
+    static void run(const cv::Mat& in, const cv::Mat& lut, cv::Mat& out)
+    {
+        cv::LUT(in, lut, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUConvertTo, cv::gapi::core::GConvertTo)
+{
+    static void run(const cv::Mat& in, int rtype, double alpha, double beta, cv::Mat& out)
+    {
+        in.convertTo(out, rtype, alpha, beta);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUSqrt, cv::gapi::core::GSqrt)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::sqrt(in, out);
+    }
+};
+
+cv::gapi::GKernelPackage cv::gapi::core::cpu::kernels()
+{
+    static auto pkg = cv::gapi::kernels
+        <  GCPUAdd
+         , GCPUAddC
+         , GCPUSub
+         , GCPUSubC
+         , GCPUSubRC
+         , GCPUMul
+         , GCPUMulC
+         , GCPUMulCOld
+         , GCPUDiv
+         , GCPUDivC
+         , GCPUDivRC
+         , GCPUMean
+         , GCPUMask
+         , GCPUPolarToCart
+         , GCPUCartToPolar
+         , GCPUPhase
+         , GCPUCmpGT
+         , GCPUCmpGE
+         , GCPUCmpLE
+         , GCPUCmpLT
+         , GCPUCmpEQ
+         , GCPUCmpNE
+         , GCPUCmpGTScalar
+         , GCPUCmpGEScalar
+         , GCPUCmpLEScalar
+         , GCPUCmpLTScalar
+         , GCPUCmpEQScalar
+         , GCPUCmpNEScalar
+         , GCPUAnd
+         , GCPUAndS
+         , GCPUOr
+         , GCPUOrS
+         , GCPUXor
+         , GCPUXorS
+         , GCPUNot
+         , GCPUSelect
+         , GCPUMin
+         , GCPUMax
+         , GCPUAbsDiff
+         , GCPUAbsDiffC
+         , GCPUSum
+         , GCPUAddW
+         , GCPUNormL1
+         , GCPUNormL2
+         , GCPUNormInf
+         , GCPUIntegral
+         , GCPUThreshold
+         , GCPUThresholdOT
+         , GCPUInRange
+         , GCPUSplit3
+         , GCPUSplit4
+         , GCPUResize
+         , GCPUMerge3
+         , GCPUMerge4
+         , GCPURemap
+         , GCPUFlip
+         , GCPUCrop
+         , GCPUConcatHor
+         , GCPUConcatVert
+         , GCPULUT
+         , GCPUConvertTo
+         , GCPUSqrt
+         >();
+    return pkg;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpucore.hpp
new file mode 100644 (file)
index 0000000..77e9e82
--- /dev/null
@@ -0,0 +1,24 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCPUCORE_HPP
+#define OPENCV_GAPI_GCPUCORE_HPP
+
+#include <map>
+#include <string>
+
+#include "opencv2/gapi/cpu/gcpukernel.hpp"
+
+namespace cv { namespace gimpl {
+
+// NB: This is what a "Kernel Package" from the original Wiki doc should be.
+void loadCPUCore(std::map<std::string, cv::GCPUKernel> &kmap);
+
+}
+}
+
+#endif // OPENCV_GAPI_GCPUCORE_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.cpp
new file mode 100644 (file)
index 0000000..d14584b
--- /dev/null
@@ -0,0 +1,273 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/imgproc.hpp"
+#include "opencv2/gapi/cpu/imgproc.hpp"
+#include "backends/cpu/gcpuimgproc.hpp"
+
+GAPI_OCV_KERNEL(GCPUSepFilter, cv::gapi::imgproc::GSepFilter)
+{
+    static void run(const cv::Mat& in, int ddepth, const cv::Mat& kernX, const cv::Mat& kernY, const cv::Point& anchor, const cv::Scalar& delta,
+                    int border, const cv::Scalar& bordVal, cv::Mat &out)
+    {
+        if( border == cv::BORDER_CONSTANT )
+        {
+            cv::Mat temp_in;
+            int width_add = (kernY.cols - 1) / 2;
+            int height_add =  (kernX.rows - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, border, bordVal);
+            cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows);
+            cv::sepFilter2D(temp_in(rect), out, ddepth, kernX, kernY, anchor, delta.val[0], border);
+        }
+        else
+            cv::sepFilter2D(in, out, ddepth, kernX, kernY, anchor, delta.val[0], border);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUBoxFilter, cv::gapi::imgproc::GBoxFilter)
+{
+    static void run(const cv::Mat& in, int ddepth, const cv::Size& ksize, const cv::Point& anchor, bool normalize, int borderType, const cv::Scalar& bordVal, cv::Mat &out)
+    {
+        if( borderType == cv::BORDER_CONSTANT )
+        {
+            cv::Mat temp_in;
+            int width_add = (ksize.width - 1) / 2;
+            int height_add =  (ksize.height - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, borderType, bordVal);
+            cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows);
+            cv::boxFilter(temp_in(rect), out, ddepth, ksize, anchor, normalize, borderType);
+        }
+        else
+            cv::boxFilter(in, out, ddepth, ksize, anchor, normalize, borderType);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUBlur, cv::gapi::imgproc::GBlur)
+{
+    static void run(const cv::Mat& in, const cv::Size& ksize, const cv::Point& anchor, int borderType, const cv::Scalar& bordVal, cv::Mat &out)
+    {
+        if( borderType == cv::BORDER_CONSTANT )
+        {
+            cv::Mat temp_in;
+            int width_add = (ksize.width - 1) / 2;
+            int height_add =  (ksize.height - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, borderType, bordVal);
+            cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows);
+            cv::blur(temp_in(rect), out, ksize, anchor, borderType);
+        }
+        else
+            cv::blur(in, out, ksize, anchor, borderType);
+    }
+};
+
+
+GAPI_OCV_KERNEL(GCPUFilter2D, cv::gapi::imgproc::GFilter2D)
+{
+    static void run(const cv::Mat& in, int ddepth, const cv::Mat& k, const cv::Point& anchor, const cv::Scalar& delta, int border,
+                    const cv::Scalar& bordVal, cv::Mat &out)
+    {
+        if( border == cv::BORDER_CONSTANT )
+        {
+            cv::Mat temp_in;
+            int width_add = (k.cols - 1) / 2;
+            int height_add =  (k.rows - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, border, bordVal );
+            cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows);
+            cv::filter2D(temp_in(rect), out, ddepth, k, anchor, delta.val[0], border);
+        }
+        else
+            cv::filter2D(in, out, ddepth, k, anchor, delta.val[0], border);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUGaussBlur, cv::gapi::imgproc::GGaussBlur)
+{
+    static void run(const cv::Mat& in, const cv::Size& ksize, double sigmaX, double sigmaY, int borderType, const cv::Scalar& bordVal, cv::Mat &out)
+    {
+        if( borderType == cv::BORDER_CONSTANT )
+        {
+            cv::Mat temp_in;
+            int width_add = (ksize.width - 1) / 2;
+            int height_add =  (ksize.height - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, borderType, bordVal );
+            cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows);
+            cv::GaussianBlur(temp_in(rect), out, ksize, sigmaX, sigmaY, borderType);
+        }
+        else
+            cv::GaussianBlur(in, out, ksize, sigmaX, sigmaY, borderType);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUMedianBlur, cv::gapi::imgproc::GMedianBlur)
+{
+    static void run(const cv::Mat& in, int ksize, cv::Mat &out)
+    {
+        cv::medianBlur(in, out, ksize);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUErode, cv::gapi::imgproc::GErode)
+{
+    static void run(const cv::Mat& in, const cv::Mat& kernel, const cv::Point& anchor, int iterations, int borderType, const cv::Scalar& borderValue, cv::Mat &out)
+    {
+        cv::erode(in, out, kernel, anchor, iterations, borderType, borderValue);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUDilate, cv::gapi::imgproc::GDilate)
+{
+    static void run(const cv::Mat& in, const cv::Mat& kernel, const cv::Point& anchor, int iterations, int borderType, const cv::Scalar& borderValue, cv::Mat &out)
+    {
+        cv::dilate(in, out, kernel, anchor, iterations, borderType, borderValue);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUSobel, cv::gapi::imgproc::GSobel)
+{
+    static void run(const cv::Mat& in, int ddepth, int dx, int dy, int ksize, double scale, double delta, int borderType,
+                    const cv::Scalar& bordVal, cv::Mat &out)
+    {
+        if( borderType == cv::BORDER_CONSTANT )
+        {
+            cv::Mat temp_in;
+            int add = (ksize - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, add, add, add, add, borderType, bordVal );
+            cv::Rect rect = cv::Rect(add, add, in.cols, in.rows);
+            cv::Sobel(temp_in(rect), out, ddepth, dx, dy, ksize, scale, delta, borderType);
+        }
+        else
+        cv::Sobel(in, out, ddepth, dx, dy, ksize, scale, delta, borderType);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUEqualizeHist, cv::gapi::imgproc::GEqHist)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::equalizeHist(in, out);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUCanny, cv::gapi::imgproc::GCanny)
+{
+    static void run(const cv::Mat& in, double thr1, double thr2, int apSize, bool l2gradient, cv::Mat &out)
+    {
+        cv::Canny(in, out, thr1, thr2, apSize, l2gradient);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPURGB2YUV, cv::gapi::imgproc::GRGB2YUV)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_RGB2YUV);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUYUV2RGB, cv::gapi::imgproc::GYUV2RGB)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_YUV2RGB);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPURGB2Lab, cv::gapi::imgproc::GRGB2Lab)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_RGB2Lab);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUBGR2LUV, cv::gapi::imgproc::GBGR2LUV)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_BGR2Luv);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUBGR2YUV, cv::gapi::imgproc::GBGR2YUV)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_BGR2YUV);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPULUV2BGR, cv::gapi::imgproc::GLUV2BGR)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_Luv2BGR);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUYUV2BGR, cv::gapi::imgproc::GYUV2BGR)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_YUV2BGR);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPURGB2Gray, cv::gapi::imgproc::GRGB2Gray)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_RGB2GRAY);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPUBGR2Gray, cv::gapi::imgproc::GBGR2Gray)
+{
+    static void run(const cv::Mat& in, cv::Mat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_BGR2GRAY);
+    }
+};
+
+GAPI_OCV_KERNEL(GCPURGB2GrayCustom, cv::gapi::imgproc::GRGB2GrayCustom)
+{
+    static void run(const cv::Mat& in, float rY, float bY, float gY, cv::Mat &out)
+    {
+        cv::Mat planes[3];
+        cv::split(in, planes);
+        out = planes[0]*rY + planes[1]*bY + planes[2]*gY;
+    }
+};
+
+cv::gapi::GKernelPackage cv::gapi::imgproc::cpu::kernels()
+{
+    static auto pkg = cv::gapi::kernels
+        < GCPUFilter2D
+        , GCPUSepFilter
+        , GCPUBoxFilter
+        , GCPUBlur
+        , GCPUGaussBlur
+        , GCPUMedianBlur
+        , GCPUErode
+        , GCPUDilate
+        , GCPUSobel
+        , GCPUCanny
+        , GCPUEqualizeHist
+        , GCPURGB2YUV
+        , GCPUYUV2RGB
+        , GCPURGB2Lab
+        , GCPUBGR2LUV
+        , GCPUBGR2YUV
+        , GCPUYUV2BGR
+        , GCPULUV2BGR
+        , GCPUBGR2Gray
+        , GCPURGB2Gray
+        , GCPURGB2GrayCustom
+        >();
+    return pkg;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpuimgproc.hpp
new file mode 100644 (file)
index 0000000..172871a
--- /dev/null
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCPUIMGPROC_HPP
+#define OPENCV_GAPI_GCPUIMGPROC_HPP
+
+#include <map>
+#include <string>
+
+#include "opencv2/gapi/cpu/gcpukernel.hpp"
+
+namespace cv { namespace gimpl {
+
+// NB: This is what a "Kernel Package" from the origianl Wiki doc should be.
+void loadCPUImgProc(std::map<std::string, cv::GCPUKernel> &kmap);
+
+}}
+
+#endif // OPENCV_GAPI_GCPUIMGPROC_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpukernel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/cpu/gcpukernel.cpp
new file mode 100644 (file)
index 0000000..af13eed
--- /dev/null
@@ -0,0 +1,52 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <cassert>
+
+#include "opencv2/gapi/cpu/gcpukernel.hpp"
+
+const cv::gapi::own::Mat& cv::GCPUContext::inMat(int input)
+{
+    return inArg<cv::gapi::own::Mat>(input);
+}
+
+cv::gapi::own::Mat&  cv::GCPUContext::outMatR(int output)
+{
+    return *util::get<cv::gapi::own::Mat*>(m_results.at(output));
+}
+
+const cv::gapi::own::Scalar& cv::GCPUContext::inVal(int input)
+{
+    return inArg<cv::gapi::own::Scalar>(input);
+}
+
+cv::gapi::own::Scalar& cv::GCPUContext::outValR(int output)
+{
+    return *util::get<cv::gapi::own::Scalar*>(m_results.at(output));
+}
+
+cv::detail::VectorRef& cv::GCPUContext::outVecRef(int output)
+{
+    return util::get<cv::detail::VectorRef>(m_results.at(output));
+}
+
+cv::GCPUKernel::GCPUKernel()
+{
+}
+
+cv::GCPUKernel::GCPUKernel(const GCPUKernel::F &f)
+    : m_f(f)
+{
+}
+
+void cv::GCPUKernel::apply(GCPUContext &ctx)
+{
+    GAPI_Assert(m_f);
+    m_f(ctx);
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.cpp
new file mode 100644 (file)
index 0000000..e6eaaae
--- /dev/null
@@ -0,0 +1,1383 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <functional>
+#include <iostream>
+#include <iomanip> // std::fixed, std::setprecision
+#include <unordered_set>
+#include <stack>
+
+#include <ade/util/algorithm.hpp>
+#include <ade/util/chain_range.hpp>
+#include <ade/util/range.hpp>
+#include <ade/util/zip_range.hpp>
+
+#include <ade/typed_graph.hpp>
+#include <ade/execution_engine/execution_engine.hpp>
+
+#include "opencv2/gapi/gcommon.hpp"
+#include "logger.hpp"
+
+#include "opencv2/gapi/own/convert.hpp"
+#include "opencv2/gapi/gmat.hpp"    //for version of descr_of
+// PRIVATE STUFF!
+#include "compiler/gobjref.hpp"
+#include "compiler/gmodel.hpp"
+
+#include "backends/fluid/gfluidbuffer_priv.hpp"
+#include "backends/fluid/gfluidbackend.hpp"
+
+#include "api/gbackend_priv.hpp" // FIXME: Make it part of Backend SDK!
+
+// FIXME: Is there a way to take a typed graph (our GModel),
+// and create a new typed graph _ATOP_ of that (by extending with a couple of
+// new types?).
+// Alternatively, is there a way to compose types graphs?
+//
+// If not, we need to introduce that!
+using GFluidModel = ade::TypedGraph
+    < cv::gimpl::FluidUnit
+    , cv::gimpl::FluidData
+    , cv::gimpl::Protocol
+    , cv::gimpl::FluidUseOwnBorderBuffer
+    >;
+
+// FIXME: Same issue with Typed and ConstTyped
+using GConstFluidModel = ade::ConstTypedGraph
+    < cv::gimpl::FluidUnit
+    , cv::gimpl::FluidData
+    , cv::gimpl::Protocol
+    , cv::gimpl::FluidUseOwnBorderBuffer
+    >;
+
+// FluidBackend middle-layer implementation ////////////////////////////////////
+namespace
+{
+    class GFluidBackendImpl final: public cv::gapi::GBackend::Priv
+    {
+        virtual void unpackKernel(ade::Graph            &graph,
+                                  const ade::NodeHandle &op_node,
+                                  const cv::GKernelImpl &impl) override
+        {
+            GFluidModel fm(graph);
+            auto fluid_impl = cv::util::any_cast<cv::GFluidKernel>(impl.opaque);
+            fm.metadata(op_node).set(cv::gimpl::FluidUnit{fluid_impl, {}, 0, 0, 0.0});
+        }
+
+        virtual EPtr compile(const ade::Graph &graph,
+                             const cv::GCompileArgs &args,
+                             const std::vector<ade::NodeHandle> &nodes) const override
+        {
+            using namespace cv::gimpl;
+            GModel::ConstGraph g(graph);
+            auto isl_graph = g.metadata().get<IslandModel>().model;
+            GIslandModel::Graph gim(*isl_graph);
+
+            const auto num_islands = std::count_if
+                (gim.nodes().begin(), gim.nodes().end(),
+                 [&](const ade::NodeHandle &nh) {
+                    return gim.metadata(nh).get<NodeKind>().k == NodeKind::ISLAND;
+                });
+
+            const auto out_rois = cv::gimpl::getCompileArg<cv::GFluidOutputRois>(args);
+            if (num_islands > 1 && out_rois.has_value())
+                cv::util::throw_error(std::logic_error("GFluidOutputRois feature supports only one-island graphs"));
+
+            auto rois = out_rois.value_or(cv::GFluidOutputRois());
+            return EPtr{new cv::gimpl::GFluidExecutable(graph, nodes, std::move(rois.rois))};
+        }
+
+        virtual void addBackendPasses(ade::ExecutionEngineSetupContext &ectx) override;
+
+    };
+}
+
+cv::gapi::GBackend cv::gapi::fluid::backend()
+{
+    static cv::gapi::GBackend this_backend(std::make_shared<GFluidBackendImpl>());
+    return this_backend;
+}
+
+// FluidAgent implementation ///////////////////////////////////////////////////
+
+namespace cv { namespace gimpl {
+struct FluidMapper
+{
+    FluidMapper(double ratio, int lpi) : m_ratio(ratio), m_lpi(lpi) {}
+    virtual ~FluidMapper() = default;
+    virtual int firstWindow(int outCoord, int lpi) const = 0;
+    virtual std::pair<int,int> linesReadAndNextWindow(int outCoord, int lpi) const = 0;
+
+protected:
+    double m_ratio = 0.0;
+    int    m_lpi   = 0;
+};
+
+struct FluidDownscaleMapper : public FluidMapper
+{
+    virtual int firstWindow(int outCoord, int lpi) const override;
+    virtual std::pair<int,int> linesReadAndNextWindow(int outCoord, int lpi) const override;
+    using FluidMapper::FluidMapper;
+};
+
+struct FluidUpscaleMapper : public FluidMapper
+{
+    virtual int firstWindow(int outCoord, int lpi) const override;
+    virtual std::pair<int,int> linesReadAndNextWindow(int outCoord, int lpi) const override;
+    FluidUpscaleMapper(double ratio, int lpi, int inHeight) : FluidMapper(ratio, lpi), m_inHeight(inHeight) {}
+private:
+    int m_inHeight = 0;
+};
+
+struct FluidFilterAgent : public FluidAgent
+{
+private:
+    virtual int firstWindow() const override;
+    virtual std::pair<int,int> linesReadAndnextWindow() const override;
+    virtual void setRatio(double) override { /* nothing */ }
+public:
+    using FluidAgent::FluidAgent;
+};
+
+struct FluidResizeAgent : public FluidAgent
+{
+private:
+    virtual int firstWindow() const override;
+    virtual std::pair<int,int> linesReadAndnextWindow() const override;
+    virtual void setRatio(double ratio) override;
+
+    std::unique_ptr<FluidMapper> m_mapper;
+public:
+    using FluidAgent::FluidAgent;
+};
+}} // namespace cv::gimpl
+
+cv::gimpl::FluidAgent::FluidAgent(const ade::Graph &g, ade::NodeHandle nh)
+    : k(GConstFluidModel(g).metadata(nh).get<FluidUnit>().k)        // init(0)
+    , op_handle(nh)                                                 // init(1)
+    , op_name(GModel::ConstGraph(g).metadata(nh).get<Op>().k.name)  // init(2)
+{
+    std::set<int> out_w;
+    std::set<int> out_h;
+    GModel::ConstGraph cm(g);
+    for (auto out_data : nh->outNodes())
+    {
+        const auto  &d      = cm.metadata(out_data).get<Data>();
+        cv::GMatDesc d_meta = cv::util::get<cv::GMatDesc>(d.meta);
+        out_w.insert(d_meta.size.width);
+        out_h.insert(d_meta.size.height);
+    }
+
+    // Different output sizes are not supported
+    GAPI_Assert(out_w.size() == 1 && out_h.size() == 1);
+}
+
+void cv::gimpl::FluidAgent::reset()
+{
+    m_producedLines = 0;
+
+    auto lines = firstWindow();
+    for (auto &v : in_views)
+    {
+        if (v)
+        {
+            v.priv().reset(lines);
+        }
+    }
+}
+
+namespace {
+static int calcGcd (int n1, int n2)
+{
+    return (n2 == 0) ? n1 : calcGcd (n2, n1 % n2);
+}
+
+// This is an empiric formula and this is not 100% guaranteed
+// that it produces correct results in all possible cases
+// FIXME:
+// prove correctness or switch to some trusted method
+//
+// When performing resize input/output pixels form a cyclic
+// pattern where inH/gcd input pixels are mapped to outH/gcd
+// output pixels (pattern repeats gcd times).
+//
+// Output pixel can partually cover some of the input pixels.
+// There are 3 possible cases:
+//
+// :___ ___:    :___ _:_ ___:    :___ __: ___ :__ ___:
+// |___|___|    |___|_:_|___|    |___|__:|___|:__|___|
+// :       :    :     :     :    :      :     :      :
+//
+// 1) No partial coverage, max window = scaleFactor;
+// 2) Partial coverage occurs on the one side of the output pixel,
+//    max window = scaleFactor + 1;
+// 3) Partial coverage occurs at both sides of the output pixel,
+//    max window = scaleFactor + 2;
+//
+// Type of the coverage is determined by remainder of
+// inPeriodH/outPeriodH division, but it's an heuristic
+// (howbeit didn't found the proof of the opposite so far).
+
+static int calcResizeWindow(int inH, int outH)
+{
+    GAPI_Assert(inH >= outH);
+    auto gcd = calcGcd(inH, outH);
+    int  inPeriodH =  inH/gcd;
+    int outPeriodH = outH/gcd;
+    int scaleFactor = inPeriodH / outPeriodH;
+
+    switch ((inPeriodH) % (outPeriodH))
+    {
+    case 0:  return scaleFactor;     break;
+    case 1:  return scaleFactor + 1; break;
+    default: return scaleFactor + 2;
+    }
+}
+
+static int maxLineConsumption(const cv::GFluidKernel& k, int inH, int outH, int lpi)
+{
+    switch (k.m_kind)
+    {
+    case cv::GFluidKernel::Kind::Filter: return k.m_window + lpi - 1; break;
+    case cv::GFluidKernel::Kind::Resize:
+    {
+        if  (inH >= outH)
+        {
+            // FIXME:
+            // This is a suboptimal value, can be reduced
+            return calcResizeWindow(inH, outH) * lpi;
+        }
+        else
+        {
+            // FIXME:
+            // This is a suboptimal value, can be reduced
+            return (inH == 1) ? 1 : 2 + lpi - 1;
+        }
+    } break;
+    default: GAPI_Assert(false); return 0;
+    }
+}
+
+static int borderSize(const cv::GFluidKernel& k)
+{
+    switch (k.m_kind)
+    {
+    case cv::GFluidKernel::Kind::Filter: return (k.m_window - 1) / 2; break;
+    // Resize never reads from border pixels
+    case cv::GFluidKernel::Kind::Resize: return 0; break;
+    default: GAPI_Assert(false); return 0;
+    }
+}
+
+inline double inCoord(int outIdx, double ratio)
+{
+    return outIdx * ratio;
+}
+
+inline int windowStart(int outIdx, double ratio)
+{
+    return static_cast<int>(inCoord(outIdx, ratio) + 1e-3);
+}
+
+inline int windowEnd(int outIdx, double ratio)
+{
+    return static_cast<int>(std::ceil(inCoord(outIdx + 1, ratio) - 1e-3));
+}
+
+inline double inCoordUpscale(int outCoord, double ratio)
+{
+    // Calculate the projection of output pixel's center
+    return (outCoord + 0.5) * ratio - 0.5;
+}
+
+inline int upscaleWindowStart(int outCoord, double ratio)
+{
+    int start = static_cast<int>(inCoordUpscale(outCoord, ratio));
+    GAPI_DbgAssert(start >= 0);
+    return start;
+}
+
+inline int upscaleWindowEnd(int outCoord, double ratio, int inSz)
+{
+    int end = static_cast<int>(std::ceil(inCoordUpscale(outCoord, ratio)) + 1);
+    if (end > inSz)
+    {
+        end = inSz;
+    }
+    return end;
+}
+} // anonymous namespace
+
+int cv::gimpl::FluidDownscaleMapper::firstWindow(int outCoord, int lpi) const
+{
+    return windowEnd(outCoord + lpi - 1, m_ratio) - windowStart(outCoord, m_ratio);
+}
+
+std::pair<int,int> cv::gimpl::FluidDownscaleMapper::linesReadAndNextWindow(int outCoord, int lpi) const
+{
+    auto nextStartIdx = outCoord + 1 + m_lpi - 1;
+    auto nextEndIdx   = nextStartIdx + lpi - 1;
+
+    auto currStart = windowStart(outCoord, m_ratio);
+    auto nextStart = windowStart(nextStartIdx, m_ratio);
+    auto nextEnd   = windowEnd(nextEndIdx, m_ratio);
+
+    auto lines_read = nextStart - currStart;
+    auto next_window = nextEnd - nextStart;
+
+    return std::make_pair(lines_read, next_window);
+}
+
+int cv::gimpl::FluidUpscaleMapper::firstWindow(int outCoord, int lpi) const
+{
+    return upscaleWindowEnd(outCoord + lpi - 1, m_ratio, m_inHeight) - upscaleWindowStart(outCoord, m_ratio);
+}
+
+std::pair<int,int> cv::gimpl::FluidUpscaleMapper::linesReadAndNextWindow(int outCoord, int lpi) const
+{
+    auto nextStartIdx = outCoord + 1 + m_lpi - 1;
+    auto nextEndIdx   = nextStartIdx + lpi - 1;
+
+    auto currStart = upscaleWindowStart(outCoord, m_ratio);
+    auto nextStart = upscaleWindowStart(nextStartIdx, m_ratio);
+    auto nextEnd   = upscaleWindowEnd(nextEndIdx, m_ratio, m_inHeight);
+
+    auto lines_read = nextStart - currStart;
+    auto next_window = nextEnd - nextStart;
+
+    return std::make_pair(lines_read, next_window);
+}
+
+int cv::gimpl::FluidFilterAgent::firstWindow() const
+{
+    return k.m_window + k.m_lpi - 1;
+}
+
+std::pair<int,int> cv::gimpl::FluidFilterAgent::linesReadAndnextWindow() const
+{
+    int lpi = std::min(k.m_lpi, m_outputLines - m_producedLines - k.m_lpi);
+    return std::make_pair(k.m_lpi, k.m_window - 1 + lpi);
+}
+
+int cv::gimpl::FluidResizeAgent::firstWindow() const
+{
+    auto outIdx = out_buffers[0]->priv().y();
+    auto lpi = std::min(m_outputLines - m_producedLines, k.m_lpi);
+    return m_mapper->firstWindow(outIdx, lpi);
+}
+
+std::pair<int,int> cv::gimpl::FluidResizeAgent::linesReadAndnextWindow() const
+{
+    auto outIdx = out_buffers[0]->priv().y();
+    auto lpi = std::min(m_outputLines - m_producedLines - k.m_lpi, k.m_lpi);
+    return m_mapper->linesReadAndNextWindow(outIdx, lpi);
+}
+
+void cv::gimpl::FluidResizeAgent::setRatio(double ratio)
+{
+    if (ratio >= 1.0)
+    {
+        m_mapper.reset(new FluidDownscaleMapper(ratio, k.m_lpi));
+    }
+    else
+    {
+        m_mapper.reset(new FluidUpscaleMapper(ratio, k.m_lpi, in_views[0].meta().size.height));
+    }
+}
+
+bool cv::gimpl::FluidAgent::canRead() const
+{
+    // An agent can work if every input buffer have enough data to start
+    for (const auto& in_view : in_views)
+    {
+        if (in_view)
+        {
+            if (!in_view.ready())
+                return false;
+        }
+    }
+    return true;
+}
+
+bool cv::gimpl::FluidAgent::canWrite() const
+{
+    // An agent can work if there is space to write in its output
+    // allocated buffers
+    GAPI_DbgAssert(!out_buffers.empty());
+    auto out_begin = out_buffers.begin();
+    auto out_end   = out_buffers.end();
+    if (k.m_scratch) out_end--;
+    for (auto it = out_begin; it != out_end; ++it)
+    {
+        if ((*it)->priv().full())
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool cv::gimpl::FluidAgent::canWork() const
+{
+    return canRead() && canWrite();
+}
+
+void cv::gimpl::FluidAgent::doWork()
+{
+    GAPI_DbgAssert(m_outputLines > m_producedLines);
+    for (auto& in_view : in_views)
+    {
+        if (in_view) in_view.priv().prepareToRead();
+    }
+
+    k.m_f(in_args, out_buffers);
+
+    for (auto& in_view : in_views)
+    {
+        if (in_view)
+        {
+            auto pair = linesReadAndnextWindow();
+            in_view.priv().readDone(pair.first, pair.second);
+        };
+    }
+
+    for (auto out_buf : out_buffers)
+    {
+        out_buf->priv().writeDone();
+        // FIXME WARNING: Scratch buffers rotated here too!
+    }
+
+    m_producedLines += k.m_lpi;
+}
+
+bool cv::gimpl::FluidAgent::done() const
+{
+    // m_producedLines is a multiple of LPI, while original
+    // height may be not.
+    return m_producedLines >= m_outputLines;
+}
+
+void cv::gimpl::FluidAgent::debug(std::ostream &os)
+{
+    os << "Fluid Agent " << std::hex << this
+       << " (" << op_name << ") --"
+       << " canWork=" << std::boolalpha << canWork()
+       << " canRead=" << std::boolalpha << canRead()
+       << " canWrite=" << std::boolalpha << canWrite()
+       << " done="    << done()
+       << " lines="   << std::dec << m_producedLines << "/" << m_outputLines
+       << " {{\n";
+    for (auto out_buf : out_buffers)
+    {
+        out_buf->debug(os);
+    }
+    std::cout << "}}" << std::endl;
+}
+
+// GCPUExcecutable implementation //////////////////////////////////////////////
+
+void cv::gimpl::GFluidExecutable::initBufferRois(std::vector<int>& readStarts,
+                                                 std::vector<cv::gapi::own::Rect>& rois,
+                                                 const std::vector<cv::gapi::own::Rect>& out_rois)
+{
+    GConstFluidModel fg(m_g);
+    auto proto = m_gm.metadata().get<Protocol>();
+    std::stack<ade::NodeHandle> nodesToVisit;
+
+    // FIXME?
+    // There is possible case when user pass the vector full of default Rect{}-s,
+    // Can be diagnosed and handled appropriately
+    if (proto.outputs.size() != out_rois.size())
+    {
+        GAPI_Assert(out_rois.size() == 0);
+        // No inference required, buffers will obtain roi from meta
+        return;
+    }
+
+    // First, initialize rois for output nodes, add them to traversal stack
+    for (const auto& it : ade::util::indexed(proto.out_nhs))
+    {
+        const auto idx = ade::util::index(it);
+        const auto nh  = ade::util::value(it);
+
+        const auto &d  = m_gm.metadata(nh).get<Data>();
+
+        // This is not our output
+        if (m_id_map.count(d.rc) == 0)
+        {
+            continue;
+        }
+
+        if (d.shape == GShape::GMAT)
+        {
+            auto desc = util::get<GMatDesc>(d.meta);
+            auto id = m_id_map.at(d.rc);
+            readStarts[id] = 0;
+
+            if (out_rois[idx] == gapi::own::Rect{})
+            {
+                rois[id] = gapi::own::Rect{ 0, 0, desc.size.width, desc.size.height };
+            }
+            else
+            {
+                // Only slices are supported at the moment
+                GAPI_Assert(out_rois[idx].x == 0);
+                GAPI_Assert(out_rois[idx].width == desc.size.width);
+                rois[id] = out_rois[idx];
+            }
+
+            nodesToVisit.push(nh);
+        }
+    }
+
+    // Perform a wide search from each of the output nodes
+    // And extend roi of buffers by border_size
+    // Each node can be visited multiple times
+    // (if node has been already visited, the check that inferred rois are the same is performed)
+    while (!nodesToVisit.empty())
+    {
+        const auto startNode = nodesToVisit.top();
+        nodesToVisit.pop();
+
+        if (!startNode->inNodes().empty())
+        {
+            GAPI_Assert(startNode->inNodes().size() == 1);
+            const auto& oh = startNode->inNodes().front();
+
+            const auto& data = m_gm.metadata(startNode).get<Data>();
+            // only GMats participate in the process so it's valid to obtain GMatDesc
+            const auto& meta = util::get<GMatDesc>(data.meta);
+
+            for (const auto& inNode : oh->inNodes())
+            {
+                const auto& in_data = m_gm.metadata(inNode).get<Data>();
+
+                if (in_data.shape == GShape::GMAT && fg.metadata(inNode).contains<FluidData>())
+                {
+                    const auto& in_meta = util::get<GMatDesc>(in_data.meta);
+                    const auto& fd = fg.metadata(inNode).get<FluidData>();
+
+                    auto adjFilterRoi = [](cv::gapi::own::Rect produced, int b, int max_height) {
+                        // Extend with border roi which should be produced, crop to logical image size
+                        cv::gapi::own::Rect roi = {produced.x, produced.y - b, produced.width, produced.height + 2*b};
+                        cv::gapi::own::Rect fullImg{ 0, 0, produced.width, max_height };
+                        return roi & fullImg;
+                    };
+
+                    auto adjResizeRoi = [](cv::gapi::own::Rect produced, cv::gapi::own::Size inSz, cv::gapi::own::Size outSz) {
+                        auto map = [](int outCoord, int producedSz, int inSize, int outSize) {
+                            double ratio = (double)inSize / outSize;
+                            int w0 = 0, w1 = 0;
+                            if (ratio >= 1.0)
+                            {
+                                w0 = windowStart(outCoord, ratio);
+                                w1 = windowEnd  (outCoord + producedSz - 1, ratio);
+                            }
+                            else
+                            {
+                                w0 = upscaleWindowStart(outCoord, ratio);
+                                w1 = upscaleWindowEnd(outCoord + producedSz - 1, ratio, inSize);
+                            }
+                            return std::make_pair(w0, w1);
+                        };
+
+                        auto mapY = map(produced.y, produced.height, inSz.height, outSz.height);
+                        auto y0 = mapY.first;
+                        auto y1 = mapY.second;
+
+                        auto mapX = map(produced.x, produced.width, inSz.width, outSz.width);
+                        auto x0 = mapX.first;
+                        auto x1 = mapX.second;
+
+                        cv::gapi::own::Rect roi = {x0, y0, x1 - x0, y1 - y0};
+                        return roi;
+                    };
+
+                    cv::gapi::own::Rect produced = rois[m_id_map.at(data.rc)];
+
+                    cv::gapi::own::Rect resized;
+                    switch (fg.metadata(oh).get<FluidUnit>().k.m_kind)
+                    {
+                    case GFluidKernel::Kind::Filter: resized = produced; break;
+                    case GFluidKernel::Kind::Resize: resized = adjResizeRoi(produced, in_meta.size, meta.size); break;
+                    default: GAPI_Assert(false);
+                    }
+
+                    int readStart = resized.y;
+                    cv::gapi::own::Rect roi = adjFilterRoi(resized, fd.border_size, in_meta.size.height);
+
+                    auto in_id = m_id_map.at(in_data.rc);
+                    if (rois[in_id] == cv::gapi::own::Rect{})
+                    {
+                        readStarts[in_id] = readStart;
+                        rois[in_id] = roi;
+                        // Continue traverse on internal (w.r.t Island) data nodes only.
+                        if (fd.internal) nodesToVisit.push(inNode);
+                    }
+                    else
+                    {
+                        GAPI_Assert(readStarts[in_id] == readStart);
+                        GAPI_Assert(rois[in_id] == roi);
+                    }
+                } // if (in_data.shape == GShape::GMAT)
+            } // for (const auto& inNode : oh->inNodes())
+        } // if (!startNode->inNodes().empty())
+    } // while (!nodesToVisit.empty())
+}
+
+cv::gimpl::GFluidExecutable::GFluidExecutable(const ade::Graph &g,
+                                              const std::vector<ade::NodeHandle> &nodes,
+                                              const std::vector<cv::gapi::own::Rect> &outputRois)
+    : m_g(g), m_gm(m_g)
+{
+    GConstFluidModel fg(m_g);
+
+    // Initialize vector of data buffers, build list of operations
+    // FIXME: There _must_ be a better way to [query] count number of DATA nodes
+    std::size_t mat_count = 0;
+    std::size_t last_agent = 0;
+
+    auto grab_mat_nh = [&](ade::NodeHandle nh) {
+        auto rc = m_gm.metadata(nh).get<Data>().rc;
+        if (m_id_map.count(rc) == 0)
+        {
+            m_all_gmat_ids[mat_count] = nh;
+            m_id_map[rc] = mat_count++;
+        }
+    };
+
+    for (const auto &nh : nodes)
+    {
+        switch (m_gm.metadata(nh).get<NodeType>().t)
+        {
+        case NodeType::DATA:
+            if (m_gm.metadata(nh).get<Data>().shape == GShape::GMAT)
+                grab_mat_nh(nh);
+            break;
+
+        case NodeType::OP:
+        {
+            const auto& fu = fg.metadata(nh).get<FluidUnit>();
+            switch (fu.k.m_kind)
+            {
+            case GFluidKernel::Kind::Filter: m_agents.emplace_back(new FluidFilterAgent(m_g, nh)); break;
+            case GFluidKernel::Kind::Resize: m_agents.emplace_back(new FluidResizeAgent(m_g, nh)); break;
+            default: GAPI_Assert(false);
+            }
+            // NB.: in_buffer_ids size is equal to Arguments size, not Edges size!!!
+            m_agents.back()->in_buffer_ids.resize(m_gm.metadata(nh).get<Op>().args.size(), -1);
+            for (auto eh : nh->inEdges())
+            {
+                // FIXME Only GMats are currently supported (which can be represented
+                // as fluid buffers
+                if (m_gm.metadata(eh->srcNode()).get<Data>().shape == GShape::GMAT)
+                {
+                    const auto in_port = m_gm.metadata(eh).get<Input>().port;
+                    const int  in_buf  = m_gm.metadata(eh->srcNode()).get<Data>().rc;
+
+                    m_agents.back()->in_buffer_ids[in_port] = in_buf;
+                    grab_mat_nh(eh->srcNode());
+                }
+            }
+            // FIXME: Assumption that all operation outputs MUST be connected
+            m_agents.back()->out_buffer_ids.resize(nh->outEdges().size(), -1);
+            for (auto eh : nh->outEdges())
+            {
+                const auto& data = m_gm.metadata(eh->dstNode()).get<Data>();
+                const auto out_port = m_gm.metadata(eh).get<Output>().port;
+                const int  out_buf  = data.rc;
+
+                m_agents.back()->out_buffer_ids[out_port] = out_buf;
+                if (data.shape == GShape::GMAT) grab_mat_nh(eh->dstNode());
+            }
+            if (fu.k.m_scratch)
+                m_scratch_users.push_back(last_agent);
+            last_agent++;
+            break;
+        }
+        default: GAPI_Assert(false);
+        }
+    }
+
+    // Check that IDs form a continiuos set (important for further indexing)
+    GAPI_Assert(m_id_map.size() >  0);
+    GAPI_Assert(m_id_map.size() == static_cast<size_t>(mat_count));
+
+    // Actually initialize Fluid buffers
+    GAPI_LOG_INFO(NULL, "Initializing " << mat_count << " fluid buffer(s)" << std::endl);
+    m_num_int_buffers = mat_count;
+    const std::size_t num_scratch = m_scratch_users.size();
+    m_buffers.resize(m_num_int_buffers + num_scratch);
+
+    // After buffers are allocated, repack: ...
+    for (auto &agent : m_agents)
+    {
+        // a. Agent input parameters with View pointers (creating Views btw)
+        const auto &op = m_gm.metadata(agent->op_handle).get<Op>();
+        const auto &fu =   fg.metadata(agent->op_handle).get<FluidUnit>();
+        agent->in_args.resize(op.args.size());
+        agent->in_views.resize(op.args.size());
+        for (auto it : ade::util::indexed(ade::util::toRange(agent->in_buffer_ids)))
+        {
+            auto in_idx  = ade::util::index(it);
+            auto buf_idx = ade::util::value(it);
+
+            if (buf_idx >= 0)
+            {
+                // IF there is input buffer, register a view (every unique
+                // reader has its own), and store it in agent Args
+                gapi::fluid::Buffer &buffer = m_buffers.at(m_id_map.at(buf_idx));
+
+                auto inEdge = GModel::getInEdgeByPort(m_g, agent->op_handle, in_idx);
+                auto ownStorage = fg.metadata(inEdge).get<FluidUseOwnBorderBuffer>().use;
+
+                gapi::fluid::View view = buffer.mkView(fu.border_size, ownStorage);
+                // NB: It is safe to keep ptr as view lifetime is buffer lifetime
+                agent->in_views[in_idx] = view;
+                agent->in_args[in_idx]  = GArg(view);
+            }
+            else
+            {
+                // Copy(FIXME!) original args as is
+                agent->in_args[in_idx] = op.args[in_idx];
+            }
+        }
+
+        // b. Agent output parameters with Buffer pointers.
+        agent->out_buffers.resize(agent->op_handle->outEdges().size(), nullptr);
+        for (auto it : ade::util::indexed(ade::util::toRange(agent->out_buffer_ids)))
+        {
+            auto out_idx = ade::util::index(it);
+            auto buf_idx = m_id_map.at(ade::util::value(it));
+            agent->out_buffers.at(out_idx) = &m_buffers.at(buf_idx);
+        }
+    }
+
+    // After parameters are there, initialize scratch buffers
+    if (num_scratch)
+    {
+        GAPI_LOG_INFO(NULL, "Initializing " << num_scratch << " scratch buffer(s)" << std::endl);
+        std::size_t last_scratch_id = 0;
+
+        for (auto i : m_scratch_users)
+        {
+            auto &agent = m_agents.at(i);
+            GAPI_Assert(agent->k.m_scratch);
+            const std::size_t new_scratch_idx = m_num_int_buffers + last_scratch_id;
+            agent->out_buffers.emplace_back(&m_buffers[new_scratch_idx]);
+            last_scratch_id++;
+        }
+    }
+
+    makeReshape(outputRois);
+
+    std::size_t total_size = 0;
+    for (const auto &i : ade::util::indexed(m_buffers))
+    {
+        // Check that all internal and scratch buffers are allocated
+        const auto idx = ade::util::index(i);
+        const auto b   = ade::util::value(i);
+        if (idx >= m_num_int_buffers ||
+            fg.metadata(m_all_gmat_ids[idx]).get<FluidData>().internal == true)
+        {
+            GAPI_Assert(b.priv().size() > 0);
+        }
+
+        // Buffers which will be bound to real images may have size of 0 at this moment
+        // (There can be non-zero sized const border buffer allocated in such buffers)
+        total_size += b.priv().size();
+    }
+    GAPI_LOG_INFO(NULL, "Internal buffers: " << std::fixed << std::setprecision(2) << static_cast<float>(total_size)/1024 << " KB\n");
+}
+
+namespace
+{
+    void resetFluidData(ade::Graph& graph)
+    {
+        using namespace cv::gimpl;
+        GModel::Graph g(graph);
+        GFluidModel fg(graph);
+        for (const auto node : g.nodes())
+        {
+            if (g.metadata(node).get<NodeType>().t == NodeType::DATA)
+            {
+                auto& fd = fg.metadata(node).get<FluidData>();
+                fd.latency         = 0;
+                fd.skew            = 0;
+                fd.max_consumption = 0;
+            }
+        }
+    }
+
+    void initFluidUnits(ade::Graph& graph)
+    {
+        using namespace cv::gimpl;
+        GModel::Graph g(graph);
+        GFluidModel fg(graph);
+
+        auto sorted = g.metadata().get<ade::passes::TopologicalSortData>().nodes();
+        for (auto node : sorted)
+        {
+            if (fg.metadata(node).contains<FluidUnit>())
+            {
+                std::set<int> in_hs, out_ws, out_hs;
+
+                for (const auto& in : node->inNodes())
+                {
+                    const auto& d = g.metadata(in).get<Data>();
+                    if (d.shape == cv::GShape::GMAT)
+                    {
+                        const auto& meta = cv::util::get<cv::GMatDesc>(d.meta);
+                        in_hs.insert(meta.size.height);
+                    }
+                }
+
+                for (const auto& out : node->outNodes())
+                {
+                    const auto& d = g.metadata(out).get<Data>();
+                    if (d.shape == cv::GShape::GMAT)
+                    {
+                        const auto& meta = cv::util::get<cv::GMatDesc>(d.meta);
+                        out_ws.insert(meta.size.width);
+                        out_hs.insert(meta.size.height);
+                    }
+                }
+
+                GAPI_Assert(in_hs.size() == 1 && out_ws.size() == 1 && out_hs.size() == 1);
+
+                auto in_h  = *in_hs .cbegin();
+                auto out_h = *out_hs.cbegin();
+
+                auto &fu = fg.metadata(node).get<FluidUnit>();
+                fu.ratio = (double)in_h / out_h;
+
+                int line_consumption = maxLineConsumption(fu.k, in_h, out_h, fu.k.m_lpi);
+                int border_size = borderSize(fu.k);
+
+                fu.border_size = border_size;
+                fu.line_consumption = line_consumption;
+
+                GModel::log(g, node, "Line consumption: " + std::to_string(fu.line_consumption));
+                GModel::log(g, node, "Border size: " + std::to_string(fu.border_size));
+            }
+        }
+    }
+
+    // FIXME!
+    // Split into initLineConsumption and initBorderSizes,
+    // call only consumption related stuff during reshape
+    void initLineConsumption(ade::Graph& graph)
+    {
+        using namespace cv::gimpl;
+        GModel::Graph g(graph);
+        GFluidModel fg(graph);
+
+        for (const auto &node : g.nodes())
+        {
+            if (fg.metadata(node).contains<FluidUnit>())
+            {
+                const auto &fu = fg.metadata(node).get<FluidUnit>();
+
+                for (const auto &in_data_node : node->inNodes())
+                {
+                    auto &fd = fg.metadata(in_data_node).get<FluidData>();
+
+                    // Update (not Set) fields here since a single data node may be
+                    // accessed by multiple consumers
+                    fd.max_consumption = std::max(fu.line_consumption, fd.max_consumption);
+                    fd.border_size     = std::max(fu.border_size, fd.border_size);
+
+                    GModel::log(g, in_data_node, "Line consumption: " + std::to_string(fd.max_consumption)
+                                + " (upd by " + std::to_string(fu.line_consumption) + ")", node);
+                    GModel::log(g, in_data_node, "Border size: " + std::to_string(fd.border_size), node);
+                }
+            }
+        }
+    }
+
+    void calcLatency(ade::Graph& graph)
+    {
+        using namespace cv::gimpl;
+        GModel::Graph g(graph);
+        GFluidModel fg(graph);
+
+        auto sorted = g.metadata().get<ade::passes::TopologicalSortData>().nodes();
+        for (const auto &node : sorted)
+        {
+            if (fg.metadata(node).contains<FluidUnit>())
+            {
+                const auto &fu = fg.metadata(node).get<FluidUnit>();
+
+                const int own_latency = fu.line_consumption - fu.border_size;
+                GModel::log(g, node, "LPI: " + std::to_string(fu.k.m_lpi));
+
+                // Output latency is max(input_latency) + own_latency
+                int in_latency = 0;
+                for (const auto &in_data_node : node->inNodes())
+                {
+                    // FIXME: ASSERT(DATA), ASSERT(FLUIDDATA)
+                    in_latency = std::max(in_latency, fg.metadata(in_data_node).get<FluidData>().latency);
+                }
+                const int out_latency = in_latency + own_latency;
+
+                for (const auto &out_data_node : node->outNodes())
+                {
+                    // FIXME: ASSERT(DATA), ASSERT(FLUIDDATA)
+                    auto &fd     = fg.metadata(out_data_node).get<FluidData>();
+                    fd.latency   = out_latency;
+                    fd.lpi_write = fu.k.m_lpi;
+                    GModel::log(g, out_data_node, "Latency: " + std::to_string(out_latency));
+                }
+            }
+        }
+    }
+
+    void calcSkew(ade::Graph& graph)
+    {
+        using namespace cv::gimpl;
+        GModel::Graph g(graph);
+        GFluidModel fg(graph);
+
+        auto sorted = g.metadata().get<ade::passes::TopologicalSortData>().nodes();
+        for (const auto &node : sorted)
+        {
+            if (fg.metadata(node).contains<FluidUnit>())
+            {
+                int max_latency = 0;
+                for (const auto &in_data_node : node->inNodes())
+                {
+                    // FIXME: ASSERT(DATA), ASSERT(FLUIDDATA)
+                    max_latency = std::max(max_latency, fg.metadata(in_data_node).get<FluidData>().latency);
+                }
+                for (const auto &in_data_node : node->inNodes())
+                {
+                    // FIXME: ASSERT(DATA), ASSERT(FLUIDDATA)
+                    auto &fd = fg.metadata(in_data_node).get<FluidData>();
+
+                    // Update (not Set) fields here since a single data node may be
+                    // accessed by multiple consumers
+                    fd.skew = std::max(fd.skew, max_latency - fd.latency);
+
+                    GModel::log(g, in_data_node, "Skew: " + std::to_string(fd.skew), node);
+                }
+            }
+        }
+    }
+}
+
+void cv::gimpl::GFluidExecutable::makeReshape(const std::vector<gapi::own::Rect> &out_rois)
+{
+    GConstFluidModel fg(m_g);
+
+    // Calculate rois for each fluid buffer
+    std::vector<int> readStarts(m_num_int_buffers);
+    std::vector<cv::gapi::own::Rect> rois(m_num_int_buffers);
+    initBufferRois(readStarts, rois, out_rois);
+
+    // NB: Allocate ALL buffer object at once, and avoid any further reallocations
+    // (since raw pointers-to-elements are taken)
+    for (const auto &it : m_all_gmat_ids)
+    {
+        auto id = it.first;
+        auto nh = it.second;
+        const auto & d  = m_gm.metadata(nh).get<Data>();
+        const auto &fd  = fg.metadata(nh).get<FluidData>();
+        const auto meta = cv::util::get<GMatDesc>(d.meta);
+
+        m_buffers[id].priv().init(meta, fd.lpi_write, readStarts[id], rois[id]);
+
+        // TODO:
+        // Introduce Storage::INTERNAL_GRAPH and Storage::INTERNAL_ISLAND?
+        if (fd.internal == true)
+        {
+            m_buffers[id].priv().allocate(fd.border, fd.border_size, fd.max_consumption, fd.skew);
+            std::stringstream stream;
+            m_buffers[id].debug(stream);
+            GAPI_LOG_INFO(NULL, stream.str());
+        }
+    }
+
+    // Allocate views, initialize agents
+    for (auto &agent : m_agents)
+    {
+        const auto &fu = fg.metadata(agent->op_handle).get<FluidUnit>();
+        for (auto it : ade::util::indexed(ade::util::toRange(agent->in_buffer_ids)))
+        {
+            auto in_idx  = ade::util::index(it);
+            auto buf_idx = ade::util::value(it);
+
+            if (buf_idx >= 0)
+            {
+                agent->in_views[in_idx].priv().allocate(fu.line_consumption, fu.border);
+            }
+        }
+
+        agent->setRatio(fu.ratio);
+        agent->m_outputLines = agent->out_buffers.front()->priv().outputLines();
+    }
+
+    // Initialize scratch buffers
+    if (m_scratch_users.size())
+    {
+        for (auto i : m_scratch_users)
+        {
+            auto &agent = m_agents.at(i);
+            GAPI_Assert(agent->k.m_scratch);
+
+            // Trigger Scratch buffer initialization method
+            agent->k.m_is(GModel::collectInputMeta(m_gm, agent->op_handle), agent->in_args, *agent->out_buffers.back());
+            std::stringstream stream;
+            agent->out_buffers.back()->debug(stream);
+            GAPI_LOG_INFO(NULL, stream.str());
+        }
+    }
+
+    // FIXME: calculate the size (lpi * ..)
+    m_script.clear();
+    m_script.reserve(10000);
+}
+
+void cv::gimpl::GFluidExecutable::reshape(ade::Graph &g, const GCompileArgs &args)
+{
+    // FIXME: Probably this needs to be integrated into common pass re-run routine
+    // Backends may want to mark with passes to re-run on reshape and framework could
+    // do it system-wide (without need in every backend handling reshape() directly).
+    // This design needs to be analyzed for implementation.
+    resetFluidData(g);
+    initFluidUnits(g);
+    initLineConsumption(g);
+    calcLatency(g);
+    calcSkew(g);
+    const auto out_rois = cv::gimpl::getCompileArg<cv::GFluidOutputRois>(args).value_or(cv::GFluidOutputRois());
+    makeReshape(out_rois.rois);
+}
+
+// FIXME: Document what it does
+void cv::gimpl::GFluidExecutable::bindInArg(const cv::gimpl::RcDesc &rc, const GRunArg &arg)
+{
+    switch (rc.shape)
+    {
+    case GShape::GMAT:    m_buffers[m_id_map.at(rc.id)].priv().bindTo(util::get<cv::gapi::own::Mat>(arg), true); break;
+    case GShape::GSCALAR: m_res.slot<cv::gapi::own::Scalar>()[rc.id] = util::get<cv::gapi::own::Scalar>(arg); break;
+    default: util::throw_error(std::logic_error("Unsupported GShape type"));
+    }
+}
+
+void cv::gimpl::GFluidExecutable::bindOutArg(const cv::gimpl::RcDesc &rc, const GRunArgP &arg)
+{
+    // Only GMat is supported as return type
+    switch (rc.shape)
+    {
+    case GShape::GMAT:
+        {
+            cv::GMatDesc desc = m_buffers[m_id_map.at(rc.id)].meta();
+            auto      &outMat = *util::get<cv::gapi::own::Mat*>(arg);
+            GAPI_Assert(outMat.data != nullptr);
+            GAPI_Assert(descr_of(outMat) == desc && "Output argument was not preallocated as it should be ?");
+            m_buffers[m_id_map.at(rc.id)].priv().bindTo(outMat, false);
+            break;
+        }
+    default: util::throw_error(std::logic_error("Unsupported return GShape type"));
+    }
+}
+
+void cv::gimpl::GFluidExecutable::packArg(cv::GArg &in_arg, const cv::GArg &op_arg)
+{
+    GAPI_Assert(op_arg.kind != cv::detail::ArgKind::GMAT
+           && op_arg.kind != cv::detail::ArgKind::GSCALAR);
+
+    if (op_arg.kind == cv::detail::ArgKind::GOBJREF)
+    {
+        const cv::gimpl::RcDesc &ref = op_arg.get<cv::gimpl::RcDesc>();
+        if (ref.shape == GShape::GSCALAR)
+        {
+            in_arg = GArg(m_res.slot<cv::gapi::own::Scalar>()[ref.id]);
+        }
+    }
+}
+
+void cv::gimpl::GFluidExecutable::run(std::vector<InObj>  &&input_objs,
+                                      std::vector<OutObj> &&output_objs)
+{
+    // Bind input buffers from parameters
+    for (auto& it : input_objs)  bindInArg(it.first, it.second);
+    for (auto& it : output_objs) bindOutArg(it.first, it.second);
+
+    // Reset Buffers and Agents state before we go
+    for (auto &buffer : m_buffers)
+        buffer.priv().reset();
+
+    for (auto &agent : m_agents)
+    {
+        agent->reset();
+        // Pass input cv::Scalar's to agent argument
+        const auto& op = m_gm.metadata(agent->op_handle).get<Op>();
+        for (const auto& it : ade::util::indexed(op.args))
+        {
+            const auto& arg = ade::util::value(it);
+            packArg(agent->in_args[ade::util::index(it)], arg);
+        }
+    }
+
+    // Explicitly reset Scratch buffers, if any
+    for (auto scratch_i : m_scratch_users)
+    {
+        auto &agent = m_agents[scratch_i];
+        GAPI_DbgAssert(agent->k.m_scratch);
+        agent->k.m_rs(*agent->out_buffers.back());
+    }
+
+    // Now start executing our stuff!
+    // Fluid execution is:
+    // - run through list of Agents from Left to Right
+    // - for every Agent:
+    //   - if all input Buffers have enough data to fulfill
+    //     Agent's window - trigger Agent
+    //     - on trigger, Agent takes all input lines from input buffers
+    //       and produces a single output line
+    //     - once Agent finishes, input buffers get "readDone()",
+    //       and output buffers get "writeDone()"
+    //   - if there's not enough data, Agent is skipped
+    // Yes, THAT easy!
+
+    if (m_script.empty())
+    {
+        bool complete = true;
+        do {
+            complete = true;
+            bool work_done=false;
+            for (auto &agent : m_agents)
+            {
+                // agent->debug(std::cout);
+                if (!agent->done())
+                {
+                    if (agent->canWork())
+                    {
+                        agent->doWork(); work_done=true;
+                        m_script.push_back(agent.get());
+                    }
+                    if (!agent->done())   complete = false;
+                }
+            }
+            GAPI_Assert(work_done || complete);
+        } while (!complete); // FIXME: number of iterations can be calculated statically
+    }
+    else
+    {
+        for (auto &agent : m_script)
+        {
+            agent->doWork();
+        }
+    }
+}
+
+// FIXME: these passes operate on graph global level!!!
+// Need to fix this for heterogeneous (island-based) processing
+void GFluidBackendImpl::addBackendPasses(ade::ExecutionEngineSetupContext &ectx)
+{
+    using namespace cv::gimpl;
+
+    // FIXME: all passes were moved to "exec" stage since Fluid
+    // should check Islands configuration first (which is now quite
+    // limited), and only then continue with all other passes.
+    //
+    // The passes/stages API must be streamlined!
+    ectx.addPass("exec", "init_fluid_data", [](ade::passes::PassContext &ctx)
+    {
+        GModel::Graph g(ctx.graph);
+        if (!GModel::isActive(g, cv::gapi::fluid::backend()))  // FIXME: Rearchitect this!
+            return;
+
+        auto isl_graph = g.metadata().get<IslandModel>().model;
+        GIslandModel::Graph gim(*isl_graph);
+
+        GFluidModel fg(ctx.graph);
+
+        const auto setFluidData = [&](ade::NodeHandle nh, bool internal) {
+            FluidData fd;
+            fd.internal = internal;
+            fg.metadata(nh).set(fd);
+        };
+
+        for (const auto& nh : gim.nodes())
+        {
+            if (gim.metadata(nh).get<NodeKind>().k == NodeKind::ISLAND)
+            {
+                const auto isl = gim.metadata(nh).get<FusedIsland>().object;
+                if (isl->backend() == cv::gapi::fluid::backend())
+                {
+                    // add FluidData to all data nodes inside island
+                    for (const auto node : isl->contents())
+                    {
+                        if (g.metadata(node).get<NodeType>().t == NodeType::DATA)
+                            setFluidData(node, true);
+                    }
+
+                    // add FluidData to slot if it's read/written by fluid
+                    std::vector<ade::NodeHandle> io_handles;
+                    for (const auto &in_op : isl->in_ops())
+                    {
+                        ade::util::copy(in_op->inNodes(), std::back_inserter(io_handles));
+                    }
+                    for (const auto &out_op : isl->out_ops())
+                    {
+                        ade::util::copy(out_op->outNodes(), std::back_inserter(io_handles));
+                    }
+                    for (const auto &io_node : io_handles)
+                    {
+                        if (!fg.metadata(io_node).contains<FluidData>())
+                            setFluidData(io_node, false);
+                    }
+                } // if (fluid backend)
+            } // if (ISLAND)
+        } // for (gim.nodes())
+    });
+    // FIXME:
+    // move to unpackKernel method
+    // when https://gitlab-icv.inn.intel.com/G-API/g-api/merge_requests/66 is merged
+    ectx.addPass("exec", "init_fluid_unit_borders", [](ade::passes::PassContext &ctx)
+    {
+        GModel::Graph g(ctx.graph);
+        if (!GModel::isActive(g, cv::gapi::fluid::backend()))  // FIXME: Rearchitect this!
+            return;
+
+        GFluidModel fg(ctx.graph);
+
+        auto sorted = g.metadata().get<ade::passes::TopologicalSortData>().nodes();
+        for (auto node : sorted)
+        {
+            if (fg.metadata(node).contains<FluidUnit>())
+            {
+                // FIXME: check that op has only one data node on input
+                auto &fu = fg.metadata(node).get<FluidUnit>();
+                const auto &op = g.metadata(node).get<Op>();
+
+                // Trigger user-defined "getBorder" callback
+                fu.border = fu.k.m_b(GModel::collectInputMeta(fg, node), op.args);
+            }
+        }
+    });
+    ectx.addPass("exec", "init_fluid_units", [](ade::passes::PassContext &ctx)
+    {
+        GModel::Graph g(ctx.graph);
+        if (!GModel::isActive(g, cv::gapi::fluid::backend()))  // FIXME: Rearchitect this!
+            return;
+
+        initFluidUnits(ctx.graph);
+    });
+    ectx.addPass("exec", "init_line_consumption", [](ade::passes::PassContext &ctx)
+    {
+        GModel::Graph g(ctx.graph);
+        if (!GModel::isActive(g, cv::gapi::fluid::backend()))  // FIXME: Rearchitect this!
+            return;
+
+        initLineConsumption(ctx.graph);
+    });
+    ectx.addPass("exec", "calc_latency", [](ade::passes::PassContext &ctx)
+    {
+        GModel::Graph g(ctx.graph);
+        if (!GModel::isActive(g, cv::gapi::fluid::backend()))  // FIXME: Rearchitect this!
+            return;
+
+        calcLatency(ctx.graph);
+    });
+    ectx.addPass("exec", "calc_skew", [](ade::passes::PassContext &ctx)
+    {
+        GModel::Graph g(ctx.graph);
+        if (!GModel::isActive(g, cv::gapi::fluid::backend()))  // FIXME: Rearchitect this!
+            return;
+
+        calcSkew(ctx.graph);
+    });
+
+    ectx.addPass("exec", "init_buffer_borders", [](ade::passes::PassContext &ctx)
+    {
+        GModel::Graph g(ctx.graph);
+        if (!GModel::isActive(g, cv::gapi::fluid::backend()))  // FIXME: Rearchitect this!
+            return;
+
+        GFluidModel fg(ctx.graph);
+        auto sorted = g.metadata().get<ade::passes::TopologicalSortData>().nodes();
+        for (auto node : sorted)
+        {
+            if (fg.metadata(node).contains<FluidData>())
+            {
+                auto &fd = fg.metadata(node).get<FluidData>();
+
+                // Assign border stuff to FluidData
+
+                // In/out data nodes are bound to user data directly,
+                // so cannot be extended with a border
+                if (fd.internal == true)
+                {
+                    // For now border of the buffer's storage is the border
+                    // of the first reader whose border size is the same.
+                    // FIXME: find more clever strategy of border picking
+                    // (it can be a border which is common for majority of the
+                    // readers, also we can calculate the number of lines which
+                    // will be copied by views on each iteration and base our choice
+                    // on this criteria)
+                    auto readers = node->outNodes();
+                    const auto &candidate = ade::util::find_if(readers, [&](ade::NodeHandle nh) {
+                        return fg.metadata(nh).contains<FluidUnit>() &&
+                               fg.metadata(nh).get<FluidUnit>().border_size == fd.border_size;
+                    });
+
+                    GAPI_Assert(candidate != readers.end());
+
+                    const auto &fu = fg.metadata(*candidate).get<FluidUnit>();
+                    fd.border = fu.border;
+                }
+
+                if (fd.border)
+                {
+                    GModel::log(g, node, "Border type: " + std::to_string(fd.border->type), node);
+                }
+            }
+        }
+    });
+    ectx.addPass("exec", "init_view_borders", [](ade::passes::PassContext &ctx)
+    {
+        GModel::Graph g(ctx.graph);
+        if (!GModel::isActive(g, cv::gapi::fluid::backend()))  // FIXME: Rearchitect this!
+            return;
+
+        GFluidModel fg(ctx.graph);
+        for (auto node : g.nodes())
+        {
+            if (fg.metadata(node).contains<FluidData>())
+            {
+                auto &fd = fg.metadata(node).get<FluidData>();
+                for (auto out_edge : node->outEdges())
+                {
+                    const auto dstNode = out_edge->dstNode();
+                    if (fg.metadata(dstNode).contains<FluidUnit>())
+                    {
+                        const auto &fu = fg.metadata(dstNode).get<FluidUnit>();
+
+                        // There is no need in own storage for view if it's border is
+                        // the same as the buffer's (view can have equal or smaller border
+                        // size in this case)
+                        if (fu.border_size == 0 ||
+                                (fu.border && fd.border && (*fu.border == *fd.border)))
+                        {
+                            GAPI_Assert(fu.border_size <= fd.border_size);
+                            fg.metadata(out_edge).set(FluidUseOwnBorderBuffer{false});
+                        }
+                        else
+                        {
+                            fg.metadata(out_edge).set(FluidUseOwnBorderBuffer{true});
+                            GModel::log(g, out_edge, "OwnBufferStorage: true");
+                        }
+                    }
+                }
+            }
+        }
+    });
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbackend.hpp
new file mode 100644 (file)
index 0000000..ba8b977
--- /dev/null
@@ -0,0 +1,137 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_FLUID_BACKEND_HPP
+#define OPENCV_GAPI_FLUID_BACKEND_HPP
+
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/gproto.hpp"
+#include "opencv2/gapi/fluid/gfluidkernel.hpp"
+#include "opencv2/gapi/fluid/gfluidbuffer.hpp"
+
+// PRIVATE STUFF!
+#include "backends/common/gbackend.hpp"
+#include "compiler/gislandmodel.hpp"
+
+namespace cv { namespace gimpl {
+
+struct FluidUnit
+{
+    static const char *name() { return "FluidUnit"; }
+    GFluidKernel k;
+    gapi::fluid::BorderOpt border;
+    int border_size;
+    int line_consumption;
+    double ratio;
+};
+
+struct FluidUseOwnBorderBuffer
+{
+    static const char *name() { return "FluidUseOwnBorderBuffer"; }
+    bool use;
+};
+
+struct FluidData
+{
+    static const char *name() { return "FluidData"; }
+
+    // FIXME: This structure starts looking like "FluidBuffer" meta
+    int  latency         = 0;
+    int  skew            = 0;
+    int  max_consumption = 1;
+    int  border_size     = 0;
+    int  lpi_write       = 1;
+    bool internal        = false; // is node internal to any fluid island
+    gapi::fluid::BorderOpt border;
+};
+
+struct FluidAgent
+{
+public:
+    virtual ~FluidAgent() = default;
+    FluidAgent(const ade::Graph &g, ade::NodeHandle nh);
+
+    GFluidKernel k;
+    ade::NodeHandle op_handle; // FIXME: why it is here??//
+    std::string op_name;
+
+    // <  0 - not a buffer
+    // >= 0 - a buffer with RcID
+    std::vector<int> in_buffer_ids;
+    std::vector<int> out_buffer_ids;
+
+    cv::GArgs in_args;
+    std::vector<cv::gapi::fluid::View>   in_views; // sparce list of IN views
+    std::vector<cv::gapi::fluid::Buffer*> out_buffers;
+
+    // FIXME Current assumption is that outputs have EQUAL SIZES
+    int m_outputLines = 0;
+    int m_producedLines = 0;
+
+    // Execution methods
+    void reset();
+    bool canWork() const;
+    bool canRead() const;
+    bool canWrite() const;
+    void doWork();
+    bool done() const;
+
+    void debug(std::ostream& os);
+
+    // FIXME:
+    // refactor (implement a more solid replacement or
+    // drop this method completely)
+    virtual void setRatio(double ratio) = 0;
+
+private:
+    // FIXME!!!
+    // move to another class
+    virtual int firstWindow() const = 0;
+    virtual std::pair<int,int> linesReadAndnextWindow() const = 0;
+};
+
+class GFluidExecutable final: public GIslandExecutable
+{
+    const ade::Graph &m_g;
+    GModel::ConstGraph m_gm;
+
+    std::vector<std::unique_ptr<FluidAgent>> m_agents;
+    std::vector<cv::gapi::fluid::Buffer> m_buffers;
+
+    std::vector<FluidAgent*> m_script;
+
+    using Magazine = detail::magazine<cv::gapi::own::Scalar>;
+    Magazine m_res;
+
+    std::size_t m_num_int_buffers; // internal buffers counter (m_buffers - num_scratch)
+    std::vector<std::size_t> m_scratch_users;
+
+    std::unordered_map<int, std::size_t> m_id_map; // GMat id -> buffer idx map
+    std::map<std::size_t, ade::NodeHandle> m_all_gmat_ids;
+
+    void bindInArg (const RcDesc &rc, const GRunArg &arg);
+    void bindOutArg(const RcDesc &rc, const GRunArgP &arg);
+    void packArg   (GArg &in_arg, const GArg &op_arg);
+
+    void initBufferRois(std::vector<int>& readStarts, std::vector<cv::gapi::own::Rect>& rois, const std::vector<gapi::own::Rect> &out_rois);
+    void makeReshape(const std::vector<cv::gapi::own::Rect>& out_rois);
+
+public:
+    GFluidExecutable(const ade::Graph &g,
+                     const std::vector<ade::NodeHandle> &nodes,
+                     const std::vector<cv::gapi::own::Rect> &outputRois);
+
+    virtual inline bool canReshape() const override { return true; }
+    virtual void reshape(ade::Graph& g, const GCompileArgs& args) override;
+
+    virtual void run(std::vector<InObj>  &&input_objs,
+                     std::vector<OutObj> &&output_objs) override;
+};
+}} // cv::gimpl
+
+
+#endif // OPENCV_GAPI_FLUID_BACKEND_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer.cpp
new file mode 100644 (file)
index 0000000..6672ea2
--- /dev/null
@@ -0,0 +1,760 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <iomanip>   // hex, dec (debug)
+
+#include "opencv2/gapi/own/convert.hpp"
+#include "opencv2/gapi/own/types.hpp"
+
+#include "opencv2/gapi/fluid/gfluidbuffer.hpp"
+#include "backends/fluid/gfluidbuffer_priv.hpp"
+#include "opencv2/gapi/opencv_includes.hpp"
+
+#include "backends/fluid/gfluidutils.hpp" // saturate
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+bool operator == (const fluid::Border& b1, const fluid::Border& b2)
+{
+    return b1.type == b2.type && b1.value == b2.value;
+}
+} // namespace fluid
+
+// Fluid BorderHandler implementation /////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+// Expected inputs:
+// row - row buffer allocated with border in mind (have memory for both image and border pixels)
+// length - size of the buffer with left and right borders included
+void fillBorderReplicateRow(uint8_t* row, int length, int chan, int borderSize)
+{
+    auto leftBorder  = reinterpret_cast<T*>(row);
+    auto rightBorder = leftBorder + (length - borderSize) * chan;
+    for (int b = 0; b < borderSize; b++)
+    {
+        for (int c = 0; c < chan; c++)
+        {
+            leftBorder [b*chan + c] = leftBorder [borderSize*chan + c];
+            rightBorder[b*chan + c] = rightBorder[-chan + c];
+        }
+    }
+}
+
+template<typename T>
+void fillBorderReflectRow(uint8_t* row, int length, int chan, int borderSize)
+{
+    auto leftBorder  = reinterpret_cast<T*>(row);
+    auto rightBorder = leftBorder + (length - borderSize) * chan;
+    for (int b = 0; b < borderSize; b++)
+    {
+        for (int c = 0; c < chan; c++)
+        {
+            leftBorder [b*chan + c] = leftBorder [(2*borderSize - b)*chan + c];
+            rightBorder[b*chan + c] = rightBorder[(-b - 2)*chan + c];
+        }
+    }
+}
+
+template<typename T>
+void fillConstBorderRow(uint8_t* row, int length, int chan, int borderSize, cv::gapi::own::Scalar borderValue)
+{
+    GAPI_DbgAssert(chan > 0 && chan <= 4);
+
+    auto leftBorder  = reinterpret_cast<T*>(row);
+    auto rightBorder = leftBorder + (length - borderSize) * chan;
+    for (int b = 0; b < borderSize; b++)
+    {
+        for (int c = 0; c < chan; c++)
+        {
+            leftBorder [b*chan + c] = fluid::saturate<T>(borderValue[c], fluid::roundd);
+            rightBorder[b*chan + c] = fluid::saturate<T>(borderValue[c], fluid::roundd);
+        }
+    }
+}
+
+// Fills const border pixels in the whole mat
+void fillBorderConstant(int borderSize, cv::gapi::own::Scalar borderValue, cv::gapi::own::Mat& mat)
+{
+    // cv::Scalar can contain maximum 4 chan
+    GAPI_Assert(mat.channels() > 0 && mat.channels() <= 4);
+
+    auto getFillBorderRowFunc = [&](int type) {
+        switch(type)
+        {
+        case CV_8U:  return &fillConstBorderRow< uint8_t>; break;
+        case CV_16S: return &fillConstBorderRow< int16_t>; break;
+        case CV_16U: return &fillConstBorderRow<uint16_t>; break;
+        case CV_32F: return &fillConstBorderRow< float  >; break;
+        default: GAPI_Assert(false); return &fillConstBorderRow<uint8_t>;
+        }
+    };
+
+    auto fillBorderRow = getFillBorderRowFunc(mat.depth());
+    for (int y = 0; y < mat.rows; y++)
+    {
+        fillBorderRow(mat.ptr(y), mat.cols, mat.channels(), borderSize, borderValue);
+    }
+}
+} // anonymous namespace
+
+fluid::BorderHandler::BorderHandler(int border_size)
+{
+    GAPI_Assert(border_size > 0);
+    m_border_size = border_size;
+}
+
+template <int BorderType>
+fluid::BorderHandlerT<BorderType>::BorderHandlerT(int border_size, int data_type)
+    : BorderHandler(border_size)
+{
+    auto getFillBorderRowFunc = [&](int border, int depth) {
+        if (border == cv::BORDER_REPLICATE)
+        {
+            switch(depth)
+            {
+            case CV_8U:  return &fillBorderReplicateRow< uint8_t>; break;
+            case CV_16S: return &fillBorderReplicateRow< int16_t>; break;
+            case CV_16U: return &fillBorderReplicateRow<uint16_t>; break;
+            case CV_32F: return &fillBorderReplicateRow< float  >; break;
+            default: GAPI_Assert(!"Unsupported data type"); return &fillBorderReplicateRow<uint8_t>;
+            }
+        }
+        else if (border == cv::BORDER_REFLECT_101)
+        {
+            switch(depth)
+            {
+            case CV_8U:  return &fillBorderReflectRow< uint8_t>; break;
+            case CV_16S: return &fillBorderReflectRow< int16_t>; break;
+            case CV_16U: return &fillBorderReflectRow<uint16_t>; break;
+            case CV_32F: return &fillBorderReflectRow< float  >; break;
+            default: GAPI_Assert(!"Unsupported data type"); return &fillBorderReflectRow<uint8_t>;
+            }
+        }
+        else
+        {
+            GAPI_Assert(!"Unsupported border type");
+            return &fillBorderReflectRow<uint8_t>;
+        }
+    };
+
+    m_fill_border_row = getFillBorderRowFunc(BorderType, CV_MAT_DEPTH(data_type));
+}
+
+namespace {
+template <int BorderType> int getBorderIdx(int log_idx, int desc_height);
+
+template<> int getBorderIdx<cv::BORDER_REPLICATE>(int log_idx, int desc_height)
+{
+    return log_idx < 0 ? 0 : desc_height - 1;
+}
+
+template<> int getBorderIdx<cv::BORDER_REFLECT_101>(int log_idx, int desc_height)
+{
+    return log_idx < 0 ? -log_idx : 2*(desc_height - 1) - log_idx;
+}
+} // namespace
+
+template <int BorderType>
+const uint8_t* fluid::BorderHandlerT<BorderType>::inLineB(int log_idx, const BufferStorageWithBorder& data, int desc_height) const
+{
+    auto idx = getBorderIdx<BorderType>(log_idx, desc_height);
+    return data.ptr(idx);
+}
+
+fluid::BorderHandlerT<cv::BORDER_CONSTANT>::BorderHandlerT(int border_size, cv::gapi::own::Scalar border_value)
+    : BorderHandler(border_size), m_border_value(border_value)
+{ /* nothing */ }
+
+const uint8_t* fluid::BorderHandlerT<cv::BORDER_CONSTANT>::inLineB(int /*log_idx*/, const BufferStorageWithBorder& /*data*/, int /*desc_height*/) const
+{
+    return m_const_border.ptr(0, m_border_size);
+}
+
+void fluid::BorderHandlerT<cv::BORDER_CONSTANT>::fillCompileTimeBorder(BufferStorageWithBorder& data)
+{
+    m_const_border.create(1, data.cols(), data.data().type());
+    m_const_border = m_border_value;
+
+    cv::gapi::fillBorderConstant(m_border_size, m_border_value, data.data());
+}
+
+template <int BorderType>
+void fluid::BorderHandlerT<BorderType>::updateBorderPixels(BufferStorageWithBorder &data, int startLine, int nLines) const
+{
+    auto& mat   = data.data();
+    auto length = mat.cols;
+    auto chan   = mat.channels();
+
+    for (int l = startLine; l < startLine + nLines; l++)
+    {
+        auto row = mat.ptr(data.physIdx(l));
+        m_fill_border_row(row, length, chan, m_border_size);
+    }
+}
+
+std::size_t fluid::BorderHandlerT<cv::BORDER_CONSTANT>::size() const
+{
+    return m_const_border.total() * m_const_border.elemSize();
+}
+
+// Fluid BufferStorage implementation //////////////////////////////////////////
+
+void fluid::BufferStorage::updateInCache(View::Cache& cache, int start_log_idx, int nLines) const
+{
+    for (int i = 0; i < nLines; i++)
+    {
+        cache.m_linePtrs[i] = inLineB(start_log_idx + i, cache.m_desc.size.height);
+    }
+}
+
+void fluid::BufferStorage::updateOutCache(Buffer::Cache& cache, int start_log_idx, int nLines)
+{
+    for (int i = 0; i < nLines; i++)
+    {
+        cache.m_linePtrs[i] = ptr(start_log_idx + i);
+    }
+}
+
+void fluid::BufferStorageWithBorder::init(int dtype, int border_size, Border border)
+{
+    switch(border.type)
+    {
+    case cv::BORDER_CONSTANT:
+        m_borderHandler.reset(new BorderHandlerT<cv::BORDER_CONSTANT>(border_size, border.value)); break;
+    case cv::BORDER_REPLICATE:
+        m_borderHandler.reset(new BorderHandlerT<cv::BORDER_REPLICATE>(border_size, dtype)); break;
+    case cv::BORDER_REFLECT_101:
+        m_borderHandler.reset(new BorderHandlerT<cv::BORDER_REFLECT_101>(border_size, dtype)); break;
+    default:
+        GAPI_Assert(false);
+    }
+}
+
+void fluid::BufferStorageWithBorder::create(int capacity, int desc_width, int dtype)
+{
+    auto borderSize = m_borderHandler->borderSize();
+    auto width = (desc_width + 2*borderSize);
+    m_data.create(capacity, width, dtype);
+
+    m_borderHandler->fillCompileTimeBorder(*this);
+}
+
+void fluid::BufferStorageWithoutBorder::create(int capacity, int desc_width, int dtype)
+{
+    auto width = desc_width;
+    m_data.create(capacity, width, dtype);
+
+    m_is_virtual = true;
+}
+
+const uint8_t* fluid::BufferStorageWithBorder::inLineB(int log_idx, int desc_height) const
+{
+    if (log_idx < 0 || log_idx >= desc_height)
+    {
+        return m_borderHandler->inLineB(log_idx, *this, desc_height);
+    }
+    else
+    {
+        return ptr(log_idx);
+    }
+}
+
+static void copyWithoutBorder(const cv::gapi::own::Mat& src, int src_border_size, cv::gapi::own::Mat& dst, int dst_border_size, int startSrcLine, int startDstLine, int lpi)
+{
+    auto subSrc = src(cv::gapi::own::Rect{src_border_size, startSrcLine, src.cols - 2*src_border_size, lpi});
+    auto subDst = dst(cv::gapi::own::Rect{dst_border_size, startDstLine, dst.cols - 2*dst_border_size, lpi});
+
+    subSrc.copyTo(subDst);
+}
+
+void fluid::BufferStorageWithoutBorder::copyTo(BufferStorageWithBorder &dst, int startLine, int nLines) const
+{
+    for (int l = startLine; l < startLine + nLines; l++)
+    {
+        copyWithoutBorder(m_data, 0, dst.data(), dst.borderSize(), physIdx(l), dst.physIdx(l), 1);
+    }
+}
+
+void fluid::BufferStorageWithBorder::copyTo(BufferStorageWithBorder &dst, int startLine, int nLines) const
+{
+    // Copy required lpi lines line by line (to avoid wrap if invoked for multiple lines)
+    for (int l = startLine; l < startLine + nLines; l++)
+    {
+        copyWithoutBorder(m_data, borderSize(), dst.data(), dst.borderSize(), physIdx(l), dst.physIdx(l), 1);
+    }
+}
+
+// FIXME? remember parent and remove src parameter?
+void fluid::BufferStorageWithBorder::updateBeforeRead(int startLine, int nLines, const BufferStorage& src)
+{
+    // TODO:
+    // Cover with tests!!
+    // (Ensure that there are no redundant copies done
+    // and only required (not fetched before) lines are copied)
+
+    GAPI_DbgAssert(startLine >= 0);
+
+    src.copyTo(*this, startLine, nLines);
+    m_borderHandler->updateBorderPixels(*this, startLine, nLines);
+}
+
+void fluid::BufferStorageWithoutBorder::updateBeforeRead(int /*startLine*/, int /*lpi*/, const BufferStorage& /*src*/)
+{
+    /* nothing */
+}
+
+void fluid::BufferStorageWithBorder::updateAfterWrite(int startLine, int nLines)
+{
+    // FIXME?
+    // Actually startLine + nLines can be > logical height so
+    // redundant end lines which will never be read
+    // can be filled in the ring buffer
+    m_borderHandler->updateBorderPixels(*this, startLine, nLines);
+}
+
+void fluid::BufferStorageWithoutBorder::updateAfterWrite(int /*startLine*/, int /*lpi*/)
+{
+    /* nothing */
+}
+
+size_t fluid::BufferStorageWithBorder::size() const
+{
+    return m_data.total()*m_data.elemSize() + m_borderHandler->size();
+}
+
+size_t fluid::BufferStorageWithoutBorder::size() const
+{
+    return m_data.total()*m_data.elemSize();
+}
+
+namespace fluid {
+namespace {
+std::unique_ptr<fluid::BufferStorage> createStorage(int capacity, int desc_width, int type,
+                                                    int border_size, fluid::BorderOpt border);
+std::unique_ptr<fluid::BufferStorage> createStorage(int capacity, int desc_width, int type,
+                                                    int border_size, fluid::BorderOpt border)
+{
+    if (border)
+    {
+        std::unique_ptr<fluid::BufferStorageWithBorder> storage(new BufferStorageWithBorder);
+        storage->init(type, border_size, border.value());
+        storage->create(capacity, desc_width, type);
+        return std::move(storage);
+    }
+
+    std::unique_ptr<BufferStorageWithoutBorder> storage(new BufferStorageWithoutBorder);
+    storage->create(capacity, desc_width, type);
+    return std::move(storage);
+}
+
+std::unique_ptr<BufferStorage> createStorage(const cv::gapi::own::Mat& data, cv::gapi::own::Rect roi);
+std::unique_ptr<BufferStorage> createStorage(const cv::gapi::own::Mat& data, cv::gapi::own::Rect roi)
+{
+    std::unique_ptr<BufferStorageWithoutBorder> storage(new BufferStorageWithoutBorder);
+    storage->attach(data, roi);
+    return std::move(storage);
+}
+} // namespace
+} // namespace fluid
+
+// Fluid View implementation ///////////////////////////////////////////////////
+
+void fluid::View::Priv::reset(int linesForFirstIteration)
+{
+    GAPI_DbgAssert(m_p);
+
+    m_lines_next_iter = linesForFirstIteration;
+    m_read_caret = m_p->priv().readStart();
+}
+
+void fluid::View::Priv::readDone(int linesRead, int linesForNextIteration)
+{
+    GAPI_DbgAssert(m_p);
+    m_read_caret += linesRead;
+    m_lines_next_iter = linesForNextIteration;
+}
+
+bool fluid::View::Priv::ready() const
+{
+    auto lastWrittenLine = m_p->priv().writeStart() + m_p->linesReady();
+    // + bottom border
+    if (lastWrittenLine == m_p->meta().size.height) lastWrittenLine += m_border_size;
+    // + top border
+    lastWrittenLine += m_border_size;
+
+    auto lastRequiredLine = m_read_caret + m_lines_next_iter;
+
+    return lastWrittenLine >= lastRequiredLine;
+}
+
+fluid::ViewPrivWithoutOwnBorder::ViewPrivWithoutOwnBorder(const Buffer *parent, int borderSize)
+{
+    GAPI_Assert(parent);
+    m_p           = parent;
+    m_border_size = borderSize;
+}
+
+const uint8_t* fluid::ViewPrivWithoutOwnBorder::InLineB(int index) const
+{
+    GAPI_DbgAssert(m_p);
+
+    const auto &p_priv = m_p->priv();
+
+    GAPI_DbgAssert(index >= -m_border_size
+                && index <  -m_border_size + m_lines_next_iter);
+
+    const int log_idx = m_read_caret + index;
+
+    return p_priv.storage().inLineB(log_idx, m_p->meta().size.height);
+}
+
+void fluid::ViewPrivWithoutOwnBorder::allocate(int lineConsumption, BorderOpt)
+{
+    initCache(lineConsumption);
+}
+
+void fluid::ViewPrivWithoutOwnBorder::prepareToRead()
+{
+    const auto &storage = m_p->priv().storage();
+
+    const int start_log_idx = m_read_caret - m_border_size;
+    storage.updateInCache(m_cache, start_log_idx, m_lines_next_iter);
+}
+
+fluid::ViewPrivWithOwnBorder::ViewPrivWithOwnBorder(const Buffer *parent, int borderSize)
+{
+    GAPI_Assert(parent);
+    m_p           = parent;
+    m_border_size = borderSize;
+}
+
+void fluid::ViewPrivWithOwnBorder::allocate(int lineConsumption, BorderOpt border)
+{
+    initCache(lineConsumption);
+
+    const auto& desc = m_cache.m_desc;
+    int  type = CV_MAKETYPE(desc.depth, desc.chan);
+    m_own_storage.init(type, m_border_size, border.value());
+    m_own_storage.create(lineConsumption, desc.size.width, type);
+}
+
+void fluid::ViewPrivWithOwnBorder::prepareToRead()
+{
+    int startLine = 0;
+    int nLines = 0;
+
+    if (m_read_caret == m_p->priv().readStart())
+    {
+        // Need to fetch full window on the first iteration
+        startLine = (m_read_caret > m_border_size) ? m_read_caret - m_border_size : 0;
+        nLines = m_lines_next_iter;
+    }
+    else
+    {
+        startLine = m_read_caret + m_border_size;
+        nLines = m_lines_next_iter - 2*m_border_size;
+    }
+
+    m_own_storage.updateBeforeRead(startLine, nLines, m_p->priv().storage());
+
+    const int start_log_idx = m_read_caret - m_border_size;
+    m_own_storage.updateInCache(m_cache, start_log_idx, m_lines_next_iter);
+}
+
+std::size_t fluid::ViewPrivWithOwnBorder::size() const
+{
+    GAPI_DbgAssert(m_p);
+    return m_own_storage.size();
+}
+
+const uint8_t* fluid::ViewPrivWithOwnBorder::InLineB(int index) const
+{
+    GAPI_DbgAssert(m_p);
+    GAPI_DbgAssert(index >= -m_border_size
+                && index <  -m_border_size + m_lines_next_iter);
+
+    const int log_idx = m_read_caret + index;
+
+    return m_own_storage.inLineB(log_idx, m_p->meta().size.height);
+}
+
+bool fluid::View::ready() const
+{
+    return m_priv->ready();
+}
+
+int fluid::View::y() const
+{
+    return m_priv->m_read_caret - m_priv->m_border_size;
+}
+
+fluid::View::Priv& fluid::View::priv()
+{
+    return *m_priv;
+}
+
+const fluid::View::Priv& fluid::View::priv() const
+{
+    return *m_priv;
+}
+
+void fluid::View::Priv::initCache(int lineConsumption)
+{
+    m_cache.m_linePtrs.resize(lineConsumption);
+    m_cache.m_desc = m_p->priv().meta();
+    m_cache.m_border_size = m_border_size;
+}
+
+// Fluid Buffer implementation /////////////////////////////////////////////////
+
+fluid::Buffer::Priv::Priv(int read_start, cv::gapi::own::Rect roi)
+    : m_readStart(read_start)
+    , m_roi(roi)
+{}
+
+void fluid::Buffer::Priv::init(const cv::GMatDesc &desc,
+                               int writer_lpi,
+                               int readStartPos,
+                               cv::gapi::own::Rect roi)
+{
+    m_writer_lpi = writer_lpi;
+    m_desc       = desc;
+    m_readStart  = readStartPos;
+    m_roi        = roi == own::Rect{} ? own::Rect{ 0, 0, desc.size.width, desc.size.height }
+                                      : roi;
+    m_cache.m_linePtrs.resize(writer_lpi);
+    m_cache.m_desc = desc;
+}
+
+void fluid::Buffer::Priv::allocate(BorderOpt border,
+                                   int border_size,
+                                   int line_consumption,
+                                   int skew)
+{
+    GAPI_Assert(line_consumption > 0);
+
+    // Init physical buffer
+
+    // FIXME? combine line_consumption with skew?
+    auto data_height = std::max(line_consumption, skew) + m_writer_lpi - 1;
+
+    m_storage = createStorage(data_height,
+                              m_desc.size.width,
+                              CV_MAKETYPE(m_desc.depth, m_desc.chan),
+                              border_size,
+                              border);
+
+    // Finally, initialize carets
+    m_write_caret = writeStart();
+
+    m_storage->updateOutCache(m_cache, m_write_caret, m_writer_lpi);
+}
+
+void fluid::Buffer::Priv::bindTo(const cv::gapi::own::Mat &data, bool is_input)
+{
+    // FIXME: move all these fields into a separate structure
+    GAPI_Assert(m_desc == descr_of(data));
+
+    // Currently m_writer_lpi is obtained from metadata which is shared between islands
+    // and this assert can trigger for slot which connects two fluid islands.
+    // m_writer_lpi is used only in write-related functions and doesn't affect
+    // buffer which is island's input so it's safe to skip this check.
+    // FIXME:
+    // Bring back this check when we move to 1 buffer <-> 1 metadata model
+    // if (is_input) GAPI_Assert(m_writer_lpi == 1);
+
+    m_storage = createStorage(data, m_roi);
+
+    m_is_input    = is_input;
+    m_write_caret = is_input ? writeEnd(): writeStart();
+    // NB: views remain the same!
+
+    m_storage->updateOutCache(m_cache, m_write_caret, m_writer_lpi);
+}
+
+bool fluid::Buffer::Priv::full() const
+{
+    int slowest_y = writeEnd();
+    if (!m_views.empty())
+    {
+        // reset with maximum possible value and then find minimum
+        slowest_y = m_desc.size.height;
+        for (const auto &v : m_views) slowest_y = std::min(slowest_y, v.y());
+    }
+
+    return m_write_caret + lpi() - slowest_y > m_storage->rows();
+}
+
+void fluid::Buffer::Priv::writeDone()
+{
+    // There are possible optimizations which can be done to fill a border values
+    // in compile time of the graph (for example border is const),
+    // so there is no need to update border values after each write.
+    // If such optimizations weren't applied, fill border for lines
+    // which have been just written
+    m_storage->updateAfterWrite(m_write_caret, m_writer_lpi);
+
+    // Final write may produce less LPI, so
+    // write caret may exceed logical buffer size
+    m_write_caret += m_writer_lpi;
+    // FIXME: add consistency check!
+
+    m_storage->updateOutCache(m_cache, m_write_caret, m_writer_lpi);
+}
+
+void fluid::Buffer::Priv::reset()
+{
+    m_write_caret = m_is_input ? writeEnd() : writeStart();
+    m_storage->updateOutCache(m_cache, m_write_caret, m_writer_lpi);
+}
+
+int fluid::Buffer::Priv::size() const
+{
+    std::size_t view_sz = 0;
+    for (const auto &v : m_views) view_sz += v.priv().size();
+
+    auto total = view_sz;
+    if (m_storage) total += m_storage->size();
+
+    // FIXME: Change API to return size_t!!!
+    return static_cast<int>(total);
+}
+
+int fluid::Buffer::Priv::linesReady() const
+{
+    if (m_is_input)
+    {
+        return m_storage->rows();
+    }
+    else
+    {
+        const int writes = std::min(m_write_caret - writeStart(), outputLines());
+        return writes;
+    }
+}
+
+uint8_t* fluid::Buffer::Priv::OutLineB(int index)
+{
+    GAPI_DbgAssert(index >= 0 && index < m_writer_lpi);
+
+    return m_storage->ptr(m_write_caret + index);
+}
+
+int fluid::Buffer::Priv::lpi() const
+{
+    // FIXME:
+    // m_write_caret can be greater than m_writeRoi.y + m_writeRoi.height, so return value can be negative !!!
+    return std::min(writeEnd() - m_write_caret, m_writer_lpi);
+}
+
+fluid::Buffer::Buffer()
+    : m_priv(new Priv())
+    , m_cache(&m_priv->cache())
+{
+}
+
+fluid::Buffer::Buffer(const cv::GMatDesc &desc)
+    : m_priv(new Priv())
+    , m_cache(&m_priv->cache())
+{
+    int lineConsumption = 1;
+    int border = 0, skew = 0, wlpi = 1, readStart = 0;
+    cv::gapi::own::Rect roi = {0, 0, desc.size.width, desc.size.height};
+    m_priv->init(desc, wlpi, readStart, roi);
+    m_priv->allocate({}, border, lineConsumption, skew);
+}
+
+fluid::Buffer::Buffer(const cv::GMatDesc &desc,
+                      int max_line_consumption,
+                      int border_size,
+                      int skew,
+                      int wlpi,
+                      BorderOpt border)
+    : m_priv(new Priv())
+    , m_cache(&m_priv->cache())
+{
+    int readStart = 0;
+    cv::gapi::own::Rect roi = {0, 0, desc.size.width, desc.size.height};
+    m_priv->init(desc, wlpi, readStart, roi);
+    m_priv->allocate(border, border_size, max_line_consumption, skew);
+}
+
+fluid::Buffer::Buffer(const cv::gapi::own::Mat &data, bool is_input)
+    : m_priv(new Priv())
+    , m_cache(&m_priv->cache())
+{
+    int wlpi = 1, readStart = 0;
+    cv::gapi::own::Rect roi{0, 0, data.cols, data.rows};
+    m_priv->init(descr_of(data), wlpi, readStart, roi);
+    m_priv->bindTo(data, is_input);
+}
+
+int fluid::Buffer::linesReady() const
+{
+    return m_priv->linesReady();
+}
+
+int fluid::Buffer::lpi() const
+{
+    return m_priv->lpi();
+}
+
+fluid::View::View(Priv* p)
+    : m_priv(p), m_cache(&p->cache())
+{ /* nothing */ }
+
+fluid::View fluid::Buffer::mkView(int borderSize, bool ownStorage)
+{
+    // FIXME: logic outside of Priv (because View takes pointer to Buffer)
+    auto view = ownStorage ? View(new ViewPrivWithOwnBorder(this, borderSize))
+                           : View(new ViewPrivWithoutOwnBorder(this, borderSize));
+    m_priv->addView(view);
+    return view;
+}
+
+void fluid::debugBufferPriv(const fluid::Buffer& buffer, std::ostream &os)
+{
+    // FIXME Use cv::gapi::own Size and Rect with operator<<, when merged ADE-285
+    const auto& p = buffer.priv();
+    os << "Fluid buffer " << std::hex << &buffer << std::dec
+       << " " << p.m_desc.size.width << " x " << p.m_desc.size.height << "]"
+       << " readStart:" << p.m_readStart
+       << " roi:" << "[" << p.m_roi.width << " x " << p.m_roi.height << " from (" << p.m_roi.x << ", " << p.m_roi.y << ")]"
+       <<" (phys " << "[" << p.storage().cols() << " x " <<  p.storage().rows() << "]" << ") :"
+       << "  w: " << p.m_write_caret
+       << ", r: [";
+    for (const auto &v : p.m_views) { os << &v.priv() << ":" << v.y() << " "; }
+    os << "], avail: " << buffer.linesReady()
+       << std::endl;
+}
+
+void fluid::Buffer::debug(std::ostream &os) const
+{
+    debugBufferPriv(*this, os);
+}
+
+fluid::Buffer::Priv& fluid::Buffer::priv()
+{
+    return *m_priv;
+}
+
+const fluid::Buffer::Priv& fluid::Buffer::priv() const
+{
+    return *m_priv;
+}
+
+int fluid::Buffer::y() const
+{
+    return m_priv->y();
+}
+
+} // namespace cv::gapi
+} // namespace cv
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidbuffer_priv.hpp
new file mode 100644 (file)
index 0000000..1f3eadc
--- /dev/null
@@ -0,0 +1,308 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_FLUID_BUFFER_PRIV_HPP
+#define OPENCV_GAPI_FLUID_BUFFER_PRIV_HPP
+
+#include <vector>
+
+#include "opencv2/gapi/fluid/gfluidbuffer.hpp"
+#include "opencv2/gapi/own/exports.hpp" // GAPI_EXPORTS
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+class BufferStorageWithBorder;
+
+class BorderHandler
+{
+protected:
+    int m_border_size;
+
+public:
+    BorderHandler(int border_size);
+    virtual ~BorderHandler() = default;
+    virtual const uint8_t* inLineB(int log_idx, const BufferStorageWithBorder &data, int desc_height) const = 0;
+
+    // Fills border pixels after buffer allocation (if possible (for const border))
+    inline virtual void fillCompileTimeBorder(BufferStorageWithBorder &) { /* nothing */ }
+
+    // Fills required border lines
+    inline virtual void updateBorderPixels(BufferStorageWithBorder& /*data*/, int /*startLine*/, int /*lpi*/) const { /* nothing */ }
+
+    inline int borderSize() const { return m_border_size; }
+    inline virtual std::size_t size() const { return 0; }
+};
+
+template<int BorderType>
+class BorderHandlerT : public BorderHandler
+{
+    std::function<void(uint8_t*,int,int,int)> m_fill_border_row;
+public:
+    BorderHandlerT(int border_size, int data_type);
+    virtual void updateBorderPixels(BufferStorageWithBorder& data, int startLine, int lpi) const override;
+    virtual const uint8_t* inLineB(int log_idx, const BufferStorageWithBorder &data, int desc_height) const override;
+};
+
+template<>
+class BorderHandlerT<cv::BORDER_CONSTANT> : public BorderHandler
+{
+    cv::gapi::own::Scalar m_border_value;
+    cv::gapi::own::Mat m_const_border;
+
+public:
+    BorderHandlerT(int border_size, cv::gapi::own::Scalar border_value);
+    virtual const uint8_t* inLineB(int log_idx, const BufferStorageWithBorder &data, int desc_height) const override;
+    virtual void fillCompileTimeBorder(BufferStorageWithBorder &) override;
+    virtual std::size_t size() const override;
+};
+
+class BufferStorage
+{
+protected:
+    cv::gapi::own::Mat m_data;
+
+public:
+    void updateInCache(View::Cache& cache, int start_log_idx, int nLines) const;
+    void updateOutCache(Buffer::Cache& cache, int start_log_idx, int nLines);
+
+    virtual void copyTo(BufferStorageWithBorder &dst, int startLine, int nLines) const = 0;
+
+    virtual ~BufferStorage() = default;
+
+    virtual const uint8_t* ptr(int idx) const = 0;
+    virtual       uint8_t* ptr(int idx) = 0;
+
+    inline bool empty() const { return m_data.empty(); }
+
+    inline const cv::gapi::own::Mat& data() const { return m_data; }
+    inline       cv::gapi::own::Mat& data()       { return m_data; }
+
+    inline int rows() const { return m_data.rows; }
+    inline int cols() const { return m_data.cols; }
+    inline int type() const { return m_data.type(); }
+
+    virtual const uint8_t* inLineB(int log_idx, int desc_height) const = 0;
+
+    // FIXME? remember parent and remove src parameter?
+    virtual void updateBeforeRead(int startLine, int nLines, const BufferStorage& src) = 0;
+    virtual void updateAfterWrite(int startLine, int nLines) = 0;
+
+    virtual inline int physIdx(int logIdx) const = 0;
+
+    virtual size_t size() const = 0;
+};
+
+class BufferStorageWithoutBorder final : public BufferStorage
+{
+    bool m_is_virtual = true;
+    cv::gapi::own::Rect m_roi;
+
+public:
+    virtual void copyTo(BufferStorageWithBorder &dst, int startLine, int nLines) const override;
+
+    inline virtual const uint8_t* ptr(int idx) const override
+    {
+        GAPI_DbgAssert((m_is_virtual && m_roi == cv::gapi::own::Rect{}) || (!m_is_virtual && m_roi != cv::gapi::own::Rect{}));
+        return m_data.ptr(physIdx(idx), 0);
+    }
+    inline virtual uint8_t* ptr(int idx) override
+    {
+        GAPI_DbgAssert((m_is_virtual && m_roi == cv::gapi::own::Rect{}) || (!m_is_virtual && m_roi != cv::gapi::own::Rect{}));
+        return m_data.ptr(physIdx(idx), 0);
+    }
+
+    inline void attach(const cv::gapi::own::Mat& _data, cv::gapi::own::Rect _roi)
+    {
+        m_data = _data(_roi);
+        m_roi = _roi;
+        m_is_virtual = false;
+    }
+
+    void create(int capacity, int desc_width, int type);
+
+    inline virtual const uint8_t* inLineB(int log_idx, int /*desc_height*/) const override { return ptr(log_idx); }
+
+    virtual void updateBeforeRead(int startLine, int nLines, const BufferStorage& src) override;
+    virtual void updateAfterWrite(int startLine, int nLines) override;
+
+    inline virtual int physIdx(int logIdx) const override { return (logIdx - m_roi.y) % m_data.rows; }
+
+    virtual size_t size() const override;
+};
+
+class BufferStorageWithBorder final: public BufferStorage
+{
+    std::unique_ptr<BorderHandler> m_borderHandler;
+
+public:
+    inline int borderSize() const { return m_borderHandler->borderSize(); }
+
+    virtual void copyTo(BufferStorageWithBorder &dst, int startLine, int nLines) const override;
+
+    inline virtual const uint8_t* ptr(int idx) const override
+    {
+        return m_data.ptr(physIdx(idx), borderSize());
+    }
+    inline virtual uint8_t* ptr(int idx) override
+    {
+        return m_data.ptr(physIdx(idx), borderSize());
+    }
+
+    void init(int depth, int border_size, Border border);
+    void create(int capacity, int desc_width, int dtype);
+
+    virtual const uint8_t* inLineB(int log_idx, int desc_height) const override;
+
+    virtual void updateBeforeRead(int startLine, int nLines, const BufferStorage &src) override;
+    virtual void updateAfterWrite(int startLine, int nLines) override;
+
+    inline virtual int physIdx(int logIdx) const override { return logIdx % m_data.rows; }
+
+    virtual size_t size() const override;
+};
+
+// FIXME: GAPI_EXPORTS is used here only to access internal methods
+// like readDone/writeDone in low-level tests
+class GAPI_EXPORTS View::Priv
+{
+    friend class View;
+protected:
+    View::Cache m_cache;
+
+    const Buffer *m_p           = nullptr; // FIXME replace with weak_ptr
+    int           m_read_caret  = -1;
+    int           m_lines_next_iter = -1;
+    int m_border_size = -1;
+
+public:
+    virtual ~Priv() = default;
+    // API used by actors/backend
+
+    const View::Cache& cache() const { return m_cache; }
+    void initCache(int lineConsumption);
+
+    virtual void allocate(int lineConsumption, BorderOpt border) = 0;
+    virtual void prepareToRead() = 0;
+
+    void readDone(int linesRead, int linesForNextIteration);
+    void reset(int linesForFirstIteration);
+
+    virtual std::size_t size() const = 0;
+
+    // Does the view have enough unread lines for next iteration
+    bool ready() const;
+
+    // API used (indirectly) by user code
+    virtual const uint8_t* InLineB(int index) const = 0;
+};
+
+class ViewPrivWithoutOwnBorder final : public View::Priv
+{
+public:
+    // API used by actors/backend
+    ViewPrivWithoutOwnBorder(const Buffer *p, int borderSize);
+
+    virtual void allocate(int lineConsumption, BorderOpt) override;
+    virtual void prepareToRead() override;
+
+    inline virtual std::size_t size() const override { return 0; }
+
+    // API used (indirectly) by user code
+    virtual const uint8_t* InLineB(int index) const override;
+};
+
+class ViewPrivWithOwnBorder final : public View::Priv
+{
+    BufferStorageWithBorder m_own_storage;
+
+public:
+    // API used by actors/backend
+    ViewPrivWithOwnBorder(const Buffer *p, int borderSize);
+
+    virtual void allocate(int lineConsumption, BorderOpt border) override;
+    virtual void prepareToRead() override;
+    virtual std::size_t size() const override;
+
+    // API used (indirectly) by user code
+    virtual const uint8_t* InLineB(int index) const override;
+};
+
+void debugBufferPriv(const Buffer& buffer, std::ostream &os);
+
+// FIXME: GAPI_EXPORTS is used here only to access internal methods
+// like readDone/writeDone in low-level tests
+class GAPI_EXPORTS Buffer::Priv
+{
+    Buffer::Cache m_cache;
+
+    int m_writer_lpi       =  1;
+
+    cv::GMatDesc m_desc    = cv::GMatDesc{-1,-1,{-1,-1}};
+    bool m_is_input        = false;
+
+    int m_write_caret      = -1;
+
+    std::vector<View> m_views;
+
+    std::unique_ptr<BufferStorage> m_storage;
+
+    // Coordinate starting from which this buffer is assumed
+    // to be read (with border not being taken into account)
+    int m_readStart;
+    cv::gapi::own::Rect m_roi;
+
+    friend void debugBufferPriv(const Buffer& p, std::ostream &os);
+
+public:
+    Priv() = default;
+    Priv(int read_start, cv::gapi::own::Rect roi);
+
+    inline const BufferStorage& storage() const { return *m_storage.get(); }
+
+    // API used by actors/backend
+    void init(const cv::GMatDesc &desc,
+              int writer_lpi,
+              int readStart,
+              cv::gapi::own::Rect roi);
+
+    void allocate(BorderOpt border, int border_size, int line_consumption, int skew);
+    void bindTo(const cv::gapi::own::Mat &data, bool is_input);
+
+    inline void addView(const View& view) { m_views.push_back(view); }
+
+    inline const GMatDesc& meta() const { return m_desc; }
+
+    bool full() const;
+    void writeDone();
+    void reset();
+    int size() const;
+
+    int linesReady() const;
+
+    inline int y() const { return m_write_caret; }
+
+    inline int writer_lpi()     const { return m_writer_lpi; }
+
+    // API used (indirectly) by user code
+    uint8_t* OutLineB(int index = 0);
+    int lpi() const;
+
+    inline int readStart()   const { return m_readStart; }
+    inline int writeStart()  const { return m_roi.y; }
+    inline int writeEnd()    const { return m_roi.y + m_roi.height; }
+    inline int outputLines() const { return m_roi.height; }
+
+    inline const Buffer::Cache& cache() const { return m_cache; }
+};
+
+} // namespace cv::gapi::fluid
+} // namespace cv::gapi
+} // namespace cv
+
+#endif // OPENCV_GAPI_FLUID_BUFFER_PRIV_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidcore.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidcore.cpp
new file mode 100644 (file)
index 0000000..16a63e2
--- /dev/null
@@ -0,0 +1,2193 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+#if !defined(GAPI_STANDALONE)
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/own/assert.hpp"
+#include "opencv2/core/traits.hpp"
+#include "opencv2/core/hal/hal.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+#include "opencv2/gapi/core.hpp"
+
+#include "opencv2/gapi/fluid/gfluidbuffer.hpp"
+#include "opencv2/gapi/fluid/gfluidkernel.hpp"
+#include "opencv2/gapi/fluid/core.hpp"
+
+#include "gfluidbuffer_priv.hpp"
+#include "gfluidbackend.hpp"
+#include "gfluidutils.hpp"
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+//---------------------
+//
+// Arithmetic functions
+//
+//---------------------
+
+template<typename DST, typename SRC1, typename SRC2>
+static inline DST absdiff(SRC1 x, SRC2 y)
+{
+    auto result = x > y? x - y: y - x;
+    return saturate<DST>(result, roundf);
+}
+
+template<typename DST, typename SRC1, typename SRC2>
+static inline DST addWeighted(SRC1 src1, SRC2 src2, float alpha, float beta, float gamma)
+{
+    float dst = src1*alpha + src2*beta + gamma;
+    return saturate<DST>(dst, roundf);
+}
+
+template<typename DST, typename SRC1, typename SRC2>
+static inline DST add(SRC1 x, SRC2 y)
+{
+    return saturate<DST>(x + y, roundf);
+}
+
+template<typename DST, typename SRC1, typename SRC2>
+static inline DST sub(SRC1 x, SRC2 y)
+{
+    return saturate<DST>(x - y, roundf);
+}
+
+template<typename DST, typename SRC1, typename SRC2>
+static inline DST subr(SRC1 x, SRC2 y)
+{
+    return saturate<DST>(y - x, roundf); // reverse: y - x
+}
+
+template<typename DST, typename SRC1, typename SRC2>
+static inline DST mul(SRC1 x, SRC2 y, float scale=1)
+{
+    auto result = scale * x * y;
+    return saturate<DST>(result, rintf);
+}
+
+template<typename DST, typename SRC1, typename SRC2>
+static inline DST div(SRC1 x, SRC2 y, float scale=1)
+{
+    // like OpenCV: returns 0, if y=0
+    auto result = y? scale * x / y: 0;
+    return saturate<DST>(result, rintf);
+}
+
+template<typename DST, typename SRC1, typename SRC2>
+static inline DST divr(SRC1 x, SRC2 y, float scale=1)
+{
+    auto result = x? scale * y / x: 0; // reverse: y / x
+    return saturate<DST>(result, rintf);
+}
+
+//---------------------------
+//
+// Fluid kernels: addWeighted
+//
+//---------------------------
+
+template<typename DST, typename SRC1, typename SRC2>
+static void run_addweighted(Buffer &dst, const View &src1, const View &src2,
+                            double alpha, double beta, double gamma)
+{
+    static_assert(std::is_same<SRC1, SRC2>::value, "wrong types");
+
+    const auto *in1 = src1.InLine<SRC1>(0);
+    const auto *in2 = src2.InLine<SRC2>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width  = dst.length();
+    int chan   = dst.meta().chan;
+    int length = width * chan;
+
+    // NB: assume in/out types are not 64-bits
+    auto _alpha = static_cast<float>( alpha );
+    auto _beta  = static_cast<float>( beta  );
+    auto _gamma = static_cast<float>( gamma );
+
+    for (int l=0; l < length; l++)
+        out[l] = addWeighted<DST>(in1[l], in2[l], _alpha, _beta, _gamma);
+}
+
+GAPI_FLUID_KERNEL(GFluidAddW, cv::gapi::core::GAddW, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, double alpha, const View &src2,
+                                      double beta, double gamma, int /*dtype*/,
+                        Buffer &dst)
+    {
+        //      DST     SRC1    SRC2    OP               __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_addweighted, dst, src1, src2, alpha, beta, gamma);
+        BINARY_(uchar , ushort, ushort, run_addweighted, dst, src1, src2, alpha, beta, gamma);
+        BINARY_(uchar ,  short,  short, run_addweighted, dst, src1, src2, alpha, beta, gamma);
+        BINARY_( short,  short,  short, run_addweighted, dst, src1, src2, alpha, beta, gamma);
+        BINARY_(ushort, ushort, ushort, run_addweighted, dst, src1, src2, alpha, beta, gamma);
+        BINARY_( float, uchar , uchar , run_addweighted, dst, src1, src2, alpha, beta, gamma);
+        BINARY_( float, ushort, ushort, run_addweighted, dst, src1, src2, alpha, beta, gamma);
+        BINARY_( float,  short,  short, run_addweighted, dst, src1, src2, alpha, beta, gamma);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//--------------------------
+//
+// Fluid kernels: +, -, *, /
+//
+//--------------------------
+
+enum Arithm { ARITHM_ABSDIFF, ARITHM_ADD, ARITHM_SUBTRACT, ARITHM_MULTIPLY, ARITHM_DIVIDE };
+
+template<typename DST, typename SRC1, typename SRC2>
+static void run_arithm(Buffer &dst, const View &src1, const View &src2, Arithm arithm,
+                       double scale=1)
+{
+    static_assert(std::is_same<SRC1, SRC2>::value, "wrong types");
+
+    const auto *in1 = src1.InLine<SRC1>(0);
+    const auto *in2 = src2.InLine<SRC2>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width  = dst.length();
+    int chan   = dst.meta().chan;
+    int length = width * chan;
+
+    // NB: assume in/out types are not 64-bits
+    float _scale = static_cast<float>( scale );
+
+    switch (arithm)
+    {
+    case ARITHM_ABSDIFF:
+        for (int l=0; l < length; l++)
+            out[l] = absdiff<DST>(in1[l], in2[l]);
+        break;
+    case ARITHM_ADD:
+        for (int l=0; l < length; l++)
+            out[l] = add<DST>(in1[l], in2[l]);
+        break;
+    case ARITHM_SUBTRACT:
+        for (int l=0; l < length; l++)
+            out[l] = sub<DST>(in1[l], in2[l]);
+        break;
+    case ARITHM_MULTIPLY:
+        for (int l=0; l < length; l++)
+            out[l] = mul<DST>(in1[l], in2[l], _scale);
+        break;
+    case ARITHM_DIVIDE:
+        for (int l=0; l < length; l++)
+            out[l] = div<DST>(in1[l], in2[l], _scale);
+        break;
+    default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation");
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidAdd, cv::gapi::core::GAdd, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, int /*dtype*/, Buffer &dst)
+    {
+        //      DST     SRC1    SRC2    OP          __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_ADD);
+        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_ADD);
+        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_ADD);
+        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_ADD);
+        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_ADD);
+        BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_ADD);
+        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_ADD);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidSub, cv::gapi::core::GSub, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, int /*dtype*/, Buffer &dst)
+    {
+        //      DST     SRC1    SRC2    OP          __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_SUBTRACT);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidMul, cv::gapi::core::GMul, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, double scale, int /*dtype*/, Buffer &dst)
+    {
+        //      DST     SRC1    SRC2    OP          __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_MULTIPLY, scale);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidDiv, cv::gapi::core::GDiv, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, double scale, int /*dtype*/, Buffer &dst)
+    {
+        //      DST     SRC1    SRC2    OP          __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
+        BINARY_(uchar ,  short,  short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
+        BINARY_(uchar ,  float,  float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
+        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
+        BINARY_( float, uchar , uchar , run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
+        BINARY_( float,  short,  short, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
+        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_DIVIDE, scale);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidAbsDiff, cv::gapi::core::GAbsDiff, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+        //      DST     SRC1    SRC2    OP          __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_arithm, dst, src1, src2, ARITHM_ABSDIFF);
+        BINARY_(ushort, ushort, ushort, run_arithm, dst, src1, src2, ARITHM_ABSDIFF);
+        BINARY_( short,  short,  short, run_arithm, dst, src1, src2, ARITHM_ABSDIFF);
+        BINARY_( float,  float,  float, run_arithm, dst, src1, src2, ARITHM_ABSDIFF);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//--------------------------------------
+//
+// Fluid kernels: +, -, *, / with Scalar
+//
+//--------------------------------------
+
+static inline v_uint16x8  v_add_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x + y; }
+static inline v_uint16x8  v_sub_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return x - y; }
+static inline v_uint16x8 v_subr_16u(const v_uint16x8 &x, const v_uint16x8 &y) { return y - x; }
+
+static inline v_float32x4  v_add_32f(const v_float32x4 &x, const v_float32x4 &y) { return x + y; }
+static inline v_float32x4  v_sub_32f(const v_float32x4 &x, const v_float32x4 &y) { return x - y; }
+static inline v_float32x4 v_subr_32f(const v_float32x4 &x, const v_float32x4 &y) { return y - x; }
+
+static inline int  s_add_8u(uchar x, uchar y) { return x + y; }
+static inline int  s_sub_8u(uchar x, uchar y) { return x - y; }
+static inline int s_subr_8u(uchar x, uchar y) { return y - x; }
+
+static inline float  s_add_32f(float x, float y) { return x + y; }
+static inline float  s_sub_32f(float x, float y) { return x - y; }
+static inline float s_subr_32f(float x, float y) { return y - x; }
+
+// manual SIMD if important case 8UC3
+static void run_arithm_s3(uchar out[], const uchar in[], int width, const uchar scalar[],
+                          v_uint16x8 (*v_op)(const v_uint16x8&, const v_uint16x8&),
+                          int (*s_op)(uchar, uchar))
+{
+    int w = 0;
+
+#if CV_SIMD128
+    for (; w <= width-16; w+=16)
+    {
+        v_uint8x16 x, y, z;
+        v_load_deinterleave(&in[3*w], x, y, z);
+
+        v_uint16x8 r0, r1;
+
+        v_expand(x, r0, r1);
+        r0 = v_op(r0, v_setall_u16(scalar[0])); // x + scalar[0]
+        r1 = v_op(r1, v_setall_u16(scalar[0]));
+        x = v_pack(r0, r1);
+
+        v_expand(y, r0, r1);
+        r0 = v_op(r0, v_setall_u16(scalar[1])); // y + scalar[1]
+        r1 = v_op(r1, v_setall_u16(scalar[1]));
+        y = v_pack(r0, r1);
+
+        v_expand(z, r0, r1);
+        r0 = v_op(r0, v_setall_u16(scalar[2])); // z + scalar[2]
+        r1 = v_op(r1, v_setall_u16(scalar[2]));
+        z = v_pack(r0, r1);
+
+        v_store_interleave(&out[3*w], x, y, z);
+    }
+#endif
+    UNUSED(v_op);
+    for (; w < width; w++)
+    {
+        out[3*w    ] = saturate<uchar>( s_op(in[3*w    ], scalar[0]) );
+        out[3*w + 1] = saturate<uchar>( s_op(in[3*w + 1], scalar[1]) );
+        out[3*w + 2] = saturate<uchar>( s_op(in[3*w + 2], scalar[2]) );
+    }
+}
+
+// manually SIMD if rounding 32F into 8U, single channel
+static void run_arithm_s1(uchar out[], const float in[], int width, const float scalar[],
+                          v_float32x4 (*v_op)(const v_float32x4&, const v_float32x4&),
+                          float (*s_op)(float, float))
+{
+    int w = 0;
+
+#if CV_SIMD128
+    for (; w <= width-16; w+=16)
+    {
+        v_float32x4 r0, r1, r2, r3;
+        r0 = v_load(&in[w     ]);
+        r1 = v_load(&in[w +  4]);
+        r2 = v_load(&in[w +  8]);
+        r3 = v_load(&in[w + 12]);
+
+        r0 = v_op(r0, v_setall_f32(scalar[0])); // r + scalar[0]
+        r1 = v_op(r1, v_setall_f32(scalar[0]));
+        r2 = v_op(r2, v_setall_f32(scalar[0]));
+        r3 = v_op(r3, v_setall_f32(scalar[0]));
+
+        v_int32x4 i0, i1, i2, i3;
+        i0 = v_round(r0);
+        i1 = v_round(r1);
+        i2 = v_round(r2);
+        i3 = v_round(r3);
+
+        v_uint16x8 us0, us1;
+        us0 = v_pack_u(i0, i1);
+        us1 = v_pack_u(i2, i3);
+
+        v_uint8x16 uc;
+        uc = v_pack(us0, us1);
+
+        v_store(&out[w], uc);
+    }
+#endif
+    UNUSED(v_op);
+    for (; w < width; w++)
+    {
+        out[w] = saturate<uchar>(s_op(in[w], scalar[0]), std::roundf);
+    }
+}
+
+static void run_arithm_s_add3(uchar out[], const uchar in[], int width, const uchar scalar[])
+{
+    run_arithm_s3(out, in, width, scalar, v_add_16u, s_add_8u);
+}
+
+static void run_arithm_s_sub3(uchar out[], const uchar in[], int width, const uchar scalar[])
+{
+    run_arithm_s3(out, in, width, scalar, v_sub_16u, s_sub_8u);
+}
+
+static void run_arithm_s_subr3(uchar out[], const uchar in[], int width, const uchar scalar[])
+{
+    run_arithm_s3(out, in, width, scalar, v_subr_16u, s_subr_8u); // reverse: subr
+}
+
+static void run_arithm_s_add1(uchar out[], const float in[], int width, const float scalar[])
+{
+    run_arithm_s1(out, in, width, scalar, v_add_32f, s_add_32f);
+}
+
+static void run_arithm_s_sub1(uchar out[], const float in[], int width, const float scalar[])
+{
+    run_arithm_s1(out, in, width, scalar, v_sub_32f, s_sub_32f);
+}
+
+static void run_arithm_s_subr1(uchar out[], const float in[], int width, const float scalar[])
+{
+    run_arithm_s1(out, in, width, scalar, v_subr_32f, s_subr_32f); // reverse: subr
+}
+
+// manually unroll the inner cycle by channels
+template<typename DST, typename SRC, typename SCALAR, typename FUNC>
+static void run_arithm_s(DST out[], const SRC in[], int width, int chan,
+                         const SCALAR scalar[4], FUNC func)
+{
+    if (chan == 4)
+    {
+        for (int w=0; w < width; w++)
+        {
+            out[4*w + 0] = func(in[4*w + 0], scalar[0]);
+            out[4*w + 1] = func(in[4*w + 1], scalar[1]);
+            out[4*w + 2] = func(in[4*w + 2], scalar[2]);
+            out[4*w + 3] = func(in[4*w + 3], scalar[3]);
+        }
+    }
+    else
+    if (chan == 3)
+    {
+        for (int w=0; w < width; w++)
+        {
+            out[3*w + 0] = func(in[3*w + 0], scalar[0]);
+            out[3*w + 1] = func(in[3*w + 1], scalar[1]);
+            out[3*w + 2] = func(in[3*w + 2], scalar[2]);
+        }
+    }
+    else
+    if (chan == 2)
+    {
+        for (int w=0; w < width; w++)
+        {
+            out[2*w + 0] = func(in[2*w + 0], scalar[0]);
+            out[2*w + 1] = func(in[2*w + 1], scalar[1]);
+        }
+    }
+    else
+    if (chan == 1)
+    {
+        for (int w=0; w < width; w++)
+        {
+            out[w] = func(in[w], scalar[0]);
+        }
+    }
+    else
+        CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
+}
+
+template<typename DST, typename SRC>
+static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Arithm arithm,
+                         float scale=1)
+{
+    const auto *in  = src.InLine<SRC>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width  = dst.length();
+    int chan   = dst.meta().chan;
+
+    // What if we cast the scalar into the SRC type?
+    const SRC myscal[4] = { static_cast<SRC>(scalar[0]), static_cast<SRC>(scalar[1]),
+                            static_cast<SRC>(scalar[2]), static_cast<SRC>(scalar[3]) };
+    bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) &&
+                     (myscal[2] == scalar[2]) && (myscal[3] == scalar[3]);
+
+    switch (arithm)
+    {
+    case ARITHM_ABSDIFF:
+        for (int w=0; w < width; w++)
+            for (int c=0; c < chan; c++)
+                out[chan*w + c] = absdiff<DST>(in[chan*w + c], scalar[c]);
+        break;
+    case ARITHM_ADD:
+        if (usemyscal)
+        {
+            if (std::is_same<DST,uchar>::value &&
+                std::is_same<SRC,uchar>::value &&
+                chan == 3)
+                run_arithm_s_add3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
+            else if (std::is_same<DST,uchar>::value &&
+                     std::is_same<SRC,float>::value &&
+                     chan == 1)
+                run_arithm_s_add1((uchar*)out, (const float*)in, width, (const float*)myscal);
+            else
+                run_arithm_s(out, in, width, chan, myscal, add<DST,SRC,SRC>);
+        }
+        else
+            run_arithm_s(out, in, width, chan, scalar, add<DST,SRC,float>);
+        break;
+    case ARITHM_SUBTRACT:
+        if (usemyscal)
+        {
+            if (std::is_same<DST,uchar>::value &&
+                std::is_same<SRC,uchar>::value &&
+                chan == 3)
+                run_arithm_s_sub3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
+            else if (std::is_same<DST,uchar>::value &&
+                     std::is_same<SRC,float>::value &&
+                     chan == 1)
+                run_arithm_s_sub1((uchar*)out, (const float*)in, width, (const float*)myscal);
+            else
+                run_arithm_s(out, in, width, chan, myscal, sub<DST,SRC,SRC>);
+        }
+        else
+            run_arithm_s(out, in, width, chan, scalar, sub<DST,SRC,float>);
+        break;
+    // TODO: optimize miltiplication and division
+    case ARITHM_MULTIPLY:
+        for (int w=0; w < width; w++)
+            for (int c=0; c < chan; c++)
+                out[chan*w + c] = mul<DST>(in[chan*w + c], scalar[c], scale);
+        break;
+    case ARITHM_DIVIDE:
+        for (int w=0; w < width; w++)
+            for (int c=0; c < chan; c++)
+                out[chan*w + c] = div<DST>(in[chan*w + c], scalar[c], scale);
+        break;
+    default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation");
+    }
+}
+
+template<typename DST, typename SRC>
+static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], Arithm arithm,
+                          float scale=1)
+{
+    const auto *in  = src.InLine<SRC>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width  = dst.length();
+    int chan   = dst.meta().chan;
+
+    // What if we cast the scalar into the SRC type?
+    const SRC myscal[4] = { static_cast<SRC>(scalar[0]), static_cast<SRC>(scalar[1]),
+                            static_cast<SRC>(scalar[2]), static_cast<SRC>(scalar[3]) };
+    bool usemyscal = (myscal[0] == scalar[0]) && (myscal[1] == scalar[1]) &&
+                     (myscal[2] == scalar[2]) && (myscal[3] == scalar[3]);
+
+    switch (arithm)
+    {
+    case ARITHM_SUBTRACT:
+        if (usemyscal)
+        {
+            if (std::is_same<DST,uchar>::value &&
+                std::is_same<SRC,uchar>::value &&
+                chan == 3)
+                run_arithm_s_subr3((uchar*)out, (const uchar*)in, width, (const uchar*)myscal);
+            else if (std::is_same<DST,uchar>::value &&
+                     std::is_same<SRC,float>::value &&
+                     chan == 1)
+                run_arithm_s_subr1((uchar*)out, (const float*)in, width, (const float*)myscal);
+            else
+                run_arithm_s(out, in, width, chan, myscal, subr<DST,SRC,SRC>);
+        }
+        else
+            run_arithm_s(out, in, width, chan, scalar, subr<DST,SRC,float>);
+        break;
+    // TODO: optimize division
+    case ARITHM_DIVIDE:
+        for (int w=0; w < width; w++)
+            for (int c=0; c < chan; c++)
+                out[chan*w + c] = div<DST>(scalar[c], in[chan*w + c], scale);
+        break;
+    default: CV_Error(cv::Error::StsBadArg, "unsupported arithmetic operation");
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &_scalar, Buffer &dst)
+    {
+        const float scalar[4] = {
+            static_cast<float>(_scalar[0]),
+            static_cast<float>(_scalar[1]),
+            static_cast<float>(_scalar[2]),
+            static_cast<float>(_scalar[3])
+        };
+
+        //     DST     SRC     OP            __VA_ARGS__
+        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
+        UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
+        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
+    {
+        const float scalar[4] = {
+            static_cast<float>(_scalar[0]),
+            static_cast<float>(_scalar[1]),
+            static_cast<float>(_scalar[2]),
+            static_cast<float>(_scalar[3])
+        };
+
+        //     DST     SRC     OP            __VA_ARGS__
+        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_ADD);
+        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
+        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_ADD);
+        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
+        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_ADD);
+        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_ADD);
+        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_ADD);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidSubC, cv::gapi::core::GSubC, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
+    {
+        const float scalar[4] = {
+            static_cast<float>(_scalar[0]),
+            static_cast<float>(_scalar[1]),
+            static_cast<float>(_scalar[2]),
+            static_cast<float>(_scalar[3])
+        };
+
+        //     DST     SRC     OP            __VA_ARGS__
+        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_SUBTRACT);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidSubRC, cv::gapi::core::GSubRC, false)
+{
+    static const int Window = 1;
+
+    static void run(const cv::Scalar &_scalar, const View &src, int /*dtype*/, Buffer &dst)
+    {
+        const float scalar[4] = {
+            static_cast<float>(_scalar[0]),
+            static_cast<float>(_scalar[1]),
+            static_cast<float>(_scalar[2]),
+            static_cast<float>(_scalar[3])
+        };
+
+        //     DST     SRC     OP             __VA_ARGS__
+        UNARY_(uchar , uchar , run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar ,  short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_(uchar ,  float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_( short,  short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_( float, uchar , run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_( float,  short, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+        UNARY_( float,  float, run_arithm_rs, dst, src, scalar, ARITHM_SUBTRACT);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidMulC, cv::gapi::core::GMulC, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &_scalar, int /*dtype*/, Buffer &dst)
+    {
+        const float scalar[4] = {
+            static_cast<float>(_scalar[0]),
+            static_cast<float>(_scalar[1]),
+            static_cast<float>(_scalar[2]),
+            static_cast<float>(_scalar[3])
+        };
+        const float scale = 1.f;
+
+        //     DST     SRC     OP            __VA_ARGS__
+        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidMulCOld, cv::gapi::core::GMulCOld, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, double _scalar, int /*dtype*/, Buffer &dst)
+    {
+        const float scalar[4] = {
+            static_cast<float>(_scalar),
+            static_cast<float>(_scalar),
+            static_cast<float>(_scalar),
+            static_cast<float>(_scalar)
+        };
+        const float scale = 1.f;
+
+        //     DST     SRC     OP            __VA_ARGS__
+        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_MULTIPLY, scale);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidDivC, cv::gapi::core::GDivC, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &_scalar, double _scale, int /*dtype*/,
+                    Buffer &dst)
+    {
+        const float scalar[4] = {
+            static_cast<float>(_scalar[0]),
+            static_cast<float>(_scalar[1]),
+            static_cast<float>(_scalar[2]),
+            static_cast<float>(_scalar[3])
+        };
+        const float scale = static_cast<float>(_scale);
+
+        //     DST     SRC     OP            __VA_ARGS__
+        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_(uchar ,  short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_(uchar ,  float, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_( float, uchar , run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_( float,  short, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_( float,  float, run_arithm_s, dst, src, scalar, ARITHM_DIVIDE, scale);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidDivRC, cv::gapi::core::GDivRC, false)
+{
+    static const int Window = 1;
+
+    static void run(const cv::Scalar &_scalar, const View &src, double _scale, int /*dtype*/,
+                    Buffer &dst)
+    {
+        const float scalar[4] = {
+            static_cast<float>(_scalar[0]),
+            static_cast<float>(_scalar[1]),
+            static_cast<float>(_scalar[2]),
+            static_cast<float>(_scalar[3])
+        };
+        const float scale = static_cast<float>(_scale);
+
+        //     DST     SRC     OP             __VA_ARGS__
+        UNARY_(uchar , uchar , run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_(uchar ,  short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_(uchar ,  float, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_( short,  short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_( float, uchar , run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_( float,  short, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
+        UNARY_( float,  float, run_arithm_rs, dst, src, scalar, ARITHM_DIVIDE, scale);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//----------------------------
+//
+// Fluid math kernels: bitwise
+//
+//----------------------------
+
+enum Bitwise { BW_AND, BW_OR, BW_XOR, BW_NOT };
+
+template<typename DST, typename SRC1, typename SRC2>
+static void run_bitwise2(Buffer &dst, const View &src1, const View &src2, Bitwise bitwise)
+{
+    static_assert(std::is_same<DST, SRC1>::value, "wrong types");
+    static_assert(std::is_same<DST, SRC2>::value, "wrong types");
+
+    const auto *in1 = src1.InLine<SRC1>(0);
+    const auto *in2 = src2.InLine<SRC2>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width  = dst.length();
+    int chan   = dst.meta().chan;
+    int length = width * chan;
+
+    switch (bitwise)
+    {
+    case BW_AND:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] & in2[l];
+        break;
+    case BW_OR:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] | in2[l];
+        break;
+    case BW_XOR:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] ^ in2[l];
+        break;
+    default: CV_Error(cv::Error::StsBadArg, "unsupported bitwise operation");
+    }
+}
+
+template<typename DST, typename SRC>
+static void run_bitwise1(Buffer &dst, const View &src, Bitwise bitwise)
+{
+    static_assert(std::is_same<DST, SRC>::value, "wrong types");
+
+    const auto *in  = src.InLine<SRC>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width  = dst.length();
+    int chan   = dst.meta().chan;
+    int length = width * chan;
+
+    switch (bitwise)
+    {
+    case BW_NOT:
+        for (int l=0; l < length; l++)
+            out[l] = ~in[l];
+        break;
+    default: CV_Error(cv::Error::StsBadArg, "unsupported bitwise operation");
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidAnd, cv::gapi::core::GAnd, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+
+        //      DST     SRC1    SRC2    OP            __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_bitwise2, dst, src1, src2, BW_AND);
+        BINARY_(ushort, ushort, ushort, run_bitwise2, dst, src1, src2, BW_AND);
+        BINARY_( short,  short,  short, run_bitwise2, dst, src1, src2, BW_AND);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidOr, cv::gapi::core::GOr, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+
+        //      DST     SRC1    SRC2    OP            __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_bitwise2, dst, src1, src2, BW_OR);
+        BINARY_(ushort, ushort, ushort, run_bitwise2, dst, src1, src2, BW_OR);
+        BINARY_( short,  short,  short, run_bitwise2, dst, src1, src2, BW_OR);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidXor, cv::gapi::core::GXor, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+
+        //      DST     SRC1    SRC2    OP            __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_bitwise2, dst, src1, src2, BW_XOR);
+        BINARY_(ushort, ushort, ushort, run_bitwise2, dst, src1, src2, BW_XOR);
+        BINARY_( short,  short,  short, run_bitwise2, dst, src1, src2, BW_XOR);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidNot, cv::gapi::core::GNot, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, Buffer &dst)
+    {
+        //     DST     SRC     OP            __VA_ARGS__
+        UNARY_(uchar , uchar , run_bitwise1, dst, src, BW_NOT);
+        UNARY_(ushort, ushort, run_bitwise1, dst, src, BW_NOT);
+        UNARY_( short,  short, run_bitwise1, dst, src, BW_NOT);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//-------------------
+//
+// Fluid kernels: LUT
+//
+//-------------------
+
+GAPI_FLUID_KERNEL(GFluidLUT, cv::gapi::core::GLUT, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Mat& lut, Buffer &dst)
+    {
+        GAPI_Assert(CV_8U == dst.meta().depth);
+        GAPI_Assert(CV_8U == src.meta().depth);
+
+        GAPI_DbgAssert(CV_8U == lut.type());
+        GAPI_DbgAssert(256 == lut.cols * lut.rows);
+        GAPI_DbgAssert(dst.length() == src.length());
+        GAPI_DbgAssert(dst.meta().chan == src.meta().chan);
+
+        const auto *in  = src.InLine<uchar>(0);
+              auto *out = dst.OutLine<uchar>();
+
+        int width  = dst.length();
+        int chan   = dst.meta().chan;
+        int length = width * chan;
+
+        for (int l=0; l < length; l++)
+            out[l] = lut.data[ in[l] ];
+    }
+};
+
+//-------------------------
+//
+// Fluid kernels: convertTo
+//
+//-------------------------
+
+template<typename DST, typename SRC>
+static void run_convertto(Buffer &dst, const View &src, double _alpha, double _beta)
+{
+    const auto *in  = src.InLine<SRC>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width  = dst.length();
+    int chan   = dst.meta().chan;
+    int length = width * chan;
+
+    // NB: don't do this if SRC or DST is 64-bit
+    auto alpha = static_cast<float>( _alpha );
+    auto beta  = static_cast<float>( _beta  );
+
+    // compute faster if no alpha no beta
+    if (alpha == 1 && beta == 0)
+    {
+        // manual SIMD if need rounding
+        if (std::is_integral<DST>::value && std::is_floating_point<SRC>::value)
+        {
+            GAPI_Assert(( std::is_same<SRC,float>::value ));
+
+            int l = 0; // cycle index
+
+        #if CV_SIMD128
+            if (std::is_same<DST,uchar>::value)
+            {
+                for (; l <= length-16; l+=16)
+                {
+                    v_int32x4 i0, i1, i2, i3;
+                    i0 = v_round( v_load( (float*)& in[l     ] ) );
+                    i1 = v_round( v_load( (float*)& in[l +  4] ) );
+                    i2 = v_round( v_load( (float*)& in[l +  8] ) );
+                    i3 = v_round( v_load( (float*)& in[l + 12] ) );
+
+                    v_uint16x8 us0, us1;
+                    us0 = v_pack_u(i0, i1);
+                    us1 = v_pack_u(i2, i3);
+
+                    v_uint8x16 uc;
+                    uc = v_pack(us0, us1);
+                    v_store((uchar*)& out[l], uc);
+                }
+            }
+            if (std::is_same<DST,ushort>::value)
+            {
+                for (; l <= length-8; l+=8)
+                {
+                    v_int32x4 i0, i1;
+                    i0 = v_round( v_load( (float*)& in[l     ] ) );
+                    i1 = v_round( v_load( (float*)& in[l +  4] ) );
+
+                    v_uint16x8 us;
+                    us = v_pack_u(i0, i1);
+                    v_store((ushort*)& out[l], us);
+                }
+            }
+        #endif
+
+            // tail of SIMD cycle
+            for (; l < length; l++)
+            {
+                out[l] = saturate<DST>(in[l], rintf);
+            }
+        }
+        else if (std::is_integral<DST>::value) // here SRC is integral
+        {
+            for (int l=0; l < length; l++)
+            {
+                out[l] = saturate<DST>(in[l]);
+            }
+        }
+        else // DST is floating-point, SRC is any
+        {
+            for (int l=0; l < length; l++)
+            {
+                out[l] = static_cast<DST>(in[l]);
+            }
+        }
+    }
+    else // if alpha or beta is non-trivial
+    {
+        // TODO: optimize if alpha and beta and data are integral
+        for (int l=0; l < length; l++)
+        {
+            out[l] = saturate<DST>(in[l]*alpha + beta, rintf);
+        }
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidConvertTo, cv::gapi::core::GConvertTo, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, int /*rtype*/, double alpha, double beta, Buffer &dst)
+    {
+        //     DST     SRC     OP             __VA_ARGS__
+        UNARY_(uchar , uchar , run_convertto, dst, src, alpha, beta);
+        UNARY_(uchar , ushort, run_convertto, dst, src, alpha, beta);
+        UNARY_(uchar ,  float, run_convertto, dst, src, alpha, beta);
+        UNARY_(ushort, uchar , run_convertto, dst, src, alpha, beta);
+        UNARY_(ushort, ushort, run_convertto, dst, src, alpha, beta);
+        UNARY_(ushort,  float, run_convertto, dst, src, alpha, beta);
+        UNARY_( float, uchar , run_convertto, dst, src, alpha, beta);
+        UNARY_( float, ushort, run_convertto, dst, src, alpha, beta);
+        UNARY_( float,  float, run_convertto, dst, src, alpha, beta);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//-----------------------------
+//
+// Fluid math kernels: min, max
+//
+//-----------------------------
+
+enum Minmax { MM_MIN, MM_MAX };
+
+template<typename DST, typename SRC1, typename SRC2>
+static void run_minmax(Buffer &dst, const View &src1, const View &src2, Minmax minmax)
+{
+    static_assert(std::is_same<DST, SRC1>::value, "wrong types");
+    static_assert(std::is_same<DST, SRC2>::value, "wrong types");
+
+    const auto *in1 = src1.InLine<SRC1>(0);
+    const auto *in2 = src2.InLine<SRC2>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    int length = width * chan;
+
+    switch (minmax)
+    {
+    case MM_MIN:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] < in2[l]? in1[l]: in2[l];
+        break;
+    case MM_MAX:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] > in2[l]? in1[l]: in2[l];
+        break;
+    default: CV_Error(cv::Error::StsBadArg, "unsupported min/max operation");
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidMin, cv::gapi::core::GMin, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+        //      DST     SRC1    SRC2    OP          __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_minmax, dst, src1, src2, MM_MIN);
+        BINARY_(ushort, ushort, ushort, run_minmax, dst, src1, src2, MM_MIN);
+        BINARY_( short,  short,  short, run_minmax, dst, src1, src2, MM_MIN);
+        BINARY_( float,  float,  float, run_minmax, dst, src1, src2, MM_MIN);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidMax, cv::gapi::core::GMax, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+        //      DST     SRC1    SRC2    OP          __VA_ARGS__
+        BINARY_(uchar , uchar , uchar , run_minmax, dst, src1, src2, MM_MAX);
+        BINARY_(ushort, ushort, ushort, run_minmax, dst, src1, src2, MM_MAX);
+        BINARY_( short,  short,  short, run_minmax, dst, src1, src2, MM_MAX);
+        BINARY_( float,  float,  float, run_minmax, dst, src1, src2, MM_MAX);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//-----------------------
+//
+// Fluid kernels: compare
+//
+//-----------------------
+
+enum Compare { CMP_EQ, CMP_NE, CMP_GE, CMP_GT, CMP_LE, CMP_LT };
+
+template<typename DST, typename SRC1, typename SRC2>
+static void run_cmp(Buffer &dst, const View &src1, const View &src2, Compare compare)
+{
+    static_assert(std::is_same<SRC1, SRC2>::value, "wrong types");
+    static_assert(std::is_same<DST, uchar>::value, "wrong types");
+
+    const auto *in1 = src1.InLine<SRC1>(0);
+    const auto *in2 = src2.InLine<SRC2>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    int length = width * chan;
+
+    switch (compare)
+    {
+    case CMP_EQ:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] == in2[l]? 255: 0;
+        break;
+    case CMP_NE:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] != in2[l]? 255: 0;
+        break;
+    case CMP_GE:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] >= in2[l]? 255: 0;
+        break;
+    case CMP_LE:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] <= in2[l]? 255: 0;
+        break;
+    case CMP_GT:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] > in2[l]? 255: 0;
+        break;
+    case CMP_LT:
+        for (int l=0; l < length; l++)
+            out[l] = in1[l] < in2[l]? 255: 0;
+        break;
+    default:
+        CV_Error(cv::Error::StsBadArg, "unsupported compare operation");
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidCmpEQ, cv::gapi::core::GCmpEQ, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+        //      DST    SRC1    SRC2    OP       __VA_ARGS__
+        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_EQ);
+        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_EQ);
+        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_EQ);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCmpNE, cv::gapi::core::GCmpNE, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+        //      DST    SRC1    SRC2    OP       __VA_ARGS__
+        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_NE);
+        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_NE);
+        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_NE);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCmpGE, cv::gapi::core::GCmpGE, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+        //      DST    SRC1    SRC2    OP       __VA_ARGS__
+        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_GE);
+        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_GE);
+        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_GE);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCmpGT, cv::gapi::core::GCmpGT, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+        //      DST    SRC1    SRC2    OP       __VA_ARGS__
+        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_GT);
+        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_GT);
+        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_GT);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCmpLE, cv::gapi::core::GCmpLE, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+        //      DST    SRC1    SRC2    OP       __VA_ARGS__
+        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_LE);
+        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_LE);
+        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_LE);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCmpLT, cv::gapi::core::GCmpLT, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, Buffer &dst)
+    {
+        //      DST    SRC1    SRC2    OP       __VA_ARGS__
+        BINARY_(uchar, uchar , uchar , run_cmp, dst, src1, src2, CMP_LT);
+        BINARY_(uchar,  short,  short, run_cmp, dst, src1, src2, CMP_LT);
+        BINARY_(uchar,  float,  float, run_cmp, dst, src1, src2, CMP_LT);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//---------------------
+//
+// Compare with GScalar
+//
+//---------------------
+
+template<typename DST, typename SRC, typename SCALAR=double>
+static void run_cmp(DST out[], const SRC in[], int length, Compare compare, SCALAR s)
+{
+    switch (compare)
+    {
+    case CMP_EQ:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] == s? 255: 0;
+        break;
+    case CMP_NE:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] != s? 255: 0;
+        break;
+    case CMP_GE:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] >= s? 255: 0;
+        break;
+    case CMP_LE:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] <= s? 255: 0;
+        break;
+    case CMP_GT:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] > s? 255: 0;
+        break;
+    case CMP_LT:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] < s? 255: 0;
+        break;
+    default:
+        CV_Error(cv::Error::StsBadArg, "unsupported compare operation");
+    }
+}
+
+template<typename DST, typename SRC>
+static void run_cmp(Buffer &dst, const View &src, Compare compare, const cv::Scalar &scalar)
+{
+    static_assert(std::is_same<DST, uchar>::value, "wrong types");
+
+    const auto *in  = src.InLine<SRC>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    int length = width * chan;
+
+    // compute faster if scalar rounds to SRC
+    double d =                   scalar[0]  ;
+    SRC    s = static_cast<SRC>( scalar[0] );
+
+    if (s == d)
+        run_cmp(out, in, length, compare, s);
+    else
+        run_cmp(out, in, length, compare, d);
+}
+
+GAPI_FLUID_KERNEL(GFluidCmpEQScalar, cv::gapi::core::GCmpEQScalar, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
+    {
+        //     DST    SRC     OP       __VA_ARGS__
+        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_EQ, scalar);
+        UNARY_(uchar,  short, run_cmp, dst, src, CMP_EQ, scalar);
+        UNARY_(uchar,  float, run_cmp, dst, src, CMP_EQ, scalar);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCmpNEScalar, cv::gapi::core::GCmpNEScalar, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
+    {
+        //     DST    SRC     OP       __VA_ARGS__
+        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_NE, scalar);
+        UNARY_(uchar,  short, run_cmp, dst, src, CMP_NE, scalar);
+        UNARY_(uchar,  float, run_cmp, dst, src, CMP_NE, scalar);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCmpGEScalar, cv::gapi::core::GCmpGEScalar, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
+    {
+        //     DST    SRC     OP       __VA_ARGS__
+        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_GE, scalar);
+        UNARY_(uchar,  short, run_cmp, dst, src, CMP_GE, scalar);
+        UNARY_(uchar,  float, run_cmp, dst, src, CMP_GE, scalar);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCmpGTScalar, cv::gapi::core::GCmpGTScalar, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
+    {
+        //     DST    SRC     OP       __VA_ARGS__
+        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_GT, scalar);
+        UNARY_(uchar,  short, run_cmp, dst, src, CMP_GT, scalar);
+        UNARY_(uchar,  float, run_cmp, dst, src, CMP_GT, scalar);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCmpLEScalar, cv::gapi::core::GCmpLEScalar, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
+    {
+        //     DST    SRC     OP       __VA_ARGS__
+        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_LE, scalar);
+        UNARY_(uchar,  short, run_cmp, dst, src, CMP_LE, scalar);
+        UNARY_(uchar,  float, run_cmp, dst, src, CMP_LE, scalar);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCmpLTScalar, cv::gapi::core::GCmpLTScalar, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &scalar, Buffer &dst)
+    {
+        //     DST    SRC     OP       __VA_ARGS__
+        UNARY_(uchar, uchar , run_cmp, dst, src, CMP_LT, scalar);
+        UNARY_(uchar,  short, run_cmp, dst, src, CMP_LT, scalar);
+        UNARY_(uchar,  float, run_cmp, dst, src, CMP_LT, scalar);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//-------------------------
+//
+// Fluid kernels: threshold
+//
+//-------------------------
+
+template<typename DST, typename SRC>
+static void run_threshold(Buffer &dst, const View &src, const cv::Scalar &thresh,
+                                                        const cv::Scalar &maxval,
+                                                                     int  type)
+{
+    static_assert(std::is_same<DST, SRC>::value, "wrong types");
+
+    const auto *in  = src.InLine<SRC>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    int length = width * chan;
+
+    DST thresh_ = saturate<DST>(thresh[0], floord);
+    DST threshd = saturate<DST>(thresh[0], roundd);
+    DST maxvald = saturate<DST>(maxval[0], roundd);
+
+    switch (type)
+    {
+    case cv::THRESH_BINARY:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] > thresh_? maxvald: 0;
+        break;
+    case cv::THRESH_BINARY_INV:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] > thresh_? 0: maxvald;
+        break;
+    case cv::THRESH_TRUNC:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] > thresh_? threshd: in[l];
+        break;
+    case cv::THRESH_TOZERO:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] > thresh_? in[l]: 0;
+        break;
+    case cv::THRESH_TOZERO_INV:
+        for (int l=0; l < length; l++)
+            out[l] = in[l] > thresh_? 0: in[l];
+        break;
+    default: CV_Error(cv::Error::StsBadArg, "unsupported threshold type");
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidThreshold, cv::gapi::core::GThreshold, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &thresh,
+                                     const cv::Scalar &maxval,
+                                                  int  type,
+                        Buffer &dst)
+    {
+        //     DST     SRC     OP             __VA_ARGS__
+        UNARY_(uchar , uchar , run_threshold, dst, src, thresh, maxval, type);
+        UNARY_(ushort, ushort, run_threshold, dst, src, thresh, maxval, type);
+        UNARY_( short,  short, run_threshold, dst, src, thresh, maxval, type);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//------------------------
+//
+// Fluid kernels: in-range
+//
+//------------------------
+
+static void run_inrange3(uchar out[], const uchar in[], int width,
+                         const uchar lower[], const uchar upper[])
+{
+    int w = 0; // cycle index
+
+#if CV_SIMD128
+    for (; w <= width-16; w+=16)
+    {
+        v_uint8x16 i0, i1, i2;
+        v_load_deinterleave(&in[3*w], i0, i1, i2);
+
+        v_uint8x16 o;
+        o = (i0 >= v_setall_u8(lower[0])) & (i0 <= v_setall_u8(upper[0])) &
+            (i1 >= v_setall_u8(lower[1])) & (i1 <= v_setall_u8(upper[1])) &
+            (i2 >= v_setall_u8(lower[2])) & (i2 <= v_setall_u8(upper[2]));
+
+        v_store(&out[w], o);
+    }
+#endif
+
+    for (; w < width; w++)
+    {
+        out[w] = in[3*w  ] >= lower[0] && in[3*w  ] <= upper[0] &&
+                 in[3*w+1] >= lower[1] && in[3*w+1] <= upper[1] &&
+                 in[3*w+2] >= lower[2] && in[3*w+2] <= upper[2] ? 255: 0;
+    }
+}
+
+template<typename DST, typename SRC>
+static void run_inrange(Buffer &dst, const View &src, const cv::Scalar &upperb,
+                                                      const cv::Scalar &lowerb)
+{
+    static_assert(std::is_same<DST, uchar>::value, "wrong types");
+
+    const auto *in  = src.InLine<SRC>(0);
+          auto *out = dst.OutLine<DST>();
+
+    int width = src.length();
+    int chan  = src.meta().chan;
+    GAPI_Assert(dst.meta().chan == 1);
+
+    SRC lower[4], upper[4];
+    for (int c=0; c < chan; c++)
+    {
+        if (std::is_integral<SRC>::value)
+        {
+            // for integral input, in[i] >= lower equals in[i] >= ceil(lower)
+            // so we can optimize compare operations by rounding lower/upper
+            lower[c] = saturate<SRC>(lowerb[c],  ceild);
+            upper[c] = saturate<SRC>(upperb[c], floord);
+        }
+        else
+        {
+            // FIXME: now values used in comparison are floats (while they
+            // have double precision initially). Comparison float/float
+            // may differ from float/double (how it should work in this case)
+            //
+            // Example: threshold=1/3 (or 1/10)
+            lower[c] = static_cast<SRC>(lowerb[c]);
+            upper[c] = static_cast<SRC>(upperb[c]);
+        }
+    }
+
+    // manually SIMD for important case if RGB/BGR
+    if (std::is_same<SRC,uchar>::value && chan==3)
+    {
+        run_inrange3((uchar*)out, (const uchar*)in, width,
+                     (const uchar*)lower, (const uchar*)upper);
+        return;
+    }
+
+    // TODO: please manually SIMD if multiple channels:
+    // modern compilers would perfectly vectorize this code if one channel,
+    // but may need help with de-interleaving channels if RGB/BGR image etc
+    switch (chan)
+    {
+    case 1:
+        for (int w=0; w < width; w++)
+            out[w] = in[w] >= lower[0] && in[w] <= upper[0]? 255: 0;
+        break;
+    case 2:
+        for (int w=0; w < width; w++)
+            out[w] = in[2*w  ] >= lower[0] && in[2*w  ] <= upper[0] &&
+                     in[2*w+1] >= lower[1] && in[2*w+1] <= upper[1] ? 255: 0;
+        break;
+    case 3:
+        for (int w=0; w < width; w++)
+            out[w] = in[3*w  ] >= lower[0] && in[3*w  ] <= upper[0] &&
+                     in[3*w+1] >= lower[1] && in[3*w+1] <= upper[1] &&
+                     in[3*w+2] >= lower[2] && in[3*w+2] <= upper[2] ? 255: 0;
+        break;
+    case 4:
+        for (int w=0; w < width; w++)
+            out[w] = in[4*w  ] >= lower[0] && in[4*w  ] <= upper[0] &&
+                     in[4*w+1] >= lower[1] && in[4*w+1] <= upper[1] &&
+                     in[4*w+2] >= lower[2] && in[4*w+2] <= upper[2] &&
+                     in[4*w+3] >= lower[3] && in[4*w+3] <= upper[3] ? 255: 0;
+        break;
+    default: CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidInRange, cv::gapi::core::GInRange, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, const cv::Scalar &lowerb, const cv::Scalar& upperb,
+                        Buffer &dst)
+    {
+        //       DST     SRC    OP           __VA_ARGS__
+        INRANGE_(uchar, uchar , run_inrange, dst, src, upperb, lowerb);
+        INRANGE_(uchar, ushort, run_inrange, dst, src, upperb, lowerb);
+        INRANGE_(uchar,  short, run_inrange, dst, src, upperb, lowerb);
+        INRANGE_(uchar,  float, run_inrange, dst, src, upperb, lowerb);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//----------------------
+//
+// Fluid kernels: select
+//
+//----------------------
+
+// manually vectored function for important case if RGB/BGR image
+static void run_select_row3(int width, uchar out[], uchar in1[], uchar in2[], uchar in3[])
+{
+    int w = 0; // cycle index
+
+#if CV_SIMD128
+    for (; w <= width-16; w+=16)
+    {
+        v_uint8x16 a1, b1, c1;
+        v_uint8x16 a2, b2, c2;
+        v_uint8x16 mask;
+        v_uint8x16 a, b, c;
+
+        v_load_deinterleave(&in1[3*w], a1, b1, c1);
+        v_load_deinterleave(&in2[3*w], a2, b2, c2);
+
+        mask = v_load(&in3[w]);
+        mask = mask != v_setzero_u8();
+
+        a = v_select(mask, a1, a2);
+        b = v_select(mask, b1, b2);
+        c = v_select(mask, c1, c2);
+
+        v_store_interleave(&out[3*w], a, b, c);
+    }
+#endif
+
+    for (; w < width; w++)
+    {
+        out[3*w    ] = in3[w]? in1[3*w    ]: in2[3*w    ];
+        out[3*w + 1] = in3[w]? in1[3*w + 1]: in2[3*w + 1];
+        out[3*w + 2] = in3[w]? in1[3*w + 2]: in2[3*w + 2];
+    }
+}
+
+// parameter chan is compile-time known constant, normally chan=1..4
+template<int chan, typename DST, typename SRC1, typename SRC2, typename SRC3>
+static void run_select_row(int width, DST out[], SRC1 in1[], SRC2 in2[], SRC3 in3[])
+{
+    if (std::is_same<DST,uchar>::value && chan==3)
+    {
+        // manually vectored function for important case if RGB/BGR image
+        run_select_row3(width, (uchar*)out, (uchar*)in1, (uchar*)in2, (uchar*)in3);
+        return;
+    }
+
+    // because `chan` is template parameter, its value is known at compilation time,
+    // so that modern compilers would efficiently vectorize this cycle if chan==1
+    // (if chan>1, compilers may need help with de-interleaving of the channels)
+    for (int w=0; w < width; w++)
+    {
+        for (int c=0; c < chan; c++)
+        {
+            out[w*chan + c] = in3[w]? in1[w*chan + c]: in2[w*chan + c];
+        }
+    }
+}
+
+template<typename DST, typename SRC1, typename SRC2, typename SRC3>
+static void run_select(Buffer &dst, const View &src1, const View &src2, const View &src3)
+{
+    static_assert(std::is_same<DST ,  SRC1>::value, "wrong types");
+    static_assert(std::is_same<DST ,  SRC2>::value, "wrong types");
+    static_assert(std::is_same<uchar, SRC3>::value, "wrong types");
+
+    auto *out = dst.OutLine<DST>();
+
+    const auto *in1 = src1.InLine<SRC1>(0);
+    const auto *in2 = src2.InLine<SRC2>(0);
+    const auto *in3 = src3.InLine<SRC3>(0);
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    switch (chan)
+    {
+    case 1: run_select_row<1>(width, out, in1, in2, in3); break;
+    case 2: run_select_row<2>(width, out, in1, in2, in3); break;
+    case 3: run_select_row<3>(width, out, in1, in2, in3); break;
+    case 4: run_select_row<4>(width, out, in1, in2, in3); break;
+    default: CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidSelect, cv::gapi::core::GSelect, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, const View &src3, Buffer &dst)
+    {
+        //      DST     SRC1    SRC2    SRC3   OP          __VA_ARGS__
+        SELECT_(uchar , uchar , uchar , uchar, run_select, dst, src1, src2, src3);
+        SELECT_(ushort, ushort, ushort, uchar, run_select, dst, src1, src2, src3);
+        SELECT_( short,  short,  short, uchar, run_select, dst, src1, src2, src3);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+};
+
+//----------------------------------------------------
+//
+// Fluid kernels: split, merge, polat2cart, cart2polar
+//
+//----------------------------------------------------
+
+GAPI_FLUID_KERNEL(GFluidSplit3, cv::gapi::core::GSplit3, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3)
+    {
+        const auto *in   =  src.InLine<uchar>(0);
+              auto *out1 = dst1.OutLine<uchar>();
+              auto *out2 = dst2.OutLine<uchar>();
+              auto *out3 = dst3.OutLine<uchar>();
+
+        GAPI_Assert(3 == src.meta().chan);
+        int width = src.length();
+
+        int w = 0; // cycle counter
+
+    #if CV_SIMD128
+        for (; w <= width-16; w+=16)
+        {
+            v_uint8x16 a, b, c;
+            v_load_deinterleave(&in[3*w], a, b, c);
+            v_store(&out1[w], a);
+            v_store(&out2[w], b);
+            v_store(&out3[w], c);
+        }
+    #endif
+
+        for (; w < width; w++)
+        {
+            out1[w] = in[3*w    ];
+            out2[w] = in[3*w + 1];
+            out3[w] = in[3*w + 2];
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidSplit4, cv::gapi::core::GSplit4, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, Buffer &dst1, Buffer &dst2, Buffer &dst3, Buffer &dst4)
+    {
+        const auto *in   =  src.InLine<uchar>(0);
+              auto *out1 = dst1.OutLine<uchar>();
+              auto *out2 = dst2.OutLine<uchar>();
+              auto *out3 = dst3.OutLine<uchar>();
+              auto *out4 = dst4.OutLine<uchar>();
+
+        GAPI_Assert(4 == src.meta().chan);
+        int width = src.length();
+
+        int w = 0; // cycle counter
+
+    #if CV_SIMD128
+        for (; w <= width-16; w+=16)
+        {
+            v_uint8x16 a, b, c, d;
+            v_load_deinterleave(&in[4*w], a, b, c, d);
+            v_store(&out1[w], a);
+            v_store(&out2[w], b);
+            v_store(&out3[w], c);
+            v_store(&out4[w], d);
+        }
+    #endif
+
+        for (; w < width; w++)
+        {
+            out1[w] = in[4*w    ];
+            out2[w] = in[4*w + 1];
+            out3[w] = in[4*w + 2];
+            out4[w] = in[4*w + 3];
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidMerge3, cv::gapi::core::GMerge3, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, const View &src3, Buffer &dst)
+    {
+        const auto *in1 = src1.InLine<uchar>(0);
+        const auto *in2 = src2.InLine<uchar>(0);
+        const auto *in3 = src3.InLine<uchar>(0);
+              auto *out = dst.OutLine<uchar>();
+
+        GAPI_Assert(3 == dst.meta().chan);
+        int width = dst.length();
+
+        int w = 0; // cycle counter
+
+    #if CV_SIMD128
+        for (; w <= width-16; w+=16)
+        {
+            v_uint8x16 a, b, c;
+            a = v_load(&in1[w]);
+            b = v_load(&in2[w]);
+            c = v_load(&in3[w]);
+            v_store_interleave(&out[3*w], a, b, c);
+        }
+    #endif
+
+        for (; w < width; w++)
+        {
+            out[3*w    ] = in1[w];
+            out[3*w + 1] = in2[w];
+            out[3*w + 2] = in3[w];
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidMerge4, cv::gapi::core::GMerge4, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, const View &src3, const View &src4,
+                    Buffer &dst)
+    {
+        const auto *in1 = src1.InLine<uchar>(0);
+        const auto *in2 = src2.InLine<uchar>(0);
+        const auto *in3 = src3.InLine<uchar>(0);
+        const auto *in4 = src4.InLine<uchar>(0);
+              auto *out = dst.OutLine<uchar>();
+
+        GAPI_Assert(4 == dst.meta().chan);
+        int width = dst.length();
+
+        int w = 0; // cycle counter
+
+    #if CV_SIMD128
+        for (; w <= width-16; w+=16)
+        {
+            v_uint8x16 a, b, c, d;
+            a = v_load(&in1[w]);
+            b = v_load(&in2[w]);
+            c = v_load(&in3[w]);
+            d = v_load(&in4[w]);
+            v_store_interleave(&out[4*w], a, b, c, d);
+        }
+    #endif
+
+        for (; w < width; w++)
+        {
+            out[4*w    ] = in1[w];
+            out[4*w + 1] = in2[w];
+            out[4*w + 2] = in3[w];
+            out[4*w + 3] = in4[w];
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidPolarToCart, cv::gapi::core::GPolarToCart, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, bool angleInDegrees,
+                    Buffer &dst1, Buffer &dst2)
+    {
+        GAPI_Assert(src1.meta().depth == CV_32F);
+        GAPI_Assert(src2.meta().depth == CV_32F);
+        GAPI_Assert(dst1.meta().depth == CV_32F);
+        GAPI_Assert(dst2.meta().depth == CV_32F);
+
+        const auto * in1 = src1.InLine<float>(0);
+        const auto * in2 = src2.InLine<float>(0);
+              auto *out1 = dst1.OutLine<float>();
+              auto *out2 = dst2.OutLine<float>();
+
+        int width = src1.length();
+        int chan  = src2.meta().chan;
+        int length = width * chan;
+
+        // SIMD: compiler vectoring!
+        for (int l=0; l < length; l++)
+        {
+            float angle = angleInDegrees?
+                          in2[l] * static_cast<float>(CV_PI / 180):
+                          in2[l];
+            float magnitude = in1[l];
+            float x = magnitude * std::cos(angle);
+            float y = magnitude * std::sin(angle);
+            out1[l] = x;
+            out2[l] = y;
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidCartToPolar, cv::gapi::core::GCartToPolar, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src1, const View &src2, bool angleInDegrees,
+                    Buffer &dst1, Buffer &dst2)
+    {
+        GAPI_Assert(src1.meta().depth == CV_32F);
+        GAPI_Assert(src2.meta().depth == CV_32F);
+        GAPI_Assert(dst1.meta().depth == CV_32F);
+        GAPI_Assert(dst2.meta().depth == CV_32F);
+
+        const auto * in1 = src1.InLine<float>(0);
+        const auto * in2 = src2.InLine<float>(0);
+              auto *out1 = dst1.OutLine<float>();
+              auto *out2 = dst2.OutLine<float>();
+
+        int width = src1.length();
+        int chan  = src2.meta().chan;
+        int length = width * chan;
+
+        // SIMD: compiler vectoring!
+        for (int l=0; l < length; l++)
+        {
+            float x = in1[l];
+            float y = in2[l];
+            float magnitude = std::hypot(y, x);
+            float angle_rad = std::atan2(y, x);
+            float angle = angleInDegrees?
+                          angle_rad * static_cast<float>(180 / CV_PI):
+                          angle_rad;
+            out1[l] = magnitude;
+            out2[l] = angle;
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidPhase, cv::gapi::core::GPhase, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src_x,
+                    const View &src_y,
+                    bool angleInDegrees,
+                    Buffer &dst)
+    {
+        const auto w = dst.length() * dst.meta().chan;
+        if (src_x.meta().depth == CV_32F && src_y.meta().depth == CV_32F)
+        {
+            hal::fastAtan32f(src_y.InLine<float>(0),
+                             src_x.InLine<float>(0),
+                             dst.OutLine<float>(),
+                             w,
+                             angleInDegrees);
+        }
+        else if (src_x.meta().depth == CV_64F && src_y.meta().depth == CV_64F)
+        {
+            hal::fastAtan64f(src_y.InLine<double>(0),
+                             src_x.InLine<double>(0),
+                             dst.OutLine<double>(),
+                             w,
+                             angleInDegrees);
+        } else GAPI_Assert(false && !"Phase supports 32F/64F input only!");
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidResize, cv::gapi::core::GResize, true)
+{
+    static const int Window = 1;
+    static const auto Kind = GFluidKernel::Kind::Resize;
+
+    constexpr static const int INTER_RESIZE_COEF_BITS = 11;
+    constexpr static const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
+    constexpr static const short ONE = INTER_RESIZE_COEF_SCALE;
+
+    struct ResizeUnit
+    {
+        short alpha0;
+        short alpha1;
+        int   s0;
+        int   s1;
+    };
+
+    static ResizeUnit map(double ratio, int start, int max, int outCoord)
+    {
+        float f = static_cast<float>((outCoord + 0.5f) * ratio - 0.5f);
+        int s = cvFloor(f);
+        f -= s;
+
+        ResizeUnit ru;
+
+        ru.s0 = std::max(s - start, 0);
+        ru.s1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
+
+        ru.alpha0 = saturate_cast<short>((1.0f - f) * INTER_RESIZE_COEF_SCALE);
+        ru.alpha1 = saturate_cast<short>((f) * INTER_RESIZE_COEF_SCALE);
+
+        return ru;
+    }
+
+    static void initScratch(const cv::GMatDesc& in,
+                            cv::Size outSz, double /*fx*/, double /*fy*/, int /*interp*/,
+                            cv::gapi::fluid::Buffer &scratch)
+    {
+        CV_Assert(in.depth == CV_8U && in.chan == 3);
+
+        cv::Size scratch_size{static_cast<int>(outSz.width * sizeof(ResizeUnit)), 1};
+
+        cv::GMatDesc desc;
+        desc.chan  = 1;
+        desc.depth = CV_8UC1;
+        desc.size  = to_own(scratch_size);
+
+        cv::gapi::fluid::Buffer buffer(desc);
+        scratch = std::move(buffer);
+
+        ResizeUnit* mapX = scratch.OutLine<ResizeUnit>();
+        double hRatio = (double)in.size.width / outSz.width;
+
+        for (int x = 0, w = outSz.width; x < w; x++)
+        {
+            mapX[x] = map(hRatio, 0, in.size.width, x);
+        }
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/)
+    {}
+
+    static void run(const cv::gapi::fluid::View& in, cv::Size /*sz*/, double /*fx*/, double /*fy*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch)
+    {
+        double vRatio = (double)in.meta().size.height / out.meta().size.height;
+        auto mapY = map(vRatio, in.y(), in.meta().size.height, out.y());
+
+        auto beta0 = mapY.alpha0;
+        auto beta1 = mapY.alpha1;
+
+        const auto src0 = in.InLine <unsigned char>(mapY.s0);
+        const auto src1 = in.InLine <unsigned char>(mapY.s1);
+
+        auto dst = out.OutLine<unsigned char>();
+
+        ResizeUnit* mapX = scratch.OutLine<ResizeUnit>();
+
+        for (int x = 0; x < out.length(); x++)
+        {
+            short alpha0 = mapX[x].alpha0;
+            short alpha1 = mapX[x].alpha1;
+            int sx0 = mapX[x].s0;
+            int sx1 = mapX[x].s1;
+
+            int res00 = src0[3*sx0    ]*alpha0 + src0[3*(sx1)    ]*alpha1;
+            int res10 = src1[3*sx0    ]*alpha0 + src1[3*(sx1)    ]*alpha1;
+
+            int res01 = src0[3*sx0 + 1]*alpha0 + src0[3*(sx1) + 1]*alpha1;
+            int res11 = src1[3*sx0 + 1]*alpha0 + src1[3*(sx1) + 1]*alpha1;
+
+            int res02 = src0[3*sx0 + 2]*alpha0 + src0[3*(sx1) + 2]*alpha1;
+            int res12 = src1[3*sx0 + 2]*alpha0 + src1[3*(sx1) + 2]*alpha1;
+
+            dst[3*x    ] = uchar(( ((beta0 * (res00 >> 4)) >> 16) + ((beta1 * (res10 >> 4)) >> 16) + 2)>>2);
+            dst[3*x + 1] = uchar(( ((beta0 * (res01 >> 4)) >> 16) + ((beta1 * (res11 >> 4)) >> 16) + 2)>>2);
+            dst[3*x + 2] = uchar(( ((beta0 * (res02 >> 4)) >> 16) + ((beta1 * (res12 >> 4)) >> 16) + 2)>>2);
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidSqrt, cv::gapi::core::GSqrt, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &in, Buffer &out)
+    {
+        const auto w = out.length() * out.meta().chan;
+        if (in.meta().depth == CV_32F)
+        {
+            hal::sqrt32f(in.InLine<float>(0),
+                         out.OutLine<float>(0),
+                         w);
+        }
+        else if (in.meta().depth == CV_64F)
+        {
+            hal::sqrt64f(in.InLine<double>(0),
+                         out.OutLine<double>(0),
+                         w);
+        } else GAPI_Assert(false && !"Sqrt supports 32F/64F input only!");
+    }
+};
+
+} // namespace fliud
+} // namespace gapi
+} // namespace cv
+
+cv::gapi::GKernelPackage cv::gapi::core::fluid::kernels()
+{
+    using namespace cv::gapi::fluid;
+
+    return cv::gapi::kernels
+     <       GFluidAdd
+            ,GFluidSub
+            ,GFluidMul
+            ,GFluidDiv
+            ,GFluidAbsDiff
+            ,GFluidAnd
+            ,GFluidOr
+            ,GFluidXor
+            ,GFluidMin
+            ,GFluidMax
+            ,GFluidCmpGT
+            ,GFluidCmpGE
+            ,GFluidCmpLE
+            ,GFluidCmpLT
+            ,GFluidCmpEQ
+            ,GFluidCmpNE
+            ,GFluidAddW
+            ,GFluidNot
+            ,GFluidLUT
+            ,GFluidConvertTo
+            ,GFluidSplit3
+            ,GFluidSplit4
+            ,GFluidMerge3
+            ,GFluidMerge4
+            ,GFluidSelect
+            ,GFluidPolarToCart
+            ,GFluidCartToPolar
+            ,GFluidPhase
+            ,GFluidAddC
+            ,GFluidSubC
+            ,GFluidSubRC
+            ,GFluidMulC
+            ,GFluidMulCOld
+            ,GFluidDivC
+            ,GFluidDivRC
+            ,GFluidAbsDiffC
+            ,GFluidCmpGTScalar
+            ,GFluidCmpGEScalar
+            ,GFluidCmpLEScalar
+            ,GFluidCmpLTScalar
+            ,GFluidCmpEQScalar
+            ,GFluidCmpNEScalar
+            ,GFluidThreshold
+            ,GFluidInRange
+            ,GFluidResize
+            ,GFluidSqrt
+        #if 0
+            ,GFluidMean        -- not fluid
+            ,GFluidSum         -- not fluid
+            ,GFluidNormL1      -- not fluid
+            ,GFluidNormL2      -- not fluid
+            ,GFluidNormInf     -- not fluid
+            ,GFluidIntegral    -- not fluid
+            ,GFluidThresholdOT -- not fluid
+            ,GFluidResize      -- not fluid (?)
+            ,GFluidRemap       -- not fluid
+            ,GFluidFlip        -- not fluid
+            ,GFluidCrop        -- not fluid
+            ,GFluidConcatHor
+            ,GFluidConcatVert  -- not fluid
+        #endif
+        >();
+}
+
+#endif // !defined(GAPI_STANDALONE)
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc.cpp
new file mode 100644 (file)
index 0000000..e2e4c4f
--- /dev/null
@@ -0,0 +1,1338 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+#if !defined(GAPI_STANDALONE)
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/own/assert.hpp"
+#include "opencv2/core/traits.hpp"
+#include "opencv2/imgproc/types_c.h"
+
+#include "opencv2/gapi/core.hpp"
+#include "opencv2/gapi/imgproc.hpp"
+
+#include "opencv2/gapi/own/types.hpp"
+
+#include "opencv2/gapi/fluid/gfluidbuffer.hpp"
+#include "opencv2/gapi/fluid/gfluidkernel.hpp"
+#include "opencv2/gapi/fluid/imgproc.hpp"
+
+#include "gfluidbuffer_priv.hpp"
+#include "gfluidbackend.hpp"
+#include "gfluidutils.hpp"
+
+#include "gfluidimgproc_func.hpp"
+
+#include "opencv2/imgproc/hal/hal.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+#include <cmath>
+#include <cstdlib>
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+//----------------------------------
+//
+// Fluid kernels: RGB2Gray, BGR2Gray
+//
+//----------------------------------
+
+// Y' = 0.299*R' + 0.587*G' + 0.114*B'
+// U' = (B' - Y')*0.492
+// V' = (R' - Y')*0.877
+static const float coef_rgb2yuv_bt601[5] = {0.299f, 0.587f, 0.114f, 0.492f, 0.877f};
+
+// R' = Y' + 1.140*V'
+// G' = Y' - 0.394*U' - 0.581*V'
+// B' = Y' + 2.032*U'
+static const float coef_yuv2rgb_bt601[4] = {1.140f, -0.394f, -0.581f, 2.032f};
+
+static void run_rgb2gray(Buffer &dst, const View &src, float coef_r, float coef_g, float coef_b)
+{
+    GAPI_Assert(src.meta().depth == CV_8U);
+    GAPI_Assert(dst.meta().depth == CV_8U);
+    GAPI_Assert(src.meta().chan == 3);
+    GAPI_Assert(dst.meta().chan == 1);
+    GAPI_Assert(src.length() == dst.length());
+
+    GAPI_Assert(coef_r < 1 && coef_g < 1 && coef_b < 1);
+    GAPI_Assert(std::abs(coef_r + coef_g + coef_b - 1) < 0.001);
+
+    const auto *in  = src.InLine<uchar>(0);
+          auto *out = dst.OutLine<uchar>();
+
+    int width = dst.length();
+
+    run_rgb2gray_impl(out, in, width, coef_r, coef_g, coef_b);
+}
+
+GAPI_FLUID_KERNEL(GFluidRGB2GrayCustom, cv::gapi::imgproc::GRGB2GrayCustom, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, float coef_r, float coef_g, float coef_b, Buffer &dst)
+    {
+        run_rgb2gray(dst, src, coef_r, coef_g, coef_b);
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidRGB2Gray, cv::gapi::imgproc::GRGB2Gray, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, Buffer &dst)
+    {
+        float coef_r = coef_rgb2yuv_bt601[0];
+        float coef_g = coef_rgb2yuv_bt601[1];
+        float coef_b = coef_rgb2yuv_bt601[2];
+        run_rgb2gray(dst, src, coef_r, coef_g, coef_b);
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidBGR2Gray, cv::gapi::imgproc::GBGR2Gray, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, Buffer &dst)
+    {
+        float coef_r = coef_rgb2yuv_bt601[0];
+        float coef_g = coef_rgb2yuv_bt601[1];
+        float coef_b = coef_rgb2yuv_bt601[2];
+        run_rgb2gray(dst, src, coef_b, coef_g, coef_r);
+    }
+};
+
+//--------------------------------------
+//
+// Fluid kernels: RGB-to-YUV, YUV-to-RGB
+//
+//--------------------------------------
+
+static void run_rgb2yuv(Buffer &dst, const View &src, const float coef[5])
+{
+    GAPI_Assert(src.meta().depth == CV_8U);
+    GAPI_Assert(dst.meta().depth == CV_8U);
+    GAPI_Assert(src.meta().chan == 3);
+    GAPI_Assert(dst.meta().chan == 3);
+    GAPI_Assert(src.length() == dst.length());
+
+    const auto *in  = src.InLine<uchar>(0);
+          auto *out = dst.OutLine<uchar>();
+
+    int width = dst.length();
+
+    run_rgb2yuv_impl(out, in, width, coef);
+}
+
+static void run_yuv2rgb(Buffer &dst, const View &src, const float coef[4])
+{
+    GAPI_Assert(src.meta().depth == CV_8U);
+    GAPI_Assert(dst.meta().depth == CV_8U);
+    GAPI_Assert(src.meta().chan == 3);
+    GAPI_Assert(dst.meta().chan == 3);
+    GAPI_Assert(src.length() == dst.length());
+
+    const auto *in  = src.InLine<uchar>(0);
+          auto *out = dst.OutLine<uchar>();
+
+    int width = dst.length();
+
+    run_yuv2rgb_impl(out, in, width, coef);
+}
+
+GAPI_FLUID_KERNEL(GFluidRGB2YUV, cv::gapi::imgproc::GRGB2YUV, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, Buffer &dst)
+    {
+        run_rgb2yuv(dst, src, coef_rgb2yuv_bt601);
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidYUV2RGB, cv::gapi::imgproc::GYUV2RGB, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, Buffer &dst)
+    {
+        run_yuv2rgb(dst, src, coef_yuv2rgb_bt601);
+    }
+};
+
+//--------------------------------------
+//
+// Fluid kernels: RGB-to-Lab, BGR-to-LUV
+//
+//--------------------------------------
+
+enum LabLUV { LL_Lab, LL_LUV };
+
+#define LabLuv_reference 0  // 1=use reference code of RGB/BGR to LUV/Lab, 0=don't
+
+#if LabLuv_reference
+
+// gamma-correction (inverse) for sRGB, 1/gamma=2.4 for inverse, like for Mac OS (?)
+static inline float f_gamma(float x)
+{
+    return x <= 0.04045f ? x*(1.f/12.92f) : std::pow((x + 0.055f)*(1/1.055f), 2.4f);
+}
+
+// saturate into interval [0, 1]
+static inline float clip01(float value)
+{
+    return value < 0? 0:
+           value > 1? 1:
+           value;
+}
+
+static inline void f_rgb2xyz(float  R, float  G, float  B,
+                             float& X, float& Y, float& Z)
+{
+    X = clip01(0.412453f*R + 0.357580f*G + 0.180423f*B);
+    Y = clip01(0.212671f*R + 0.715160f*G + 0.072169f*B);
+    Z = clip01(0.019334f*R + 0.119193f*G + 0.950227f*B);
+}
+
+static inline void f_xyz2lab(float  X, float  Y, float  Z,
+                             float& L, float& a, float& b)
+{
+    // CIE XYZ values of reference white point for D65 illuminant
+    static const float Xn = 0.950456f, Yn = 1.f, Zn = 1.088754f;
+
+    // Other coefficients below:
+    // 7.787f    = (29/3)^3/(29*4)
+    // 0.008856f = (6/29)^3
+    // 903.3     = (29/3)^3
+
+    float x = X/Xn, y = Y/Yn, z = Z/Zn;
+
+    auto f = [](float t){ return t>0.008856f? std::cbrt(t): (7.787f*t + 16.f/116.f); };
+
+    float fx = f(x), fy = f(y), fz = f(z);
+
+    L = y > 0.008856f ? (116.f*std::cbrt(y) - 16.f) : (903.3f * y);
+    a = 500.f * (fx - fy);
+    b = 200.f * (fy - fz);
+}
+
+static inline void f_xyz2luv(float  X, float  Y, float  Z,
+                             float& L, float& u, float& v)
+{
+    static const float un = 0.19793943f, vn = 0.46831096f;
+
+    float u1 = 4*X / (X + 15*Y + 3*Z);
+    float v1 = 9*Y / (X + 15*Y + 3*Z);
+
+    L = Y > 0.008856f ? (116.f*std::cbrt(Y) - 16.f) : (903.3f * Y);
+    u = 13*L * (u1 - un);
+    v = 13*L * (v1 - vn);
+}
+
+template<LabLUV labluv, int blue=0>
+static void run_rgb2labluv_reference(uchar out[], const uchar in[], int width)
+{
+    for (int w=0; w < width; w++)
+    {
+        float R, G, B;
+        B = in[3*w +    blue ] / 255.f;
+        G = in[3*w +    1    ] / 255.f;
+        R = in[3*w + (2^blue)] / 255.f;
+
+        B = f_gamma( B );
+        G = f_gamma( G );
+        R = f_gamma( R );
+
+        float X, Y, Z;
+        f_rgb2xyz(R, G, B, X, Y, Z);
+
+        // compile-time `if`
+        if (LL_Lab == labluv)
+        {
+            float L, a, b;
+            f_xyz2lab(X, Y, Z, L, a, b);
+
+            out[3*w    ] = saturate<uchar>(L * 255.f/100, roundf);
+            out[3*w + 1] = saturate<uchar>(a + 128, roundf);
+            out[3*w + 2] = saturate<uchar>(b + 128, roundf);
+        }
+        else if (LL_LUV == labluv)
+        {
+            float L, u, v;
+            f_xyz2luv(X, Y, Z, L, u, v);
+
+            out[3*w    ] = saturate<uchar>( L        * 255.f/100, roundf);
+            out[3*w + 1] = saturate<uchar>((u + 134) * 255.f/354, roundf);
+            out[3*w + 2] = saturate<uchar>((v + 140) * 255.f/262, roundf);
+        }
+        else
+            CV_Error(cv::Error::StsBadArg, "unsupported color conversion");;
+    }
+}
+
+#endif  // LabLuv_reference
+
+// compile-time parameters: output format (Lab/LUV),
+// and position of blue channel in BGR/RGB (0 or 2)
+template<LabLUV labluv, int blue=0>
+static void run_rgb2labluv(Buffer &dst, const View &src)
+{
+    GAPI_Assert(src.meta().depth == CV_8U);
+    GAPI_Assert(dst.meta().depth == CV_8U);
+    GAPI_Assert(src.meta().chan == 3);
+    GAPI_Assert(dst.meta().chan == 3);
+    GAPI_Assert(src.length() == dst.length());
+
+    const auto *in  = src.InLine<uchar>(0);
+          auto *out = dst.OutLine<uchar>();
+
+    int width = dst.length();
+
+#if LabLuv_reference
+    run_rgb2labluv_reference<labluv, blue>(out, in, width);
+#else
+    uchar *dst_data = out;
+    const uchar *src_data = in;
+    size_t src_step = width;
+    size_t dst_step = width;
+    int height = 1;
+    int depth = CV_8U;
+    int scn = 3;
+    bool swapBlue = (blue == 2);
+    bool isLab = (LL_Lab == labluv);
+    bool srgb = true;
+    cv::hal::cvtBGRtoLab(src_data, src_step, dst_data, dst_step,
+               width, height, depth, scn, swapBlue, isLab, srgb);
+#endif
+}
+
+GAPI_FLUID_KERNEL(GFluidRGB2Lab, cv::gapi::imgproc::GRGB2Lab, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, Buffer &dst)
+    {
+        static const int blue = 2; // RGB: 0=red, 1=green, 2=blue
+        run_rgb2labluv<LL_Lab, blue>(dst, src);
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidBGR2LUV, cv::gapi::imgproc::GBGR2LUV, false)
+{
+    static const int Window = 1;
+
+    static void run(const View &src, Buffer &dst)
+    {
+        static const int blue = 0; // BGR: 0=blue, 1=green, 2=red
+        run_rgb2labluv<LL_LUV, blue>(dst, src);
+    }
+};
+
+//-------------------------------
+//
+// Fluid kernels: blur, boxFilter
+//
+//-------------------------------
+
+static const int maxKernelSize = 9;
+
+template<typename DST, typename SRC>
+static void run_boxfilter(Buffer &dst, const View &src, const cv::Size &kernelSize,
+                          const cv::Point& /* anchor */, bool normalize)
+{
+    GAPI_Assert(kernelSize.width <= maxKernelSize);
+    GAPI_Assert(kernelSize.width == kernelSize.height);
+
+    int kernel = kernelSize.width;
+    int border = (kernel - 1) / 2;
+
+    const SRC *in[ maxKernelSize ];
+          DST *out;
+
+    for (int i=0; i < kernel; i++)
+    {
+        in[i] = src.InLine<SRC>(i - border);
+    }
+
+    out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    GAPI_DbgAssert(chan <= 4);
+
+    for (int w=0; w < width; w++)
+    {
+        float sum[4] = {0, 0, 0, 0};
+
+        for (int i=0; i < kernel; i++)
+        {
+            for (int j=0; j < kernel; j++)
+            {
+                for (int c=0; c < chan; c++)
+                    sum[c] += in[i][(w + j - border)*chan + c];
+            }
+        }
+
+        for (int c=0; c < chan; c++)
+        {
+            float result = normalize? sum[c]/(kernel * kernel) : sum[c];
+
+            out[w*chan + c] = saturate<DST>(result, rintf);
+        }
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidBlur, cv::gapi::imgproc::GBlur, false)
+{
+    static const int Window = 3;
+
+    static void run(const View &src, const cv::Size& kernelSize, const cv::Point& anchor,
+                    int /* borderType */, const cv::Scalar& /* borderValue */, Buffer &dst)
+    {
+        // TODO: support sizes 3, 5, 7, 9, ...
+        GAPI_Assert(kernelSize.width  == 3 && kernelSize.height == 3);
+
+        // TODO: suport non-trivial anchor
+        GAPI_Assert(anchor.x == -1 && anchor.y == -1);
+
+        static const bool normalize = true;
+
+        //     DST     SRC     OP             __VA_ARGS__
+        UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_( short,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+
+    static Border getBorder(const cv::GMatDesc& /* src */,
+                            const cv::Size    & /* kernelSize */,
+                            const cv::Point   & /* anchor */,
+                                      int          borderType,
+                            const cv::Scalar  &    borderValue)
+    {
+        return { borderType, borderValue};
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidBoxFilter, cv::gapi::imgproc::GBoxFilter, false)
+{
+    static const int Window = 3;
+
+    static void run(const     View  &    src,
+                              int     /* ddepth */,
+                    const cv::Size  &    kernelSize,
+                    const cv::Point &   anchor,
+                              bool       normalize,
+                              int     /* borderType */,
+                    const cv::Scalar& /* borderValue */,
+                              Buffer&    dst)
+    {
+        // TODO: support sizes 3, 5, 7, 9, ...
+        GAPI_Assert(kernelSize.width  == 3 && kernelSize.height == 3);
+
+        // TODO: suport non-trivial anchor
+        GAPI_Assert(anchor.x == -1 && anchor.y == -1);
+
+        //     DST     SRC     OP             __VA_ARGS__
+        UNARY_(uchar , uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_(ushort, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_( short,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_( float, uchar , run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_( float, ushort, run_boxfilter, dst, src, kernelSize, anchor, normalize);
+        UNARY_( float,  short, run_boxfilter, dst, src, kernelSize, anchor, normalize);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+
+    static Border getBorder(const cv::GMatDesc& /* src */,
+                                      int       /* ddepth */,
+                            const cv::Size    & /* kernelSize */,
+                            const cv::Point   & /* anchor */,
+                                      bool      /* normalize */,
+                                      int          borderType,
+                            const cv::Scalar  &    borderValue)
+    {
+        return { borderType, borderValue};
+    }
+};
+
+//-------------------------
+//
+// Fluid kernels: sepFilter
+//
+//-------------------------
+
+template<typename T>
+static void getKernel(T k[], const cv::Mat& kernel)
+{
+    GAPI_Assert(kernel.channels() == 1);
+
+    int depth = CV_MAT_DEPTH(kernel.type());
+    int cols = kernel.cols;
+    int rows = kernel.rows;
+
+    switch ( depth )
+    {
+    case CV_8U:
+        for (int h=0; h < rows; h++)
+        for (int w=0; w < cols; w++)
+            k[h*cols + w] = static_cast<T>( kernel.at<uchar>(h, w) );
+        break;
+    case CV_16U:
+        for (int h=0; h < rows; h++)
+        for (int w=0; w < cols; w++)
+            k[h*cols + w] = static_cast<T>( kernel.at<ushort>(h, w) );
+        break;
+    case CV_16S:
+        for (int h=0; h < rows; h++)
+        for (int w=0; w < cols; w++)
+            k[h*cols + w] = static_cast<T>( kernel.at<short>(h, w) );
+        break;
+    case CV_32F:
+        for (int h=0; h < rows; h++)
+        for (int w=0; w < cols; w++)
+            k[h*cols + w] = static_cast<T>( kernel.at<float>(h, w) );
+        break;
+    default: CV_Error(cv::Error::StsBadArg, "unsupported kernel type");
+    }
+}
+
+template<typename DST, typename SRC>
+static void run_sepfilter(Buffer& dst, const View& src,
+                          const float kx[], int kxLen,
+                          const float ky[], int kyLen,
+                          const cv::Point& /* anchor */,
+                          float delta=0)
+{
+    static const int maxLines = 9;
+    GAPI_Assert(kyLen <= maxLines);
+
+    const SRC *in[ maxLines ];
+          DST *out;
+
+    int border = (kyLen - 1) / 2;
+    for (int i=0; i < kyLen; i++)
+    {
+        in[i] = src.InLine<SRC>(i - border);
+    }
+
+    out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    for (int w=0; w < width; w++)
+    {
+        // TODO: make this cycle innermost
+        for (int c=0; c < chan; c++)
+        {
+            float sum=0;
+
+            for (int i=0; i < kyLen; i++)
+            {
+                float sumi=0;
+
+                for (int j=0; j < kxLen; j++)
+                {
+                    sumi += in[i][(w + j - border)*chan + c] * kx[j];
+                }
+
+                sum += sumi * ky[i];
+            }
+
+            float result = sum + delta;
+
+            out[w*chan + c] = saturate<DST>(result, rintf);
+        }
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidSepFilter, cv::gapi::imgproc::GSepFilter, true)
+{
+    static const int Window = 3;
+
+    static void run(const     View&      src,
+                              int     /* ddepth */,
+                    const cv::Mat&       kernX,
+                    const cv::Mat&       kernY,
+                    const cv::Point&     anchor,
+                    const cv::Scalar&    delta_,
+                              int     /* borderType */,
+                    const cv::Scalar& /* borderValue */,
+                              Buffer&    dst,
+                              Buffer&    scratch)
+    {
+        // TODO: support non-trivial anchors
+        GAPI_Assert(anchor.x == -1 && anchor.y == -1);
+
+        // TODO: support kernel heights 3, 5, 7, 9, ...
+        GAPI_Assert((kernY.rows == 1 || kernY.cols == 1)  && (kernY.cols * kernY.rows == 3));
+        GAPI_Assert((kernX.rows == 1 || kernX.cols == 1));
+
+        int kxLen = kernX.rows * kernX.cols;
+        int kyLen = kernY.rows * kernY.cols;
+
+        float *kx = scratch.OutLine<float>();
+        float *ky = kx + kxLen;
+
+        float delta = static_cast<float>(delta_[0]);
+
+        //     DST     SRC     OP             __VA_ARGS__
+        UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
+        UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
+        UNARY_( short,  short, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
+        UNARY_( float,  float, run_sepfilter, dst, src, kx, kxLen, ky, kyLen, anchor, delta);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+
+    static void initScratch(const GMatDesc& /* in */,
+                                  int       /* ddepth */,
+                            const Mat     &    kernX,
+                            const Mat     &    kernY,
+                            const Point   & /* anchor */,
+                            const Scalar  & /* delta */,
+                                  int       /* borderType */,
+                            const Scalar  & /* borderValue */,
+                                  Buffer  &    scratch)
+    {
+        int kxLen = kernX.rows * kernX.cols;
+        int kyLen = kernY.rows * kernY.cols;
+
+        cv::gapi::own::Size bufsize(kxLen + kyLen, 1);
+        GMatDesc bufdesc = {CV_32F, 1, bufsize};
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+
+        // FIXME: move to resetScratch stage ?
+        float *kx = scratch.OutLine<float>();
+        float *ky = kx + kxLen;
+        getKernel(kx, kernX);
+        getKernel(ky, kernY);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
+
+    static Border getBorder(const cv::GMatDesc& /* src */,
+                                      int       /* ddepth */,
+                            const cv::Mat&      /* kernX */,
+                            const cv::Mat&      /* kernY */,
+                            const cv::Point&    /* anchor */,
+                            const cv::Scalar&   /* delta */,
+                                      int          borderType,
+                            const cv::Scalar&      borderValue)
+    {
+        return { borderType, borderValue};
+    }
+};
+
+//----------------------------
+//
+// Fluid kernels: gaussianBlur
+//
+//----------------------------
+
+GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
+{
+    // TODO: support kernel height 3, 5, 7, 9, ...
+    static const int Window = 3;
+
+    static void run(const     View  &    src,
+                    const cv::Size  &    ksize,
+                              double  /* sigmaX */,
+                              double  /* sigmaY */,
+                              int     /* borderType */,
+                    const cv::Scalar& /* borderValue */,
+                              Buffer&    dst,
+                              Buffer&    scratch)
+    {
+        GAPI_Assert(ksize.height == 3);
+
+        int kxsize = ksize.width;
+        int kysize = ksize.height;
+
+        auto *kx = scratch.OutLine<float>(); // cached kernX data
+        auto *ky = kx + kxsize;              // cached kernY data
+
+        auto  anchor = cv::Point(-1, -1);
+        float delta = 0.f;
+
+        //     DST     SRC     OP             __VA_ARGS__
+        UNARY_(uchar , uchar , run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
+        UNARY_(ushort, ushort, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
+        UNARY_( short,  short, run_sepfilter, dst, src, kx, kxsize, ky, kysize, anchor, delta);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+
+    static void initScratch(const GMatDesc& /* in */,
+                            const cv::Size &   ksize,
+                                  double       sigmaX,
+                                  double       sigmaY,
+                                  int       /* borderType */,
+                            const cv::Scalar  & /* borderValue */,
+                                  Buffer  &    scratch)
+    {
+        int kxsize = ksize.width;
+        int kysize = ksize.height;
+
+        cv::gapi::own::Size bufsize(kxsize + kysize, 1);
+        GMatDesc bufdesc = {CV_32F, 1, bufsize};
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+
+        // FIXME: fill buffer at resetScratch stage?
+
+        if (sigmaX == 0)
+            sigmaX = 0.3 * ((kxsize - 1)/2. - 1) + 0.8;
+
+        if (sigmaY == 0)
+            sigmaY = sigmaX;
+
+        Mat kernX = getGaussianKernel(kxsize, sigmaX, CV_32F);
+
+        Mat kernY = kernX;
+        if (sigmaY != sigmaX)
+            kernY = getGaussianKernel(kysize, sigmaY, CV_32F);
+
+        auto *kx = scratch.OutLine<float>();
+        auto *ky = kx + kxsize;
+
+        getKernel(kx, kernX);
+        getKernel(ky, kernY);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
+
+    static Border getBorder(const cv::GMatDesc& /* src */,
+                            const cv::Size    & /* ksize */,
+                                      double    /* sigmaX */,
+                                      double    /* sigmaY */,
+                                      int          borderType,
+                            const cv::Scalar  &    borderValue)
+    {
+        return { borderType, borderValue};
+    }
+};
+
+//---------------------
+//
+// Fluid kernels: Sobel
+//
+//---------------------
+
+template<typename DST, typename SRC>
+static void run_sobel(Buffer& dst,
+                const View  & src,
+                const float   kx[],
+                const float   ky[],
+                      int     ksize,
+                      float   scale,  // default: 1
+                      float   delta,  // default: 0
+                      float  *buf[])
+{
+    static const int kmax = 11;
+    GAPI_Assert(ksize <= kmax);
+
+    const SRC *in[ kmax ];
+          DST *out;
+
+    int border = (ksize - 1) / 2;
+    for (int i=0; i < ksize; i++)
+    {
+        in[i] = src.InLine<SRC>(i - border);
+    }
+
+    out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    GAPI_DbgAssert(ksize == 3);
+//  float buf[3][width * chan];
+
+    int y  = dst.y();
+    int y0 = dst.priv().writeStart();
+//  int y1 = dst.priv().writeEnd();
+
+    run_sobel_row(out, in, width, chan, kx, ky, border, scale, delta, buf, y, y0);
+}
+
+GAPI_FLUID_KERNEL(GFluidSobel, cv::gapi::imgproc::GSobel, true)
+{
+    static const int Window = 3;
+
+    static void run(const     View  &    src,
+                              int     /* ddepth */,
+                              int     /* dx */,
+                              int     /* dy */,
+                              int        ksize,
+                              double    _scale,
+                              double    _delta,
+                              int     /* borderType */,
+                    const cv::Scalar& /* borderValue */,
+                              Buffer&    dst,
+                              Buffer&    scratch)
+    {
+        // TODO: support kernel height 3, 5, 7, 9, ...
+        GAPI_Assert(ksize == 3 || ksize == FILTER_SCHARR);
+
+        int ksz = (ksize == FILTER_SCHARR)? 3: ksize;
+
+        auto *kx = scratch.OutLine<float>();
+        auto *ky = kx + ksz;
+
+        int width = dst.meta().size.width;
+        int chan  = dst.meta().chan;
+
+        float *buf[3];
+        buf[0] = ky + ksz;
+        buf[1] = buf[0] + width*chan;
+        buf[2] = buf[1] + width*chan;
+
+        auto scale = static_cast<float>(_scale);
+        auto delta = static_cast<float>(_delta);
+
+        //     DST     SRC     OP         __VA_ARGS__
+        UNARY_(uchar , uchar , run_sobel, dst, src, kx, ky, ksz, scale, delta, buf);
+        UNARY_(ushort, ushort, run_sobel, dst, src, kx, ky, ksz, scale, delta, buf);
+        UNARY_( short, uchar , run_sobel, dst, src, kx, ky, ksz, scale, delta, buf);
+        UNARY_( short, ushort, run_sobel, dst, src, kx, ky, ksz, scale, delta, buf);
+        UNARY_( short,  short, run_sobel, dst, src, kx, ky, ksz, scale, delta, buf);
+        UNARY_( float, uchar , run_sobel, dst, src, kx, ky, ksz, scale, delta, buf);
+        UNARY_( float, ushort, run_sobel, dst, src, kx, ky, ksz, scale, delta, buf);
+        UNARY_( float,  short, run_sobel, dst, src, kx, ky, ksz, scale, delta, buf);
+        UNARY_( float,  float, run_sobel, dst, src, kx, ky, ksz, scale, delta, buf);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+
+    static void initScratch(const GMatDesc&    in,
+                                  int       /* ddepth */,
+                                  int          dx,
+                                  int          dy,
+                                  int          ksize,
+                                  double    /* scale */,
+                                  double    /* delta */,
+                                  int       /* borderType */,
+                            const Scalar  & /* borderValue */,
+                                  Buffer  &    scratch)
+    {
+        // TODO: support kernel height 3, 5, 7, 9, ...
+        GAPI_Assert(ksize == 3 || ksize == FILTER_SCHARR);
+        int ksz = (ksize == FILTER_SCHARR) ? 3 : ksize;
+
+        int width = in.size.width;
+        int chan  = in.chan;
+
+        int buflen = ksz + ksz            // kernels: kx, ky
+                   + ksz * width * chan;  // working buffers
+
+        cv::gapi::own::Size bufsize(buflen, 1);
+        GMatDesc bufdesc = {CV_32F, 1, bufsize};
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+
+        auto *kx = scratch.OutLine<float>();
+        auto *ky = kx + ksz;
+
+        Mat kxmat(1, ksize, CV_32FC1, kx);
+        Mat kymat(ksize, 1, CV_32FC1, ky);
+        getDerivKernels(kxmat, kymat, dx, dy, ksize);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
+
+    static Border getBorder(const cv::GMatDesc& /* src */,
+                                      int       /* ddepth */,
+                                      int       /* dx */,
+                                      int       /* dy */,
+                                      int       /* ksize */,
+                                      double    /* scale */,
+                                      double    /* delta */,
+                                      int          borderType,
+                            const cv::Scalar  &    borderValue)
+    {
+        return {borderType, borderValue};
+    }
+};
+
+//------------------------
+//
+// Fluid kernels: filter2D
+//
+//------------------------
+
+template<typename DST, typename SRC>
+static void run_filter2d(Buffer& dst, const View& src,
+                         const float k[], int k_rows, int k_cols,
+                         const cv::Point& /* anchor */,
+                         float delta=0)
+{
+    static const int maxLines = 9;
+    GAPI_Assert(k_rows <= maxLines);
+
+    const SRC *in[ maxLines ];
+          DST *out;
+
+    int border_x = (k_cols - 1) / 2;
+    int border_y = (k_rows - 1) / 2;
+
+    for (int i=0; i < k_rows; i++)
+    {
+        in[i] = src.InLine<SRC>(i - border_y);
+    }
+
+    out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    for (int w=0; w < width; w++)
+    {
+        // TODO: make this cycle innermost
+        for (int c=0; c < chan; c++)
+        {
+            float sum = 0;
+
+            for (int i=0; i < k_rows; i++)
+            for (int j=0; j < k_cols; j++)
+            {
+                sum += in[i][(w + j - border_x)*chan + c] * k[k_cols*i + j];
+            }
+
+            float result = sum + delta;
+
+            out[w*chan + c] = saturate<DST>(result, rintf);
+        }
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidFilter2D, cv::gapi::imgproc::GFilter2D, true)
+{
+    static const int Window = 3;
+
+    static void run(const     View  &    src,
+                              int     /* ddepth */,
+                    const cv::Mat   &    kernel,
+                    const cv::Point &    anchor,
+                    const cv::Scalar&    delta_,
+                              int     /* borderType */,
+                    const cv::Scalar& /* borderValue */,
+                              Buffer&    dst,
+                              Buffer&    scratch)
+    {
+        // TODO: support non-trivial anchors
+        GAPI_Assert(anchor.x == -1 && anchor.y == -1);
+
+        // TODO: support kernel heights 3, 5, 7, 9, ...
+        GAPI_Assert(kernel.rows == 3 && kernel.cols == 3);
+
+        float delta = static_cast<float>(delta_[0]);
+
+        int k_rows = kernel.rows;
+        int k_cols = kernel.cols;
+        const float *k = scratch.OutLine<float>(); // copy of kernel.data
+
+        //     DST     SRC     OP            __VA_ARGS__
+        UNARY_(uchar , uchar , run_filter2d, dst, src, k, k_rows, k_cols, anchor, delta);
+        UNARY_(ushort, ushort, run_filter2d, dst, src, k, k_rows, k_cols, anchor, delta);
+        UNARY_( short,  short, run_filter2d, dst, src, k, k_rows, k_cols, anchor, delta);
+        UNARY_( float, uchar , run_filter2d, dst, src, k, k_rows, k_cols, anchor, delta);
+        UNARY_( float, ushort, run_filter2d, dst, src, k, k_rows, k_cols, anchor, delta);
+        UNARY_( float,  short, run_filter2d, dst, src, k, k_rows, k_cols, anchor, delta);
+        UNARY_( float,  float, run_filter2d, dst, src, k, k_rows, k_cols, anchor, delta);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+
+    static void initScratch(const cv::GMatDesc& /* in */,
+                                      int       /* ddepth */,
+                            const cv::Mat     &    kernel,
+                            const cv::Point   & /* anchor */,
+                            const cv::Scalar  & /* delta */,
+                                      int       /* borderType */,
+                            const cv::Scalar  & /* borderValue */,
+                                      Buffer  &    scratch)
+    {
+        cv::gapi::own::Size bufsize(kernel.rows * kernel.cols, 1);
+        GMatDesc bufdesc = {CV_32F, 1, bufsize};
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+
+        // FIXME: move to resetScratch stage ?
+        float *data = scratch.OutLine<float>();
+        getKernel(data, kernel);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
+
+    static Border getBorder(const cv::GMatDesc& /* src */,
+                                      int       /* ddepth */,
+                            const cv::Mat&      /* kernel */,
+                            const cv::Point&    /* anchor */,
+                            const cv::Scalar&   /* delta */,
+                                      int          borderType,
+                            const cv::Scalar&      borderValue)
+    {
+        return { borderType, borderValue};
+    }
+};
+
+//-----------------------------
+//
+// Fluid kernels: erode, dilate
+//
+//-----------------------------
+
+enum Morphology { M_ERODE, M_DILATE };
+
+template<typename DST, typename SRC>
+static void run_morphology(          Buffer&    dst,
+                           const     View  &    src,
+                           const     uchar      k[],
+                                     int        k_rows,
+                                     int        k_cols,
+                           const cv::Point & /* anchor */,
+                                     Morphology morphology)
+{
+    static const int maxLines = 9;
+    GAPI_Assert(k_rows <= maxLines);
+
+    const SRC *in[ maxLines ];
+          DST *out;
+
+    int border_x = (k_cols - 1) / 2;
+    int border_y = (k_rows - 1) / 2;
+
+    for (int i=0; i < k_rows; i++)
+    {
+        in[i] = src.InLine<SRC>(i - border_y);
+    }
+
+    out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    for (int w=0; w < width; w++)
+    {
+        // TODO: make this cycle innermost
+        for (int c=0; c < chan; c++)
+        {
+            SRC result=0;
+            if (M_ERODE == morphology)
+            {
+                result = std::numeric_limits<SRC>::max();
+            }
+            else if (M_DILATE == morphology)
+            {
+                result = std::numeric_limits<SRC>::min();
+            }
+            else
+                CV_Error(cv::Error::StsBadArg, "unsupported morphology operation");
+
+            for (int i=0; i < k_rows; i++)
+            for (int j=0; j < k_cols; j++)
+            {
+                if ( k[k_cols*i + j] )
+                {
+                    if (M_ERODE == morphology)
+                    {
+                        result = std::min(result, in[i][(w + j - border_x)*chan + c]);
+                    }
+                    else if (M_DILATE == morphology)
+                    {
+                        result = std::max(result, in[i][(w + j - border_x)*chan + c]);
+                    }
+                    else
+                        CV_Error(cv::Error::StsBadArg, "unsupported morphology operation");
+                }
+            }
+
+            out[w*chan + c] = saturate<DST>(result, rintf);
+        }
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidErode, cv::gapi::imgproc::GErode, true)
+{
+    static const int Window = 3;
+
+    static void run(const     View  &    src,
+                    const cv::Mat   &    kernel,
+                    const cv::Point &    anchor,
+                              int        iterations,
+                              int     /* borderType */,
+                    const cv::Scalar& /* borderValue */,
+                              Buffer&    dst,
+                              Buffer&    scratch)
+    {
+        // TODO: support non-trivial anchors
+        GAPI_Assert(anchor.x == -1 && anchor.y == -1);
+
+        // TODO: support kernel heights 3, 5, 7, 9, ...
+        GAPI_Assert(kernel.rows == 3 && kernel.cols == 3);
+
+        // TODO: support iterations > 1
+        GAPI_Assert(iterations == 1);
+
+        int k_rows = kernel.rows;
+        int k_cols = kernel.cols;
+
+        auto *k = scratch.OutLine<uchar>(); // copy of kernel.data
+
+        //     DST     SRC     OP              __VA_ARGS__
+        UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
+        UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
+        UNARY_( short,  short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_ERODE);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+
+    static void initScratch(const GMatDesc& /* in */,
+                            const Mat     &    kernel,
+                            const Point   & /* anchor */,
+                              int           /* iterations */,
+                                  int       /* borderType */,
+                            const cv::Scalar  & /* borderValue */,
+                                  Buffer  &    scratch)
+    {
+        int k_rows = kernel.rows;
+        int k_cols = kernel.cols;
+
+        cv::gapi::own::Size bufsize(k_rows * k_cols, 1);
+        GMatDesc bufdesc = {CV_8U, 1, bufsize};
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+
+        // FIXME: move to resetScratch stage ?
+        auto *k = scratch.OutLine<uchar>();
+        getKernel(k, kernel);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
+
+    static Border getBorder(const cv::GMatDesc& /* src */,
+                            const cv::Mat   &   /* kernel */,
+                            const cv::Point &   /* anchor */,
+                                      int       /* iterations */,
+                                      int          borderType,
+                            const cv::Scalar&      borderValue)
+    {
+    #if 1
+        // TODO: saturate borderValue to image type in general case (not only maximal border)
+        GAPI_Assert(borderType == cv::BORDER_CONSTANT && borderValue[0] == DBL_MAX);
+        return { borderType, cv::gapi::own::Scalar::all(INT_MAX) };
+    #else
+        return { borderType, borderValue };
+    #endif
+    }
+};
+
+GAPI_FLUID_KERNEL(GFluidDilate, cv::gapi::imgproc::GDilate, true)
+{
+    static const int Window = 3;
+
+    static void run(const     View  &    src,
+                    const cv::Mat   &    kernel,
+                    const cv::Point &    anchor,
+                              int        iterations,
+                              int     /* borderType */,
+                    const cv::Scalar& /* borderValue */,
+                              Buffer&    dst,
+                              Buffer&    scratch)
+    {
+        // TODO: support non-trivial anchors
+        GAPI_Assert(anchor.x == -1 && anchor.y == -1);
+
+        // TODO: support kernel heights 3, 5, 7, 9, ...
+        GAPI_Assert(kernel.rows == 3 && kernel.cols == 3);
+
+        // TODO: support iterations > 1
+        GAPI_Assert(iterations == 1);
+
+        int k_rows = kernel.rows;
+        int k_cols = kernel.cols;
+
+        auto *k = scratch.OutLine<uchar>(); // copy of kernel.data
+
+        //     DST     SRC     OP              __VA_ARGS__
+        UNARY_(uchar , uchar , run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
+        UNARY_(ushort, ushort, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
+        UNARY_( short,  short, run_morphology, dst, src, k, k_rows, k_cols, anchor, M_DILATE);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+
+    static void initScratch(const GMatDesc& /* in */,
+                            const Mat     &    kernel,
+                            const Point   & /* anchor */,
+                              int           /* iterations */,
+                                  int       /* borderType */,
+                            const cv::Scalar  & /* borderValue */,
+                                  Buffer  &    scratch)
+    {
+        int k_rows = kernel.rows;
+        int k_cols = kernel.cols;
+
+        cv::gapi::own::Size bufsize(k_rows * k_cols, 1);
+        GMatDesc bufdesc = {CV_8U, 1, bufsize};
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+
+        // FIXME: move to resetScratch stage ?
+        auto *k = scratch.OutLine<uchar>();
+        getKernel(k, kernel);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
+
+    static Border getBorder(const cv::GMatDesc& /* src */,
+                            const cv::Mat   &   /* kernel */,
+                            const cv::Point &   /* anchor */,
+                                      int       /* iterations */,
+                                      int       borderType,
+                            const cv::Scalar&   borderValue)
+    {
+    #if 1
+        // TODO: fix borderValue for Dilate in general case (not only minimal border)
+        GAPI_Assert(borderType == cv::BORDER_CONSTANT && borderValue[0] == DBL_MAX);
+        return { borderType, cv::gapi::own::Scalar::all(INT_MIN) };
+    #else
+        return { borderType, borderValue };
+    #endif
+    }
+};
+
+//--------------------------
+//
+// Fluid kernels: medianBlur
+//
+//--------------------------
+
+template<typename DST, typename SRC>
+static void run_medianblur(      Buffer& dst,
+                           const View  & src,
+                                 int     ksize)
+{
+    static const int kmax = 9;
+    GAPI_Assert(ksize <= kmax);
+
+    const SRC *in[ kmax ];
+          DST *out;
+
+    int border = (ksize - 1) / 2;
+
+    for (int i=0; i < ksize; i++)
+    {
+        in[i] = src.InLine<SRC>(i - border);
+    }
+
+    out = dst.OutLine<DST>(0);
+
+    int width = dst.length();
+    int chan  = dst.meta().chan;
+
+    for (int w=0; w < width; w++)
+    {
+        // TODO: make this cycle innermost
+        for (int c=0; c < chan; c++)
+        {
+            SRC neighbours[kmax * kmax];
+
+            for (int i=0; i < ksize; i++)
+            for (int j=0; j < ksize; j++)
+            {
+                neighbours[i*ksize + j] = in[i][(w + j - border)*chan + c];
+            }
+
+            int length = ksize * ksize;
+            std::nth_element(neighbours, neighbours + length/2, neighbours + length);
+
+            out[w*chan + c] = saturate<DST>(neighbours[length/2], rintf);
+        }
+    }
+}
+
+GAPI_FLUID_KERNEL(GFluidMedianBlur, cv::gapi::imgproc::GMedianBlur, false)
+{
+    static const int Window = 3;
+
+    static void run(const View  & src,
+                          int     ksize,
+                          Buffer& dst)
+    {
+        // TODO: support kernel sizes: 3, 5, 7, 9, ...
+        GAPI_Assert(ksize == 3);
+
+        //     DST     SRC     OP              __VA_ARGS__
+        UNARY_(uchar , uchar , run_medianblur, dst, src, ksize);
+        UNARY_(ushort, ushort, run_medianblur, dst, src, ksize);
+        UNARY_( short,  short, run_medianblur, dst, src, ksize);
+
+        CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
+    }
+
+    static Border getBorder(const cv::GMatDesc& /* src */,
+                                      int       /* ksize */)
+    {
+        int  borderType  = cv::BORDER_REPLICATE;
+        auto borderValue = cv::Scalar();
+        return { borderType, borderValue };
+    }
+};
+
+} // namespace fliud
+} // namespace gapi
+} // namespace cv
+
+cv::gapi::GKernelPackage cv::gapi::imgproc::fluid::kernels()
+{
+    using namespace cv::gapi::fluid;
+
+    return cv::gapi::kernels
+    <   GFluidBGR2Gray
+      , GFluidRGB2Gray
+      , GFluidRGB2GrayCustom
+      , GFluidRGB2YUV
+      , GFluidYUV2RGB
+      , GFluidRGB2Lab
+      , GFluidBGR2LUV
+      , GFluidBlur
+      , GFluidSepFilter
+      , GFluidBoxFilter
+      , GFluidFilter2D
+      , GFluidErode
+      , GFluidDilate
+      , GFluidMedianBlur
+      , GFluidGaussBlur
+      , GFluidSobel
+    #if 0
+      , GFluidCanny        -- not fluid (?)
+      , GFluidEqualizeHist -- not fluid
+    #endif
+    >();
+}
+
+#endif // !defined(GAPI_STANDALONE)
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
new file mode 100644 (file)
index 0000000..9b21790
--- /dev/null
@@ -0,0 +1,93 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+#if !defined(GAPI_STANDALONE)
+
+#include "gfluidimgproc_func.hpp"
+#include "gfluidimgproc_func.simd.hpp"
+#include "backends/fluid/gfluidimgproc_func.simd_declarations.hpp"
+
+#include "gfluidutils.hpp"
+
+#include "opencv2/core/cvdef.h"
+#include "opencv2/core/hal/intrin.hpp"
+
+#include <cmath>
+#include <cstdlib>
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wstrict-overflow"
+#endif
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+//----------------------------------
+//
+// Fluid kernels: RGB2Gray, BGR2Gray
+//
+//----------------------------------
+
+void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
+                       float coef_r, float coef_g, float coef_b)
+{
+    CV_CPU_DISPATCH(run_rgb2gray_impl,
+        (out, in, width, coef_r, coef_g, coef_b),
+        CV_CPU_DISPATCH_MODES_ALL);
+}
+
+//--------------------------------------
+//
+// Fluid kernels: RGB-to-YUV, YUV-to-RGB
+//
+//--------------------------------------
+
+void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5])
+{
+    CV_CPU_DISPATCH(run_rgb2yuv_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL);
+}
+
+void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4])
+{
+    CV_CPU_DISPATCH(run_yuv2rgb_impl, (out, in, width, coef), CV_CPU_DISPATCH_MODES_ALL);
+}
+
+//---------------------
+//
+// Fluid kernels: Sobel
+//
+//---------------------
+
+#define RUN_SOBEL_ROW(DST, SRC)                                          \
+void run_sobel_row(DST out[], const SRC *in[], int width, int chan,      \
+                   const float kx[], const float ky[], int border,       \
+                   float scale, float delta, float *buf[],               \
+                   int y, int y0)                                        \
+{                                                                        \
+    CV_CPU_DISPATCH(run_sobel_row,                                       \
+        (out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \
+        CV_CPU_DISPATCH_MODES_ALL);                                      \
+}
+
+RUN_SOBEL_ROW(uchar , uchar )
+RUN_SOBEL_ROW(ushort, ushort)
+RUN_SOBEL_ROW( short, uchar )
+RUN_SOBEL_ROW( short, ushort)
+RUN_SOBEL_ROW( short,  short)
+RUN_SOBEL_ROW( float, uchar )
+RUN_SOBEL_ROW( float, ushort)
+RUN_SOBEL_ROW( float,  short)
+RUN_SOBEL_ROW( float,  float)
+
+#undef RUN_SOBEL_ROW
+
+} // namespace fliud
+} // namespace gapi
+} // namespace cv
+
+#endif // !defined(GAPI_STANDALONE)
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
new file mode 100644 (file)
index 0000000..1b6f1b8
--- /dev/null
@@ -0,0 +1,64 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+#pragma once
+
+#if !defined(GAPI_STANDALONE)
+
+#include "opencv2/core.hpp"
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+//----------------------------------
+//
+// Fluid kernels: RGB2Gray, BGR2Gray
+//
+//----------------------------------
+
+void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
+                       float coef_r, float coef_g, float coef_b);
+
+//--------------------------------------
+//
+// Fluid kernels: RGB-to-YUV, YUV-to-RGB
+//
+//--------------------------------------
+
+void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5]);
+
+void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
+
+//---------------------
+//
+// Fluid kernels: Sobel
+//
+//---------------------
+
+#define RUN_SOBEL_ROW(DST, SRC)                                     \
+void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
+                   const float kx[], const float ky[], int border,  \
+                   float scale, float delta, float *buf[],          \
+                   int y, int y0);
+
+RUN_SOBEL_ROW(uchar , uchar )
+RUN_SOBEL_ROW(ushort, ushort)
+RUN_SOBEL_ROW( short, uchar )
+RUN_SOBEL_ROW( short, ushort)
+RUN_SOBEL_ROW( short,  short)
+RUN_SOBEL_ROW( float, uchar )
+RUN_SOBEL_ROW( float, ushort)
+RUN_SOBEL_ROW( float,  short)
+RUN_SOBEL_ROW( float,  float)
+
+#undef RUN_SOBEL_ROW
+
+}  // namespace fluid
+}  // namespace gapi
+}  // namespace cv
+
+#endif // !defined(GAPI_STANDALONE)
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
new file mode 100644 (file)
index 0000000..c87be08
--- /dev/null
@@ -0,0 +1,562 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+// NB: allow including this *.hpp several times!
+// #pragma once -- don't: this file is NOT once!
+
+#if !defined(GAPI_STANDALONE)
+
+#include "opencv2/gapi/own/saturate.hpp"
+
+#include "opencv2/core.hpp"
+#include "opencv2/core/hal/intrin.hpp"
+
+#include <cstdint>
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wstrict-overflow"
+#endif
+
+using cv::gapi::own::saturate;
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN
+
+//----------------------------------
+//
+// Fluid kernels: RGB2Gray, BGR2Gray
+//
+//----------------------------------
+
+void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
+                       float coef_r, float coef_g, float coef_b);
+
+//--------------------------------------
+//
+// Fluid kernels: RGB-to-YUV, YUV-to-RGB
+//
+//--------------------------------------
+
+void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5]);
+
+void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4]);
+
+//---------------------
+//
+// Fluid kernels: Sobel
+//
+//---------------------
+
+#define RUN_SOBEL_ROW(DST, SRC)                                     \
+void run_sobel_row(DST out[], const SRC *in[], int width, int chan, \
+                  const float kx[], const float ky[], int border,   \
+                  float scale, float delta, float *buf[],           \
+                  int y, int y0);
+
+RUN_SOBEL_ROW(uchar , uchar )
+RUN_SOBEL_ROW(ushort, ushort)
+RUN_SOBEL_ROW( short, uchar )
+RUN_SOBEL_ROW( short, ushort)
+RUN_SOBEL_ROW( short,  short)
+RUN_SOBEL_ROW( float, uchar )
+RUN_SOBEL_ROW( float, ushort)
+RUN_SOBEL_ROW( float,  short)
+RUN_SOBEL_ROW( float,  float)
+
+#undef RUN_SOBEL_ROW
+
+//----------------------------------------------------------------------
+
+#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+//----------------------------------
+//
+// Fluid kernels: RGB2Gray, BGR2Gray
+//
+//----------------------------------
+
+void run_rgb2gray_impl(uchar out[], const uchar in[], int width,
+                       float coef_r, float coef_g, float coef_b)
+{
+    // assume:
+    // - coefficients are less than 1
+    // - and their sum equals 1
+
+    constexpr int unity = 1 << 16;  // Q0.0.16 inside ushort:
+    ushort rc = static_cast<ushort>(coef_r * unity + 0.5f);
+    ushort gc = static_cast<ushort>(coef_g * unity + 0.5f);
+    ushort bc = static_cast<ushort>(coef_b * unity + 0.5f);
+
+    GAPI_Assert(rc + gc + bc <= unity);
+    GAPI_Assert(rc + gc + bc >= USHRT_MAX);
+
+#if CV_SIMD
+    constexpr int nlanes = v_uint8::nlanes;
+    if (width >= nlanes)
+    {
+        for (int w=0; w < width; )
+        {
+            // process main part of pixels row
+            for ( ; w <= width - nlanes; w += nlanes)
+            {
+                v_uint8 r, g, b;
+                v_load_deinterleave(&in[3*w], r, g, b);
+
+                v_uint16 r0, r1, g0, g1, b0, b1;
+                v_expand(r, r0, r1);
+                v_expand(g, g0, g1);
+                v_expand(b, b0, b1);
+
+                v_uint16 y0, y1;
+                static const ushort half = 1 << 7; // Q0.8.8
+                y0 = (v_mul_hi(r0 << 8, vx_setall_u16(rc)) +
+                      v_mul_hi(g0 << 8, vx_setall_u16(gc)) +
+                      v_mul_hi(b0 << 8, vx_setall_u16(bc)) +
+                                        vx_setall_u16(half)) >> 8;
+                y1 = (v_mul_hi(r1 << 8, vx_setall_u16(rc)) +
+                      v_mul_hi(g1 << 8, vx_setall_u16(gc)) +
+                      v_mul_hi(b1 << 8, vx_setall_u16(bc)) +
+                                        vx_setall_u16(half)) >> 8;
+
+                v_uint8 y;
+                y = v_pack(y0, y1);
+                v_store(&out[w], y);
+            }
+
+            // process tail (if any)
+            if (w < width)
+            {
+                GAPI_DbgAssert(width - nlanes >= 0);
+                w = width - nlanes;
+            }
+        }
+
+        return;
+    }
+#endif
+
+    for (int w=0; w < width; w++)
+    {
+        uchar r = in[3*w    ];
+        uchar g = in[3*w + 1];
+        uchar b = in[3*w + 2];
+
+        static const int half = 1 << 15;  // Q0.0.16
+        ushort y = (r*rc + b*bc + g*gc + half) >> 16;
+        out[w] = static_cast<uchar>(y);
+    }
+}
+
+//--------------------------------------
+//
+// Fluid kernels: RGB-to-YUV, YUV-to-RGB
+//
+//--------------------------------------
+
+void run_rgb2yuv_impl(uchar out[], const uchar in[], int width, const float coef[5])
+{
+    ushort c0 = static_cast<ushort>(coef[0]*(1 << 16) + 0.5f);  // Q0.0.16 un-signed
+    ushort c1 = static_cast<ushort>(coef[1]*(1 << 16) + 0.5f);
+    ushort c2 = static_cast<ushort>(coef[2]*(1 << 16) + 0.5f);
+    short c3 = static_cast<short>(coef[3]*(1 << 12) + 0.5f);    // Q1.0.12 signed
+    short c4 = static_cast<short>(coef[4]*(1 << 12) + 0.5f);
+
+    int w = 0;
+
+#if CV_SIMD
+    static const int nlanes = v_uint8::nlanes;
+    for ( ; w <= width - nlanes; w += nlanes)
+    {
+        v_uint8 r, g, b;
+        v_load_deinterleave(&in[3*w], r, g, b);
+
+        v_uint16 _r0, _r1, _g0, _g1, _b0, _b1;
+        v_expand(r, _r0, _r1);
+        v_expand(g, _g0, _g1);
+        v_expand(b, _b0, _b1);
+
+        _r0 = _r0 << 7;                         // Q0.9.7 un-signed
+        _r1 = _r1 << 7;
+        _g0 = _g0 << 7;
+        _g1 = _g1 << 7;
+        _b0 = _b0 << 7;
+        _b1 = _b1 << 7;
+
+        v_uint16 _y0, _y1;
+        _y0 = v_mul_hi(vx_setall_u16(c0), _r0)  // Q0.9.7
+            + v_mul_hi(vx_setall_u16(c1), _g0)
+            + v_mul_hi(vx_setall_u16(c2), _b0);
+        _y1 = v_mul_hi(vx_setall_u16(c0), _r1)
+            + v_mul_hi(vx_setall_u16(c1), _g1)
+            + v_mul_hi(vx_setall_u16(c2), _b1);
+
+        v_int16 r0, r1, b0, b1, y0, y1;
+        r0 = v_reinterpret_as_s16(_r0);         // Q1.8.7 signed
+        r1 = v_reinterpret_as_s16(_r1);
+        b0 = v_reinterpret_as_s16(_b0);
+        b1 = v_reinterpret_as_s16(_b1);
+        y0 = v_reinterpret_as_s16(_y0);
+        y1 = v_reinterpret_as_s16(_y1);
+
+        v_int16 u0, u1, v0, v1;
+        u0 = v_mul_hi(vx_setall_s16(c3), b0 - y0);  // Q1.12.3
+        u1 = v_mul_hi(vx_setall_s16(c3), b1 - y1);
+        v0 = v_mul_hi(vx_setall_s16(c4), r0 - y0);
+        v1 = v_mul_hi(vx_setall_s16(c4), r1 - y1);
+
+        v_uint8 y, u, v;
+        y = v_pack((_y0 + vx_setall_u16(1 << 6)) >> 7,
+                   (_y1 + vx_setall_u16(1 << 6)) >> 7);
+        u = v_pack_u((u0 + vx_setall_s16(257 << 2)) >> 3,  // 257 << 2 = 128.5 * (1 << 3)
+                     (u1 + vx_setall_s16(257 << 2)) >> 3);
+        v = v_pack_u((v0 + vx_setall_s16(257 << 2)) >> 3,
+                     (v1 + vx_setall_s16(257 << 2)) >> 3);
+
+        v_store_interleave(&out[3*w], y, u, v);
+    }
+#endif
+
+    for ( ; w < width; w++)
+    {
+        short r = in[3*w    ] << 7;                            // Q1.8.7 signed
+        short g = in[3*w + 1] << 7;
+        short b = in[3*w + 2] << 7;
+        short y = (c0*r + c1*g + c2*b) >> 16;                  // Q1.8.7
+        short u =  c3*(b - y) >> 16;                           // Q1.12.3
+        short v =  c4*(r - y) >> 16;
+        out[3*w    ] = static_cast<uchar>((y              + (1 << 6)) >> 7);
+        out[3*w + 1] =    saturate<uchar>((u + (128 << 3) + (1 << 2)) >> 3);
+        out[3*w + 2] =    saturate<uchar>((v + (128 << 3) + (1 << 2)) >> 3);
+    }
+}
+
+void run_yuv2rgb_impl(uchar out[], const uchar in[], int width, const float coef[4])
+{
+    short c0 = static_cast<short>(coef[0] * (1 << 12) + 0.5f);  // Q1.3.12
+    short c1 = static_cast<short>(coef[1] * (1 << 12) + 0.5f);
+    short c2 = static_cast<short>(coef[2] * (1 << 12) + 0.5f);
+    short c3 = static_cast<short>(coef[3] * (1 << 12) + 0.5f);
+
+    int w = 0;
+
+#if CV_SIMD
+    static const int nlanes = v_uint8::nlanes;
+    for ( ; w <= width - nlanes; w += nlanes)
+    {
+        v_uint8 y, u, v;
+        v_load_deinterleave(&in[3*w], y, u, v);
+
+        v_uint16 _y0, _y1, _u0, _u1, _v0, _v1;
+        v_expand(y, _y0, _y1);
+        v_expand(u, _u0, _u1);
+        v_expand(v, _v0, _v1);
+
+        v_int16 y0, y1, u0, u1, v0, v1;
+        y0 = v_reinterpret_as_s16(_y0);
+        y1 = v_reinterpret_as_s16(_y1);
+        u0 = v_reinterpret_as_s16(_u0);
+        u1 = v_reinterpret_as_s16(_u1);
+        v0 = v_reinterpret_as_s16(_v0);
+        v1 = v_reinterpret_as_s16(_v1);
+
+        y0 =  y0 << 3;                              // Q1.12.3
+        y1 =  y1 << 3;
+        u0 = (u0 - vx_setall_s16(128)) << 7;        // Q1.8.7
+        u1 = (u1 - vx_setall_s16(128)) << 7;
+        v0 = (v0 - vx_setall_s16(128)) << 7;
+        v1 = (v1 - vx_setall_s16(128)) << 7;
+
+        v_int16 r0, r1, g0, g1, b0, b1;
+        r0 = y0 + v_mul_hi(vx_setall_s16(c0), v0);  // Q1.12.3
+        r1 = y1 + v_mul_hi(vx_setall_s16(c0), v1);
+        g0 = y0 + v_mul_hi(vx_setall_s16(c1), u0)
+                + v_mul_hi(vx_setall_s16(c2), v0);
+        g1 = y1 + v_mul_hi(vx_setall_s16(c1), u1)
+                + v_mul_hi(vx_setall_s16(c2), v1);
+        b0 = y0 + v_mul_hi(vx_setall_s16(c3), u0);
+        b1 = y1 + v_mul_hi(vx_setall_s16(c3), u1);
+
+        v_uint8 r, g, b;
+        r = v_pack_u((r0 + vx_setall_s16(1 << 2)) >> 3,
+                     (r1 + vx_setall_s16(1 << 2)) >> 3);
+        g = v_pack_u((g0 + vx_setall_s16(1 << 2)) >> 3,
+                     (g1 + vx_setall_s16(1 << 2)) >> 3);
+        b = v_pack_u((b0 + vx_setall_s16(1 << 2)) >> 3,
+                     (b1 + vx_setall_s16(1 << 2)) >> 3);
+
+        v_store_interleave(&out[3*w], r, g, b);
+    }
+#endif
+
+    for ( ; w < width; w++)
+    {
+        short y =  in[3*w    ]        << 3;  // Q1.12.3
+        short u = (in[3*w + 1] - 128) << 7;  // Q1.8.7
+        short v = (in[3*w + 2] - 128) << 7;
+        short r = y + (        c0*v  >> 16); // Q1.12.3
+        short g = y + ((c1*u + c2*v) >> 16);
+        short b = y + ((c3*u       ) >> 16);
+        out[3*w    ] = saturate<uchar>((r + (1 << 2)) >> 3);
+        out[3*w + 1] = saturate<uchar>((g + (1 << 2)) >> 3);
+        out[3*w + 2] = saturate<uchar>((b + (1 << 2)) >> 3);
+    }
+}
+
+//---------------------
+//
+// Fluid kernels: Sobel
+//
+//---------------------
+
+// Sobel 3x3: vertical pass
+template<bool noscale, typename DST>
+static void run_sobel3x3_vert(DST out[], int length, const float ky[],
+                float scale, float delta, const int r[], float *buf[])
+{
+    float ky0 = ky[0],
+          ky1 = ky[1],
+          ky2 = ky[2];
+
+    int r0 = r[0],
+        r1 = r[1],
+        r2 = r[2];
+
+#if CV_SIMD
+    // for floating-point output,
+    // manual vectoring may be not better than compiler's optimization
+#define EXPLICIT_SIMD_32F 0  // 1=vectorize 32f case explicitly, 0=don't
+#if     EXPLICIT_SIMD_32F
+    if (std::is_same<DST, float>::value && length >= v_int16::nlanes)
+    {
+        constexpr static int nlanes = v_float32::nlanes;
+
+        for (int l=0; l < length; )
+        {
+            for (; l <= length - nlanes; l += nlanes)
+            {
+                v_float32 sum = vx_load(&buf[r0][l]) * vx_setall_f32(ky0);
+                    sum = v_fma(vx_load(&buf[r1][l]),  vx_setall_f32(ky1), sum);
+                    sum = v_fma(vx_load(&buf[r2][l]),  vx_setall_f32(ky2), sum);
+
+                if (!noscale)
+                {
+                    sum = v_fma(sum, vx_setall_f32(scale), vx_setall_f32(delta));
+                }
+
+                v_store(reinterpret_cast<float*>(&out[l]), sum);
+            }
+
+            if (l < length)
+            {
+                // tail: recalculate last pixels
+                GAPI_DbgAssert(length >= nlanes);
+                l = length - nlanes;
+            }
+        }
+
+        return;
+    }
+#endif
+
+    if ((std::is_same<DST, short>::value || std::is_same<DST, ushort>::value)
+        && length >= v_int16::nlanes)
+    {
+        constexpr static int nlanes = v_int16::nlanes;
+
+        for (int l=0; l < length; )
+        {
+            for (; l <= length - nlanes; l += nlanes)
+            {
+                v_float32 sum0 = vx_load(&buf[r0][l])            * vx_setall_f32(ky0);
+                    sum0 = v_fma(vx_load(&buf[r1][l]),             vx_setall_f32(ky1), sum0);
+                    sum0 = v_fma(vx_load(&buf[r2][l]),             vx_setall_f32(ky2), sum0);
+
+                v_float32 sum1 = vx_load(&buf[r0][l + nlanes/2]) * vx_setall_f32(ky0);
+                    sum1 = v_fma(vx_load(&buf[r1][l + nlanes/2]),  vx_setall_f32(ky1), sum1);
+                    sum1 = v_fma(vx_load(&buf[r2][l + nlanes/2]),  vx_setall_f32(ky2), sum1);
+
+                if (!noscale)
+                {
+                    sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                    sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+                }
+
+                v_int32 isum0 = v_round(sum0),
+                        isum1 = v_round(sum1);
+
+                if (std::is_same<DST, short>::value)
+                {
+                    // signed short
+                    v_int16 res = v_pack(isum0, isum1);
+                    v_store(reinterpret_cast<short*>(&out[l]), res);
+                } else
+                {
+                    // unsigned short
+                    v_uint16 res = v_pack_u(isum0, isum1);
+                    v_store(reinterpret_cast<ushort*>(&out[l]), res);
+                }
+            }
+
+            if (l < length)
+            {
+                // tail: recalculate last pixels
+                GAPI_DbgAssert(length >= nlanes);
+                l = length - nlanes;
+            }
+        }
+
+        return;
+    }
+
+    if (std::is_same<DST, uchar>::value && length >= v_uint8::nlanes)
+    {
+        constexpr static int nlanes = v_uint8::nlanes;
+
+        for (int l=0; l < length; )
+        {
+            for (; l <= length - nlanes; l += nlanes)
+            {
+                v_float32 sum0 = vx_load(&buf[r0][l])              * vx_setall_f32(ky0);
+                    sum0 = v_fma(vx_load(&buf[r1][l]),               vx_setall_f32(ky1), sum0);
+                    sum0 = v_fma(vx_load(&buf[r2][l]),               vx_setall_f32(ky2), sum0);
+
+                v_float32 sum1 = vx_load(&buf[r0][l +   nlanes/4]) * vx_setall_f32(ky0);
+                    sum1 = v_fma(vx_load(&buf[r1][l +   nlanes/4]),  vx_setall_f32(ky1), sum1);
+                    sum1 = v_fma(vx_load(&buf[r2][l +   nlanes/4]),  vx_setall_f32(ky2), sum1);
+
+                v_float32 sum2 = vx_load(&buf[r0][l + 2*nlanes/4]) * vx_setall_f32(ky0);
+                    sum2 = v_fma(vx_load(&buf[r1][l + 2*nlanes/4]),  vx_setall_f32(ky1), sum2);
+                    sum2 = v_fma(vx_load(&buf[r2][l + 2*nlanes/4]),  vx_setall_f32(ky2), sum2);
+
+                v_float32 sum3 = vx_load(&buf[r0][l + 3*nlanes/4]) * vx_setall_f32(ky0);
+                    sum3 = v_fma(vx_load(&buf[r1][l + 3*nlanes/4]),  vx_setall_f32(ky1), sum3);
+                    sum3 = v_fma(vx_load(&buf[r2][l + 3*nlanes/4]),  vx_setall_f32(ky2), sum3);
+
+                if (!noscale)
+                {
+                    sum0 = v_fma(sum0, vx_setall_f32(scale), vx_setall_f32(delta));
+                    sum1 = v_fma(sum1, vx_setall_f32(scale), vx_setall_f32(delta));
+                    sum2 = v_fma(sum2, vx_setall_f32(scale), vx_setall_f32(delta));
+                    sum3 = v_fma(sum3, vx_setall_f32(scale), vx_setall_f32(delta));
+                }
+
+                v_int32 isum0 = v_round(sum0),
+                        isum1 = v_round(sum1),
+                        isum2 = v_round(sum2),
+                        isum3 = v_round(sum3);
+
+                v_int16 ires0 = v_pack(isum0, isum1),
+                        ires1 = v_pack(isum2, isum3);
+
+                v_uint8 res = v_pack_u(ires0, ires1);
+                v_store(reinterpret_cast<uchar*>(&out[l]), res);
+            }
+
+            if (l < length)
+            {
+                // tail: recalculate last pixels
+                GAPI_DbgAssert(length >= nlanes);
+                l = length - nlanes;
+            }
+        }
+
+        return;
+    }
+#endif
+
+    // reference code
+    for (int l=0; l < length; l++)
+    {
+        float sum = buf[r0][l]*ky0 + buf[r1][l]*ky1 + buf[r2][l]*ky2;
+
+        if (!noscale)
+        {
+            sum = sum*scale + delta;
+        }
+
+        out[l] = cv::gapi::own::saturate<DST>(sum, rintf);
+    }
+}
+
+template<typename DST, typename SRC>
+static void run_sobel_impl(DST out[], const SRC *in[], int width, int chan,
+                           const float kx[], const float ky[], int border,
+                           float scale, float delta, float *buf[],
+                           int y, int y0)
+{
+    int r[3];
+    r[0] = (y - y0)     % 3;  // buf[r[0]]: previous
+    r[1] = (y - y0 + 1) % 3;  //            this
+    r[2] = (y - y0 + 2) % 3;  //            next row
+
+    int length = width * chan;
+
+    // horizontal pass
+
+    // full horizontal pass is needed only if very 1st row in ROI;
+    // for 2nd and further rows, it is enough to convolve only the
+    // "next" row - as we can reuse buffers from previous calls to
+    // this kernel (note that Fluid processes rows consequently)
+    int k0 = (y == y0)? 0: 2;
+
+    for (int k = k0; k < 3; k++)
+    {
+        //                             previous, this , next pixel
+        const SRC *s[3] = {in[k] - border*chan , in[k], in[k] + border*chan};
+
+        // rely on compiler vectoring
+        for (int l=0; l < length; l++)
+        {
+            buf[r[k]][l] = s[0][l]*kx[0] + s[1][l]*kx[1] + s[2][l]*kx[2];
+        }
+    }
+
+    // vertical pass
+    if (scale == 1 && delta == 0)
+    {
+        constexpr static bool noscale = true;  // omit scaling
+        run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
+    } else
+    {
+        constexpr static bool noscale = false;  // do scaling
+        run_sobel3x3_vert<noscale, DST>(out, length, ky, scale, delta, r, buf);
+    }
+}
+
+#define RUN_SOBEL_ROW(DST, SRC)                                                    \
+void run_sobel_row(DST out[], const SRC *in[], int width, int chan,                \
+                   const float kx[], const float ky[], int border,                 \
+                   float scale, float delta, float *buf[],                         \
+                   int y, int y0)                                                  \
+{                                                                                  \
+    run_sobel_impl(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0); \
+}
+
+RUN_SOBEL_ROW(uchar , uchar )
+RUN_SOBEL_ROW(ushort, ushort)
+RUN_SOBEL_ROW( short, uchar )
+RUN_SOBEL_ROW( short, ushort)
+RUN_SOBEL_ROW( short,  short)
+RUN_SOBEL_ROW( float, uchar )
+RUN_SOBEL_ROW( float, ushort)
+RUN_SOBEL_ROW( float,  short)
+RUN_SOBEL_ROW( float,  float)
+
+#undef RUN_SOBEL_ROW
+
+#endif  // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+
+CV_CPU_OPTIMIZATION_NAMESPACE_END
+
+}  // namespace fluid
+}  // namespace gapi
+}  // namespace cv
+
+#endif // !defined(GAPI_STANDALONE)
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidutils.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/fluid/gfluidutils.hpp
new file mode 100644 (file)
index 0000000..a38b2f1
--- /dev/null
@@ -0,0 +1,93 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef GFLUIDUTILS_HPP
+#define GFLUIDUTILS_HPP
+
+#include <limits>
+#include <type_traits>
+#include <opencv2/gapi/util/compiler_hints.hpp> //UNUSED
+#include <opencv2/gapi/own/saturate.hpp>
+
+namespace cv {
+namespace gapi {
+namespace fluid {
+
+using cv::gapi::own::saturate;
+using cv::gapi::own::ceild;
+using cv::gapi::own::floord;
+using cv::gapi::own::roundd;
+using cv::gapi::own::rintd;
+
+//--------------------------------
+//
+// Macros for mappig of data types
+//
+//--------------------------------
+
+#define UNARY_(DST, SRC, OP, ...)                         \
+    if (cv::DataType<DST>::depth == dst.meta().depth &&   \
+        cv::DataType<SRC>::depth == src.meta().depth)     \
+    {                                                     \
+        GAPI_DbgAssert(dst.length() == src.length());       \
+        GAPI_DbgAssert(dst.meta().chan == src.meta().chan); \
+                                                          \
+        OP<DST, SRC>(__VA_ARGS__);                        \
+        return;                                           \
+    }
+
+// especial unary operation: dst is always 8UC1 image
+#define INRANGE_(DST, SRC, OP, ...)                       \
+    if (cv::DataType<DST>::depth == dst.meta().depth &&   \
+        cv::DataType<SRC>::depth == src.meta().depth)     \
+    {                                                     \
+        GAPI_DbgAssert(dst.length() == src.length());       \
+        GAPI_DbgAssert(dst.meta().chan == 1);               \
+                                                          \
+        OP<DST, SRC>(__VA_ARGS__);                        \
+        return;                                           \
+    }
+
+#define BINARY_(DST, SRC1, SRC2, OP, ...)                  \
+    if (cv::DataType<DST>::depth == dst.meta().depth &&    \
+        cv::DataType<SRC1>::depth == src1.meta().depth &&  \
+        cv::DataType<SRC2>::depth == src2.meta().depth)    \
+    {                                                      \
+        GAPI_DbgAssert(dst.length() == src1.length());       \
+        GAPI_DbgAssert(dst.length() == src2.length());       \
+                                                           \
+        GAPI_DbgAssert(dst.meta().chan == src1.meta().chan); \
+        GAPI_DbgAssert(dst.meta().chan == src2.meta().chan); \
+                                                           \
+        OP<DST, SRC1, SRC2>(__VA_ARGS__);                  \
+        return;                                            \
+    }
+
+// especial ternary operation: src3 has only one channel
+#define SELECT_(DST, SRC1, SRC2, SRC3, OP, ...)            \
+    if (cv::DataType<DST>::depth == dst.meta().depth &&    \
+        cv::DataType<SRC1>::depth == src1.meta().depth &&  \
+        cv::DataType<SRC2>::depth == src2.meta().depth &&  \
+        cv::DataType<SRC3>::depth == src3.meta().depth)    \
+    {                                                      \
+        GAPI_DbgAssert(dst.length() == src1.length());       \
+        GAPI_DbgAssert(dst.length() == src2.length());       \
+        GAPI_DbgAssert(dst.length() == src3.length());       \
+                                                           \
+        GAPI_DbgAssert(dst.meta().chan == src1.meta().chan); \
+        GAPI_DbgAssert(dst.meta().chan == src2.meta().chan); \
+        GAPI_DbgAssert(              1 == src3.meta().chan); \
+                                                           \
+        OP<DST, SRC1, SRC2, SRC3>(__VA_ARGS__);            \
+        return;                                            \
+    }
+
+} // namespace fluid
+} // namespace gapi
+} // namespace cv
+
+#endif // GFLUIDUTILS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.cpp
new file mode 100644 (file)
index 0000000..eda6a5f
--- /dev/null
@@ -0,0 +1,226 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <functional>
+#include <unordered_set>
+
+#include <ade/util/algorithm.hpp>
+
+#include <ade/util/range.hpp>
+#include <ade/util/zip_range.hpp>
+#include <ade/util/chain_range.hpp>
+
+#include <ade/typed_graph.hpp>
+
+#include "opencv2/gapi/gcommon.hpp"
+#include "opencv2/gapi/util/any.hpp"
+#include "opencv2/gapi/gtype_traits.hpp"
+
+#include "compiler/gobjref.hpp"
+#include "compiler/gmodel.hpp"
+
+#include "backends/gpu/ggpubackend.hpp"
+#include "backends/gpu/ggpuimgproc.hpp"
+#include "backends/gpu/ggpucore.hpp"
+
+#include "api/gbackend_priv.hpp" // FIXME: Make it part of Backend SDK!
+
+// FIXME: Is there a way to take a typed graph (our GModel),
+// and create a new typed graph _ATOP_ of that (by extending with a couple of
+// new types?).
+// Alternatively, is there a way to compose types graphs?
+//
+// If not, we need to introduce that!
+using GGPUModel = ade::TypedGraph
+    < cv::gimpl::Unit
+    , cv::gimpl::Protocol
+    >;
+
+// FIXME: Same issue with Typed and ConstTyped
+using GConstGGPUModel = ade::ConstTypedGraph
+    < cv::gimpl::Unit
+    , cv::gimpl::Protocol
+    >;
+
+namespace
+{
+    class GGPUBackendImpl final: public cv::gapi::GBackend::Priv
+    {
+        virtual void unpackKernel(ade::Graph            &graph,
+                                  const ade::NodeHandle &op_node,
+                                  const cv::GKernelImpl &impl) override
+        {
+            GGPUModel gm(graph);
+            auto gpu_impl = cv::util::any_cast<cv::GGPUKernel>(impl.opaque);
+            gm.metadata(op_node).set(cv::gimpl::Unit{gpu_impl});
+        }
+
+        virtual EPtr compile(const ade::Graph &graph,
+                             const cv::GCompileArgs &,
+                             const std::vector<ade::NodeHandle> &nodes) const override
+        {
+            return EPtr{new cv::gimpl::GGPUExecutable(graph, nodes)};
+        }
+   };
+}
+
+cv::gapi::GBackend cv::gapi::gpu::backend()
+{
+    static cv::gapi::GBackend this_backend(std::make_shared<GGPUBackendImpl>());
+    return this_backend;
+}
+
+// GGPUExcecutable implementation //////////////////////////////////////////////
+cv::gimpl::GGPUExecutable::GGPUExecutable(const ade::Graph &g,
+                                          const std::vector<ade::NodeHandle> &nodes)
+    : m_g(g), m_gm(m_g)
+{
+    // Convert list of operations (which is topologically sorted already)
+    // into an execution script.
+    for (auto &nh : nodes)
+    {
+        switch (m_gm.metadata(nh).get<NodeType>().t)
+        {
+        case NodeType::OP: m_script.push_back({nh, GModel::collectOutputMeta(m_gm, nh)}); break;
+        case NodeType::DATA:
+        {
+            m_dataNodes.push_back(nh);
+            const auto &desc = m_gm.metadata(nh).get<Data>();
+            if (desc.storage == Data::Storage::CONST)
+            {
+                auto rc = RcDesc{desc.rc, desc.shape, desc.ctor};
+                magazine::bindInArg(m_res, rc, m_gm.metadata(nh).get<ConstValue>().arg);
+            }
+            //preallocate internal Mats in advance
+            if (desc.storage == Data::Storage::INTERNAL && desc.shape == GShape::GMAT)
+            {
+                const auto mat_desc = util::get<cv::GMatDesc>(desc.meta);
+                const auto type = CV_MAKETYPE(mat_desc.depth, mat_desc.chan);
+                m_res.slot<cv::UMat>()[desc.rc].create(mat_desc.size.width, mat_desc.size.height, type);
+            }
+            break;
+        }
+        default: util::throw_error(std::logic_error("Unsupported NodeType type"));
+        }
+    }
+}
+
+// FIXME: Document what it does
+cv::GArg cv::gimpl::GGPUExecutable::packArg(const GArg &arg)
+{
+    // No API placeholders allowed at this point
+    // FIXME: this check has to be done somewhere in compilation stage.
+    GAPI_Assert(   arg.kind != cv::detail::ArgKind::GMAT
+              && arg.kind != cv::detail::ArgKind::GSCALAR
+              && arg.kind != cv::detail::ArgKind::GARRAY);
+
+    if (arg.kind != cv::detail::ArgKind::GOBJREF)
+    {
+        // All other cases - pass as-is, with no transformations to GArg contents.
+        return arg;
+    }
+    GAPI_Assert(arg.kind == cv::detail::ArgKind::GOBJREF);
+
+    // Wrap associated CPU object (either host or an internal one)
+    // FIXME: object can be moved out!!! GExecutor faced that.
+    const cv::gimpl::RcDesc &ref = arg.get<cv::gimpl::RcDesc>();
+    switch (ref.shape)
+    {
+    case GShape::GMAT:    return GArg(m_res.slot<cv::UMat>()[ref.id]);
+    case GShape::GSCALAR: return GArg(m_res.slot<cv::gapi::own::Scalar>()[ref.id]);
+        // Note: .at() is intentional for GArray as object MUST be already there
+    //   (and constructed by either bindIn/Out or resetInternal)
+    case GShape::GARRAY:  return GArg(m_res.slot<cv::detail::VectorRef>().at(ref.id));
+    default:
+        util::throw_error(std::logic_error("Unsupported GShape type"));
+        break;
+    }
+}
+
+void cv::gimpl::GGPUExecutable::run(std::vector<InObj>  &&input_objs,
+                                    std::vector<OutObj> &&output_objs)
+{
+    // Update resources with run-time information - what this Island
+    // has received from user (or from another Island, or mix...)
+    // FIXME: Check input/output objects against GIsland protocol
+
+    for (auto& it : input_objs)   magazine::bindInArg (m_res, it.first, it.second, true);
+    for (auto& it : output_objs)  magazine::bindOutArg(m_res, it.first, it.second, true);
+
+    // Initialize (reset) internal data nodes with user structures
+    // before processing a frame (no need to do it for external data structures)
+    GModel::ConstGraph gm(m_g);
+    for (auto nh : m_dataNodes)
+    {
+        const auto &desc = gm.metadata(nh).get<Data>();
+
+        if (   desc.storage == Data::Storage::INTERNAL
+            && !util::holds_alternative<util::monostate>(desc.ctor))
+        {
+            // FIXME: Note that compile-time constant data objects (like
+            // a value-initialized GArray<T>) also satisfy this condition
+            // and should be excluded, but now we just don't support it
+            magazine::resetInternalData(m_res, desc);
+        }
+    }
+
+    // OpenCV backend execution is not a rocket science at all.
+    // Simply invoke our kernels in the proper order.
+    GConstGGPUModel gcm(m_g);
+    for (auto &op_info : m_script)
+    {
+        const auto &op = m_gm.metadata(op_info.nh).get<Op>();
+
+        // Obtain our real execution unit
+        // TODO: Should kernels be copyable?
+        GGPUKernel k = gcm.metadata(op_info.nh).get<Unit>().k;
+
+        // Initialize kernel's execution context:
+        // - Input parameters
+        GGPUContext context;
+        context.m_args.reserve(op.args.size());
+
+        using namespace std::placeholders;
+        ade::util::transform(op.args,
+                          std::back_inserter(context.m_args),
+                          std::bind(&GGPUExecutable::packArg, this, _1));
+
+        // - Output parameters.
+        // FIXME: pre-allocate internal Mats, etc, according to the known meta
+        for (const auto &out_it : ade::util::indexed(op.outs))
+        {
+            // FIXME: Can the same GArg type resolution mechanism be reused here?
+            const auto out_port  = ade::util::index(out_it);
+            const auto out_desc  = ade::util::value(out_it);
+            context.m_results[out_port] = magazine::getObjPtr(m_res, out_desc, true);
+        }
+
+        // Now trigger the executable unit
+        k.apply(context);
+
+        for (const auto &out_it : ade::util::indexed(op_info.expected_out_metas))
+        {
+            const auto out_index      = ade::util::index(out_it);
+            const auto expected_meta  = ade::util::value(out_it);
+            const auto out_meta       = descr_of(context.m_results[out_index]);
+
+            if (expected_meta != out_meta)
+            {
+                util::throw_error
+                    (std::logic_error
+                     ("Output meta doesn't "
+                      "coincide with the generated meta\n"
+                      "Expected: " + ade::util::to_string(expected_meta) + "\n"
+                      "Actual  : " + ade::util::to_string(out_meta)));
+            }
+        }
+    } // for(m_script)
+
+    for (auto &it : output_objs) magazine::writeBack(m_res, it.first, it.second, true);
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpubackend.hpp
new file mode 100644 (file)
index 0000000..1fb128d
--- /dev/null
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GGPUBACKEND_HPP
+#define OPENCV_GAPI_GGPUBACKEND_HPP
+
+#include <map>                // map
+#include <unordered_map>      // unordered_map
+#include <tuple>              // tuple
+#include <ade/util/algorithm.hpp> // type_list_index
+
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/gproto.hpp"
+#include "opencv2/gapi/gpu/ggpukernel.hpp"
+
+
+#include "api/gapi_priv.hpp"
+#include "backends/common/gbackend.hpp"
+#include "compiler/gislandmodel.hpp"
+
+namespace cv { namespace gimpl {
+
+struct Unit
+{
+    static const char *name() { return "GPUKernel"; }
+    GGPUKernel k;
+};
+
+class GGPUExecutable final: public GIslandExecutable
+{
+    const ade::Graph &m_g;
+    GModel::ConstGraph m_gm;
+
+    struct OperationInfo
+    {
+        ade::NodeHandle nh;
+        GMetaArgs expected_out_metas;
+    };
+
+    // Execution script, currently absolutely naive
+    std::vector<OperationInfo> m_script;
+    // List of all resources in graph (both internal and external)
+    std::vector<ade::NodeHandle> m_dataNodes;
+
+    // Actual data of all resources in graph (both internal and external)
+    Mag m_res;
+    GArg packArg(const GArg &arg);
+
+public:
+    GGPUExecutable(const ade::Graph                   &graph,
+                   const std::vector<ade::NodeHandle> &nodes);
+
+    virtual inline bool canReshape() const override { return false; }
+    virtual inline void reshape(ade::Graph&, const GCompileArgs&) override
+    {
+        // FIXME: GPU plugin is in fact reshapeable (as it was initially,
+        // even before outMeta() has been introduced), so this limitation
+        // should be dropped.
+        util::throw_error(std::logic_error("GGPUExecutable::reshape() should never be called"));
+    }
+
+    virtual void run(std::vector<InObj>  &&input_objs,
+                     std::vector<OutObj> &&output_objs) override;
+};
+
+}}
+
+#endif // OPENCV_GAPI_GGPUBACKEND_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.cpp
new file mode 100644 (file)
index 0000000..a1ee6a1
--- /dev/null
@@ -0,0 +1,582 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/core.hpp"
+#include "opencv2/gapi/gpu/core.hpp"
+#include "backends/gpu/ggpucore.hpp"
+
+GAPI_GPU_KERNEL(GGPUAdd, cv::gapi::core::GAdd)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, int dtype, cv::UMat& out)
+    {
+        cv::add(a, b, out, cv::noArray(), dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUAddC, cv::gapi::core::GAddC)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, int dtype, cv::UMat& out)
+    {
+        cv::add(a, b, out, cv::noArray(), dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUSub, cv::gapi::core::GSub)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, int dtype, cv::UMat& out)
+    {
+        cv::subtract(a, b, out, cv::noArray(), dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUSubC, cv::gapi::core::GSubC)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, int dtype, cv::UMat& out)
+    {
+        cv::subtract(a, b, out, cv::noArray(), dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUSubRC, cv::gapi::core::GSubRC)
+{
+    static void run(const cv::Scalar& a, const cv::UMat& b, int dtype, cv::UMat& out)
+    {
+        cv::subtract(a, b, out, cv::noArray(), dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUMul, cv::gapi::core::GMul)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, double scale, int dtype, cv::UMat& out)
+    {
+        cv::multiply(a, b, out, scale, dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUMulCOld, cv::gapi::core::GMulCOld)
+{
+    static void run(const cv::UMat& a, double b, int dtype, cv::UMat& out)
+    {
+        cv::multiply(a, b, out, 1, dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUMulC, cv::gapi::core::GMulC)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, int dtype, cv::UMat& out)
+    {
+        cv::multiply(a, b, out, 1, dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUDiv, cv::gapi::core::GDiv)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, double scale, int dtype, cv::UMat& out)
+    {
+        cv::divide(a, b, out, scale, dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUDivC, cv::gapi::core::GDivC)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, double scale, int dtype, cv::UMat& out)
+    {
+        cv::divide(a, b, out, scale, dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUDivRC, cv::gapi::core::GDivRC)
+{
+    static void run(const cv::Scalar& a, const cv::UMat& b, double scale, int dtype, cv::UMat& out)
+    {
+        cv::divide(a, b, out, scale, dtype);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUMask, cv::gapi::core::GMask)
+{
+    static void run(const cv::UMat& in, const cv::UMat& mask, cv::UMat& out)
+    {
+        out = cv::UMat::zeros(in.size(), in.type());
+        in.copyTo(out, mask);
+    }
+};
+
+
+GAPI_GPU_KERNEL(GGPUMean, cv::gapi::core::GMean)
+{
+    static void run(const cv::UMat& in, cv::Scalar& out)
+    {
+        out = cv::mean(in);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUPolarToCart, cv::gapi::core::GPolarToCart)
+{
+    static void run(const cv::UMat& magn, const cv::UMat& angle, bool angleInDegrees, cv::UMat& outx, cv::UMat& outy)
+    {
+        cv::polarToCart(magn, angle, outx, outy, angleInDegrees);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCartToPolar, cv::gapi::core::GCartToPolar)
+{
+    static void run(const cv::UMat& x, const cv::UMat& y, bool angleInDegrees, cv::UMat& outmagn, cv::UMat& outangle)
+    {
+        cv::cartToPolar(x, y, outmagn, outangle, angleInDegrees);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpGT, cv::gapi::core::GCmpGT)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_GT);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpGE, cv::gapi::core::GCmpGE)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_GE);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpLE, cv::gapi::core::GCmpLE)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_LE);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpLT, cv::gapi::core::GCmpLT)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_LT);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpEQ, cv::gapi::core::GCmpEQ)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_EQ);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpNE, cv::gapi::core::GCmpNE)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_NE);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpGTScalar, cv::gapi::core::GCmpGTScalar)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_GT);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpGEScalar, cv::gapi::core::GCmpGEScalar)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_GE);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpLEScalar, cv::gapi::core::GCmpLEScalar)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_LE);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpLTScalar, cv::gapi::core::GCmpLTScalar)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_LT);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpEQScalar, cv::gapi::core::GCmpEQScalar)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_EQ);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCmpNEScalar, cv::gapi::core::GCmpNEScalar)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out)
+    {
+        cv::compare(a, b, out, cv::CMP_NE);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUAnd, cv::gapi::core::GAnd)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out)
+    {
+        cv::bitwise_and(a, b, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUAndS, cv::gapi::core::GAndS)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out)
+    {
+        cv::bitwise_and(a, b, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUOr, cv::gapi::core::GOr)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out)
+    {
+        cv::bitwise_or(a, b, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUOrS, cv::gapi::core::GOrS)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out)
+    {
+        cv::bitwise_or(a, b, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUXor, cv::gapi::core::GXor)
+{
+    static void run(const cv::UMat& a, const cv::UMat& b, cv::UMat& out)
+    {
+        cv::bitwise_xor(a, b, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUXorS, cv::gapi::core::GXorS)
+{
+    static void run(const cv::UMat& a, const cv::Scalar& b, cv::UMat& out)
+    {
+        cv::bitwise_xor(a, b, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUNot, cv::gapi::core::GNot)
+{
+    static void run(const cv::UMat& a, cv::UMat& out)
+    {
+        cv::bitwise_not(a, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUSelect, cv::gapi::core::GSelect)
+{
+    static void run(const cv::UMat& src1, const cv::UMat& src2, const cv::UMat& mask, cv::UMat& out)
+    {
+        src2.copyTo(out);
+        src1.copyTo(out, mask);
+    }
+};
+
+////TODO: doesn't compiled with UMat
+//GAPI_GPU_KERNEL(GGPUMin, cv::gapi::core::GMin)
+//{
+//    static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out)
+//    {
+//        out = cv::min(in1, in2);
+//    }
+//};
+//
+////TODO: doesn't compiled with UMat
+//GAPI_GPU_KERNEL(GGPUMax, cv::gapi::core::GMax)
+//{
+//    static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out)
+//    {
+//        out = cv::max(in1, in2);
+//    }
+//};
+
+
+GAPI_GPU_KERNEL(GGPUAbsDiff, cv::gapi::core::GAbsDiff)
+{
+    static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out)
+    {
+        cv::absdiff(in1, in2, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUAbsDiffC, cv::gapi::core::GAbsDiffC)
+{
+    static void run(const cv::UMat& in1, const cv::Scalar& in2, cv::UMat& out)
+    {
+        cv::absdiff(in1, in2, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUSum, cv::gapi::core::GSum)
+{
+    static void run(const cv::UMat& in, cv::Scalar& out)
+    {
+        out = cv::sum(in);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUAddW, cv::gapi::core::GAddW)
+{
+    static void run(const cv::UMat& in1, double alpha, const cv::UMat& in2, double beta, double gamma, int dtype, cv::UMat& out)
+    {
+        cv::addWeighted(in1, alpha, in2, beta, gamma, out, dtype);
+    }
+};
+
+
+GAPI_GPU_KERNEL(GGPUNormL1, cv::gapi::core::GNormL1)
+{
+    static void run(const cv::UMat& in, cv::Scalar& out)
+    {
+        out = cv::norm(in, cv::NORM_L1);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUNormL2, cv::gapi::core::GNormL2)
+{
+    static void run(const cv::UMat& in, cv::Scalar& out)
+    {
+        out = cv::norm(in, cv::NORM_L2);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUNormInf, cv::gapi::core::GNormInf)
+{
+    static void run(const cv::UMat& in, cv::Scalar& out)
+    {
+        out = cv::norm(in, cv::NORM_INF);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUIntegral, cv::gapi::core::GIntegral)
+{
+    static void run(const cv::UMat& in, int sdepth, int sqdepth, cv::UMat& out, cv::UMat& outSq)
+    {
+        cv::integral(in, out, outSq, sdepth, sqdepth);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUThreshold, cv::gapi::core::GThreshold)
+{
+    static void run(const cv::UMat& in, const cv::Scalar& a, const cv::Scalar& b, int type, cv::UMat& out)
+    {
+        cv::threshold(in, out, a.val[0], b.val[0], type);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUThresholdOT, cv::gapi::core::GThresholdOT)
+{
+    static void run(const cv::UMat& in, const cv::Scalar& b, int type, cv::UMat& out, cv::Scalar& outScalar)
+    {
+        outScalar = cv::threshold(in, out, b.val[0], b.val[0], type);
+    }
+};
+
+
+GAPI_GPU_KERNEL(GGPUInRange, cv::gapi::core::GInRange)
+{
+    static void run(const cv::UMat& in, const cv::Scalar& low, const cv::Scalar& up, cv::UMat& out)
+    {
+        cv::inRange(in, low, up, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUSplit3, cv::gapi::core::GSplit3)
+{
+    static void run(const cv::UMat& in, cv::UMat &m1, cv::UMat &m2, cv::UMat &m3)
+    {
+        std::vector<cv::UMat> outMats = {m1, m2, m3};
+        cv::split(in, outMats);
+
+        // Write back FIXME: Write a helper or avoid this nonsence completely!
+        m1 = outMats[0];
+        m2 = outMats[1];
+        m3 = outMats[2];
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUSplit4, cv::gapi::core::GSplit4)
+{
+    static void run(const cv::UMat& in, cv::UMat &m1, cv::UMat &m2, cv::UMat &m3, cv::UMat &m4)
+    {
+        std::vector<cv::UMat> outMats = {m1, m2, m3, m4};
+        cv::split(in, outMats);
+
+        // Write back FIXME: Write a helper or avoid this nonsence completely!
+        m1 = outMats[0];
+        m2 = outMats[1];
+        m3 = outMats[2];
+        m4 = outMats[3];
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUMerge3, cv::gapi::core::GMerge3)
+{
+    static void run(const cv::UMat& in1, const cv::UMat& in2, const cv::UMat& in3, cv::UMat &out)
+    {
+        std::vector<cv::UMat> inMats = {in1, in2, in3};
+        cv::merge(inMats, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUMerge4, cv::gapi::core::GMerge4)
+{
+    static void run(const cv::UMat& in1, const cv::UMat& in2, const cv::UMat& in3, const cv::UMat& in4, cv::UMat &out)
+    {
+        std::vector<cv::UMat> inMats = {in1, in2, in3, in4};
+        cv::merge(inMats, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUResize, cv::gapi::core::GResize)
+{
+    static void run(const cv::UMat& in, cv::Size sz, double fx, double fy, int interp, cv::UMat &out)
+    {
+        cv::resize(in, out, sz, fx, fy, interp);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPURemap, cv::gapi::core::GRemap)
+{
+    static void run(const cv::UMat& in, const cv::Mat& x, const cv::Mat& y, int a, int b, cv::Scalar s, cv::UMat& out)
+    {
+        cv::remap(in, out, x, y, a, b, s);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUFlip, cv::gapi::core::GFlip)
+{
+    static void run(const cv::UMat& in, int code, cv::UMat& out)
+    {
+        cv::flip(in, out, code);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCrop, cv::gapi::core::GCrop)
+{
+    static void run(const cv::UMat& in, cv::Rect rect, cv::UMat& out)
+    {
+        cv::UMat(in, rect).copyTo(out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUConcatHor, cv::gapi::core::GConcatHor)
+{
+    static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out)
+    {
+        cv::hconcat(in1, in2, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUConcatVert, cv::gapi::core::GConcatVert)
+{
+    static void run(const cv::UMat& in1, const cv::UMat& in2, cv::UMat& out)
+    {
+        cv::vconcat(in1, in2, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPULUT, cv::gapi::core::GLUT)
+{
+    static void run(const cv::UMat& in, const cv::Mat& lut, cv::UMat& out)
+    {
+        cv::LUT(in, lut, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUConvertTo, cv::gapi::core::GConvertTo)
+{
+    static void run(const cv::UMat& in, int rtype, double alpha, double beta, cv::UMat& out)
+    {
+        in.convertTo(out, rtype, alpha, beta);
+    }
+};
+
+cv::gapi::GKernelPackage cv::gapi::core::gpu::kernels()
+{
+    static auto pkg = cv::gapi::kernels
+        <  GGPUAdd
+         , GGPUAddC
+         , GGPUSub
+         , GGPUSubC
+         , GGPUSubRC
+         , GGPUMul
+         , GGPUMulC
+         , GGPUMulCOld
+         , GGPUDiv
+         , GGPUDivC
+         , GGPUDivRC
+         , GGPUMean
+         , GGPUMask
+         , GGPUPolarToCart
+         , GGPUCartToPolar
+         , GGPUCmpGT
+         , GGPUCmpGE
+         , GGPUCmpLE
+         , GGPUCmpLT
+         , GGPUCmpEQ
+         , GGPUCmpNE
+         , GGPUCmpGTScalar
+         , GGPUCmpGEScalar
+         , GGPUCmpLEScalar
+         , GGPUCmpLTScalar
+         , GGPUCmpEQScalar
+         , GGPUCmpNEScalar
+         , GGPUAnd
+         , GGPUAndS
+         , GGPUOr
+         , GGPUOrS
+         , GGPUXor
+         , GGPUXorS
+         , GGPUNot
+         , GGPUSelect
+         //, GGPUMin
+         //, GGPUMax
+         , GGPUAbsDiff
+         , GGPUAbsDiffC
+         , GGPUSum
+         , GGPUAddW
+         , GGPUNormL1
+         , GGPUNormL2
+         , GGPUNormInf
+         , GGPUIntegral
+         , GGPUThreshold
+         , GGPUThresholdOT
+         , GGPUInRange
+         , GGPUSplit3
+         , GGPUSplit4
+         , GGPUResize
+         , GGPUMerge3
+         , GGPUMerge4
+         , GGPURemap
+         , GGPUFlip
+         , GGPUCrop
+         , GGPUConcatHor
+         , GGPUConcatVert
+         , GGPULUT
+         , GGPUConvertTo
+         >();
+    return pkg;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpucore.hpp
new file mode 100644 (file)
index 0000000..47cbfa6
--- /dev/null
@@ -0,0 +1,24 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GGPUCORE_HPP
+#define OPENCV_GAPI_GGPUCORE_HPP
+
+#include <map>
+#include <string>
+
+#include "opencv2/gapi/gpu/ggpukernel.hpp"
+
+namespace cv { namespace gimpl {
+
+// NB: This is what a "Kernel Package" from the original Wiki doc should be.
+void loadGPUCore(std::map<std::string, cv::GGPUKernel> &kmap);
+
+}
+}
+
+#endif // OPENCV_GAPI_GGPUCORE_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.cpp
new file mode 100644 (file)
index 0000000..9b7aca1
--- /dev/null
@@ -0,0 +1,277 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include "opencv2/gapi/imgproc.hpp"
+#include "opencv2/gapi/gpu/imgproc.hpp"
+#include "backends/gpu/ggpuimgproc.hpp"
+
+
+GAPI_GPU_KERNEL(GGPUSepFilter, cv::gapi::imgproc::GSepFilter)
+{
+    static void run(const cv::UMat& in, int ddepth, const cv::Mat& kernX, const cv::Mat& kernY, const cv::Point& anchor, const cv::Scalar& delta,
+                    int border, const cv::Scalar& bordVal, cv::UMat &out)
+    {
+        if( border == cv::BORDER_CONSTANT )
+        {
+            cv::UMat temp_in;
+            int width_add = (kernY.cols - 1) / 2;
+            int height_add =  (kernX.rows - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, border, bordVal);
+            cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows);
+            cv::sepFilter2D(temp_in(rect), out, ddepth, kernX, kernY, anchor, delta.val[0], border);
+        }
+        else
+            cv::sepFilter2D(in, out, ddepth, kernX, kernY, anchor, delta.val[0], border);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUBoxFilter, cv::gapi::imgproc::GBoxFilter)
+{
+    static void run(const cv::UMat& in, int ddepth, const cv::Size& ksize, const cv::Point& anchor, bool normalize, int borderType, const cv::Scalar& bordVal, cv::UMat &out)
+    {
+        if( borderType == cv::BORDER_CONSTANT )
+        {
+            cv::UMat temp_in;
+            int width_add = (ksize.width - 1) / 2;
+            int height_add =  (ksize.height - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, borderType, bordVal);
+            cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows);
+            cv::boxFilter(temp_in(rect), out, ddepth, ksize, anchor, normalize, borderType);
+        }
+        else
+            cv::boxFilter(in, out, ddepth, ksize, anchor, normalize, borderType);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUBlur, cv::gapi::imgproc::GBlur)
+{
+    static void run(const cv::UMat& in, const cv::Size& ksize, const cv::Point& anchor, int borderType, const cv::Scalar& bordVal, cv::UMat &out)
+    {
+        if( borderType == cv::BORDER_CONSTANT )
+        {
+            cv::UMat temp_in;
+            int width_add = (ksize.width - 1) / 2;
+            int height_add =  (ksize.height - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, borderType, bordVal);
+            cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows);
+            cv::blur(temp_in(rect), out, ksize, anchor, borderType);
+        }
+        else
+            cv::blur(in, out, ksize, anchor, borderType);
+    }
+};
+
+
+GAPI_GPU_KERNEL(GGPUFilter2D, cv::gapi::imgproc::GFilter2D)
+{
+    static void run(const cv::UMat& in, int ddepth, const cv::Mat& k, const cv::Point& anchor, const cv::Scalar& delta, int border,
+                    const cv::Scalar& bordVal, cv::UMat &out)
+    {
+        if( border == cv::BORDER_CONSTANT )
+        {
+            cv::UMat temp_in;
+            int width_add = (k.cols - 1) / 2;
+            int height_add =  (k.rows - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, border, bordVal );
+            cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows);
+            cv::filter2D(temp_in(rect), out, ddepth, k, anchor, delta.val[0], border);
+        }
+        else
+            cv::filter2D(in, out, ddepth, k, anchor, delta.val[0], border);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUGaussBlur, cv::gapi::imgproc::GGaussBlur)
+{
+    static void run(const cv::UMat& in, const cv::Size& ksize, double sigmaX, double sigmaY, int borderType, const cv::Scalar& bordVal, cv::UMat &out)
+    {
+        if( borderType == cv::BORDER_CONSTANT )
+        {
+            cv::UMat temp_in;
+            int width_add = (ksize.width - 1) / 2;
+            int height_add =  (ksize.height - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, height_add, height_add, width_add, width_add, borderType, bordVal );
+            cv::Rect rect = cv::Rect(height_add, width_add, in.cols, in.rows);
+            cv::GaussianBlur(temp_in(rect), out, ksize, sigmaX, sigmaY, borderType);
+        }
+        else
+            cv::GaussianBlur(in, out, ksize, sigmaX, sigmaY, borderType);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUMedianBlur, cv::gapi::imgproc::GMedianBlur)
+{
+    static void run(const cv::UMat& in, int ksize, cv::UMat &out)
+    {
+        cv::medianBlur(in, out, ksize);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUErode, cv::gapi::imgproc::GErode)
+{
+    static void run(const cv::UMat& in, const cv::Mat& kernel, const cv::Point& anchor, int iterations, int borderType, const cv::Scalar& borderValue, cv::UMat &out)
+    {
+        cv::erode(in, out, kernel, anchor, iterations, borderType, borderValue);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUDilate, cv::gapi::imgproc::GDilate)
+{
+    static void run(const cv::UMat& in, const cv::Mat& kernel, const cv::Point& anchor, int iterations, int borderType, const cv::Scalar& borderValue, cv::UMat &out)
+    {
+        cv::dilate(in, out, kernel, anchor, iterations, borderType, borderValue);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUSobel, cv::gapi::imgproc::GSobel)
+{
+    static void run(const cv::UMat& in, int ddepth, int dx, int dy, int ksize, double scale, double delta, int borderType,
+                    const cv::Scalar& bordVal, cv::UMat &out)
+    {
+        if( borderType == cv::BORDER_CONSTANT )
+        {
+            cv::UMat temp_in;
+            int add = (ksize - 1) / 2;
+            cv::copyMakeBorder(in, temp_in, add, add, add, add, borderType, bordVal );
+            cv::Rect rect = cv::Rect(add, add, in.cols, in.rows);
+            cv::Sobel(temp_in(rect), out, ddepth, dx, dy, ksize, scale, delta, borderType);
+        }
+        else
+        cv::Sobel(in, out, ddepth, dx, dy, ksize, scale, delta, borderType);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUEqualizeHist, cv::gapi::imgproc::GEqHist)
+{
+    static void run(const cv::UMat& in, cv::UMat &out)
+    {
+        cv::equalizeHist(in, out);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUCanny, cv::gapi::imgproc::GCanny)
+{
+    static void run(const cv::UMat& in, double thr1, double thr2, int apSize, bool l2gradient, cv::UMat &out)
+    {
+        cv::Canny(in, out, thr1, thr2, apSize, l2gradient);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPURGB2YUV, cv::gapi::imgproc::GRGB2YUV)
+{
+    static void run(const cv::UMat& in, cv::UMat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_RGB2YUV);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUYUV2RGB, cv::gapi::imgproc::GYUV2RGB)
+{
+    static void run(const cv::UMat& in, cv::UMat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_YUV2RGB);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPURGB2Lab, cv::gapi::imgproc::GRGB2Lab)
+{
+    static void run(const cv::UMat& in, cv::UMat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_RGB2Lab);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUBGR2LUV, cv::gapi::imgproc::GBGR2LUV)
+{
+    static void run(const cv::UMat& in, cv::UMat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_BGR2Luv);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUBGR2YUV, cv::gapi::imgproc::GBGR2YUV)
+{
+    static void run(const cv::UMat& in, cv::UMat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_BGR2YUV);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPULUV2BGR, cv::gapi::imgproc::GLUV2BGR)
+{
+    static void run(const cv::UMat& in, cv::UMat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_Luv2BGR);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUYUV2BGR, cv::gapi::imgproc::GYUV2BGR)
+{
+    static void run(const cv::UMat& in, cv::UMat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_YUV2BGR);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPURGB2Gray, cv::gapi::imgproc::GRGB2Gray)
+{
+    static void run(const cv::UMat& in, cv::UMat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_RGB2GRAY);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPUBGR2Gray, cv::gapi::imgproc::GBGR2Gray)
+{
+    static void run(const cv::UMat& in, cv::UMat &out)
+    {
+        cv::cvtColor(in, out, cv::COLOR_BGR2GRAY);
+    }
+};
+
+GAPI_GPU_KERNEL(GGPURGB2GrayCustom, cv::gapi::imgproc::GRGB2GrayCustom)
+{
+    //TODO: avoid copy
+    static void run(const cv::UMat& in, float rY, float bY, float gY, cv::UMat &out)
+    {
+        cv::Mat planes[3];
+        cv::split(in.getMat(cv::ACCESS_READ), planes);
+        cv::Mat tmp_out = (planes[0]*rY + planes[1]*bY + planes[2]*gY);
+        tmp_out.copyTo(out);
+    }
+};
+
+
+cv::gapi::GKernelPackage cv::gapi::imgproc::gpu::kernels()
+{
+    static auto pkg = cv::gapi::kernels
+        < GGPUFilter2D
+        , GGPUSepFilter
+        , GGPUBoxFilter
+        , GGPUBlur
+        , GGPUGaussBlur
+        , GGPUMedianBlur
+        , GGPUErode
+        , GGPUDilate
+        , GGPUSobel
+        , GGPUCanny
+        , GGPUEqualizeHist
+        , GGPURGB2YUV
+        , GGPUYUV2RGB
+        , GGPURGB2Lab
+        , GGPUBGR2LUV
+        , GGPUBGR2YUV
+        , GGPUYUV2BGR
+        , GGPULUV2BGR
+        , GGPUBGR2Gray
+        , GGPURGB2Gray
+        , GGPURGB2GrayCustom
+        >();
+    return pkg;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpuimgproc.hpp
new file mode 100644 (file)
index 0000000..cd2e324
--- /dev/null
@@ -0,0 +1,23 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GGPUIMGPROC_HPP
+#define OPENCV_GAPI_GGPUIMGPROC_HPP
+
+#include <map>
+#include <string>
+
+#include "opencv2/gapi/gpu/ggpukernel.hpp"
+
+namespace cv { namespace gimpl {
+
+// NB: This is what a "Kernel Package" from the origianl Wiki doc should be.
+void loadGPUImgProc(std::map<std::string, cv::GGPUKernel> &kmap);
+
+}}
+
+#endif // OPENCV_GAPI_GGPUIMGPROC_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpukernel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/backends/gpu/ggpukernel.cpp
new file mode 100644 (file)
index 0000000..87e2aa9
--- /dev/null
@@ -0,0 +1,50 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include <cassert>
+
+#include "opencv2/gapi/gpu/ggpukernel.hpp"
+
+const cv::UMat& cv::GGPUContext::inMat(int input)
+{
+    return (inArg<cv::UMat>(input));
+}
+
+cv::UMat& cv::GGPUContext::outMatR(int output)
+{
+    return (*(util::get<cv::UMat*>(m_results.at(output))));
+}
+
+const cv::gapi::own::Scalar& cv::GGPUContext::inVal(int input)
+{
+    return inArg<cv::gapi::own::Scalar>(input);
+}
+
+cv::gapi::own::Scalar& cv::GGPUContext::outValR(int output)
+{
+    return *util::get<cv::gapi::own::Scalar*>(m_results.at(output));
+}
+
+cv::detail::VectorRef& cv::GGPUContext::outVecRef(int output)
+{
+    return util::get<cv::detail::VectorRef>(m_results.at(output));
+}
+
+cv::GGPUKernel::GGPUKernel()
+{
+}
+
+cv::GGPUKernel::GGPUKernel(const GGPUKernel::F &f)
+    : m_f(f)
+{
+}
+
+void cv::GGPUKernel::apply(GGPUContext &ctx)
+{
+    CV_Assert(m_f);
+    m_f(ctx);
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/README.md b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/README.md
new file mode 100644 (file)
index 0000000..995aa39
--- /dev/null
@@ -0,0 +1 @@
+This directory contains G-API graph compiler logic.
\ No newline at end of file
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled.cpp
new file mode 100644 (file)
index 0000000..876575d
--- /dev/null
@@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <ade/graph.hpp>
+
+#include "opencv2/gapi/gproto.hpp" // descr_of
+#include "opencv2/gapi/gcompiled.hpp"
+
+#include "compiler/gcompiled_priv.hpp"
+#include "backends/common/gbackend.hpp"
+
+// GCompiled private implementation ////////////////////////////////////////////
+void cv::GCompiled::Priv::setup(const GMetaArgs &_metaArgs,
+                                const GMetaArgs &_outMetas,
+                                std::unique_ptr<cv::gimpl::GExecutor> &&_pE)
+{
+    m_metas    = _metaArgs;
+    m_outMetas = _outMetas;
+    m_exec     = std::move(_pE);
+}
+
+bool cv::GCompiled::Priv::isEmpty() const
+{
+    return !m_exec;
+}
+
+void cv::GCompiled::Priv::run(cv::gimpl::GRuntimeArgs &&args)
+{
+    // Strip away types since ADE knows nothing about that
+    // args will be taken by specific GBackendExecutables
+    checkArgs(args);
+    m_exec->run(std::move(args));
+}
+
+const cv::GMetaArgs& cv::GCompiled::Priv::metas() const
+{
+    return m_metas;
+}
+
+const cv::GMetaArgs& cv::GCompiled::Priv::outMetas() const
+{
+    return m_outMetas;
+}
+
+void cv::GCompiled::Priv::checkArgs(const cv::gimpl::GRuntimeArgs &args) const
+{
+    const auto runtime_metas = descr_of(args.inObjs);
+    if (runtime_metas != m_metas)
+    {
+      util::throw_error(std::logic_error("This object was compiled "
+                                         "for different metadata!"));
+        // FIXME: Add details on what is actually wrong
+    }
+}
+
+bool cv::GCompiled::Priv::canReshape() const
+{
+    GAPI_Assert(m_exec);
+    return m_exec->canReshape();
+}
+
+void cv::GCompiled::Priv::reshape(const GMetaArgs& inMetas, const GCompileArgs& args)
+{
+    GAPI_Assert(m_exec);
+    m_exec->reshape(inMetas, args);
+    m_metas = inMetas;
+}
+
+const cv::gimpl::GModel::Graph& cv::GCompiled::Priv::model() const
+{
+    GAPI_Assert(nullptr != m_exec);
+    return m_exec->model();
+}
+
+// GCompiled public implementation /////////////////////////////////////////////
+cv::GCompiled::GCompiled()
+    : m_priv(new Priv())
+{
+}
+
+cv::GCompiled::operator bool() const
+{
+    return !m_priv->isEmpty();
+}
+
+void cv::GCompiled::operator() (GRunArgs &&ins, GRunArgsP &&outs)
+{
+    // FIXME: Check that <outs> matches the protocol
+    m_priv->run(cv::gimpl::GRuntimeArgs{std::move(ins),std::move(outs)});
+}
+
+#if !defined(GAPI_STANDALONE)
+void cv::GCompiled::operator ()(cv::Mat in, cv::Mat &out)
+{
+    (*this)(cv::gin(in), cv::gout(out));
+}
+
+void cv::GCompiled::operator() (cv::Mat in, cv::Scalar &out)
+{
+    (*this)(cv::gin(in), cv::gout(out));
+}
+
+void cv::GCompiled::operator() (cv::Mat in1, cv::Mat in2, cv::Mat &out)
+{
+    (*this)(cv::gin(in1, in2), cv::gout(out));
+}
+
+void cv::GCompiled::operator() (cv::Mat in1, cv::Mat in2, cv::Scalar &out)
+{
+    (*this)(cv::gin(in1, in2), cv::gout(out));
+}
+
+void cv::GCompiled::operator ()(const std::vector<cv::Mat> &ins,
+                                const std::vector<cv::Mat> &outs)
+{
+    GRunArgs call_ins;
+    GRunArgsP call_outs;
+
+    // Make a temporary copy of vector outs - cv::Mats are copies anyway
+    auto tmp = outs;
+    for (const cv::Mat &m : ins) { call_ins.emplace_back(m);   }
+    for (      cv::Mat &m : tmp) { call_outs.emplace_back(&m); }
+
+    (*this)(std::move(call_ins), std::move(call_outs));
+}
+#endif // !defined(GAPI_STANDALONE)
+
+const cv::GMetaArgs& cv::GCompiled::metas() const
+{
+    return m_priv->metas();
+}
+
+const cv::GMetaArgs& cv::GCompiled::outMetas() const
+{
+    return m_priv->outMetas();
+}
+
+cv::GCompiled::Priv& cv::GCompiled::priv()
+{
+    return *m_priv;
+}
+
+bool cv::GCompiled::canReshape() const
+{
+    return m_priv->canReshape();
+}
+
+void cv::GCompiled::reshape(const GMetaArgs& inMetas, const GCompileArgs& args)
+{
+    m_priv->reshape(inMetas, args);
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled_priv.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiled_priv.hpp
new file mode 100644 (file)
index 0000000..e616b2b
--- /dev/null
@@ -0,0 +1,61 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMPILED_PRIV_HPP
+#define OPENCV_GAPI_GCOMPILED_PRIV_HPP
+
+#include <memory> // unique_ptr
+
+#include "opencv2/gapi/util/optional.hpp"
+#include "compiler/gmodel.hpp"
+#include "executor/gexecutor.hpp"
+
+// NB: BTW, GCompiled is the only "public API" class which
+// private part (implementaion) is hosted in the "compiler/" module.
+//
+// This file is here just to keep ADE hidden from the top-level APIs.
+//
+// As the thing becomes more complex, appropriate API and implementation
+// part will be placed to api/ and compiler/ modules respectively.
+
+namespace cv {
+
+namespace gimpl
+{
+    struct GRuntimeArgs;
+};
+
+// FIXME: GAPI_EXPORTS is here only due to tests and Windows linker issues
+class GAPI_EXPORTS GCompiled::Priv
+{
+    // NB: For now, a GCompiled keeps the original ade::Graph alive.
+    // If we want to go autonomous, we might to do something with this.
+    GMetaArgs  m_metas;    // passed by user
+    GMetaArgs  m_outMetas; // inferred by compiler
+    std::unique_ptr<cv::gimpl::GExecutor> m_exec;
+
+    void checkArgs(const cv::gimpl::GRuntimeArgs &args) const;
+
+public:
+    void setup(const GMetaArgs &metaArgs,
+               const GMetaArgs &outMetas,
+               std::unique_ptr<cv::gimpl::GExecutor> &&pE);
+    bool isEmpty() const;
+
+    bool canReshape() const;
+    void reshape(const GMetaArgs& inMetas, const GCompileArgs &args);
+
+    void run(cv::gimpl::GRuntimeArgs &&args);
+    const GMetaArgs& metas() const;
+    const GMetaArgs& outMetas() const;
+
+    const cv::gimpl::GModel::Graph& model() const;
+};
+
+}
+
+#endif // OPENCV_GAPI_GCOMPILED_PRIV_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.cpp
new file mode 100644 (file)
index 0000000..32ce8e3
--- /dev/null
@@ -0,0 +1,281 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <vector>
+#include <stack>
+#include <unordered_map>
+
+#include <ade/util/algorithm.hpp>      // any_of
+#include <ade/util/zip_range.hpp>      // zip_range, indexed
+
+#include <ade/graph.hpp>
+#include <ade/passes/check_cycles.hpp>
+
+#include "api/gcomputation_priv.hpp"
+#include "api/gnode_priv.hpp"   // FIXME: why it is here?
+#include "api/gproto_priv.hpp"  // FIXME: why it is here?
+#include "api/gcall_priv.hpp"   // FIXME: why it is here?
+#include "api/gapi_priv.hpp"    // FIXME: why it is here?
+#include "api/gbackend_priv.hpp" // Backend basic API (newInstance, etc)
+
+#include "compiler/gmodel.hpp"
+#include "compiler/gmodelbuilder.hpp"
+#include "compiler/gcompiler.hpp"
+#include "compiler/gcompiled_priv.hpp"
+#include "compiler/passes/passes.hpp"
+
+#include "executor/gexecutor.hpp"
+#include "backends/common/gbackend.hpp"
+
+// <FIXME:>
+#if !defined(GAPI_STANDALONE)
+#include "opencv2/gapi/cpu/core.hpp"    // Also directly refer to Core
+#include "opencv2/gapi/cpu/imgproc.hpp" // ...and Imgproc kernel implementations
+#endif // !defined(GAPI_STANDALONE)
+// </FIXME:>
+
+#include "opencv2/gapi/gcompoundkernel.hpp" // compound::backend()
+
+#include "logger.hpp"
+
+namespace
+{
+    cv::gapi::GKernelPackage getKernelPackage(cv::GCompileArgs &args)
+    {
+        static auto ocv_pkg =
+#if !defined(GAPI_STANDALONE)
+            combine(cv::gapi::core::cpu::kernels(),
+                    cv::gapi::imgproc::cpu::kernels(),
+                    cv::unite_policy::KEEP);
+#else
+            cv::gapi::GKernelPackage();
+#endif // !defined(GAPI_STANDALONE)
+        auto user_pkg = cv::gimpl::getCompileArg<cv::gapi::GKernelPackage>(args);
+        return combine(ocv_pkg, user_pkg.value_or(cv::gapi::GKernelPackage{}), cv::unite_policy::REPLACE);
+    }
+
+    cv::util::optional<std::string> getGraphDumpDirectory(cv::GCompileArgs& args)
+    {
+        auto dump_info = cv::gimpl::getCompileArg<cv::graph_dump_path>(args);
+        if (!dump_info.has_value())
+        {
+            const char* path = std::getenv("GRAPH_DUMP_PATH");
+            return path
+                ? cv::util::make_optional(std::string(path))
+                : cv::util::optional<std::string>();
+        }
+        else
+        {
+            return cv::util::make_optional(dump_info.value().m_dump_path);
+        }
+    }
+} // anonymous namespace
+
+
+// GCompiler implementation ////////////////////////////////////////////////////
+
+cv::gimpl::GCompiler::GCompiler(const cv::GComputation &c,
+                                GMetaArgs              &&metas,
+                                GCompileArgs           &&args)
+    : m_c(c), m_metas(std::move(metas)), m_args(std::move(args))
+{
+    using namespace std::placeholders;
+    m_all_kernels       = getKernelPackage(m_args);
+    auto lookup_order   = getCompileArg<gapi::GLookupOrder>(m_args).value_or(gapi::GLookupOrder());
+    auto dump_path      = getGraphDumpDirectory(m_args);
+
+    m_e.addPassStage("init");
+    m_e.addPass("init", "check_cycles",  ade::passes::CheckCycles());
+    m_e.addPass("init", "expand_kernels",  std::bind(passes::expandKernels, _1,
+                                                     m_all_kernels)); // NB: package is copied
+    m_e.addPass("init", "topo_sort",     ade::passes::TopologicalSort());
+    m_e.addPass("init", "init_islands",  passes::initIslands);
+    m_e.addPass("init", "check_islands", passes::checkIslands);
+    // TODO:
+    // - Check basic graph validity (i.e., all inputs are connected)
+    // - Complex dependencies (i.e. parent-child) unrolling
+    // - etc, etc, etc
+
+    // Remove GCompoundBackend to avoid calling setupBackend() with it in the list
+    m_all_kernels.remove(cv::gapi::compound::backend());
+    m_e.addPass("init", "resolve_kernels", std::bind(passes::resolveKernels, _1,
+                                                     std::ref(m_all_kernels), // NB: and not copied here
+                                                     lookup_order));
+
+    m_e.addPass("init", "check_islands_content", passes::checkIslandsContent);
+    m_e.addPassStage("meta");
+    m_e.addPass("meta", "initialize",   std::bind(passes::initMeta, _1, std::ref(m_metas)));
+    m_e.addPass("meta", "propagate",    std::bind(passes::inferMeta, _1, false));
+    m_e.addPass("meta", "finalize",     passes::storeResultingMeta);
+    // moved to another stage, FIXME: two dumps?
+    //    m_e.addPass("meta", "dump_dot",     passes::dumpDotStdout);
+
+    // Special stage for backend-specific transformations
+    // FIXME: document passes hierarchy and order for backend developers
+    m_e.addPassStage("transform");
+
+    m_e.addPassStage("exec");
+    m_e.addPass("exec", "fuse_islands",     passes::fuseIslands);
+    m_e.addPass("exec", "sync_islands",     passes::syncIslandTags);
+
+    if (dump_path.has_value())
+    {
+        m_e.addPass("exec", "dump_dot", std::bind(passes::dumpGraph, _1,
+                                                  dump_path.value()));
+    }
+
+    // Process backends at the last moment (after all G-API passes are added).
+    ade::ExecutionEngineSetupContext ectx(m_e);
+    auto backends = m_all_kernels.backends();
+    for (auto &b : backends)
+    {
+        b.priv().addBackendPasses(ectx);
+    }
+}
+
+void cv::gimpl::GCompiler::validateInputMeta()
+{
+    if (m_metas.size() != m_c.priv().m_ins.size())
+    {
+        util::throw_error(std::logic_error
+                    ("COMPILE: GComputation interface / metadata mismatch! "
+                     "(expected " + std::to_string(m_c.priv().m_ins.size()) + ", "
+                     "got " + std::to_string(m_metas.size()) + " meta arguments)"));
+    }
+
+    const auto meta_matches = [](const GMetaArg &meta, const GProtoArg &proto) {
+        switch (proto.index())
+        {
+        // FIXME: Auto-generate methods like this from traits:
+        case GProtoArg::index_of<cv::GMat>():
+            return util::holds_alternative<cv::GMatDesc>(meta);
+
+        case GProtoArg::index_of<cv::GScalar>():
+            return util::holds_alternative<cv::GScalarDesc>(meta);
+
+        case GProtoArg::index_of<cv::detail::GArrayU>():
+            return util::holds_alternative<cv::GArrayDesc>(meta);
+
+        default:
+            GAPI_Assert(false);
+        }
+        return false; // should never happen
+    };
+
+    for (const auto &meta_arg_idx : ade::util::indexed(ade::util::zip(m_metas, m_c.priv().m_ins)))
+    {
+        const auto &meta  = std::get<0>(ade::util::value(meta_arg_idx));
+        const auto &proto = std::get<1>(ade::util::value(meta_arg_idx));
+
+        if (!meta_matches(meta, proto))
+        {
+            const auto index  = ade::util::index(meta_arg_idx);
+            util::throw_error(std::logic_error
+                        ("GComputation object type / metadata descriptor mismatch "
+                         "(argument " + std::to_string(index) + ")"));
+            // FIXME: report what we've got and what we've expected
+        }
+    }
+    // All checks are ok
+}
+
+void cv::gimpl::GCompiler::validateOutProtoArgs()
+{
+    for (const auto &out_pos : ade::util::indexed(m_c.priv().m_outs))
+    {
+        const auto &node = proto::origin_of(ade::util::value(out_pos)).node;
+        if (node.shape() != cv::GNode::NodeShape::CALL)
+        {
+            auto pos = ade::util::index(out_pos);
+            util::throw_error(std::logic_error
+                        ("Computation output " + std::to_string(pos) +
+                         " is not a result of any operation"));
+        }
+    }
+}
+
+cv::gimpl::GCompiler::GPtr cv::gimpl::GCompiler::generateGraph()
+{
+    validateInputMeta();
+    validateOutProtoArgs();
+
+    // Generate ADE graph from expression-based computation
+    std::unique_ptr<ade::Graph> pG(new ade::Graph);
+    ade::Graph& g = *pG;
+
+    GModel::Graph gm(g);
+    cv::gimpl::GModel::init(gm);
+    cv::gimpl::GModelBuilder builder(g);
+    auto proto_slots = builder.put(m_c.priv().m_ins, m_c.priv().m_outs);
+    GAPI_LOG_INFO(NULL, "Generated graph: " << g.nodes().size() << " nodes" << std::endl);
+
+    // Store Computation's protocol in metadata
+    Protocol p;
+    std::tie(p.inputs, p.outputs, p.in_nhs, p.out_nhs) = proto_slots;
+    gm.metadata().set(p);
+
+    return pG;
+}
+
+void cv::gimpl::GCompiler::runPasses(ade::Graph &g)
+{
+    m_e.runPasses(g);
+    GAPI_LOG_INFO(NULL, "All compiler passes are successful");
+}
+
+void cv::gimpl::GCompiler::compileIslands(ade::Graph &g)
+{
+    GModel::Graph gm(g);
+    std::shared_ptr<ade::Graph> gptr(gm.metadata().get<IslandModel>().model);
+    GIslandModel::Graph gim(*gptr);
+
+    // Run topological sort on GIslandModel first
+    auto pass_ctx = ade::passes::PassContext{*gptr};
+    ade::passes::TopologicalSort{}(pass_ctx);
+
+    // Now compile islands
+    GIslandModel::compileIslands(gim, g, m_args);
+}
+
+cv::GCompiled cv::gimpl::GCompiler::produceCompiled(GPtr &&pg)
+{
+    // This is the final compilation step. Here:
+    // - An instance of GExecutor is created. Depening on the platform,
+    //   build configuration, etc, a GExecutor may be:
+    //   - a naive single-thread graph interpreter;
+    //   - a std::thread-based thing
+    //   - a TBB-based thing, etc.
+    // - All this stuff is wrapped into a GCompiled object and returned
+    //   to user.
+
+    // Note: this happens in the last pass ("compile_islands"):
+    // - Each GIsland of GIslandModel instantiates its own,
+    //   backend-specific executable object
+    //   - Every backend gets a subgraph to execute, and builds
+    //     an execution plan for it (backend-specific execution)
+    // ...before call to produceCompiled();
+
+    const auto &outMetas = GModel::ConstGraph(*pg).metadata()
+        .get<OutputMeta>().outMeta;
+    std::unique_ptr<GExecutor> pE(new GExecutor(std::move(pg)));
+    // FIXME: select which executor will be actually used,
+    // make GExecutor abstract.
+
+    GCompiled compiled;
+    compiled.priv().setup(m_metas, outMetas, std::move(pE));
+    return compiled;
+}
+
+cv::GCompiled cv::gimpl::GCompiler::compile()
+{
+    std::unique_ptr<ade::Graph> pG = generateGraph();
+    runPasses(*pG);
+    compileIslands(*pG);
+    return produceCompiled(std::move(pG));
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gcompiler.hpp
new file mode 100644 (file)
index 0000000..b369c14
--- /dev/null
@@ -0,0 +1,51 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GCOMPILER_HPP
+#define OPENCV_GAPI_GCOMPILER_HPP
+
+
+#include "opencv2/gapi/gcommon.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+#include "opencv2/gapi/gcomputation.hpp"
+
+#include <ade/execution_engine/execution_engine.hpp>
+
+namespace cv { namespace gimpl {
+
+// FIXME: exported for internal tests only!
+class GAPI_EXPORTS GCompiler
+{
+    const GComputation&      m_c;
+    const GMetaArgs          m_metas;
+    GCompileArgs             m_args;
+    ade::ExecutionEngine     m_e;
+
+    cv::gapi::GKernelPackage m_all_kernels;
+
+    void validateInputMeta();
+    void validateOutProtoArgs();
+
+public:
+    explicit GCompiler(const GComputation &c,
+                             GMetaArgs    &&metas,
+                             GCompileArgs &&args);
+
+    // The method which does everything...
+    GCompiled compile();
+
+    // But is actually composed of this:
+    using GPtr = std::unique_ptr<ade::Graph>;
+    GPtr       generateGraph();               // Unroll GComputation into a GModel
+    void       runPasses(ade::Graph &g);      // Apply all G-API passes on a GModel
+    void       compileIslands(ade::Graph &g); // Instantiate GIslandExecutables in GIslandModel
+    GCompiled  produceCompiled(GPtr &&pg);    // Produce GCompiled from processed GModel
+};
+
+}}
+
+#endif // OPENCV_GAPI_GCOMPILER_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.cpp
new file mode 100644 (file)
index 0000000..8e20302
--- /dev/null
@@ -0,0 +1,289 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <sstream>
+#include <unordered_set>
+#include <unordered_map>
+
+#include <ade/util/checked_cast.hpp>
+
+#include "api/gbackend_priv.hpp" // GBackend::Priv().compile()
+#include "compiler/gmodel.hpp"
+#include "compiler/gislandmodel.hpp"
+#include "logger.hpp"    // GAPI_LOG
+
+namespace cv { namespace gimpl {
+
+GIsland::GIsland(const gapi::GBackend &bknd,
+                 ade::NodeHandle op,
+                 util::optional<std::string> &&user_tag)
+    : m_backend(bknd)
+    , m_user_tag(std::move(user_tag))
+{
+    m_all.insert(op);
+    m_in_ops.insert(op);
+    m_out_ops.insert(op);
+}
+
+// _ because of gcc4.8 wanings on ARM
+GIsland::GIsland(const gapi::GBackend &_bknd,
+                 node_set &&_all,
+                 node_set &&_in_ops,
+                 node_set &&_out_ops,
+                 util::optional<std::string> &&_user_tag)
+    : m_backend(_bknd)
+    , m_all(std::move(_all))
+    , m_in_ops(std::move(_in_ops))
+    , m_out_ops(std::move(_out_ops))
+    , m_user_tag(std::move(_user_tag))
+{
+}
+
+const GIsland::node_set& GIsland::contents() const
+{
+    return m_all;
+}
+
+const GIsland::node_set& GIsland::in_ops() const
+{
+    return m_in_ops;
+}
+
+const GIsland::node_set& GIsland::out_ops() const
+{
+    return m_out_ops;
+}
+
+gapi::GBackend GIsland::backend() const
+{
+    return m_backend;
+}
+
+bool GIsland::is_user_specified() const
+{
+    return m_user_tag.has_value();
+}
+
+void GIsland::debug() const
+{
+    std::stringstream stream;
+    stream << name() << " {{\n  input ops: ";
+    for (const auto& nh : m_in_ops) stream << nh << "; ";
+    stream << "\n  output ops: ";
+    for (const auto& nh : m_out_ops) stream << nh << "; ";
+    stream << "\n  contents: ";
+    for (const auto& nh : m_all) stream << nh << "; ";
+    stream << "\n}}" << std::endl;
+    GAPI_LOG_INFO(NULL, stream.str());
+}
+
+GIsland::node_set GIsland::consumers(const ade::Graph &g,
+                                     const ade::NodeHandle &slot_nh) const
+{
+    GIslandModel::ConstGraph gim(g);
+    auto data_nh = gim.metadata(slot_nh).get<DataSlot>().original_data_node;
+    GIsland::node_set result;
+    for (const auto& in_op : m_in_ops)
+    {
+        auto it = std::find(in_op->inNodes().begin(),
+                            in_op->inNodes().end(),
+                            data_nh);
+        if (it != in_op->inNodes().end())
+            result.insert(in_op);
+    }
+    return result;
+}
+
+ade::NodeHandle GIsland::producer(const ade::Graph &g,
+                                  const ade::NodeHandle &slot_nh) const
+{
+    GIslandModel::ConstGraph gim(g);
+    auto data_nh = gim.metadata(slot_nh).get<DataSlot>().original_data_node;
+    for (const auto& out_op : m_out_ops)
+    {
+        auto it = std::find(out_op->outNodes().begin(),
+                            out_op->outNodes().end(),
+                            data_nh);
+        if (it != out_op->outNodes().end())
+            return out_op;
+    }
+    // Consistency: A GIsland requested for producer() of slot_nh should
+    // always had the appropriate GModel node handle in its m_out_ops vector.
+    GAPI_Assert(false);
+    return ade::NodeHandle();
+}
+
+std::string GIsland::name() const
+{
+    if (is_user_specified())
+        return m_user_tag.value();
+
+    std::stringstream ss;
+    ss << "island_#" << std::hex << static_cast<const void*>(this);
+    return ss.str();
+}
+
+void GIslandModel::generateInitial(GIslandModel::Graph &g,
+                                   const ade::Graph &src_graph)
+{
+    const GModel::ConstGraph src_g(src_graph);
+
+    // Initially GIslandModel is a 1:1 projection from GModel:
+    // 1) Every GModel::OP becomes a separate GIslandModel::FusedIsland;
+    // 2) Every GModel::DATA becomes GIslandModel::DataSlot;
+    // 3) Single-operation FusedIslands are connected with DataSlots in the
+    //    same way as OPs and DATA (edges with the same metadata)
+
+    using node_set = std::unordered_set
+        < ade::NodeHandle
+        , ade::HandleHasher<ade::Node>
+        >;
+    using node_map = std::unordered_map
+        < ade::NodeHandle
+        , ade::NodeHandle
+        , ade::HandleHasher<ade::Node>
+        >;
+
+    node_set all_operations;
+    node_map data_to_slot;
+
+    // First, list all operations and build create DataSlots in <g>
+    for (auto src_nh : src_g.nodes())
+    {
+        switch (src_g.metadata(src_nh).get<NodeType>().t)
+        {
+        case NodeType::OP:   all_operations.insert(src_nh);                break;
+        case NodeType::DATA: data_to_slot[src_nh] = mkSlotNode(g, src_nh); break;
+        default: GAPI_Assert(false); break;
+        }
+    } // for (src_g.nodes)
+
+    // Now put single-op islands and connect it with DataSlots
+    for (auto src_op_nh : all_operations)
+    {
+        auto nh = mkIslandNode(g, src_g.metadata(src_op_nh).get<Op>().backend, src_op_nh, src_graph);
+        for (auto in_edge : src_op_nh->inEdges())
+        {
+            auto src_data_nh = in_edge->srcNode();
+            auto isl_slot_nh = data_to_slot.at(src_data_nh);
+            g.link(isl_slot_nh, nh); // no other data stored yet
+        }
+        for (auto out_edge : src_op_nh->outEdges())
+        {
+            auto dst_data_nh = out_edge->dstNode();
+            auto isl_slot_nh = data_to_slot.at(dst_data_nh);
+            g.link(nh, isl_slot_nh);
+        }
+    } // for(all_operations)
+}
+
+ade::NodeHandle GIslandModel::mkSlotNode(Graph &g, const ade::NodeHandle &data_nh)
+{
+    auto nh = g.createNode();
+    g.metadata(nh).set(DataSlot{data_nh});
+    g.metadata(nh).set(NodeKind{NodeKind::SLOT});
+    return nh;
+}
+
+ade::NodeHandle GIslandModel::mkIslandNode(Graph &g, const gapi::GBackend& bknd, const ade::NodeHandle &op_nh, const ade::Graph &orig_g)
+{
+    const GModel::ConstGraph src_g(orig_g);
+    util::optional<std::string> user_tag;
+    if (src_g.metadata(op_nh).contains<Island>())
+    {
+        user_tag = util::make_optional(src_g.metadata(op_nh).get<Island>().island);
+    }
+
+    auto nh = g.createNode();
+    std::shared_ptr<GIsland> island(new GIsland(bknd, op_nh, std::move(user_tag)));
+    g.metadata(nh).set(FusedIsland{std::move(island)});
+    g.metadata(nh).set(NodeKind{NodeKind::ISLAND});
+    return nh;
+}
+
+ade::NodeHandle GIslandModel::mkIslandNode(Graph &g, std::shared_ptr<GIsland>&& isl)
+{
+    ade::NodeHandle nh = g.createNode();
+    g.metadata(nh).set(cv::gimpl::NodeKind{cv::gimpl::NodeKind::ISLAND});
+    g.metadata(nh).set<cv::gimpl::FusedIsland>({std::move(isl)});
+    return nh;
+}
+
+void GIslandModel::syncIslandTags(Graph &g, ade::Graph &orig_g)
+{
+    GModel::Graph gm(orig_g);
+    for (auto nh : g.nodes())
+    {
+        if (NodeKind::ISLAND == g.metadata(nh).get<NodeKind>().k)
+        {
+            auto island = g.metadata(nh).get<FusedIsland>().object;
+            auto isl_tag = island->name();
+            for (const auto& orig_nh_inside : island->contents())
+            {
+                gm.metadata(orig_nh_inside).set(Island{isl_tag});
+            }
+        }
+    }
+}
+
+void GIslandModel::compileIslands(Graph &g, const ade::Graph &orig_g, const GCompileArgs &args)
+{
+    GModel::ConstGraph gm(orig_g);
+
+    auto original_sorted = gm.metadata().get<ade::passes::TopologicalSortData>();
+    for (auto nh : g.nodes())
+    {
+        if (NodeKind::ISLAND == g.metadata(nh).get<NodeKind>().k)
+        {
+            auto island_obj = g.metadata(nh).get<FusedIsland>().object;
+            auto island_ops = island_obj->contents();
+
+            std::vector<ade::NodeHandle> topo_sorted_list;
+            ade::util::copy_if(original_sorted.nodes(),
+                               std::back_inserter(topo_sorted_list),
+                               [&](ade::NodeHandle sorted_nh) {
+                                   return ade::util::contains(island_ops, sorted_nh);
+                               });
+
+            auto island_exe = island_obj->backend().priv()
+                .compile(orig_g, args, topo_sorted_list);
+            GAPI_Assert(nullptr != island_exe);
+            g.metadata(nh).set(IslandExec{std::move(island_exe)});
+        }
+    }
+}
+
+ade::NodeHandle GIslandModel::producerOf(const ConstGraph &g, ade::NodeHandle &data_nh)
+{
+    for (auto nh : g.nodes())
+    {
+        // find a data slot...
+        if (NodeKind::SLOT == g.metadata(nh).get<NodeKind>().k)
+        {
+            // which is associated with the given data object...
+            if (data_nh == g.metadata(nh).get<DataSlot>().original_data_node)
+            {
+                // which probably has a produrer...
+                if (0u != nh->inNodes().size())
+                {
+                    // ...then the answer is that producer
+                    return nh->inNodes().front();
+                }
+                else return ade::NodeHandle(); // input data object?
+                                               // return empty to break the cycle
+            }
+        }
+    }
+    // No appropriate data slot found - probably, the object has been
+    // optimized out during fusion
+    return ade::NodeHandle();
+}
+
+} // namespace cv
+} // namespace gimpl
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gislandmodel.hpp
new file mode 100644 (file)
index 0000000..03b42ff
--- /dev/null
@@ -0,0 +1,187 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GISLANDMODEL_HPP
+#define OPENCV_GAPI_GISLANDMODEL_HPP
+
+#include <unordered_set>
+#include <memory>        // shared_ptr
+
+#include <ade/graph.hpp>
+#include <ade/typed_graph.hpp>
+#include <ade/passes/topological_sort.hpp>
+
+#include "opencv2/gapi/util/optional.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+
+#include "compiler/gobjref.hpp"
+
+namespace cv { namespace gimpl {
+
+
+// FIXME: GAPI_EXPORTS only because of tests!
+class GAPI_EXPORTS GIsland
+{
+public:
+    using node_set = std::unordered_set
+         < ade::NodeHandle
+         , ade::HandleHasher<ade::Node>
+         >;
+
+    // Initial constructor (constructs a single-op Island)
+    GIsland(const gapi::GBackend &bknd,
+            ade::NodeHandle op,
+            util::optional<std::string>&& user_tag);
+
+    // Merged constructor
+    GIsland(const gapi::GBackend &bknd,
+            node_set &&all,
+            node_set &&in_ops,
+            node_set &&out_ops,
+            util::optional<std::string>&& user_tag);
+
+    const node_set& contents() const;
+    const node_set& in_ops() const;
+    const node_set& out_ops() const;
+
+    std::string name() const;
+    gapi::GBackend backend() const;
+
+    /**
+     * Returns all GModel operation node handles which are _reading_
+     * from a GModel data object associated (wrapped in) the given
+     * Slot object.
+     *
+     * @param g an ade::Graph with GIslandModel information inside
+     * @param slot_nh Slot object node handle of interest
+     * @return a set of GModel operation node handles
+     */
+    node_set consumers(const ade::Graph &g,
+                       const ade::NodeHandle &slot_nh) const;
+
+    /**
+     * Returns a GModel operation node handle which is _writing_
+     * to a GModel data object associated (wrapped in) the given
+     * Slot object.
+     *
+     * @param g an ade::Graph with GIslandModel information inside
+     * @param slot_nh Slot object node handle of interest
+     * @return a node handle of original GModel
+     */
+    ade::NodeHandle producer(const ade::Graph &g,
+                             const ade::NodeHandle &slot_nh) const;
+
+    void debug() const;
+    bool is_user_specified() const;
+
+protected:
+    gapi::GBackend m_backend; // backend which handles this Island execution
+
+    node_set m_all;     // everything (data + operations) within an island
+    node_set m_in_ops;  // operations island begins with
+    node_set m_out_ops; // operations island ends with
+
+    // has island name IF specified by user. Empty for internal (inferred) islands
+    util::optional<std::string> m_user_tag;
+};
+
+
+
+// GIslandExecutable - a backend-specific thing which executes
+// contents of an Island
+// * Is instantiated by the last step of the Islands fusion procedure;
+// * Is orchestrated by a GExecutor instance.
+//
+class GIslandExecutable
+{
+public:
+    using InObj  = std::pair<RcDesc, cv::GRunArg>;
+    using OutObj = std::pair<RcDesc, cv::GRunArgP>;
+
+    // FIXME: now run() requires full input vector to be available.
+    // actually, parts of subgraph may execute even if there's no all data
+    // slots in place.
+    // TODO: Add partial execution capabilities
+    virtual void run(std::vector<InObj>  &&input_objs,
+                     std::vector<OutObj> &&output_objs) = 0;
+
+    virtual bool canReshape() const = 0;
+    virtual void reshape(ade::Graph& g, const GCompileArgs& args) = 0;
+
+    virtual ~GIslandExecutable() = default;
+};
+
+
+
+// Couldn't reuse NodeType here - FIXME unify (move meta to a shared place)
+struct NodeKind
+{
+    static const char *name() { return "NodeKind"; }
+    enum { ISLAND, SLOT} k;
+};
+
+// FIXME: Rename to Island (as soon as current GModel::Island is renamed
+// to IslandTag).
+struct FusedIsland
+{
+    static const char *name() { return "FusedIsland"; }
+    std::shared_ptr<GIsland> object;
+};
+
+struct DataSlot
+{
+    static const char *name() { return "DataSlot"; }
+    ade::NodeHandle original_data_node; // direct link to GModel
+};
+
+struct IslandExec
+{
+    static const char *name() { return "IslandExecutable"; }
+    std::shared_ptr<GIslandExecutable> object;
+};
+
+namespace GIslandModel
+{
+    using Graph = ade::TypedGraph
+        < NodeKind
+        , FusedIsland
+        , DataSlot
+        , IslandExec
+        , ade::passes::TopologicalSortData
+        >;
+
+    // FIXME: derive from TypedGraph
+    using ConstGraph = ade::ConstTypedGraph
+        < NodeKind
+        , FusedIsland
+        , DataSlot
+        , IslandExec
+        , ade::passes::TopologicalSortData
+        >;
+
+    // Top-level function
+    void generateInitial(Graph &g, const ade::Graph &src_g);
+    // "Building blocks"
+    ade::NodeHandle mkSlotNode(Graph &g, const ade::NodeHandle &data_nh);
+    ade::NodeHandle mkIslandNode(Graph &g, const gapi::GBackend &bknd, const ade::NodeHandle &op_nh, const ade::Graph &orig_g);
+    ade::NodeHandle mkIslandNode(Graph &g, std::shared_ptr<GIsland>&& isl);
+
+    // GIslandModel API
+    void syncIslandTags(Graph &g, ade::Graph &orig_g);
+    void compileIslands(Graph &g, const ade::Graph &orig_g, const GCompileArgs &args);
+
+    // Debug routines
+    // producerOf - returns an Island handle which produces given data object
+    //     from the original model (! don't mix with DataSlot)
+    // FIXME: GAPI_EXPORTS because of tests only!
+    ade::NodeHandle GAPI_EXPORTS producerOf(const ConstGraph &g, ade::NodeHandle &data_nh);
+
+} // namespace GIslandModel
+
+}} // namespace cv::gimpl
+
+#endif // OPENCV_GAPI_GISLANDMODEL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.cpp
new file mode 100644 (file)
index 0000000..4b24552
--- /dev/null
@@ -0,0 +1,247 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <string>
+#include <sstream> // used in GModel::log
+
+
+#include <ade/util/zip_range.hpp>   // util::indexed
+#include <ade/util/checked_cast.hpp>
+
+#include "opencv2/gapi/gproto.hpp"
+#include "api/gnode_priv.hpp"
+#include "compiler/gobjref.hpp"
+#include "compiler/gmodel.hpp"
+
+namespace cv { namespace gimpl {
+
+ade::NodeHandle GModel::mkOpNode(GModel::Graph &g, const GKernel &k, const std::vector<GArg> &args, const std::string &island)
+{
+    ade::NodeHandle op_h = g.createNode();
+    g.metadata(op_h).set(NodeType{NodeType::OP});
+    //These extra empty {} are to please GCC (-Wmissing-field-initializers)
+    g.metadata(op_h).set(Op{k, args, {}, {}, {}});
+    if (!island.empty())
+        g.metadata(op_h).set(Island{island});
+    return op_h;
+}
+
+ade::NodeHandle GModel::mkDataNode(GModel::Graph &g, const GOrigin& origin)
+{
+    ade::NodeHandle op_h = g.createNode();
+    const auto id = g.metadata().get<DataObjectCounter>().GetNewId(origin.shape);
+    g.metadata(op_h).set(NodeType{NodeType::DATA});
+
+    GMetaArg meta;
+    Data::Storage storage = Data::Storage::INTERNAL; // By default, all objects are marked INTERNAL
+
+    if (origin.node.shape() == GNode::NodeShape::CONST_BOUNDED)
+    {
+        auto value = value_of(origin);
+        meta       = descr_of(value);
+        storage    = Data::Storage::CONST;
+        g.metadata(op_h).set(ConstValue{value});
+    }
+    g.metadata(op_h).set(Data{origin.shape, id, meta, origin.ctor, storage});
+    return op_h;
+}
+
+void GModel::linkIn(Graph &g, ade::NodeHandle opH, ade::NodeHandle objH, std::size_t in_port)
+{
+    // Check if input is already connected
+    for (const auto& in_e : opH->inEdges())
+    {
+        GAPI_Assert(g.metadata(in_e).get<Input>().port != in_port);
+    }
+
+    auto &op = g.metadata(opH).get<Op>();
+    auto &gm = g.metadata(objH).get<Data>();
+
+     // FIXME: check validity using kernel prototype
+    GAPI_Assert(in_port < op.args.size());
+
+    ade::EdgeHandle eh = g.link(objH, opH);
+    g.metadata(eh).set(Input{in_port});
+
+    // Replace an API object with a REF (G* -> GOBJREF)
+    op.args[in_port] = cv::GArg(RcDesc{gm.rc, gm.shape, {}});
+}
+
+void GModel::linkOut(Graph &g, ade::NodeHandle opH, ade::NodeHandle objH, std::size_t out_port)
+{
+    // FIXME: check validity using kernel prototype
+
+    // Check if output is already connected
+    for (const auto& out_e : opH->outEdges())
+    {
+        GAPI_Assert(g.metadata(out_e).get<Output>().port != out_port);
+    }
+
+    auto &op = g.metadata(opH).get<Op>();
+    auto &gm = g.metadata(objH).get<Data>();
+
+    GAPI_Assert(objH->inNodes().size() == 0u);
+
+    ade::EdgeHandle eh = g.link(opH, objH);
+    g.metadata(eh).set(Output{out_port});
+
+    // TODO: outs must be allocated according to kernel protocol!
+    const auto storage_with_port = ade::util::checked_cast<std::size_t>(out_port+1);
+    const auto min_out_size = std::max(op.outs.size(), storage_with_port);
+    op.outs.resize(min_out_size, RcDesc{-1,GShape::GMAT,{}}); // FIXME: Invalid shape instead?
+    op.outs[out_port] = RcDesc{gm.rc, gm.shape, {}};
+}
+
+std::vector<ade::NodeHandle> GModel::orderedInputs(Graph &g, ade::NodeHandle nh)
+{
+    std::vector<ade::NodeHandle> sorted_in_nhs(nh->inEdges().size());
+    for (const auto& in_eh : nh->inEdges())
+    {
+        const auto port = g.metadata(in_eh).get<cv::gimpl::Input>().port;
+        GAPI_Assert(port < sorted_in_nhs.size());
+        sorted_in_nhs[port] = in_eh->srcNode();
+    }
+    return sorted_in_nhs;
+}
+
+std::vector<ade::NodeHandle> GModel::orderedOutputs(Graph &g, ade::NodeHandle nh)
+{
+    std::vector<ade::NodeHandle> sorted_out_nhs(nh->outEdges().size());
+    for (const auto& out_eh : nh->outEdges())
+    {
+        const auto port = g.metadata(out_eh).get<cv::gimpl::Output>().port;
+        GAPI_Assert(port < sorted_out_nhs.size());
+        sorted_out_nhs[port] = out_eh->dstNode();
+    }
+    return sorted_out_nhs;
+}
+
+void GModel::init(Graph& g)
+{
+    g.metadata().set(DataObjectCounter());
+}
+
+void GModel::log(Graph &g, ade::NodeHandle nh, std::string &&msg, ade::NodeHandle updater)
+{
+    std::string s = std::move(msg);
+    if (updater != nullptr)
+    {
+        std::stringstream fmt;
+        fmt << " (via " << updater << ")";
+        s += fmt.str();
+    }
+
+    if (g.metadata(nh).contains<Journal>())
+    {
+        g.metadata(nh).get<Journal>().messages.push_back(s);
+    }
+    else
+    {
+        g.metadata(nh).set(Journal{{s}});
+    }
+}
+
+// FIXME:
+// Unify with GModel::log(.. ade::NodeHandle ..)
+void GModel::log(Graph &g, ade::EdgeHandle eh, std::string &&msg, ade::NodeHandle updater)
+{
+    std::string s = std::move(msg);
+    if (updater != nullptr)
+    {
+        std::stringstream fmt;
+        fmt << " (via " << updater << ")";
+        s += fmt.str();
+    }
+
+    if (g.metadata(eh).contains<Journal>())
+    {
+        g.metadata(eh).get<Journal>().messages.push_back(s);
+    }
+    else
+    {
+        g.metadata(eh).set(Journal{{s}});
+    }
+}
+
+ade::NodeHandle GModel::detail::dataNodeOf(const ConstGraph &g, const GOrigin &origin)
+{
+    // FIXME: Does it still work with graph transformations, e.g. redirectWriter()??
+    return g.metadata().get<Layout>().object_nodes.at(origin);
+}
+
+void GModel::redirectReaders(Graph &g, ade::NodeHandle from, ade::NodeHandle to)
+{
+    std::vector<ade::EdgeHandle> ehh(from->outEdges().begin(), from->outEdges().end());
+    for (auto e : ehh)
+    {
+        auto dst = e->dstNode();
+        auto input = g.metadata(e).get<Input>();
+        g.erase(e);
+        linkIn(g, dst, to, input.port);
+    }
+}
+
+void GModel::redirectWriter(Graph &g, ade::NodeHandle from, ade::NodeHandle to)
+{
+    GAPI_Assert(from->inEdges().size() == 1);
+    auto e = from->inEdges().front();
+    auto op = e->srcNode();
+    auto output = g.metadata(e).get<Output>();
+    g.erase(e);
+    linkOut(g, op, to, output.port);
+}
+
+GMetaArgs GModel::collectInputMeta(GModel::ConstGraph cg, ade::NodeHandle node)
+{
+    GAPI_Assert(cg.metadata(node).get<NodeType>().t == NodeType::OP);
+    GMetaArgs in_meta_args(cg.metadata(node).get<Op>().args.size());
+
+    for (const auto &e : node->inEdges())
+    {
+        const auto& in_data = cg.metadata(e->srcNode()).get<Data>();
+        in_meta_args[cg.metadata(e).get<Input>().port] = in_data.meta;
+    }
+
+    return in_meta_args;
+}
+
+
+ade::EdgeHandle GModel::getInEdgeByPort(const GModel::ConstGraph& cg,
+                                        const ade::NodeHandle&    nh,
+                                              std::size_t         in_port)
+{
+    auto inEdges = nh->inEdges();
+    const auto& edge = ade::util::find_if(inEdges, [&](ade::EdgeHandle eh) {
+        return cg.metadata(eh).get<Input>().port == in_port;
+    });
+    GAPI_Assert(edge != inEdges.end());
+    return *edge;
+}
+
+GMetaArgs GModel::collectOutputMeta(GModel::ConstGraph cg, ade::NodeHandle node)
+{
+    GAPI_Assert(cg.metadata(node).get<NodeType>().t == NodeType::OP);
+    GMetaArgs out_meta_args(cg.metadata(node).get<Op>().outs.size());
+
+    for (const auto &e : node->outEdges())
+    {
+        const auto& out_data = cg.metadata(e->dstNode()).get<Data>();
+        out_meta_args[cg.metadata(e).get<Output>().port] = out_data.meta;
+    }
+
+    return out_meta_args;
+}
+
+bool GModel::isActive(const GModel::Graph &cg, const cv::gapi::GBackend &backend)
+{
+    return ade::util::contains(cg.metadata().get<ActiveBackends>().backends,
+                               backend);
+}
+
+}} // cv::gimpl
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodel.hpp
new file mode 100644 (file)
index 0000000..003519b
--- /dev/null
@@ -0,0 +1,251 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GMODEL_HPP
+#define OPENCV_GAPI_GMODEL_HPP
+
+#include <memory>           // shared_ptr
+#include <unordered_map>
+#include <functional>       // std::function
+
+#include <ade/graph.hpp>
+#include <ade/typed_graph.hpp>
+#include <ade/passes/topological_sort.hpp>
+
+// /!\ ATTENTION:
+//
+// No API includes like GMat, GNode, GCall here!
+// This part of the system is API-unaware by its design.
+//
+
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+#include "api/gapi_priv.hpp"   // GShape
+#include "api/gproto_priv.hpp" // origin_of
+#include "backends/common/gbackend.hpp"
+
+#include "compiler/gobjref.hpp"
+#include "compiler/gislandmodel.hpp"
+
+namespace cv { namespace gimpl {
+
+// TODO: Document all metadata types
+
+struct NodeType
+{
+    static const char *name() { return "NodeType"; }
+    enum { OP, DATA } t;
+};
+
+struct Input
+{
+    static const char *name() { return "Input"; }
+    std::size_t port;
+};
+
+struct Output
+{
+    static const char *name() { return "Output"; }
+    std::size_t port;
+};
+
+struct Op
+{
+    static const char *name() { return "Op"; }
+    cv::GKernel         k;
+    std::vector<GArg>   args; // TODO: Introduce a new type for internal args?
+    std::vector<RcDesc> outs; // TODO: Introduce a new type for resource references
+
+    cv::gapi::GBackend  backend;
+    util::any           opaque;
+};
+
+struct Data
+{
+    static const char *name() { return "Data"; }
+
+    // FIXME: This is a _pure_ duplication of RcDesc now! (except storage)
+    GShape   shape; // FIXME: Probably to be replaced by GMetaArg?
+    int      rc;
+    GMetaArg meta;
+    HostCtor ctor;  // T-specific helper to deal with unknown types in our code
+    // FIXME: Why rc+shape+meta is not represented as RcDesc here?
+
+    enum class Storage
+    {
+        INTERNAL,   // data object is not listed in GComputation protocol
+        INPUT,      // data object is listed in GComputation protocol as Input
+        OUTPUT,     // data object is listed in GComputation protocol as Output
+        CONST,      // data object is constant
+    };
+    Storage storage;
+};
+
+struct ConstValue
+{
+    static const char *name() { return "ConstValue"; }
+    GRunArg arg;
+};
+
+// This metadata is valid for both DATA and OP kinds of nodes
+// FIXME: Rename to IslandTag
+struct Island
+{
+    static const char *name() { return "Island"; }
+    std::string island; // can be set by user, otherwise is set by fusion
+};
+
+struct Protocol
+{
+    static const char *name() { return "Protocol"; }
+    // TODO: Replace the whole thing with a "Protocol" object
+    std::vector<RcDesc> inputs;
+    std::vector<RcDesc> outputs;
+
+    std::vector<ade::NodeHandle> in_nhs;
+    std::vector<ade::NodeHandle> out_nhs;
+};
+
+struct OutputMeta
+{
+    static const char *name() { return "OutputMeta"; }
+    GMetaArgs outMeta;
+};
+
+struct Journal
+{
+    static const char *name() { return "Journal"; }
+    std::vector<std::string> messages;
+};
+
+// The mapping between user-side GMat/GScalar/... objects
+// and its  appropriate nodes. Can be stored in graph optionally
+// (NOT used by any compiler or backends, introspection purposes
+// only)
+struct Layout
+{
+    static const char *name() { return "Layout"; }
+    GOriginMap<ade::NodeHandle> object_nodes;
+};
+
+// Unique data object counter (per-type)
+class DataObjectCounter
+{
+public:
+    static const char* name() { return "DataObjectCounter"; }
+    int GetNewId(GShape shape) { return m_next_data_id[shape]++; }
+private:
+    std::unordered_map<cv::GShape, int> m_next_data_id;
+};
+
+// A projected graph of Islands (generated from graph of Operations)
+struct IslandModel
+{
+    static const char* name() { return "IslandModel"; }
+    std::shared_ptr<ade::Graph> model;
+};
+
+// List of backends selected for current graph execution
+struct ActiveBackends
+{
+    static const char *name() { return "ActiveBackends"; }
+    std::unordered_set<cv::gapi::GBackend> backends;
+};
+
+namespace GModel
+{
+    using Graph = ade::TypedGraph
+        < NodeType
+        , Input
+        , Output
+        , Op
+        , Data
+        , ConstValue
+        , Island
+        , Protocol
+        , OutputMeta
+        , Journal
+        , ade::passes::TopologicalSortData
+        , DataObjectCounter
+        , Layout
+        , IslandModel
+        , ActiveBackends
+        >;
+
+    // FIXME: How to define it based on GModel???
+    using ConstGraph = ade::ConstTypedGraph
+        < NodeType
+        , Input
+        , Output
+        , Op
+        , Data
+        , ConstValue
+        , Island
+        , Protocol
+        , OutputMeta
+        , Journal
+        , ade::passes::TopologicalSortData
+        , DataObjectCounter
+        , Layout
+        , IslandModel
+        , ActiveBackends
+        >;
+
+    // User should initialize graph before using it
+    // GAPI_EXPORTS for tests
+    GAPI_EXPORTS void init (Graph& g);
+
+    ade::NodeHandle mkOpNode(Graph &g, const GKernel &k, const std::vector<GArg>& args, const std::string &island);
+
+    // FIXME: change it to take GMeta instead of GShape?
+    ade::NodeHandle mkDataNode(Graph &g, const GOrigin& origin);
+
+    // Adds a string message to a node. Any node can be subject of log, messages then
+    // appear in the dumped .dot file.x
+    void log(Graph &g, ade::NodeHandle op, std::string &&message, ade::NodeHandle updater = ade::NodeHandle());
+    void log(Graph &g, ade::EdgeHandle op, std::string &&message, ade::NodeHandle updater = ade::NodeHandle());
+
+    void linkIn   (Graph &g, ade::NodeHandle op,     ade::NodeHandle obj, std::size_t in_port);
+    void linkOut  (Graph &g, ade::NodeHandle op,     ade::NodeHandle obj, std::size_t out_port);
+
+    // FIXME: Align this GModel API properly, it is a mess now
+    namespace detail
+    {
+        // FIXME: GAPI_EXPORTS only because of tests!!!
+        GAPI_EXPORTS ade::NodeHandle dataNodeOf(const ConstGraph& g, const GOrigin &origin);
+    }
+    template<typename T> inline ade::NodeHandle dataNodeOf(const ConstGraph& g, T &&t)
+    {
+        return detail::dataNodeOf(g, cv::gimpl::proto::origin_of(GProtoArg{t}));
+    }
+
+    void linkIn   (Graph &g, ade::NodeHandle op,     ade::NodeHandle obj, std::size_t in_port);
+    void linkOut  (Graph &g, ade::NodeHandle op,     ade::NodeHandle obj, std::size_t out_port);
+
+    void redirectReaders(Graph &g, ade::NodeHandle from, ade::NodeHandle to);
+    void redirectWriter (Graph &g, ade::NodeHandle from, ade::NodeHandle to);
+
+    std::vector<ade::NodeHandle> orderedInputs (Graph &g, ade::NodeHandle nh);
+    std::vector<ade::NodeHandle> orderedOutputs(Graph &g, ade::NodeHandle nh);
+
+    // Returns input meta array for given op node
+    // Array is sparse, as metadata for non-gapi input objects is empty
+    // TODO:
+    // Cover with tests!!
+    GMetaArgs collectInputMeta(GModel::ConstGraph cg, ade::NodeHandle node);
+    GMetaArgs collectOutputMeta(GModel::ConstGraph cg, ade::NodeHandle node);
+
+    ade::EdgeHandle getInEdgeByPort(const GModel::ConstGraph& cg, const ade::NodeHandle& nh, std::size_t in_port);
+
+    // Returns true if the given backend participates in the execution
+    bool isActive(const GModel::Graph &cg, const cv::gapi::GBackend &backend);
+} // namespace GModel
+
+
+}} // namespace cv::gimpl
+
+#endif // OPENCV_GAPI_GMODEL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.cpp
new file mode 100644 (file)
index 0000000..c9b2fbb
--- /dev/null
@@ -0,0 +1,305 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+//    FIXME: "I personally hate this file"
+//                                        - Dmitry
+//
+////////////////////////////////////////////////////////////////////////////////
+#include "precomp.hpp"
+
+#include <utility>              // tuple
+#include <stack>                // stack
+#include <vector>               // vector
+#include <unordered_set>        // unordered_set
+#include <type_traits>          // is_same
+
+#include <ade/util/zip_range.hpp>   // util::indexed
+
+#include "api/gapi_priv.hpp"    // GOrigin
+#include "api/gproto_priv.hpp"  // descriptor_of and other GProtoArg-related
+#include "api/gcall_priv.hpp"
+#include "api/gnode_priv.hpp"
+
+#include "compiler/gmodelbuilder.hpp"
+
+namespace {
+
+
+// TODO: move to helpers and cover with internal tests?
+template<typename T> struct GVisited
+{
+    typedef std::unordered_set<T> VTs;
+
+    bool visited(const T& t) const { return m_visited.find(t) != m_visited.end(); }
+    void visit  (const T& t)       { m_visited.insert(t); }
+    const VTs& visited()     const { return m_visited; }
+
+private:
+    VTs m_visited;
+};
+
+template<typename T, typename U = T> struct GVisitedTracker: protected GVisited<T>
+{
+    typedef std::vector<U> TUs;
+
+    void  visit(const T& t, const U& u) { GVisited<T>::visit(t); m_tracked.push_back(u); }
+    const TUs& tracked() const          { return m_tracked; }
+    using GVisited<T>::visited;
+
+private:
+    TUs m_tracked;
+};
+
+} // namespace
+
+
+cv::gimpl::Unrolled cv::gimpl::unrollExpr(const GProtoArgs &ins,
+                                          const GProtoArgs &outs)
+{
+    // FIXME: Who's gonna check if ins/outs are not EMPTY?
+    // FIXME: operator== for GObjects? (test if the same object or not)
+    using GObjId = const cv::GOrigin*;
+
+    GVisitedTracker<const GNode::Priv*, cv::GNode> ops;
+    GVisited<GObjId> reached_sources;
+    cv::GOriginSet   origins;
+
+    // Cache input argument objects for a faster look-up
+    // While the only reliable way to identify a Data object is Origin
+    // (multiple data objects may refer to the same Origin as result of
+    // multuple yield() calls), input objects can be uniquely identified
+    // by its `priv` address. Here we rely on this to verify if the expression
+    // we unroll actually matches the protocol specified to us by user.
+    std::unordered_set<GObjId> in_objs_p;
+    for (const auto& in_obj : ins)
+    {
+        // Objects are guarnateed to remain alive while this method
+        // is working, so it is safe to keep pointers here and below
+        in_objs_p.insert(&proto::origin_of(in_obj));
+    }
+
+    // Recursive expression traversal
+    std::stack<cv::GProtoArg> data_objs(std::deque<cv::GProtoArg>(outs.begin(), outs.end()));
+    while (!data_objs.empty())
+    {
+        const auto  obj   = data_objs.top();
+        const auto &obj_p = proto::origin_of(obj);
+        data_objs.pop();
+
+        const auto &origin = obj_p;
+        origins.insert(origin); // TODO: Put Object description here later on
+
+        // If this Object is listed in the protocol, don't dive deeper (even
+        // if it is in fact a result of operation). Our computation is
+        // bounded by this data slot, so terminate this recursion path early.
+        if (in_objs_p.find(&obj_p) != in_objs_p.end())
+        {
+            reached_sources.visit(&obj_p);
+            continue;
+        }
+
+        const cv::GNode &node = origin.node;
+        switch (node.shape())
+        {
+        case cv::GNode::NodeShape::EMPTY:
+            // TODO: Own exception type?
+            util::throw_error(std::logic_error("Empty node reached!"));
+            break;
+
+        case cv::GNode::NodeShape::PARAM:
+        case cv::GNode::NodeShape::CONST_BOUNDED:
+            // No preceding operation to this data object - so the data object is either a GComputation
+            // parameter or a constant (compile-time) value
+            // Record it to check if protocol matches expression tree later
+            if (!reached_sources.visited(&obj_p))
+                reached_sources.visit(&obj_p);
+            break;
+
+        case cv::GNode::NodeShape::CALL:
+            if (!ops.visited(&node.priv()))
+            {
+                // This operation hasn't been visited yet - mark it so,
+                // then add its operands to stack to continue recursion.
+                ops.visit(&node.priv(), node);
+
+                const cv::GCall         call   = origin.node.call();
+                const cv::GCall::Priv&  call_p = call.priv();
+
+                // Put the outputs object description of the node
+                // so that they are not lost if they are not consumed by other operations
+                for (const auto &it : ade::util::indexed(call_p.m_k.outShapes))
+                {
+                    std::size_t port  = ade::util::index(it);
+                    GShape shape      = ade::util::value(it);
+
+                    GOrigin org { shape, node, port};
+                    origins.insert(org);
+                }
+
+                for (const auto &arg : call_p.m_args)
+                {
+                    if (proto::is_dynamic(arg))
+                    {
+                        data_objs.push(proto::rewrap(arg)); // Dive deeper
+                    }
+                }
+            }
+            break;
+
+        default:
+            // Unsupported node shape
+            GAPI_Assert(false);
+            break;
+        }
+    }
+
+    // Check if protocol mentions data_objs which weren't reached during traversal
+    const auto missing_reached_sources = [&reached_sources](GObjId p) {
+        return reached_sources.visited().find(p) == reached_sources.visited().end();
+    };
+    if (ade::util::any_of(in_objs_p, missing_reached_sources))
+    {
+        // TODO: Own exception type or a return code?
+      util::throw_error(std::logic_error("Data object listed in Protocol "
+                                     "wasn\'t reached during unroll"));
+    }
+
+    // Check if there endpoint (parameter) data_objs which are not listed in protocol
+    const auto missing_in_proto = [&in_objs_p](GObjId p) {
+        return p->node.shape() != cv::GNode::NodeShape::CONST_BOUNDED &&
+               in_objs_p.find(p) == in_objs_p.end();
+    };
+    if (ade::util::any_of(reached_sources.visited(), missing_in_proto))
+    {
+        // TODO: Own exception type or a return code?
+      util::throw_error(std::logic_error("Data object reached during unroll "
+                                     "wasn\'t found in Protocol"));
+    }
+
+    return cv::gimpl::Unrolled{ops.tracked(), origins};
+}
+
+
+cv::gimpl::GModelBuilder::GModelBuilder(ade::Graph &g)
+    : m_g(g)
+{
+}
+
+cv::gimpl::GModelBuilder::ProtoSlots
+cv::gimpl::GModelBuilder::put(const GProtoArgs &ins, const GProtoArgs &outs)
+{
+    const auto unrolled = cv::gimpl::unrollExpr(ins, outs);
+
+    // First, put all operations and its arguments into graph.
+    for (const auto &op_expr_node : unrolled.all_ops)
+    {
+        GAPI_Assert(op_expr_node.shape() == GNode::NodeShape::CALL);
+        const GCall&        call    = op_expr_node.call();
+        const GCall::Priv&  call_p  = call.priv();
+        ade::NodeHandle     call_h  = put_OpNode(op_expr_node);
+
+        for (const auto &it : ade::util::indexed(call_p.m_args))
+        {
+            const auto  in_port = ade::util::index(it);
+            const auto& in_arg  = ade::util::value(it);
+
+            if (proto::is_dynamic(in_arg))
+            {
+                ade::NodeHandle data_h = put_DataNode(proto::origin_of(in_arg));
+                cv::gimpl::GModel::linkIn(m_g, call_h, data_h, in_port);
+            }
+        }
+    }
+
+    // Then iterate via all "origins", instantiate (if not yet) Data graph nodes
+    // and connect these nodes with their producers in graph
+    for (const auto &origin : unrolled.all_data)
+    {
+        const cv::GNode& prod = origin.node;
+        GAPI_Assert(prod.shape() != cv::GNode::NodeShape::EMPTY);
+
+        ade::NodeHandle data_h = put_DataNode(origin);
+        if (prod.shape() == cv::GNode::NodeShape::CALL)
+        {
+            ade::NodeHandle call_h = put_OpNode(prod);
+            cv::gimpl::GModel::linkOut(m_g, call_h, data_h, origin.port);
+        }
+    }
+
+    // Mark graph data nodes as INPUTs and OUTPUTs respectively (according to the protocol)
+    for (const auto &arg : ins)
+    {
+        ade::NodeHandle nh = put_DataNode(proto::origin_of(arg));
+        m_g.metadata(nh).get<Data>().storage = Data::Storage::INPUT;
+    }
+    for (const auto &arg : outs)
+    {
+        ade::NodeHandle nh = put_DataNode(proto::origin_of(arg));
+        m_g.metadata(nh).get<Data>().storage = Data::Storage::OUTPUT;
+    }
+
+    // And, finally, store data object layout in meta
+    m_g.metadata().set(Layout{m_graph_data});
+
+    // After graph is generated, specify which data objects are actually
+    // computation entry/exit points.
+    using NodeDescr = std::pair<std::vector<RcDesc>,
+                                std::vector<ade::NodeHandle> >;
+
+    const auto get_proto_slots = [&](const GProtoArgs &proto) -> NodeDescr
+    {
+        NodeDescr slots;
+
+        slots.first.reserve(proto.size());
+        slots.second.reserve(proto.size());
+
+        for (const auto &arg : proto)
+        {
+            ade::NodeHandle nh = put_DataNode(proto::origin_of(arg));
+            const auto &desc = m_g.metadata(nh).get<Data>();
+            //These extra empty {} are to please GCC (-Wmissing-field-initializers)
+            slots.first.push_back(RcDesc{desc.rc, desc.shape, {}});
+            slots.second.push_back(nh);
+        }
+        return slots;
+    };
+
+    auto in_slots  = get_proto_slots(ins);
+    auto out_slots = get_proto_slots(outs);
+    return ProtoSlots{in_slots.first,  out_slots.first,
+                      in_slots.second, out_slots.second};
+}
+
+ade::NodeHandle cv::gimpl::GModelBuilder::put_OpNode(const cv::GNode &node)
+{
+    const auto& node_p = node.priv();
+    const auto  it     = m_graph_ops.find(&node_p);
+    if (it == m_graph_ops.end())
+    {
+        GAPI_Assert(node.shape() == GNode::NodeShape::CALL);
+        const auto &call_p = node.call().priv();
+        auto nh = cv::gimpl::GModel::mkOpNode(m_g, call_p.m_k, call_p.m_args, node_p.m_island);
+        m_graph_ops[&node_p] = nh;
+        return nh;
+    }
+    else return it->second;
+}
+
+// FIXME: rename to get_DataNode (and same for Op)
+ade::NodeHandle cv::gimpl::GModelBuilder::put_DataNode(const GOrigin &origin)
+{
+    const auto it = m_graph_data.find(origin);
+    if (it == m_graph_data.end())
+    {
+        auto nh = cv::gimpl::GModel::mkDataNode(m_g, origin);
+        m_graph_data[origin] = nh;
+        return nh;
+    }
+    else return it->second;
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gmodelbuilder.hpp
new file mode 100644 (file)
index 0000000..ce12c7e
--- /dev/null
@@ -0,0 +1,77 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GMODEL_BUILDER_HPP
+#define OPENCV_GAPI_GMODEL_BUILDER_HPP
+
+#include <map>
+#include <unordered_map>
+
+#include "opencv2/gapi/gproto.hpp"
+#include "opencv2/gapi/gcall.hpp"
+
+#include "api/gapi_priv.hpp"
+#include "api/gnode.hpp"
+#include "compiler/gmodel.hpp"
+
+namespace cv { namespace gimpl {
+
+struct Unrolled
+{
+    std::vector<cv::GNode> all_ops;
+    GOriginSet             all_data;
+
+    // NB.: Right now, as G-API operates with GMats only and that
+    // GMats have no type or dimensions (when a computation is built),
+    // track only origins (data links) with no any additional meta.
+};
+
+// FIXME: GAPI_EXPORTS only because of tests!!!
+GAPI_EXPORTS Unrolled unrollExpr(const GProtoArgs &ins, const GProtoArgs &outs);
+
+// This class generates an ADE graph with G-API specific metadata
+// to represent user-specified computation in terms of graph model
+//
+// Resulting graph is built according to the following rules:
+// - Every operation is a node
+// - Every dynamic object (GMat) is a node
+// - Edges between nodes represent producer/consumer relationships
+//   between operations and data objects.
+// FIXME: GAPI_EXPORTS only because of tests!!!
+class GAPI_EXPORTS GModelBuilder
+{
+    GModel::Graph m_g;
+
+    // Mappings of G-API user framework entities to ADE node handles
+    std::unordered_map<const cv::GNode::Priv*, ade::NodeHandle> m_graph_ops;
+    GOriginMap<ade::NodeHandle> m_graph_data;
+
+    // Internal methods for mapping APIs into ADE during put()
+    ade::NodeHandle put_OpNode(const cv::GNode &node);
+    ade::NodeHandle put_DataNode(const cv::GOrigin &origin);
+
+public:
+    explicit GModelBuilder(ade::Graph &g);
+
+    // TODO: replace GMat with a generic type
+    // TODO: Cover with tests! (as the rest of internal stuff)
+    // FIXME: Calling this method multiple times is currently UB
+    // TODO: add a semantic link between "ints" returned and in-model data IDs.
+    typedef std::tuple<std::vector<RcDesc>,
+                       std::vector<RcDesc>,
+                       std::vector<ade::NodeHandle>,
+                       std::vector<ade::NodeHandle> > ProtoSlots;
+
+    ProtoSlots put(const GProtoArgs &ins, const GProtoArgs &outs);
+
+protected:
+    ade::NodeHandle opNode(cv::GMat gmat);
+};
+
+}}
+
+#endif // OPENCV_GAPI_GMODEL_BUILDER_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gobjref.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/gobjref.hpp
new file mode 100644 (file)
index 0000000..be365c9
--- /dev/null
@@ -0,0 +1,50 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GMATREF_HPP
+#define OPENCV_GAPI_GMATREF_HPP
+
+#include "opencv2/gapi/util/variant.hpp"
+#include "opencv2/gapi/garg.hpp"
+
+#include "api/gapi_priv.hpp" // GShape, HostCtor
+
+namespace cv
+{
+
+namespace gimpl
+{
+    struct RcDesc
+    {
+        int      id;      // id is unique but local to shape
+        GShape   shape;   // pair <id,shape> IS the unique ID
+        HostCtor ctor;    // FIXME: is it really used here? Or in <Data>?
+
+        bool operator==(const RcDesc &rhs) const
+        {
+            // FIXME: ctor is not checked (should be?)
+            return id == rhs.id && shape == rhs.shape;
+        }
+
+        bool operator< (const RcDesc &rhs) const
+        {
+            return (id == rhs.id) ? shape < rhs.shape : id < rhs.id;
+        }
+    };
+} // gimpl
+
+namespace detail
+{
+    template<> struct GTypeTraits<cv::gimpl::RcDesc>
+    {
+        static constexpr const ArgKind kind = ArgKind::GOBJREF;
+    };
+}
+
+} // cv
+
+#endif // OPENCV_GAPI_GMATREF_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/dump_dot.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/dump_dot.cpp
new file mode 100644 (file)
index 0000000..8741089
--- /dev/null
@@ -0,0 +1,223 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <iostream>                              // cout
+#include <sstream>                               // stringstream
+#include <fstream>                               // ofstream
+
+#include <ade/passes/check_cycles.hpp>
+
+#include "opencv2/gapi/gproto.hpp"
+#include "compiler/gmodel.hpp"
+#include "compiler/gislandmodel.hpp"
+#include "compiler/passes/passes.hpp"
+
+namespace cv { namespace gimpl { namespace passes {
+
+// TODO: FIXME: Ideally all this low-level stuff with accessing ADE APIs directly
+// should be incapsulated somewhere into GModel, so here we'd operate not
+// with raw nodes and edges, but with Operations and Data it produce/consume.
+void dumpDot(const ade::Graph &g, std::ostream& os)
+{
+    GModel::ConstGraph gr(g);
+
+    const std::unordered_map<cv::GShape, std::string> data_labels = {
+        {cv::GShape::GMAT,    "GMat"},
+        {cv::GShape::GSCALAR, "GScalar"},
+        {cv::GShape::GARRAY,  "GArray"},
+    };
+
+    auto format_op_label  = [&gr](ade::NodeHandle nh) -> std::string {
+        std::stringstream ss;
+        const cv::GKernel k = gr.metadata(nh).get<Op>().k;
+        ss << k.name << "_" << nh;
+        return ss.str();
+    };
+
+    auto format_op  = [&format_op_label](ade::NodeHandle nh) -> std::string {
+        return "\"" + format_op_label(nh) + "\"";
+    };
+
+    auto format_obj = [&gr, &data_labels](ade::NodeHandle nh) -> std::string {
+        std::stringstream ss;
+        const auto &data = gr.metadata(nh).get<Data>();
+        ss << data_labels.at(data.shape) << "_" << data.rc;
+        return ss.str();
+    };
+
+    auto format_log = [&gr](ade::NodeHandle nh, const std::string &obj_name) {
+        std::stringstream ss;
+        const auto &msgs = gr.metadata(nh).get<Journal>().messages;
+        ss << "xlabel=\"";
+        if (!obj_name.empty()) { ss << "*** " << obj_name << " ***:\n"; };
+        for (const auto &msg : msgs) { ss << msg << "\n"; }
+        ss << "\"";
+        return ss.str();
+    };
+
+    // FIXME:
+    // Unify with format_log
+    auto format_log_e = [&gr](ade::EdgeHandle nh) {
+        std::stringstream ss;
+        const auto &msgs = gr.metadata(nh).get<Journal>().messages;
+        for (const auto &msg : msgs) { ss << "\n" << msg; }
+        return ss.str();
+    };
+
+    auto sorted = gr.metadata().get<ade::passes::TopologicalSortData>();
+
+    os << "digraph GAPI_Computation {\n";
+
+    // Prior to dumping the graph itself, list Data and Op nodes individually
+    // and put type information in labels.
+    // Also prepare list of nodes in islands, if any
+    std::map<std::string, std::vector<std::string> > islands;
+    for (auto &nh : sorted.nodes())
+    {
+        const auto node_type = gr.metadata(nh).get<NodeType>().t;
+        if (NodeType::DATA == node_type)
+        {
+            const auto obj_data = gr.metadata(nh).get<Data>();
+            const auto obj_name = format_obj(nh);
+
+            os << obj_name << " [label=\"" << obj_name << "\n" << obj_data.meta << "\"";
+            if (gr.metadata(nh).contains<Journal>()) { os << ", " << format_log(nh, obj_name); }
+            os << " ]\n";
+
+            if (gr.metadata(nh).contains<Island>())
+                islands[gr.metadata(nh).get<Island>().island].push_back(obj_name);
+        }
+        else if (NodeType::OP == gr.metadata(nh).get<NodeType>().t)
+        {
+            const auto obj_name       = format_op(nh);
+            const auto obj_name_label = format_op_label(nh);
+
+            os << obj_name << " [label=\"" << obj_name_label << "\"";
+            if (gr.metadata(nh).contains<Journal>()) { os << ", " << format_log(nh, obj_name_label); }
+            os << " ]\n";
+
+            if (gr.metadata(nh).contains<Island>())
+                islands[gr.metadata(nh).get<Island>().island].push_back(obj_name);
+        }
+    }
+
+    // Then, dump Islands (only nodes, operations and data, without links)
+    for (const auto &isl : islands)
+    {
+        os << "subgraph \"cluster " + isl.first << "\" {\n";
+        for(auto isl_node : isl.second) os << isl_node << ";\n";
+        os << "label=\"" << isl.first << "\";";
+        os << "}\n";
+    }
+
+    // Now dump the graph
+    for (auto &nh : sorted.nodes())
+    {
+        // FIXME: Alan Kay probably hates me.
+        switch (gr.metadata(nh).get<NodeType>().t)
+        {
+        case NodeType::DATA:
+        {
+            const auto obj_name = format_obj(nh);
+            for (const auto &eh : nh->outEdges())
+            {
+                os << obj_name << " -> " << format_op(eh->dstNode())
+                   << " [ label = \"in_port: "
+                   << gr.metadata(eh).get<Input>().port;
+                   if (gr.metadata(eh).contains<Journal>()) { os << format_log_e(eh); }
+                   os << "\" ] \n";
+            }
+        }
+        break;
+        case NodeType::OP:
+        {
+            for (const auto &eh : nh->outEdges())
+            {
+                os << format_op(nh) << " -> " << format_obj(eh->dstNode())
+                   << " [ label = \"out_port: "
+                   << gr.metadata(eh).get<Output>().port
+                   << " \" ]; \n";
+            }
+        }
+        break;
+        default: GAPI_Assert(false);
+        }
+    }
+
+    // And finally dump a GIslandModel (not connected with GModel directly,
+    // but projected in the same .dot file side-by-side)
+    auto pIG = gr.metadata().get<IslandModel>().model;
+    GIslandModel::Graph gim(*pIG);
+    for (auto nh : gim.nodes())
+    {
+        switch (gim.metadata(nh).get<NodeKind>().k)
+        {
+        case NodeKind::ISLAND:
+            {
+                const auto island   = gim.metadata(nh).get<FusedIsland>().object;
+                const auto isl_name = "\"" + island->name() + "\"";
+                for (auto out_nh : nh->outNodes())
+                {
+                    os << isl_name << " -> \"slot:"
+                       << format_obj(gim.metadata(out_nh).get<DataSlot>()
+                                     .original_data_node)
+                       << "\"\n";
+                }
+            }
+            break;
+
+        case NodeKind::SLOT:
+            {
+                const auto obj_name = format_obj(gim.metadata(nh).get<DataSlot>()
+                                                 .original_data_node);
+                for (auto cons_nh : nh->outNodes())
+                {
+                    os << "\"slot:" << obj_name << "\" -> \""
+                       << gim.metadata(cons_nh).get<FusedIsland>().object->name()
+                       << "\"\n";
+                }
+            }
+            break;
+
+        default:
+            GAPI_Assert(false);
+            break;
+        }
+    }
+
+    os << "}" << std::endl;
+}
+
+void dumpDot(ade::passes::PassContext &ctx, std::ostream& os)
+{
+    dumpDot(ctx.graph, os);
+}
+
+void dumpDotStdout(ade::passes::PassContext &ctx)
+{
+    dumpDot(ctx, std::cout);
+}
+
+void dumpDotToFile(ade::passes::PassContext &ctx, const std::string& dump_path)
+{
+    std::ofstream dump_file(dump_path);
+
+    if (dump_file.is_open())
+    {
+        dumpDot(ctx, dump_file);
+        dump_file << std::endl;
+    }
+}
+
+void dumpGraph(ade::passes::PassContext &ctx, const std::string& dump_path)
+{
+    dump_path.empty() ? dumpDotStdout(ctx) : dumpDotToFile(ctx, dump_path);
+}
+
+}}} // cv::gimpl::passes
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/exec.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/exec.cpp
new file mode 100644 (file)
index 0000000..7119e34
--- /dev/null
@@ -0,0 +1,641 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <string>
+#include <list> // list
+#include <iomanip>  // setw, etc
+#include <fstream> // ofstream
+#include <memory>
+#include <functional>
+
+#include <ade/util/algorithm.hpp>   // contains
+#include <ade/util/chain_range.hpp> // chain
+
+#include "opencv2/gapi/util/optional.hpp"  // util::optional
+#include "logger.hpp"    // GAPI_LOG
+
+#include "compiler/gmodel.hpp"
+#include "compiler/gislandmodel.hpp"
+#include "compiler/passes/passes.hpp"
+#include "compiler/passes/helpers.hpp"
+#include "compiler/transactions.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// N.B.
+// Merge is a binary operation (LHS `Merge` RHS) where LHS may be arbitrary
+//
+// After every merge, the procedure starts from the beginning (in the topological
+// order), thus trying to merge next "unmerged" island to the latest merged one.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Uncomment to dump more info on merge process
+// FIXME: make it user-configurable run-time option
+// #define DEBUG_MERGE
+
+namespace cv
+{
+namespace gimpl
+{
+namespace
+{
+    bool fusionIsTrivial(const ade::Graph &src_graph)
+    {
+        // Fusion is considered trivial if there only one
+        // active backend and no user-defined islands
+        // FIXME:
+        // Also check the cases backend can't handle
+        // (e.x. GScalar connecting two fluid ops should split the graph)
+        const GModel::ConstGraph g(src_graph);
+        const auto& active_backends = g.metadata().get<ActiveBackends>().backends;
+        return active_backends.size() == 1 &&
+                ade::util::all_of(g.nodes(), [&](ade::NodeHandle nh) {
+            return !g.metadata(nh).contains<Island>();
+        });
+    }
+
+    void fuseTrivial(GIslandModel::Graph &g, const ade::Graph &src_graph)
+    {
+        const GModel::ConstGraph src_g(src_graph);
+
+        const auto& backend = *src_g.metadata().get<ActiveBackends>().backends.cbegin();
+        const auto& proto = src_g.metadata().get<Protocol>();
+        GIsland::node_set all, in_ops, out_ops;
+
+        all.insert(src_g.nodes().begin(), src_g.nodes().end());
+
+        for (const auto nh : proto.in_nhs)
+        {
+            all.erase(nh);
+            in_ops.insert(nh->outNodes().begin(), nh->outNodes().end());
+        }
+        for (const auto nh : proto.out_nhs)
+        {
+            all.erase(nh);
+            out_ops.insert(nh->inNodes().begin(), nh->inNodes().end());
+        }
+
+        auto isl = std::make_shared<GIsland>(backend,
+                                             std::move(all),
+                                             std::move(in_ops),
+                                             std::move(out_ops),
+                                             util::optional<std::string>{});
+
+        auto ih = GIslandModel::mkIslandNode(g, std::move(isl));
+
+        for (const auto nh : proto.in_nhs)
+        {
+            auto slot = GIslandModel::mkSlotNode(g, nh);
+            g.link(slot, ih);
+        }
+        for (const auto nh : proto.out_nhs)
+        {
+            auto slot = GIslandModel::mkSlotNode(g, nh);
+            g.link(ih, slot);
+        }
+    }
+
+    struct MergeContext
+    {
+        using CycleCausers = std::pair< std::shared_ptr<GIsland>,
+                                        std::shared_ptr<GIsland> >;
+
+        struct CycleHasher final
+        {
+            std::size_t operator()(const CycleCausers& p) const
+            {
+                std::size_t a = std::hash<GIsland*>()(p.first.get());
+                std::size_t b = std::hash<GIsland*>()(p.second.get());
+                return a ^ (b << 1);
+            }
+        };
+
+        // Set of Islands pairs which cause a cycle if merged.
+        // Every new merge produces a new Island, and if Islands were
+        // merged (and thus dropped from GIslandModel), the objects may
+        // still be alive as included into this set.
+        std::unordered_set<CycleCausers, CycleHasher> cycle_causers;
+    };
+
+    bool canMerge(const GIslandModel::Graph &g,
+                  const ade::NodeHandle a_nh,
+                  const ade::NodeHandle /*slot_nh*/,
+                  const ade::NodeHandle b_nh,
+                  const MergeContext &ctx = MergeContext())
+    {
+        auto a_ptr = g.metadata(a_nh).get<FusedIsland>().object;
+        auto b_ptr = g.metadata(b_nh).get<FusedIsland>().object;
+        GAPI_Assert(a_ptr.get());
+        GAPI_Assert(b_ptr.get());
+
+        // Islands with different affinity can't be merged
+        if (a_ptr->backend() != b_ptr->backend())
+            return false;
+
+        // Islands which cause a cycle can't be merged as well
+        // (since the flag is set, the procedure already tried to
+        // merge these islands in the past)
+        if (ade::util::contains(ctx.cycle_causers, std::make_pair(a_ptr, b_ptr))||
+            ade::util::contains(ctx.cycle_causers, std::make_pair(b_ptr, a_ptr)))
+            return false;
+
+        // There may be user-defined islands. Initially user-defined
+        // islands also are built from single operations and then merged
+        // by this procedure, but there is some exceptions.
+        // User-specified island can't be merged to an internal island
+        if (   ( a_ptr->is_user_specified() && !b_ptr->is_user_specified())
+            || (!a_ptr->is_user_specified() &&  b_ptr->is_user_specified()))
+        {
+            return false;
+        }
+        else if (a_ptr->is_user_specified() && b_ptr->is_user_specified())
+        {
+            // These islads are _different_ user-specified Islands
+            // FIXME: today it may only differ by name
+            if (a_ptr->name() != b_ptr->name())
+                return false;
+        }
+
+        // FIXME: add a backend-specified merge checker
+        return true;
+    }
+
+    inline bool isProducedBy(const ade::NodeHandle &slot,
+                             const ade::NodeHandle &island)
+    {
+        // A data slot may have only 0 or 1 producer
+        if (slot->inNodes().size() == 0)
+            return false;
+
+        return slot->inNodes().front() == island;
+    }
+
+    inline bool isConsumedBy(const ade::NodeHandle &slot,
+                             const ade::NodeHandle &island)
+    {
+        auto it = std::find_if(slot->outNodes().begin(),
+                               slot->outNodes().end(),
+                               [&](const ade::NodeHandle &nh) {
+                                   return nh == island;
+                               });
+        return it != slot->outNodes().end();
+    }
+
+    /**
+     * Find a candidate Island for merge for the given Island nh.
+     *
+     * @param g Island Model where merge occurs
+     * @param nh GIsland node, either LHS or RHS of probable merge
+     * @param ctx Merge context, may contain some cached stuff to avoid
+     *      double/triple/etc checking
+     * @return Tuple of Island handle, Data slot handle (which connects them),
+     *      and a position of found handle with respect to nh (IN/OUT)
+     */
+    std::tuple<ade::NodeHandle, ade::NodeHandle, Direction>
+    findCandidate(const GIslandModel::Graph &g,
+                  ade::NodeHandle nh,
+                  const MergeContext &ctx = MergeContext())
+    {
+        using namespace std::placeholders;
+
+        // Find a first matching candidate GIsland for merge
+        // among inputs
+        for (const auto& input_data_nh : nh->inNodes())
+        {
+            if (input_data_nh->inNodes().size() != 0)
+            {
+                // Data node must have a single producer only
+                GAPI_DbgAssert(input_data_nh->inNodes().size() == 1);
+                auto input_data_prod_nh = input_data_nh->inNodes().front();
+                if (canMerge(g, input_data_prod_nh, input_data_nh, nh, ctx))
+                    return std::make_tuple(input_data_prod_nh,
+                                           input_data_nh,
+                                           Direction::In);
+            }
+        } // for(inNodes)
+
+        // Ok, now try to find it among the outputs
+        for (const auto& output_data_nh : nh->outNodes())
+        {
+            auto mergeTest = [&](ade::NodeHandle cons_nh) -> bool {
+                return canMerge(g, nh, output_data_nh, cons_nh, ctx);
+            };
+            auto cand_it = std::find_if(output_data_nh->outNodes().begin(),
+                                        output_data_nh->outNodes().end(),
+                                        mergeTest);
+            if (cand_it != output_data_nh->outNodes().end())
+                return std::make_tuple(*cand_it,
+                                       output_data_nh,
+                                       Direction::Out);
+        } // for(outNodes)
+        // Empty handle, no good candidates
+        return std::make_tuple(ade::NodeHandle(),
+                               ade::NodeHandle(),
+                               Direction::Invalid);
+    }
+
+    // A cancellable merge of two GIslands, "a" and "b", connected via "slot"
+    class MergeAction
+    {
+        ade::Graph &m_g;
+        const ade::Graph &m_orig_g;
+        GIslandModel::Graph m_gim;
+        ade::NodeHandle m_prod;
+        ade::NodeHandle m_slot;
+        ade::NodeHandle m_cons;
+
+        Change::List m_changes;
+
+        struct MergeObjects
+        {
+            using NS = GIsland::node_set;
+            NS all;      // same as in GIsland
+            NS in_ops;   // same as in GIsland
+            NS out_ops;  // same as in GIsland
+            NS opt_interim_slots;    // can be dropped (optimized out)
+            NS non_opt_interim_slots;// can't be dropped (extern. link)
+        };
+        MergeObjects identify() const;
+
+    public:
+        MergeAction(ade::Graph &g,
+                    const ade::Graph &orig_g,
+                    ade::NodeHandle prod,
+                    ade::NodeHandle slot,
+                    ade::NodeHandle cons)
+            : m_g(g)
+            , m_orig_g(orig_g)
+            , m_gim(GIslandModel::Graph(m_g))
+            , m_prod(prod)
+            , m_slot(slot)
+            , m_cons(cons)
+        {
+        }
+
+        void tryMerge(); // Try to merge islands Prod and Cons
+        void rollback(); // Roll-back changes if merge has been done but broke the model
+        void commit();   // Fix changes in the model after successful merge
+    };
+
+    // Merge proceduce is a replacement of two Islands, Prod and Cons,
+    // connected via !Slot!, with a new Island, which contain all Prod
+    // nodes + all Cons nodes, and reconnected in the graph properly:
+    //
+    // Merge(Prod, !Slot!, Cons)
+    //
+    //                                  [Slot 2]
+    //                                    :
+    //                                    v
+    //     ... [Slot 0] -> Prod -> !Slot! -> Cons -> [Slot 3] -> ...
+    //     ... [Slot 1] -'           :           '-> [Slot 4] -> ...
+    //                               V
+    //                              ...
+    // results into:
+    //
+    //     ... [Slot 0] -> Merged  -> [Slot 3] ...
+    //     ... [Slot 1] :         :-> [Slot 4] ...
+    //     ... [Slot 2] '         '-> !Slot! ...
+    //
+    // The rules are the following:
+    // 1) All Prod input slots become Merged input slots;
+    //    - Example: Slot 0 Slot 1
+    // 2) Any Cons input slots which come from Islands different to Prod
+    //    also become Merged input slots;
+    //    - Example: Slot 2
+    // 3) All Cons output slots become Merged output slots;
+    //    - Example: Slot 3, Slot 4
+    // 4) All Prod output slots which are not consumed by Cons
+    //    also become Merged output slots;
+    //    - (not shown on the example)
+    // 5) If the !Slot! which connects Prod and Cons is consumed
+    //    exclusively by Cons, it is optimized out (dropped) from the model;
+    // 6) If the !Slot! is used by consumers other by Cons, it
+    //    becomes an extra output of Merged
+    // 7) !Slot! may be not the only connection between Prod and Cons,
+    //    but as a result of merge operation, all such connections
+    //    should be handles as described for !Slot!
+
+    MergeAction::MergeObjects MergeAction::identify() const
+    {
+        auto lhs = m_gim.metadata(m_prod).get<FusedIsland>().object;
+        auto rhs = m_gim.metadata(m_cons).get<FusedIsland>().object;
+
+        GIsland::node_set interim_slots;
+
+        GIsland::node_set merged_all(lhs->contents());
+        merged_all.insert(rhs->contents().begin(), rhs->contents().end());
+
+        GIsland::node_set merged_in_ops(lhs->in_ops());     // (1)
+        for (auto cons_in_slot_nh : m_cons->inNodes())      // (2)
+        {
+            if (isProducedBy(cons_in_slot_nh, m_prod))
+            {
+                interim_slots.insert(cons_in_slot_nh);
+                // at this point, interim_slots are not sync with merged_all
+                // (merged_all will be updated with interim_slots which
+                // will be optimized out).
+            }
+            else
+            {
+                const auto extra_in_ops = rhs->consumers(m_g, cons_in_slot_nh);
+                merged_in_ops.insert(extra_in_ops.begin(), extra_in_ops.end());
+            }
+        }
+
+        GIsland::node_set merged_out_ops(rhs->out_ops());   // (3)
+        for (auto prod_out_slot_nh : m_prod->outNodes())    // (4)
+        {
+            if (!isConsumedBy(prod_out_slot_nh, m_cons))
+            {
+                merged_out_ops.insert(lhs->producer(m_g, prod_out_slot_nh));
+            }
+        }
+
+        // (5,6,7)
+        GIsland::node_set opt_interim_slots;
+        GIsland::node_set non_opt_interim_slots;
+
+        auto is_non_opt = [&](const ade::NodeHandle &slot_nh) {
+            // If a data object associated with this slot is a part
+            // of GComputation _output_ protocol, it can't be optimzied out
+            const auto data_nh = m_gim.metadata(slot_nh).get<DataSlot>().original_data_node;
+            const auto &data = GModel::ConstGraph(m_orig_g).metadata(data_nh).get<Data>();
+            if (data.storage == Data::Storage::OUTPUT)
+                return true;
+
+            // Otherwise, a non-optimizeable data slot is the one consumed
+            // by some other island than "cons"
+            const auto it = std::find_if(slot_nh->outNodes().begin(),
+                                         slot_nh->outNodes().end(),
+                                         [&](ade::NodeHandle &&nh)
+                                         {return nh != m_cons;});
+            return it != slot_nh->outNodes().end();
+        };
+        for (auto slot_nh : interim_slots)
+        {
+            // Put all intermediate data nodes (which are BOTH optimized
+            // and not-optimized out) to Island contents.
+            merged_all.insert(m_gim.metadata(slot_nh)
+                              .get<DataSlot>().original_data_node);
+
+            GIsland::node_set &dst = is_non_opt(slot_nh)
+                ? non_opt_interim_slots // there are consumers other than m_cons
+                : opt_interim_slots;    // there's no consumers other than m_cons
+            dst.insert(slot_nh);
+        }
+
+        // (4+6).
+        // BTW, (4) could be "All Prod output slots read NOT ONLY by Cons"
+        for (auto non_opt_slot_nh : non_opt_interim_slots)
+        {
+            merged_out_ops.insert(lhs->producer(m_g, non_opt_slot_nh));
+        }
+        return MergeObjects{
+            merged_all, merged_in_ops, merged_out_ops,
+            opt_interim_slots, non_opt_interim_slots
+        };
+    }
+
+    // FIXME(DM): Probably this procedure will be refactored dramatically one day...
+    void MergeAction::tryMerge()
+    {
+        // _: Understand the contents and I/O connections of a new merged Island
+        MergeObjects mo = identify();
+        auto lhs_obj = m_gim.metadata(m_prod).get<FusedIsland>().object;
+        auto rhs_obj = m_gim.metadata(m_cons).get<FusedIsland>().object;
+        GAPI_Assert(   ( lhs_obj->is_user_specified() &&  rhs_obj->is_user_specified())
+                    || (!lhs_obj->is_user_specified() && !rhs_obj->is_user_specified()));
+        cv::util::optional<std::string> maybe_user_tag;
+        if (lhs_obj->is_user_specified() && rhs_obj->is_user_specified())
+        {
+            GAPI_Assert(lhs_obj->name() == rhs_obj->name());
+            maybe_user_tag = cv::util::make_optional(lhs_obj->name());
+        }
+
+        // A: Create a new Island and add it to the graph
+        auto backend = m_gim.metadata(m_prod).get<FusedIsland>()
+            .object->backend();
+        auto merged = std::make_shared<GIsland>(backend,
+                                                           std::move(mo.all),
+                                                           std::move(mo.in_ops),
+                                                           std::move(mo.out_ops),
+                                                           std::move(maybe_user_tag));
+        // FIXME: move this debugging to some user-controllable log-level
+#ifdef DEBUG_MERGE
+        merged->debug();
+#endif
+        auto new_nh = GIslandModel::mkIslandNode(m_gim, std::move(merged));
+        m_changes.enqueue<Change::NodeCreated>(new_nh);
+
+        // B: Disconnect all Prod's input Slots from Prod,
+        //    connect it to Merged
+        std::vector<ade::EdgeHandle> input_edges(m_prod->inEdges().begin(),
+                                                 m_prod->inEdges().end());
+        for (auto in_edge : input_edges)
+        {
+            m_changes.enqueue<Change::NewLink>(m_g, in_edge->srcNode(), new_nh);
+            m_changes.enqueue<Change::DropLink>(m_g, m_prod, in_edge);
+        }
+
+        // C: Disconnect all Cons' output Slots from Cons,
+        //    connect it to Merged
+        std::vector<ade::EdgeHandle> output_edges(m_cons->outEdges().begin(),
+                                                  m_cons->outEdges().end());
+        for (auto out_edge : output_edges)
+        {
+            m_changes.enqueue<Change::NewLink>(m_g, new_nh, out_edge->dstNode());
+            m_changes.enqueue<Change::DropLink>(m_g, m_cons, out_edge);
+        }
+
+        // D: Process the intermediate slots (betweed Prod n Cons).
+        // D/1 - Those which are optimized out are just removed from the model
+        for (auto opt_slot_nh : mo.opt_interim_slots)
+        {
+            GAPI_Assert(1      == opt_slot_nh->inNodes().size() );
+            GAPI_Assert(m_prod == opt_slot_nh->inNodes().front());
+
+            std::vector<ade::EdgeHandle> edges_to_drop;
+            ade::util::copy(ade::util::chain(opt_slot_nh->inEdges(),
+                                             opt_slot_nh->outEdges()),
+                            std::back_inserter(edges_to_drop));
+            for (auto eh : edges_to_drop)
+            {
+                m_changes.enqueue<Change::DropLink>(m_g, opt_slot_nh, eh);
+            }
+            m_changes.enqueue<Change::DropNode>(opt_slot_nh);
+        }
+        // D/2 - Those which are used externally are connected to new nh
+        //       as outputs.
+        for (auto non_opt_slot_nh : mo.non_opt_interim_slots)
+        {
+            GAPI_Assert(1      == non_opt_slot_nh->inNodes().size() );
+            GAPI_Assert(m_prod == non_opt_slot_nh->inNodes().front());
+            m_changes.enqueue<Change::DropLink>(m_g, non_opt_slot_nh,
+                                                non_opt_slot_nh->inEdges().front());
+
+            std::vector<ade::EdgeHandle> edges_to_probably_drop
+                (non_opt_slot_nh->outEdges().begin(),
+                 non_opt_slot_nh->outEdges().end());;
+            for (auto eh : edges_to_probably_drop)
+            {
+                if (eh->dstNode() == m_cons)
+                {
+                    // drop only edges to m_cons, as there's other consumers
+                    m_changes.enqueue<Change::DropLink>(m_g, non_opt_slot_nh, eh);
+                }
+            }
+            m_changes.enqueue<Change::NewLink>(m_g, new_nh, non_opt_slot_nh);
+        }
+
+        // E. All Prod's output edges which are directly related to Merge (e.g.
+        //    connected to Cons) were processed on step (D).
+        //    Relink the remaining output links
+        std::vector<ade::EdgeHandle>  prod_extra_out_edges
+            (m_prod->outEdges().begin(),
+             m_prod->outEdges().end());
+        for (auto extra_out : prod_extra_out_edges)
+        {
+            m_changes.enqueue<Change::NewLink>(m_g, new_nh, extra_out->dstNode());
+            m_changes.enqueue<Change::DropLink>(m_g, m_prod, extra_out);
+        }
+
+        // F. All Cons' input edges which are directly related to merge (e.g.
+        //    connected to Prod) were processed on step (D) as well,
+        //    remaining should become Merged island's input edges
+        std::vector<ade::EdgeHandle> cons_extra_in_edges
+            (m_cons->inEdges().begin(),
+             m_cons->inEdges().end());
+        for (auto extra_in : cons_extra_in_edges)
+        {
+            m_changes.enqueue<Change::NewLink>(m_g, extra_in->srcNode(), new_nh);
+            m_changes.enqueue<Change::DropLink>(m_g, m_cons, extra_in);
+        }
+
+        // G. Finally, drop the original Island nodes. DropNode() does
+        //    the sanity check for us (both nodes should have 0 edges).
+        m_changes.enqueue<Change::DropNode>(m_prod);
+        m_changes.enqueue<Change::DropNode>(m_cons);
+    }
+
+    void MergeAction::rollback()
+    {
+        m_changes.rollback(m_g);
+    }
+    void MergeAction::commit()
+    {
+        m_changes.commit(m_g);
+    }
+
+#ifdef DEBUG_MERGE
+    void merge_debug(const ade::Graph &g, int iteration)
+    {
+        std::stringstream filename;
+        filename << "fusion_" << static_cast<const void*>(&g)
+                 << "_" << std::setw(4) << std::setfill('0') << iteration
+                 << ".dot";
+        std::ofstream ofs(filename.str());
+        passes::dumpDot(g, ofs);
+    }
+#endif
+
+    void fuseGeneral(ade::Graph& im, const ade::Graph& g)
+    {
+        GIslandModel::Graph gim(im);
+        MergeContext mc;
+
+        bool there_was_a_merge = false;
+        std::size_t iteration = 0u;
+        do
+        {
+            there_was_a_merge = false;
+
+            // FIXME: move this debugging to some user-controllable log level
+    #ifdef DEBUG_MERGE
+            GAPI_LOG_INFO(NULL, "Before next merge attempt " << iteration << "...");
+            merge_debug(g, iteration);
+    #endif
+            iteration++;
+            auto sorted = pass_helpers::topoSort(im);
+            for (auto nh : sorted)
+            {
+                if (NodeKind::ISLAND == gim.metadata(nh).get<NodeKind>().k)
+                {
+                    ade::NodeHandle cand_nh;
+                    ade::NodeHandle cand_slot;
+                    Direction dir = Direction::Invalid;
+                    std::tie(cand_nh, cand_slot, dir) = findCandidate(gim, nh, mc);
+                    if (cand_nh != nullptr && dir != Direction::Invalid)
+                    {
+                        auto lhs_nh = (dir == Direction::In  ? cand_nh : nh);
+                        auto rhs_nh = (dir == Direction::Out ? cand_nh : nh);
+
+                        auto l_obj = gim.metadata(lhs_nh).get<FusedIsland>().object;
+                        auto r_obj = gim.metadata(rhs_nh).get<FusedIsland>().object;
+                        GAPI_LOG_INFO(NULL, r_obj->name() << " can be merged into " << l_obj->name());
+                        // Try to do a merge. If merge was succesfull, check if the
+                        // graph have cycles (cycles are prohibited at this point).
+                        // If there are cycles, roll-back the merge and mark a pair of
+                        // these Islands with a special tag - "cycle-causing".
+                        MergeAction action(im, g, lhs_nh, cand_slot, rhs_nh);
+                        action.tryMerge();
+                        if (pass_helpers::hasCycles(im))
+                        {
+                            GAPI_LOG_INFO(NULL,
+                                          "merge(" << l_obj->name() << "," << r_obj->name() <<
+                                          ") caused cycle, rolling back...");
+                            action.rollback();
+                            // don't try to merge these two islands next time (findCandidate will use that)
+                            mc.cycle_causers.insert({l_obj, r_obj});
+                        }
+                        else
+                        {
+                            GAPI_LOG_INFO(NULL,
+                                          "merge(" << l_obj->name() << "," << r_obj->name() <<
+                                          ") was successful!");
+                            action.commit();
+    #ifdef DEBUG_MERGE
+                            GIslandModel::syncIslandTags(gim, g);
+    #endif
+                            there_was_a_merge = true;
+                            break; // start do{}while from the beginning
+                        }
+                    } // if(can merge)
+                } // if(ISLAND)
+            } // for(all nodes)
+        }
+        while (there_was_a_merge);
+    }
+}  // anonymous namespace
+
+void passes::fuseIslands(ade::passes::PassContext &ctx)
+{
+    std::shared_ptr<ade::Graph> gptr(new ade::Graph);
+    GIslandModel::Graph gim(*gptr);
+
+    if (fusionIsTrivial(ctx.graph))
+    {
+        fuseTrivial(gim, ctx.graph);
+    }
+    else
+    {
+        GIslandModel::generateInitial(gim, ctx.graph);
+        fuseGeneral(*gptr.get(), ctx.graph);
+    }
+    GModel::Graph(ctx.graph).metadata().set(IslandModel{std::move(gptr)});
+}
+
+void passes::syncIslandTags(ade::passes::PassContext &ctx)
+{
+    GModel::Graph gm(ctx.graph);
+    std::shared_ptr<ade::Graph> gptr(gm.metadata().get<IslandModel>().model);
+    GIslandModel::Graph gim(*gptr);
+    GIslandModel::syncIslandTags(gim, ctx.graph);
+}
+}} // namespace cv::gimpl
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.cpp
new file mode 100644 (file)
index 0000000..60bf36a
--- /dev/null
@@ -0,0 +1,122 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <algorithm>     // copy
+#include <unordered_map>
+#include <unordered_set>
+
+#include <ade/util/filter_range.hpp>
+
+#include "opencv2/gapi/own/assert.hpp" // GAPI_Assert
+#include "compiler/passes/helpers.hpp"
+
+namespace {
+namespace Cycles
+{
+    // FIXME: This code is taken directly from ADE.
+    // export a bool(ade::Graph) function with pass instead
+    enum class TraverseState
+    {
+        visiting,
+        visited,
+    };
+    using state_t = std::unordered_map<ade::Node*, TraverseState>;
+
+    bool inline checkCycle(state_t& state, const ade::NodeHandle& node)
+    {
+        GAPI_Assert(nullptr != node);
+        state[node.get()] = TraverseState::visiting;
+        for (auto adj: node->outNodes())
+        {
+            auto it = state.find(adj.get());
+            if (state.end() == it) // not visited
+            {
+                // FIXME: use std::stack instead on-stack recursion
+                if (checkCycle(state, adj))
+                {
+                    return true; // detected! (deeper frame)
+                }
+            }
+            else if (TraverseState::visiting == it->second)
+            {
+                return true; // detected! (this frame)
+            }
+        }
+        state[node.get()] = TraverseState::visited;
+        return false; // not detected
+    }
+
+    bool inline hasCycles(const ade::Graph &graph)
+    {
+        state_t state;
+        bool detected = false;
+        for (auto node: graph.nodes())
+        {
+            if (state.end() == state.find(node.get()))
+            {
+                // not yet visited during recursion
+                detected |= checkCycle(state, node);
+                if (detected) break;
+            }
+        }
+        return detected;
+    }
+} // namespace Cycles
+
+namespace TopoSort
+{
+    using sorted_t = std::vector<ade::NodeHandle>;
+    using visited_t = std::unordered_set<ade::Node*>;
+
+    struct NonEmpty final
+    {
+        bool operator()(const ade::NodeHandle& node) const
+        {
+            return nullptr != node;
+        }
+    };
+
+    void inline visit(sorted_t& sorted, visited_t& visited, const ade::NodeHandle& node)
+    {
+        if (visited.end() == visited.find(node.get()))
+        {
+            for (auto adj: node->inNodes())
+            {
+                visit(sorted, visited, adj);
+            }
+            sorted.push_back(node);
+            visited.insert(node.get());
+        }
+    }
+
+    sorted_t inline topoSort(const ade::Graph &g)
+    {
+        sorted_t sorted;
+        visited_t visited;
+        for (auto node: g.nodes())
+        {
+            visit(sorted, visited, node);
+        }
+
+        auto r = ade::util::filter<NonEmpty>(ade::util::toRange(sorted));
+        return sorted_t(r.begin(), r.end());
+    }
+} // namespace TopoSort
+
+} // anonymous namespace
+
+bool cv::gimpl::pass_helpers::hasCycles(const ade::Graph &g)
+{
+    return Cycles::hasCycles(g);
+}
+
+std::vector<ade::NodeHandle> cv::gimpl::pass_helpers::topoSort(const ade::Graph &g)
+{
+    return TopoSort::topoSort(g);
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/helpers.hpp
new file mode 100644 (file)
index 0000000..3aa18e6
--- /dev/null
@@ -0,0 +1,31 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_COMPILER_PASSES_HELPERS_HPP
+#define OPENCV_GAPI_COMPILER_PASSES_HELPERS_HPP
+
+// FIXME: DROP THIS and REUSE ADE utilities
+// (which serve as passes already but are not exposed as standalone functions)
+
+#include <vector>
+
+#include <ade/passes/pass_base.hpp>
+#include <ade/node.hpp> // FIXME: Forward declarations instead?
+#include <ade/graph.hpp>
+
+namespace cv {
+namespace gimpl {
+namespace pass_helpers {
+
+bool hasCycles(const ade::Graph &graph);
+std::vector<ade::NodeHandle> topoSort(const ade::Graph &graph);
+
+} // namespace pass_helpers
+} // namespace gimpl
+} // name
+
+#endif // OPENCV_GAPI_COMPILER_PASSES_HELPERS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/islands.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/islands.cpp
new file mode 100644 (file)
index 0000000..942f738
--- /dev/null
@@ -0,0 +1,233 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <sstream>
+#include <stack>
+#include <ade/util/chain_range.hpp>
+#include <ade/graph.hpp>
+
+#include "compiler/gmodel.hpp"
+#include "compiler/passes/passes.hpp"
+
+namespace
+{
+    bool is_within_same_island(const cv::gimpl::GModel::Graph &gr,
+                               const ade::NodeHandle          &dataNode,
+                               const std::string              &island)
+    {
+        // A data node is within the same island as it's reader node
+        // if and only if data object's producer island (if there's a producer)
+        // is the same as the specified one.
+        //
+        // An object may have only a single producer, but multiple consumers,
+        // and these consumers may be assigned to different Islands.
+        // Since "initIslands" traversal direction is op-to-args, i.e. reverse,
+        // a single Data object may be visited twice during Islands initialization.
+        //
+        // In this case, Data object is part of Island A if and only if:
+        // - Data object's producer is part of Island A,
+        // - AND any of Data obejct's consumers is part of Island A.
+        //
+        //   Op["island0"] --> Data[ ? ] --> Op["island0"]
+        //                       :
+        //                       '---------> Op["island1"]
+        //
+        // In the above example, Data object is assigned to "island0" as
+        // it is surrounded by operations assigned to "island0"
+
+        using namespace cv::gimpl;
+
+        if (   gr.metadata(dataNode).contains<Island>()
+            && gr.metadata(dataNode).get<Island>().island != island)
+            return false;
+
+        if (dataNode->inNodes().empty())
+            return false;
+
+        GAPI_Assert(dataNode->inNodes().size() == 1u);
+        const auto prod_h = dataNode->inNodes().front();
+
+        // FIXME: ADE should have something like get_or<> or get<>(default)
+        GAPI_Assert(gr.metadata(prod_h).get<NodeType>().t == NodeType::OP);
+        return     (   gr.metadata(prod_h).contains<Island>()
+                    && gr.metadata(prod_h).get<Island>().island == island)
+                    && (ade::util::any_of(dataNode->outNodes(), [&](ade::NodeHandle cons_h)
+                    {
+                        return (   gr.metadata(cons_h).contains<Island>()
+                                && gr.metadata(cons_h).get<Island>().island == island);
+                    }));
+    }
+} // anonymous namespace
+
+// Initially only Operations have Island tag. This pass adds Island tag
+// to all data objects within an Island.
+// A data object is considered within an Island if and only if
+// its reader and writer are assigned to this Island (see above).
+void cv::gimpl::passes::initIslands(ade::passes::PassContext &ctx)
+{
+    GModel::Graph gr(ctx.graph);
+    for (const auto &nh : gr.nodes())
+    {
+        if (gr.metadata(nh).get<NodeType>().t == NodeType::OP)
+        {
+            if (gr.metadata(nh).contains<Island>())
+            {
+                const auto island = gr.metadata(nh).get<Island>().island;
+
+                // It is enough to check only input nodes
+                for (const auto &in_data_node : nh->inNodes())
+                {
+                    if (is_within_same_island(gr, in_data_node, island))
+                    {
+                        gr.metadata(in_data_node).set(Island{island});
+                    }
+                } // for(in_data_node)
+            } // if (contains<Island>)
+        } // if (OP)
+    } // for (nodes)
+}
+
+// There should be no multiple (disconnected) islands with the same name.
+// This may occur if user assigns the same islands name to multiple ranges
+// in the graph.
+// FIXME: How it could be avoided on an earlier stage?
+void cv::gimpl::passes::checkIslands(ade::passes::PassContext &ctx)
+{
+    GModel::ConstGraph gr(ctx.graph);
+
+    // The algorithm is teh following:
+    //
+    // 1. Put all Tagged nodes (both Operations and Data) into a set
+    // 2. Initialize Visited set as (empty)
+    // 3. Initialize Traversal stack as (empty)
+    // 4. Initialize Islands map (String -> Integer) as (empty)
+    // 5. For every Tagged node from a set
+    //    a. Skip if it is Visited
+    //    b. For every input/output node:
+    //       * if it is tagged with the same island:
+    //         - add it to Traversal stack
+    //         - remove from Tagged nodes if it is t
+    //    c. While (stack is not empty):
+    //       - Take a node from Stack
+    //       - Repeat (b)
+    //    d. Increment Islands map [this island] by 1
+    //
+    //
+    // If whatever Island has counter is more than 1, it is a disjoint
+    // one (i.e. there's two islands with the same name).
+
+    using node_set = std::unordered_set
+         < ade::NodeHandle
+         , ade::HandleHasher<ade::Node>
+         >;
+    node_set tagged_nodes;
+    node_set visited_tagged_nodes;
+    std::unordered_map<std::string, int> island_counters;
+
+    for (const auto &nh : gr.nodes())
+    {
+        if (gr.metadata(nh).contains<Island>())
+        {
+            tagged_nodes.insert(nh);
+            island_counters[gr.metadata(nh).get<Island>().island] = 0;
+        }
+    }
+
+    // Make a copy to allow list modifications during traversal
+    for (const auto &tagged_nh : tagged_nodes)
+    {
+        if (visited_tagged_nodes.end() != ade::util::find(visited_tagged_nodes, tagged_nh))
+            continue;
+
+        // Run the recursive traversal process as described in 5/a-d.
+        // This process is like a flood-fill traversal for island.
+        // If there's to distint successful flood-fills happened for the same island
+        // name, there are two islands with this name.
+        std::stack<ade::NodeHandle> stack;
+        stack.push(tagged_nh);
+
+        while (!stack.empty())
+        {
+            const auto this_nh = stack.top();
+            stack.pop();
+
+            // Since _this_ node is visited, it is a part of processed island
+            // so mark it as visited to skip in other recursive processes
+            visited_tagged_nodes.insert(this_nh);
+
+            GAPI_DbgAssert(gr.metadata(this_nh).contains<Island>());
+            GAPI_DbgAssert(   gr.metadata(this_nh  ).get<Island>().island
+                         == gr.metadata(tagged_nh).get<Island>().island);
+            const auto &this_island = gr.metadata(this_nh).get<Island>().island;
+
+            for (const auto neighbor_nh : ade::util::chain(this_nh->inNodes(), this_nh->outNodes()))
+            {
+                if (   gr.metadata(neighbor_nh).contains<Island>()
+                    && gr.metadata(neighbor_nh).get<Island>().island == this_island
+                    && !visited_tagged_nodes.count(neighbor_nh))
+                {
+                    stack.push(neighbor_nh);
+                }
+            } // for (neighbor)
+        } // while (stack)
+
+        // Flood-fill is over, now increment island counter for this island
+        island_counters[gr.metadata(tagged_nh).get<Island>().island]++;
+    } // for(tagged)
+
+    bool check_failed = false;
+    std::stringstream ss;
+    for (const auto &ic : island_counters)
+    {
+        GAPI_Assert(ic.second > 0);
+        if (ic.second > 1)
+        {
+            check_failed = true;
+            ss << "\"" << ic.first << "\"(" << ic.second << ") ";
+        }
+    }
+    if (check_failed)
+    {
+        util::throw_error
+            (std::logic_error("There are multiple distinct islands "
+                              "with the same name: [" + ss.str() + "], "
+                              "please check your cv::gapi::island() parameters!"));
+    }
+}
+
+void cv::gimpl::passes::checkIslandsContent(ade::passes::PassContext &ctx)
+{
+    GModel::ConstGraph gr(ctx.graph);
+    std::unordered_map<std::string, cv::gapi::GBackend> backends_of_islands;
+    for (const auto& nh : gr.nodes())
+    {
+        if (NodeType::OP == gr.metadata(nh).get<NodeType>().t &&
+            gr.metadata(nh).contains<Island>())
+        {
+            const auto island      = gr.metadata(nh).get<Island>().island;
+            auto island_backend_it = backends_of_islands.find(island);
+            const auto& op         = gr.metadata(nh).get<Op>();
+
+            if (island_backend_it != backends_of_islands.end())
+            {
+                // Check that backend of the operation coincides with the backend of the island
+                // Backend of the island is determined by the backend of the first operation from this island
+                if (island_backend_it->second != op.backend)
+                {
+                    util::throw_error(std::logic_error(island + " contains kernels " + op.k.name +
+                                                       " with different backend"));
+                }
+            }
+            else
+            {
+                backends_of_islands.emplace(island, op.backend);
+            }
+        }
+    }
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/kernels.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/kernels.cpp
new file mode 100644 (file)
index 0000000..2703149
--- /dev/null
@@ -0,0 +1,157 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <ade/util/zip_range.hpp>   // util::indexed
+#include <ade/graph.hpp>
+#include <ade/passes/check_cycles.hpp>
+
+#include "opencv2/gapi/gcompoundkernel.hpp" // compound::backend()
+
+#include "compiler/gmodel.hpp"
+#include "compiler/passes/passes.hpp"
+
+#include "api/gbackend_priv.hpp"
+#include "backends/common/gbackend.hpp"
+#include "compiler/gmodelbuilder.hpp"
+#include "logger.hpp"    // GAPI_LOG
+
+namespace
+{
+    struct ImplInfo
+    {
+        cv::GKernelImpl impl;
+        cv::GArgs       in_args;
+    };
+
+    // Generaly the algorithm is following
+    //
+    // 1. Get GCompoundKernel implementation
+    // 2. Create GCompoundContext
+    // 3. Run GCompoundKernel with GCompoundContext
+    // 4. Build subgraph from imputs/outputs GCompoundKernel
+    // 5. Replace compound node to subgraph
+
+    void expand(ade::Graph& g, ade::NodeHandle nh, const ImplInfo& impl_info)
+    {
+        cv::gimpl::GModel::Graph gr(g);
+        auto compound_impl = cv::util::any_cast<cv::detail::GCompoundKernel>(impl_info.impl.opaque);
+
+        // GCompoundContext instantiates its own objects
+        // in accordance with the RcDescs from in_args
+        cv::detail::GCompoundContext context(impl_info.in_args);
+        compound_impl.apply(context);
+
+        cv::GProtoArgs ins, outs;
+        ins.reserve(context.m_args.size());
+        outs.reserve(context.m_results.size());
+
+        // Inputs can be non-dynamic types.
+        // Such inputs are not used when building a graph
+        for (const auto& arg : context.m_args)
+        {
+            if (cv::gimpl::proto::is_dynamic(arg))
+            {
+                ins.emplace_back(cv::gimpl::proto::rewrap(arg));
+            }
+        }
+
+        ade::util::transform(context.m_results, std::back_inserter(outs), &cv::gimpl::proto::rewrap);
+
+        cv::gimpl::GModelBuilder builder(g);
+
+        // Build the subgraph graph which will need to replace the compound node
+        const auto& proto_slots = builder.put(ins, outs);
+
+        const auto& in_nhs  = std::get<2>(proto_slots);
+        const auto& out_nhs = std::get<3>(proto_slots);
+
+        auto sorted_in_nhs  = cv::gimpl::GModel::orderedInputs(gr, nh);
+        auto sorted_out_nhs = cv::gimpl::GModel::orderedOutputs(gr, nh);
+
+        // Reconnect expanded kernels from graph data objects
+        // to subgraph data objects, then drop that graph data objects
+        for (const auto& it : ade::util::zip(in_nhs, sorted_in_nhs))
+        {
+            const auto& subgr_in_nh = std::get<0>(it);
+            const auto& comp_in_nh  = std::get<1>(it);
+
+            cv::gimpl::GModel::redirectReaders(gr, subgr_in_nh, comp_in_nh);
+            gr.erase(subgr_in_nh);
+        }
+
+        gr.erase(nh);
+
+        for (const auto& it : ade::util::zip(out_nhs, sorted_out_nhs))
+        {
+            const auto& subgr_out_nh = std::get<0>(it);
+            const auto& comp_out_nh  = std::get<1>(it);
+
+            cv::gimpl::GModel::redirectWriter(gr, subgr_out_nh, comp_out_nh);
+            gr.erase(subgr_out_nh);
+        }
+    }
+}
+// This pass, given the kernel package, selects a kernel implementation
+// for every operation in the graph
+void cv::gimpl::passes::resolveKernels(ade::passes::PassContext   &ctx,
+                                       const gapi::GKernelPackage &kernels,
+                                       const gapi::GLookupOrder   &order)
+{
+    std::unordered_set<cv::gapi::GBackend> active_backends;
+
+    GModel::Graph gr(ctx.graph);
+    for (const auto &nh : gr.nodes())
+    {
+        if (gr.metadata(nh).get<NodeType>().t == NodeType::OP)
+        {
+            auto &op = gr.metadata(nh).get<Op>();
+            cv::gapi::GBackend selected_backend;
+            cv::GKernelImpl    selected_impl;
+            std::tie(selected_backend, selected_impl)
+                = kernels.lookup(op.k.name, order);
+
+            selected_backend.priv().unpackKernel(ctx.graph, nh, selected_impl);
+            op.backend = selected_backend;
+            active_backends.insert(selected_backend);
+        }
+    }
+    gr.metadata().set(ActiveBackends{active_backends});
+}
+
+void cv::gimpl::passes::expandKernels(ade::passes::PassContext &ctx, const gapi::GKernelPackage &kernels)
+{
+    GModel::Graph gr(ctx.graph);
+
+    // Repeat the loop while there are compound kernels.
+    // Restart procedure after every successfull unrolling
+    bool has_compound_kernel = true;
+    while (has_compound_kernel)
+    {
+        has_compound_kernel = false;
+        for (const auto& nh : gr.nodes())
+        {
+            if (gr.metadata(nh).get<NodeType>().t == NodeType::OP)
+            {
+                const auto& op = gr.metadata(nh).get<Op>();
+
+                cv::gapi::GBackend selected_backend;
+                cv::GKernelImpl    selected_impl;
+                std::tie(selected_backend, selected_impl) = kernels.lookup(op.k.name);
+
+                if (selected_backend == cv::gapi::compound::backend())
+                {
+                    has_compound_kernel = true;
+                    expand(ctx.graph, nh, ImplInfo{selected_impl, op.args});
+                    break;
+                }
+            }
+        }
+    }
+    GAPI_LOG_INFO(NULL, "Final graph: " << ctx.graph.nodes().size() << " nodes" << std::endl);
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/meta.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/meta.cpp
new file mode 100644 (file)
index 0000000..528d84c
--- /dev/null
@@ -0,0 +1,125 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <ade/util/zip_range.hpp>   // util::indexed
+#include <ade/graph.hpp>
+#include <ade/passes/check_cycles.hpp>
+
+#include "compiler/gmodel.hpp"
+#include "compiler/passes/passes.hpp"
+#include "logger.hpp"    // GAPI_LOG
+
+
+// Iterate over all nodes and initialize meta of objects taken from the
+// outside (i.e., computation input/output arguments)
+void cv::gimpl::passes::initMeta(ade::passes::PassContext &ctx, const GMetaArgs &metas)
+{
+    GModel::Graph gr(ctx.graph);
+
+    const auto &proto = gr.metadata().get<Protocol>();
+
+    for (const auto& it : ade::util::indexed(proto.in_nhs))
+    {
+        auto& data = gr.metadata(ade::util::value(it)).get<Data>();
+        data.meta = metas.at(ade::util::index(it));
+    }
+}
+
+// Iterate over all operations in the topological order, trigger kernels
+// validate() function, update output objects metadata.
+void cv::gimpl::passes::inferMeta(ade::passes::PassContext &ctx, bool meta_is_initialized)
+{
+    // FIXME: ADE pass dependency on topo_sort?
+    // FIXME: ADE pass dependency on initMeta?
+    GModel::Graph gr(ctx.graph);
+
+    const auto sorted = gr.metadata().get<ade::passes::TopologicalSortData>() ;
+    for (const auto &nh : sorted.nodes())
+    {
+        if (gr.metadata(nh).get<NodeType>().t == NodeType::OP)
+        {
+            const auto& op = gr.metadata(nh).get<Op>();
+            GAPI_Assert(op.k.outMeta != nullptr);
+
+            // Prepare operation's input metadata vector
+            // Note that it's size is usually different from nh.inEdges.size(),
+            // and its element count is equal to operation's arguments count.
+            GMetaArgs input_meta_args(op.args.size());
+
+            // Iterate through input edges, update input_meta_args's slots
+            // appropriately. Not all of them will be updated due to (see above).
+            GAPI_Assert(nh->inEdges().size() > 0);
+            for (const auto &in_eh : nh->inEdges())
+            {
+                const auto& input_port = gr.metadata(in_eh).get<Input>().port;
+                const auto& input_nh   = in_eh->srcNode();
+                GAPI_Assert(gr.metadata(input_nh).get<NodeType>().t == NodeType::DATA);
+
+                const auto& input_meta = gr.metadata(input_nh).get<Data>().meta;
+                if (util::holds_alternative<util::monostate>(input_meta))
+                {
+                    // No meta in an input argument - a fatal error
+                    // (note graph is traversed here in topoligcal order)
+                  util::throw_error(std::logic_error("Fatal: input object's metadata "
+                                                 "not found!"));
+                    // FIXME: Add more details!!!
+                }
+                input_meta_args.at(input_port) = input_meta;
+            }
+            // Now ask kernel for it's output meta.
+            // Resulting out_args may have a larger size than op.outs, since some
+            // outputs could stay unused (unconnected)
+            const auto& out_metas = op.k.outMeta(input_meta_args, op.args);
+
+            // Walk through operation's outputs, update meta of output objects
+            // appropriately
+            GAPI_Assert(nh->outEdges().size() > 0);
+            for (const auto &out_eh : nh->outEdges())
+            {
+                const auto &output_port = gr.metadata(out_eh).get<Output>().port;
+                const auto &output_nh   = out_eh->dstNode();
+                GAPI_Assert(gr.metadata(output_nh).get<NodeType>().t == NodeType::DATA);
+
+                auto       &output_meta = gr.metadata(output_nh).get<Data>().meta;
+                if (!meta_is_initialized && !util::holds_alternative<util::monostate>(output_meta))
+                {
+                    GAPI_LOG_INFO(NULL,
+                                  "!!! Output object has an initialized meta - "
+                                  "how it is possible today?" << std::endl; );
+                    if (output_meta != out_metas.at(output_port))
+                    {
+                      util::throw_error(std::logic_error("Fatal: meta mismatch"));
+                        // FIXME: New exception type?
+                        // FIXME: More details!
+                    }
+                }
+                // Store meta in graph
+                output_meta = out_metas.at(output_port);
+            }
+        } // if(OP)
+    } // for(sorted)
+}
+
+// After all metadata in graph is infered, store a vector of inferred metas
+// for computation output values.
+void cv::gimpl::passes::storeResultingMeta(ade::passes::PassContext &ctx)
+{
+    GModel::Graph gr(ctx.graph);
+
+    const auto &proto = gr.metadata().get<Protocol>();
+    GMetaArgs output_metas(proto.out_nhs.size());
+
+    for (const auto& it : ade::util::indexed(proto.out_nhs))
+    {
+        auto& data = gr.metadata(ade::util::value(it)).get<Data>();
+        output_metas[ade::util::index(it)] = data.meta;
+    }
+
+    gr.metadata().set(OutputMeta{output_metas});
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/passes.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/passes/passes.hpp
new file mode 100644 (file)
index 0000000..14f6acd
--- /dev/null
@@ -0,0 +1,58 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_COMPILER_PASSES_HPP
+#define OPENCV_GAPI_COMPILER_PASSES_HPP
+
+#include <ostream>
+#include <ade/passes/pass_base.hpp>
+
+#include "opencv2/gapi/garg.hpp"
+#include "opencv2/gapi/gcommon.hpp"
+
+// Forward declarations - external
+namespace ade {
+    class Graph;
+
+    namespace passes {
+        struct PassContext;
+    }
+}
+
+namespace cv {
+
+namespace gimpl { namespace passes {
+
+void dumpDot(const ade::Graph &g, std::ostream& os);
+void dumpDot(ade::passes::PassContext &ctx, std::ostream& os);
+void dumpDotStdout(ade::passes::PassContext &ctx);
+void dumpGraph(ade::passes::PassContext     &ctx, const std::string& dump_path);
+void dumpDotToFile(ade::passes::PassContext &ctx, const std::string& dump_path);
+
+void initIslands(ade::passes::PassContext &ctx);
+void checkIslands(ade::passes::PassContext &ctx);
+void checkIslandsContent(ade::passes::PassContext &ctx);
+
+void initMeta(ade::passes::PassContext &ctx, const GMetaArgs &metas);
+void inferMeta(ade::passes::PassContext &ctx, bool meta_is_initialized);
+void storeResultingMeta(ade::passes::PassContext &ctx);
+
+void expandKernels(ade::passes::PassContext &ctx,
+                   const gapi::GKernelPackage& kernels);
+
+void resolveKernels(ade::passes::PassContext       &ctx,
+                    const gapi::GKernelPackage &kernels,
+                    const gapi::GLookupOrder   &order);
+
+void fuseIslands(ade::passes::PassContext &ctx);
+void syncIslandTags(ade::passes::PassContext &ctx);
+
+}} // namespace gimpl::passes
+
+} // namespace cv
+
+#endif // OPENCV_GAPI_COMPILER_PASSES_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/transactions.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/compiler/transactions.hpp
new file mode 100644 (file)
index 0000000..54af8a6
--- /dev/null
@@ -0,0 +1,147 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_COMPILER_TRANSACTIONS_HPP
+#define OPENCV_GAPI_COMPILER_TRANSACTIONS_HPP
+
+#include <algorithm> // find_if
+#include <functional>
+#include <list>
+
+#include <ade/graph.hpp>
+
+#include "opencv2/gapi/own/assert.hpp"
+
+enum class Direction: int {Invalid, In, Out};
+
+////////////////////////////////////////////////////////////////////////////
+////
+// TODO: Probably it can be moved to ADE
+
+namespace Change
+{
+    struct Base
+    {
+        virtual void commit  (ade::Graph & ) {};
+        virtual void rollback(ade::Graph & ) {};
+        virtual ~Base() = default;
+    };
+
+    class NodeCreated final: public Base
+    {
+        ade::NodeHandle m_node;
+    public:
+        explicit NodeCreated(const ade::NodeHandle &nh) : m_node(nh) {}
+        virtual void rollback(ade::Graph &g) override { g.erase(m_node); }
+    };
+
+    // NB: Drops all metadata stored in the EdgeHandle,
+    // which is not restored even in the rollback
+
+    // FIXME: either add a way for users to preserve meta manually
+    // or extend ADE to manipulate with meta such way
+    class DropLink final: public Base
+    {
+        ade::NodeHandle m_node;
+        Direction       m_dir;
+
+        ade::NodeHandle m_sibling;
+
+    public:
+        DropLink(ade::Graph &g,
+                 const ade::NodeHandle &node,
+                 const ade::EdgeHandle &edge)
+            : m_node(node), m_dir(node == edge->srcNode()
+                                  ? Direction::Out
+                                  : Direction::In)
+        {
+            m_sibling = (m_dir == Direction::In
+                         ? edge->srcNode()
+                         : edge->dstNode());
+            g.erase(edge);
+        }
+
+        virtual void rollback(ade::Graph &g) override
+        {
+            switch(m_dir)
+            {
+            case Direction::In:  g.link(m_sibling, m_node); break;
+            case Direction::Out: g.link(m_node, m_sibling); break;
+            default: GAPI_Assert(false);
+            }
+        }
+    };
+
+    class NewLink final: public Base
+    {
+        ade::EdgeHandle m_edge;
+
+    public:
+        NewLink(ade::Graph &g,
+                  const ade::NodeHandle &prod,
+                  const ade::NodeHandle &cons)
+            : m_edge(g.link(prod, cons))
+        {
+        }
+
+        virtual void rollback(ade::Graph &g) override
+        {
+            g.erase(m_edge);
+        }
+    };
+
+    class DropNode final: public Base
+    {
+        ade::NodeHandle m_node;
+
+    public:
+        explicit DropNode(const ade::NodeHandle &nh)
+            : m_node(nh)
+        {
+            // According to the semantic, node should be disconnected
+            // manually before it is dropped
+            GAPI_Assert(m_node->inEdges().size()  == 0);
+            GAPI_Assert(m_node->outEdges().size() == 0);
+        }
+
+        virtual void commit(ade::Graph &g) override
+        {
+            g.erase(m_node);
+        }
+    };
+
+    class List
+    {
+        std::list< std::unique_ptr<Base> > m_changes;
+
+    public:
+        template<typename T, typename ...Args>
+        void enqueue(Args&&... args)
+        {
+            std::unique_ptr<Base> p(new T(args...));
+            m_changes.push_back(std::move(p));
+        }
+
+        void commit(ade::Graph &g)
+        {
+            // Commit changes in the forward order
+            for (auto& ch : m_changes) ch->commit(g);
+        }
+
+        void rollback(ade::Graph &g)
+        {
+            // Rollback changes in the reverse order
+            for (auto it = m_changes.rbegin(); it != m_changes.rend(); ++it)
+            {
+                (*it)->rollback(g);
+            }
+        }
+    };
+} // namespace Change
+////////////////////////////////////////////////////////////////////////////
+
+#endif // OPENCV_GAPI_COMPILER_TRANSACTIONS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.cpp b/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.cpp
new file mode 100644 (file)
index 0000000..f117c06
--- /dev/null
@@ -0,0 +1,244 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "precomp.hpp"
+
+#include <iostream>
+
+#include <ade/util/zip_range.hpp>
+
+#include "opencv2/gapi/opencv_includes.hpp"
+#include "executor/gexecutor.hpp"
+#include "compiler/passes/passes.hpp"
+
+cv::gimpl::GExecutor::GExecutor(std::unique_ptr<ade::Graph> &&g_model)
+    : m_orig_graph(std::move(g_model))
+    , m_island_graph(GModel::Graph(*m_orig_graph).metadata()
+                     .get<IslandModel>().model)
+    , m_gm(*m_orig_graph)
+    , m_gim(*m_island_graph)
+{
+    // NB: Right now GIslandModel is acyclic, so for a naive execution,
+    // simple unrolling to a list of triggers is enough
+
+    // Naive execution model is similar to current CPU (OpenCV) plugin
+    // execution model:
+    // 1. Allocate all internal resources first (NB - CPU plugin doesn't do it)
+    // 2. Put input/output GComputation arguments to the storage
+    // 3. For every Island, prepare vectors of input/output parameter descs
+    // 4. Iterate over a list of operations (sorted in the topological order)
+    // 5. For every operation, form a list of input/output data objects
+    // 6. Run GIslandExecutable
+    // 7. writeBack
+
+    auto sorted = m_gim.metadata().get<ade::passes::TopologicalSortData>();
+    for (auto nh : sorted.nodes())
+    {
+        switch (m_gim.metadata(nh).get<NodeKind>().k)
+        {
+        case NodeKind::ISLAND:
+            {
+                std::vector<RcDesc> input_rcs;
+                std::vector<RcDesc> output_rcs;
+                input_rcs.reserve(nh->inNodes().size());
+                output_rcs.reserve(nh->outNodes().size());
+
+                auto xtract = [&](ade::NodeHandle slot_nh, std::vector<RcDesc> &vec) {
+                    const auto orig_data_nh
+                        = m_gim.metadata(slot_nh).get<DataSlot>().original_data_node;
+                    const auto &orig_data_info
+                        = m_gm.metadata(orig_data_nh).get<Data>();
+                    vec.emplace_back(RcDesc{ orig_data_info.rc
+                                           , orig_data_info.shape
+                                           , orig_data_info.ctor});
+                };
+                // (3)
+                for (auto in_slot_nh  : nh->inNodes())  xtract(in_slot_nh,  input_rcs);
+                for (auto out_slot_nh : nh->outNodes()) xtract(out_slot_nh, output_rcs);
+
+                m_ops.emplace_back(OpDesc{ std::move(input_rcs)
+                                         , std::move(output_rcs)
+                                         , m_gim.metadata(nh).get<IslandExec>().object});
+            }
+            break;
+
+        case NodeKind::SLOT:
+            {
+                const auto orig_data_nh
+                    = m_gim.metadata(nh).get<DataSlot>().original_data_node;
+                // (1)
+                initResource(orig_data_nh);
+                m_slots.emplace_back(DataDesc{nh, orig_data_nh});
+            }
+            break;
+
+        default:
+            GAPI_Assert(false);
+            break;
+        } // switch(kind)
+    } // for(gim nodes)
+}
+
+void cv::gimpl::GExecutor::initResource(const ade::NodeHandle &orig_nh)
+{
+    const Data &d = m_gm.metadata(orig_nh).get<Data>();
+
+    if (   d.storage != Data::Storage::INTERNAL
+        && d.storage != Data::Storage::CONST)
+        return;
+
+    // INTERNALS+CONST only! no need to allocate/reset output objects
+    // to as it is bound externally (e.g. already in the m_res)
+
+    switch (d.shape)
+    {
+    case GShape::GMAT:
+        {
+            const auto desc = util::get<cv::GMatDesc>(d.meta);
+            const auto type = CV_MAKETYPE(desc.depth, desc.chan);
+            m_res.slot<cv::gapi::own::Mat>()[d.rc].create(desc.size, type);
+        }
+        break;
+
+    case GShape::GSCALAR:
+        if (d.storage == Data::Storage::CONST)
+        {
+            auto rc = RcDesc{d.rc, d.shape, d.ctor};
+            magazine::bindInArg(m_res, rc, m_gm.metadata(orig_nh).get<ConstValue>().arg);
+        }
+        break;
+
+    case GShape::GARRAY:
+        // Constructed on Reset, do nothing here
+        break;
+
+    default:
+        GAPI_Assert(false);
+    }
+}
+
+void cv::gimpl::GExecutor::run(cv::gimpl::GRuntimeArgs &&args)
+{
+    // (2)
+    const auto proto = m_gm.metadata().get<Protocol>();
+
+    // Basic check if input/output arguments are correct
+    // FIXME: Move to GCompiled (do once for all GExecutors)
+    if (proto.inputs.size() != args.inObjs.size()) // TODO: Also check types
+    {
+        util::throw_error(std::logic_error
+                          ("Computation's input protocol doesn\'t "
+                           "match actual arguments!"));
+    }
+    if (proto.outputs.size() != args.outObjs.size()) // TODO: Also check types
+    {
+        util::throw_error(std::logic_error
+                          ("Computation's output protocol doesn\'t "
+                           "match actual arguments!"));
+    }
+
+    namespace util = ade::util;
+
+    //ensure that output Mat parameters are correctly allocated
+    for (auto index : util::iota(proto.out_nhs.size()) )     //FIXME: avoid copy of NodeHandle and GRunRsltComp ?
+    {
+        auto& nh = proto.out_nhs.at(index);
+        const Data &d = m_gm.metadata(nh).get<Data>();
+        if (d.shape == GShape::GMAT)
+        {
+            using cv::util::get;
+            const auto desc = get<cv::GMatDesc>(d.meta);
+            const auto type = CV_MAKETYPE(desc.depth, desc.chan);
+
+#if !defined(GAPI_STANDALONE)
+            // Building as part of OpenCV - follow OpenCV behavior
+            // if output buffer is not enough to hold the result, reallocate it
+            auto& out_mat   = *get<cv::Mat*>(args.outObjs.at(index));
+            out_mat.create(cv::gapi::own::to_ocv(desc.size), type);
+#else
+            // Building standalone - output buffer should always exist,
+            // and _exact_ match our inferred metadata
+            auto& out_mat   = *get<cv::gapi::own::Mat*>(args.outObjs.at(index));
+            GAPI_Assert(   out_mat.type() == type
+                        && out_mat.data   != nullptr
+                        && out_mat.rows   == desc.size.height
+                        && out_mat.cols   == desc.size.width)
+#endif // !defined(GAPI_STANDALONE)
+        }
+    }
+    // Update storage with user-passed objects
+    for (auto it : ade::util::zip(ade::util::toRange(proto.inputs),
+                                  ade::util::toRange(args.inObjs)))
+    {
+        magazine::bindInArg(m_res, std::get<0>(it), std::get<1>(it));
+    }
+    for (auto it : ade::util::zip(ade::util::toRange(proto.outputs),
+                                  ade::util::toRange(args.outObjs)))
+    {
+        magazine::bindOutArg(m_res, std::get<0>(it), std::get<1>(it));
+    }
+
+    // Reset internal data
+    for (auto &sd : m_slots)
+    {
+        const auto& data = m_gm.metadata(sd.data_nh).get<Data>();
+        magazine::resetInternalData(m_res, data);
+    }
+
+    // Run the script
+    for (auto &op : m_ops)
+    {
+        // (5)
+        using InObj  = GIslandExecutable::InObj;
+        using OutObj = GIslandExecutable::OutObj;
+        std::vector<InObj>  in_objs;
+        std::vector<OutObj> out_objs;
+        in_objs.reserve (op.in_objects.size());
+        out_objs.reserve(op.out_objects.size());
+
+        for (const auto &rc : op.in_objects)
+        {
+            in_objs.emplace_back(InObj{rc, magazine::getArg(m_res, rc)});
+        }
+        for (const auto &rc : op.out_objects)
+        {
+            out_objs.emplace_back(OutObj{rc, magazine::getObjPtr(m_res, rc)});
+        }
+
+        // (6)
+        op.isl_exec->run(std::move(in_objs), std::move(out_objs));
+    }
+
+    // (7)
+    for (auto it : ade::util::zip(ade::util::toRange(proto.outputs),
+                                  ade::util::toRange(args.outObjs)))
+    {
+        magazine::writeBack(m_res, std::get<0>(it), std::get<1>(it));
+    }
+}
+
+const cv::gimpl::GModel::Graph& cv::gimpl::GExecutor::model() const
+{
+    return m_gm;
+}
+
+bool cv::gimpl::GExecutor::canReshape() const
+{
+    // FIXME: Introduce proper reshaping support on GExecutor level
+    // for all cases!
+    return (m_ops.size() == 1) && m_ops[0].isl_exec->canReshape();
+}
+
+void cv::gimpl::GExecutor::reshape(const GMetaArgs& inMetas, const GCompileArgs& args)
+{
+    GAPI_Assert(canReshape());
+    auto& g = *m_orig_graph.get();
+    ade::passes::PassContext ctx{g};
+    passes::initMeta(ctx, inMetas);
+    passes::inferMeta(ctx, true);
+    m_ops[0].isl_exec->reshape(g, args);
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/executor/gexecutor.hpp
new file mode 100644 (file)
index 0000000..e4128ba
--- /dev/null
@@ -0,0 +1,97 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_GEXECUTOR_HPP
+#define OPENCV_GAPI_GEXECUTOR_HPP
+
+#include <memory> // unique_ptr, shared_ptr
+
+#include <utility> // tuple, required by magazine
+#include <unordered_map> // required by magazine
+
+#include <ade/graph.hpp>
+
+#include "backends/common/gbackend.hpp"
+
+namespace cv {
+namespace gimpl {
+
+// Graph-level executor interface.
+//
+// This class specifies API for a "super-executor" which orchestrates
+// the overall Island graph execution.
+//
+// Every Island (subgraph) execution is delegated to a particular
+// backend and is done opaquely to the GExecutor.
+//
+// Inputs to a GExecutor instance are:
+// - GIslandModel - a high-level graph model which may be seen as a
+//   "procedure" to execute.
+//   - GModel - a low-level graph of operations (from which a GIslandModel
+//     is projected)
+// - GComputation runtime arguments - vectors of input/output objects
+//
+// Every GExecutor is responsible for
+// a. Maintaining non-island (intermediate) data objects within graph
+// b. Providing GIslandExecutables with input/output data according to
+//    their protocols
+// c. Triggering execution of GIslandExecutables when task/data dependencies
+//    are met.
+//
+// By default G-API stores all data on host, and cross-Island
+// exchange happens via host buffers (and CV data objects).
+//
+// Today's exchange data objects are:
+// - cv::Mat               - for image buffers
+// - cv::Scalar            - for single values (with up to four components inside)
+// - cv::detail::VectorRef - an untyped wrapper over std::vector<T>
+//
+
+class GExecutor
+{
+protected:
+    std::unique_ptr<ade::Graph> m_orig_graph;
+    std::shared_ptr<ade::Graph> m_island_graph;
+
+    cv::gimpl::GModel::Graph       m_gm;  // FIXME: make const?
+    cv::gimpl::GIslandModel::Graph m_gim; // FIXME: make const?
+
+    // FIXME: Naive executor details are here for now
+    // but then it should be moved to another place
+    struct OpDesc
+    {
+        std::vector<RcDesc> in_objects;
+        std::vector<RcDesc> out_objects;
+        std::shared_ptr<GIslandExecutable> isl_exec;
+    };
+    std::vector<OpDesc> m_ops;
+
+    struct DataDesc
+    {
+        ade::NodeHandle slot_nh;
+        ade::NodeHandle data_nh;
+    };
+    std::vector<DataDesc> m_slots;
+
+    Mag m_res;
+
+    void initResource(const ade::NodeHandle &orig_nh); // FIXME: shouldn't it be RcDesc?
+
+public:
+    explicit GExecutor(std::unique_ptr<ade::Graph> &&g_model);
+    void run(cv::gimpl::GRuntimeArgs &&args);
+
+    bool canReshape() const;
+    void reshape(const GMetaArgs& inMetas, const GCompileArgs& args);
+
+    const GModel::Graph& model() const; // FIXME: make it ConstGraph?
+};
+
+} // namespace gimpl
+} // namespace cv
+
+#endif // OPENCV_GAPI_GEXECUTOR_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/logger.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/logger.hpp
new file mode 100644 (file)
index 0000000..ff4c759
--- /dev/null
@@ -0,0 +1,22 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef __OPENCV_GAPI_LOGGER_HPP__
+#define __OPENCV_GAPI_LOGGER_HPP__
+
+#if !defined(GAPI_STANDALONE)
+#  include "opencv2/core/cvdef.h"
+#  include "opencv2/core/utils/logger.hpp"
+#  define GAPI_LOG_INFO(tag, ...)    CV_LOG_INFO(tag, __VA_ARGS__)
+#  define GAPI_LOG_WARNING(tag, ...) CV_LOG_WARNING(tag, __VA_ARGS__)
+#else
+#  define GAPI_LOG_INFO(tag, ...)
+#  define GAPI_LOG_WARNING(tag, ...)
+#endif //  !defined(GAPI_STANDALONE)
+
+
+#endif // __OPENCV_GAPI_LOGGER_HPP__
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/src/precomp.hpp b/inference-engine/thirdparty/fluid/modules/gapi/src/precomp.hpp
new file mode 100644 (file)
index 0000000..eebe9d8
--- /dev/null
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef __OPENCV_GAPI_PRECOMP_HPP__
+#define __OPENCV_GAPI_PRECOMP_HPP__
+
+#if !defined(GAPI_STANDALONE)
+#  include "opencv2/core.hpp"
+#  include "opencv2/imgproc.hpp"
+#  include "opencv2/gapi/core.hpp"
+#  include "opencv2/gapi/imgproc.hpp"
+#endif //  !defined(GAPI_STANDALONE)
+
+#include "opencv2/gapi.hpp"
+#include "opencv2/gapi/gkernel.hpp"
+
+#endif // __OPENCV_GAPI_PRECOMP_HPP__
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_compoundkernel_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_compoundkernel_tests.cpp
new file mode 100644 (file)
index 0000000..1f5de7a
--- /dev/null
@@ -0,0 +1,500 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+// FIXME: move out from Common
+
+#include "test_precomp.hpp"
+#include "opencv2/gapi/cpu/core.hpp"
+
+#include <ade/util/algorithm.hpp>
+
+namespace opencv_test
+{
+namespace
+{
+    G_TYPED_KERNEL(GCompoundDoubleAddC, <GMat(GMat, GScalar)>, "org.opencv.test.compound_double_addC")
+    {
+        static GMatDesc outMeta(GMatDesc in, GScalarDesc) { return in; }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundDoubleAddCImpl, GCompoundDoubleAddC)
+    {
+        static GMat expand(cv::GMat in, cv::GScalar s)
+        {
+            return cv::gapi::addC(cv::gapi::addC(in, s), s);
+        }
+    };
+
+    G_TYPED_KERNEL(GCompoundAddC, <GMat(GMat, GScalar)>, "org.opencv.test.compound_addC")
+    {
+        static GMatDesc outMeta(GMatDesc in, GScalarDesc) { return in; }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundAddCImpl, GCompoundAddC)
+    {
+        static GMat expand(cv::GMat in, cv::GScalar s)
+        {
+            return cv::gapi::addC(in, s);
+        }
+    };
+
+    using GMat3 = std::tuple<GMat,GMat,GMat>;
+    using GMat2 = std::tuple<GMat,GMat>;
+
+    G_TYPED_KERNEL_M(GCompoundMergeWithSplit, <GMat3(GMat, GMat, GMat)>, "org.opencv.test.compound_merge_split")
+    {
+        static std::tuple<GMatDesc,GMatDesc,GMatDesc> outMeta(GMatDesc a, GMatDesc b, GMatDesc c)
+        {
+            return std::make_tuple(a, b, c);
+        }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundMergeWithSplitImpl, GCompoundMergeWithSplit)
+    {
+        static GMat3 expand(cv::GMat a, cv::GMat b, cv::GMat c)
+        {
+            return cv::gapi::split3(cv::gapi::merge3(a, b, c));
+        }
+    };
+
+    G_TYPED_KERNEL(GCompoundAddWithAddC, <GMat(GMat, GMat, GScalar)>, "org.opencv.test.compound_add_with_addc")
+    {
+        static GMatDesc outMeta(GMatDesc in, GMatDesc, GScalarDesc)
+        {
+            return in;
+        }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundAddWithAddCImpl, GCompoundAddWithAddC)
+    {
+        static GMat expand(cv::GMat in1, cv::GMat in2, cv::GScalar s)
+        {
+            return cv::gapi::addC(cv::gapi::add(in1, in2), s);
+        }
+    };
+
+    G_TYPED_KERNEL_M(GCompoundSplitWithAdd, <GMat2(GMat)>, "org.opencv.test.compound_split_with_add")
+    {
+        static std::tuple<GMatDesc, GMatDesc> outMeta(GMatDesc in)
+        {
+            const auto out_depth = in.depth;
+            const auto out_desc  = in.withType(out_depth, 1);
+            return std::make_tuple(out_desc, out_desc);
+        }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundSplitWithAddImpl, GCompoundSplitWithAdd)
+    {
+        static GMat2 expand(cv::GMat in)
+        {
+            cv::GMat a, b, c;
+            std::tie(a, b, c) = cv::gapi::split3(in);
+            return std::make_tuple(cv::gapi::add(a, b), c);
+        }
+    };
+
+    G_TYPED_KERNEL_M(GCompoundParallelAddC, <GMat2(GMat, GScalar)>, "org.opencv.test.compound_parallel_addc")
+    {
+        static std::tuple<GMatDesc, GMatDesc> outMeta(GMatDesc in, GScalarDesc)
+        {
+            return std::make_tuple(in, in);
+        }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundParallelAddCImpl, GCompoundParallelAddC)
+    {
+        static GMat2 expand(cv::GMat in, cv::GScalar s)
+        {
+            return std::make_tuple(cv::gapi::addC(in, s), cv::gapi::addC(in, s));
+        }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundAddImpl, cv::gapi::core::GAdd)
+    {
+        static GMat expand(cv::GMat in1, cv::GMat in2, int)
+        {
+            return cv::gapi::sub(cv::gapi::sub(in1, in2), in2);
+        }
+    };
+
+    G_TYPED_KERNEL(GCompoundAddWithAddCWithDoubleAddC, <GMat(GMat, GMat, GScalar)>, "org.opencv.test.compound_add_with_addC_with_double_addC")
+    {
+        static GMatDesc outMeta(GMatDesc in, GMatDesc, GScalarDesc)
+        {
+            return in;
+        }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundAddWithAddCWithDoubleAddCImpl, GCompoundAddWithAddCWithDoubleAddC)
+    {
+        static GMat expand(cv::GMat in1, cv::GMat in2, cv::GScalar s)
+        {
+            return GCompoundDoubleAddC::on(GCompoundAddWithAddC::on(in1, in2, s), s);
+        }
+    };
+
+    using GDoubleArray = cv::GArray<double>;
+    G_TYPED_KERNEL(GNegateArray, <GDoubleArray(GDoubleArray)>, "org.opencv.test.negate_array")
+    {
+        static GArrayDesc outMeta(const GArrayDesc&) { return empty_array_desc(); }
+    };
+
+    GAPI_OCV_KERNEL(GNegateArrayImpl, GNegateArray)
+    {
+        static void run(const std::vector<double>& in, std::vector<double>& out)
+        {
+            ade::util::transform(in, std::back_inserter(out), std::negate<double>());
+        }
+    };
+
+    G_TYPED_KERNEL(GMaxInArray, <GScalar(GDoubleArray)>, "org.opencv.test.max_in_array")
+    {
+        static GScalarDesc outMeta(const GArrayDesc&) { return empty_scalar_desc(); }
+    };
+
+    GAPI_OCV_KERNEL(GMaxInArrayImpl, GMaxInArray)
+    {
+        static void run(const std::vector<double>& in, cv::Scalar& out)
+        {
+            out = *std::max_element(in.begin(), in.end());
+        }
+    };
+
+    G_TYPED_KERNEL(GCompoundMaxInArray, <GScalar(GDoubleArray)>, "org.opencv.test.compound_max_in_array")
+    {
+        static GScalarDesc outMeta(const GArrayDesc&) { return empty_scalar_desc(); }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundMaxInArrayImpl, GCompoundMaxInArray)
+    {
+        static GScalar expand(GDoubleArray in)
+        {
+            return GMaxInArray::on(in);
+        }
+    };
+
+    G_TYPED_KERNEL(GCompoundNegateArray, <GDoubleArray(GDoubleArray)>, "org.opencv.test.compound_negate_array")
+    {
+        static GArrayDesc outMeta(const GArrayDesc&) { return empty_array_desc(); }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundNegateArrayImpl, GCompoundNegateArray)
+    {
+        static GDoubleArray expand(GDoubleArray in)
+        {
+            return GNegateArray::on(in);
+        }
+    };
+
+    G_TYPED_KERNEL(SetDiagKernel, <GMat(GMat, GDoubleArray)>, "org.opencv.test.empty_kernel")
+    {
+        static GMatDesc outMeta(GMatDesc in, GArrayDesc) { return in; }
+    };
+
+    void setDiag(cv::Mat& in, const std::vector<double>& diag)
+    {
+        GAPI_Assert(in.rows == static_cast<int>(diag.size()));
+        GAPI_Assert(in.cols == static_cast<int>(diag.size()));
+        for (int i = 0; i < in.rows; ++i)
+        {
+            in.at<uchar>(i, i) = static_cast<uchar>(diag[i]);
+        }
+    }
+
+    GAPI_OCV_KERNEL(SetDiagKernelImpl, SetDiagKernel)
+    {
+        static void run(const cv::Mat& in, const std::vector<double>& v, cv::Mat& out)
+        {
+            in.copyTo(out);
+            setDiag(out, v);
+        }
+    };
+
+    G_TYPED_KERNEL(GCompoundGMatGArrayGMat, <GMat(GMat, GDoubleArray, GMat)>, "org.opencv.test.compound_gmat_garray_gmat")
+    {
+        static GMatDesc outMeta(GMatDesc in, GArrayDesc, GMatDesc) { return in; }
+    };
+
+    GAPI_COMPOUND_KERNEL(GCompoundGMatGArrayGMatImpl, GCompoundGMatGArrayGMat)
+    {
+        static GMat expand(GMat a, GDoubleArray b, GMat c)
+        {
+            return SetDiagKernel::on(cv::gapi::add(a, c), b);
+        }
+    };
+
+} // namespace
+
+// FIXME avoid cv::combine that use custom and default kernels together
+TEST(GCompoundKernel, ReplaceDefaultKernel)
+{
+    cv::GMat in1, in2;
+    auto out = cv::gapi::add(in1, in2);
+    const auto custom_pkg = cv::gapi::kernels<GCompoundAddImpl>();
+    const auto full_pkg   = cv::gapi::combine(cv::gapi::core::cpu::kernels(), custom_pkg, cv::unite_policy::REPLACE);
+    cv::GComputation comp(cv::GIn(in1, in2), cv::GOut(out));
+    cv::Mat in_mat1 = cv::Mat::eye(3, 3, CV_8UC1),
+            in_mat2 = cv::Mat::eye(3, 3, CV_8UC1),
+            out_mat(3, 3, CV_8UC1),
+            ref_mat(3, 3, CV_8UC1);
+
+    comp.apply(cv::gin(in_mat1, in_mat2), cv::gout(out_mat), cv::compile_args(full_pkg));
+    ref_mat = in_mat1 - in_mat2 - in_mat2;
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GCompoundKernel, DoubleAddC)
+{
+    cv::GMat in1, in2;
+    cv::GScalar s;
+    auto add_res   = cv::gapi::add(in1, in2);
+    auto super     = GCompoundDoubleAddC::on(add_res, s);
+    auto out       = cv::gapi::addC(super, s);
+
+    const auto custom_pkg = cv::gapi::kernels<GCompoundDoubleAddCImpl>();
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in1, in2, s), cv::GOut(out));
+
+    cv::Mat in_mat1 = cv::Mat::eye(3, 3, CV_8UC1),
+        in_mat2 = cv::Mat::eye(3, 3, CV_8UC1),
+        out_mat(3, 3, CV_8UC1),
+        ref_mat(3, 3, CV_8UC1);
+
+    cv::Scalar scalar = 2;
+
+    comp.apply(cv::gin(in_mat1, in_mat2, scalar), cv::gout(out_mat), cv::compile_args(full_pkg));
+    ref_mat = in_mat1 + in_mat2 + scalar + scalar + scalar;
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GCompoundKernel, AddC)
+{
+    cv::GMat in1, in2;
+    cv::GScalar s;
+    auto add_res   = cv::gapi::add(in1, in2);
+    auto super     = GCompoundAddC::on(add_res, s);
+    auto out       = cv::gapi::addC(super, s);
+
+    const auto custom_pkg = cv::gapi::kernels<GCompoundAddCImpl>();
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in1, in2, s), cv::GOut(out));
+
+    cv::Mat in_mat1 = cv::Mat::eye(3, 3, CV_8UC1),
+        in_mat2 = cv::Mat::eye(3, 3, CV_8UC1),
+        out_mat(3, 3, CV_8UC1),
+        ref_mat(3, 3, CV_8UC1);
+
+    cv::Scalar scalar = 2;
+
+    comp.apply(cv::gin(in_mat1, in_mat2, scalar), cv::gout(out_mat), cv::compile_args(full_pkg));
+    ref_mat = in_mat1 + in_mat2 + scalar + scalar;
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GCompoundKernel, MergeWithSplit)
+{
+    cv::GMat in, a1, b1, c1,
+        a2, b2, c2;
+
+    std::tie(a1, b1, c1) = cv::gapi::split3(in);
+    std::tie(a2, b2, c2) = GCompoundMergeWithSplit::on(a1, b1, c1);
+    auto out = cv::gapi::merge3(a2, b2, c2);
+
+    const auto custom_pkg = cv::gapi::kernels<GCompoundMergeWithSplitImpl>();
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+
+    cv::Mat in_mat = cv::Mat::eye(3, 3, CV_8UC3), out_mat, ref_mat;
+    comp.apply(cv::gin(in_mat), cv::gout(out_mat), cv::compile_args(full_pkg));
+    ref_mat = in_mat;
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GCompoundKernel, AddWithAddC)
+{
+    cv::GMat in1, in2;
+    cv::GScalar s;
+    auto out = GCompoundAddWithAddC::on(in1, in2, s);
+
+    const auto custom_pkg = cv::gapi::kernels<GCompoundAddWithAddCImpl>();
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in1, in2, s), cv::GOut(out));
+
+    cv::Mat in_mat1 = cv::Mat::eye(3, 3, CV_8UC1),
+        in_mat2 = cv::Mat::eye(3, 3, CV_8UC1),
+        out_mat(3, 3, CV_8UC1),
+        ref_mat(3, 3, CV_8UC1);
+
+    cv::Scalar scalar = 2;
+
+    comp.apply(cv::gin(in_mat1, in_mat2, scalar), cv::gout(out_mat), cv::compile_args(full_pkg));
+    ref_mat = in_mat1 + in_mat2 + scalar;
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GCompoundKernel, SplitWithAdd)
+{
+    cv::GMat in, out1, out2;
+    std::tie(out1, out2) = GCompoundSplitWithAdd::on(in);
+
+    const auto custom_pkg = cv::gapi::kernels<GCompoundSplitWithAddImpl>();
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out1, out2));
+
+    cv::Mat in_mat = cv::Mat::eye(3, 3, CV_8UC3),
+        out_mat1(3, 3, CV_8UC1),
+        out_mat2(3, 3, CV_8UC1),
+        ref_mat1(3, 3, CV_8UC1),
+        ref_mat2(3, 3, CV_8UC1);
+
+    comp.apply(cv::gin(in_mat), cv::gout(out_mat1, out_mat2), cv::compile_args(full_pkg));
+
+    std::vector<cv::Mat> channels(3);
+    cv::split(in_mat, channels);
+
+    ref_mat1 = channels[0] + channels[1];
+    ref_mat2 = channels[2];
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat1 != ref_mat1));
+    EXPECT_EQ(0, cv::countNonZero(out_mat2 != ref_mat2));
+}
+
+TEST(GCompoundKernel, ParallelAddC)
+{
+    cv::GMat in1, out1, out2;
+    cv::GScalar in2;
+    std::tie(out1, out2) = GCompoundParallelAddC::on(in1, in2);
+
+    const auto custom_pkg = cv::gapi::kernels<GCompoundParallelAddCImpl>();
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in1, in2), cv::GOut(out1, out2));
+
+    cv::Mat in_mat = cv::Mat::eye(3, 3, CV_8UC1),
+        out_mat1(3, 3, CV_8UC1),
+        out_mat2(3, 3, CV_8UC1),
+        ref_mat1(3, 3, CV_8UC1),
+        ref_mat2(3, 3, CV_8UC1);
+
+    cv::Scalar scalar = 2;
+
+    comp.apply(cv::gin(in_mat, scalar), cv::gout(out_mat1, out_mat2), cv::compile_args(full_pkg));
+
+    ref_mat1 = in_mat + scalar;
+    ref_mat2 = in_mat + scalar;
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat1 != ref_mat1));
+    EXPECT_EQ(0, cv::countNonZero(out_mat2 != ref_mat2));
+}
+
+TEST(GCompoundKernel, GCompundKernelAndDefaultUseOneData)
+{
+    cv::GMat in1, in2;
+    cv::GScalar s;
+    auto out = cv::gapi::add(GCompoundAddWithAddC::on(in1, in2, s), cv::gapi::addC(in2, s));
+
+    const auto custom_pkg = cv::gapi::kernels<GCompoundAddWithAddCImpl>();
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in1, in2, s), cv::GOut(out));
+
+    cv::Mat in_mat1 = cv::Mat::eye(3, 3, CV_8UC1),
+        in_mat2 = cv::Mat::eye(3, 3, CV_8UC1),
+        out_mat(3, 3, CV_8UC1),
+        ref_mat(3, 3, CV_8UC1);
+
+    cv::Scalar scalar = 2;
+
+    comp.apply(cv::gin(in_mat1, in_mat2, scalar), cv::gout(out_mat), cv::compile_args(full_pkg));
+    ref_mat = in_mat1 + in_mat2 + scalar + in_mat2 + scalar;
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GCompoundKernel, CompoundExpandedToCompound)
+{
+    cv::GMat in1, in2;
+    cv::GScalar s;
+    auto out = GCompoundAddWithAddCWithDoubleAddC::on(in1, in2, s);
+
+    const auto custom_pkg = cv::gapi::kernels<GCompoundAddWithAddCWithDoubleAddCImpl,
+                                              GCompoundAddWithAddCImpl,
+                                              GCompoundDoubleAddCImpl>();
+
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in1, in2, s), cv::GOut(out));
+
+    cv::Mat in_mat1 = cv::Mat::eye(3, 3, CV_8UC1),
+            in_mat2 = cv::Mat::eye(3, 3, CV_8UC1),
+            out_mat(3, 3, CV_8UC1),
+            ref_mat(3, 3, CV_8UC1);
+
+    cv::Scalar scalar = 2;
+
+    comp.apply(cv::gin(in_mat1, in_mat2, scalar), cv::gout(out_mat), cv::compile_args(full_pkg));
+    ref_mat = in_mat1 + in_mat2 + scalar + scalar + scalar;
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GCompoundKernel, MaxInArray)
+{
+    GDoubleArray in;
+    auto out = GCompoundMaxInArray::on(in);
+    const auto custom_pkg = cv::gapi::kernels<GCompoundMaxInArrayImpl, GMaxInArrayImpl>();
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+    std::vector<double> v = { 1, 5, -2, 3, 10, 2};
+    cv::Scalar out_scl;
+    cv::Scalar ref_scl(*std::max_element(v.begin(), v.end()));
+
+    comp.apply(cv::gin(v), cv::gout(out_scl), cv::compile_args(full_pkg));
+
+    EXPECT_EQ(out_scl, ref_scl);
+}
+
+TEST(GCompoundKernel, NegateArray)
+{
+    GDoubleArray in;
+    GDoubleArray out = GCompoundNegateArray::on(in);
+    const auto custom_pkg = cv::gapi::kernels<GCompoundNegateArrayImpl, GNegateArrayImpl>();
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+    std::vector<double> in_v = {1, 5, -2, -10, 3};
+    std::vector<double> out_v;
+    std::vector<double> ref_v;
+    ade::util::transform(in_v, std::back_inserter(ref_v), std::negate<double>());
+
+    comp.apply(cv::gin(in_v), cv::gout(out_v), cv::compile_args(full_pkg));
+
+    EXPECT_EQ(out_v, ref_v);
+}
+
+TEST(GCompoundKernel, RightGArrayHandle)
+{
+    cv::GMat in[2];
+    GDoubleArray a;
+    cv::GMat out = GCompoundGMatGArrayGMat::on(in[0], a, in[1]);
+    const auto custom_pkg = cv::gapi::kernels<GCompoundGMatGArrayGMatImpl, SetDiagKernelImpl>();
+    const auto full_pkg   = cv::gapi::combine(custom_pkg, cv::gapi::core::cpu::kernels(), cv::unite_policy::KEEP);
+    cv::GComputation comp(cv::GIn(in[0], a, in[1]), cv::GOut(out));
+    std::vector<double> in_v(3, 1.0);
+    cv::Mat in_mat1 = cv::Mat::eye(cv::Size(3, 3), CV_8UC1),
+            in_mat2 = cv::Mat::eye(cv::Size(3, 3), CV_8UC1),
+            out_mat;
+    cv::Mat ref_mat= in_mat1 + in_mat2;
+    setDiag(ref_mat, in_v);
+
+    comp.apply(cv::gin(in_mat1, in_v, in_mat2), cv::gout(out_mat), cv::compile_args(full_pkg));
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+
+}
+} // opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.cpp
new file mode 100644 (file)
index 0000000..eb77612
--- /dev/null
@@ -0,0 +1,9 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "gapi_core_tests_inl.hpp"
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests.hpp
new file mode 100644 (file)
index 0000000..77a82df
--- /dev/null
@@ -0,0 +1,153 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CORE_TESTS_HPP
+#define OPENCV_GAPI_CORE_TESTS_HPP
+
+#include <iostream>
+
+#include "gapi_tests_common.hpp"
+
+namespace opencv_test
+{
+enum mathOp
+{
+    ADD = 0,
+    SUB = 1,
+    MUL = 2,
+    DIV = 3
+};
+
+enum bitwiseOp
+{
+    AND = 0,
+    OR = 1,
+    XOR = 2,
+    NOT = 3
+};
+
+namespace
+{
+const char *MathOperations[] = {"ADD", "SUB", "MUL", "DIV"};
+const char *BitwiseOperations[] = {"And", "Or", "Xor"};
+const char *CompareOperations[] = {"CMP_EQ", "CMP_GT", "CMP_GE", "CMP_LT", "CMP_LE", "CMP_NE"};
+//corresponds to OpenCV
+const char *NormOperations[] = {"", "NORM_INF", "NORM_L1", "","NORM_L2"};
+}
+
+
+struct PrintMathOpCoreParams
+{
+    template <class TestParams>
+    std::string operator()(const ::testing::TestParamInfo<TestParams>& info) const
+    {
+        std::stringstream ss;
+        cv::Size sz = std::get<4>(info.param);
+        ss<<MathOperations[std::get<0>(info.param)]
+                    <<"_"<<std::get<1>(info.param)
+                    <<"_"<<std::get<2>(info.param)
+                    <<"_"<<(int)std::get<3>(info.param)
+                    <<"_"<<sz.width
+                    <<"x"<<sz.height
+                    <<"_"<<(std::get<5>(info.param)+1)
+                    <<"_"<<std::get<6>(info.param)
+                    <<"_"<<std::get<7>(info.param);
+        return ss.str();
+   }
+};
+
+struct PrintCmpCoreParams
+{
+    template <class TestParams>
+    std::string operator()(const ::testing::TestParamInfo<TestParams>& info) const
+    {
+        std::stringstream ss;
+        cv::Size sz = std::get<3>(info.param);
+        ss<<CompareOperations[std::get<0>(info.param)]
+                    <<"_"<<std::get<1>(info.param)
+                    <<"_"<<std::get<2>(info.param)
+                    <<"_"<<sz.width
+                    <<"x"<<sz.height
+                    <<"_"<<std::get<4>(info.param);
+        return ss.str();
+   }
+};
+
+struct PrintBWCoreParams
+{
+    template <class TestParams>
+    std::string operator()(const ::testing::TestParamInfo<TestParams>& info) const
+    {
+        std::stringstream ss;
+        cv::Size sz = std::get<2>(info.param);
+        ss<<BitwiseOperations[std::get<0>(info.param)]
+                    <<"_"<<std::get<1>(info.param)
+                    <<"_"<<sz.width
+                    <<"x"<<sz.height
+                    <<"_"<<std::get<3>(info.param);
+        return ss.str();
+   }
+};
+
+struct PrintNormCoreParams
+{
+    template <class TestParams>
+    std::string operator()(const ::testing::TestParamInfo<TestParams>& info) const
+    {
+        std::stringstream ss;
+        cv::Size sz = std::get<2>(info.param);
+        ss<<NormOperations[std::get<0>(info.param)]
+                    <<"_"<<std::get<1>(info.param)
+                    <<"_"<<sz.width
+                    <<"x"<<sz.height;
+        return ss.str();
+   }
+};
+
+struct MathOpTest        : public TestParams<std::tuple<mathOp,bool,int,double,cv::Size,int,bool,bool,cv::GCompileArgs>>{};
+struct MulDoubleTest     : public TestParams<std::tuple<int,cv::Size,int,bool,cv::GCompileArgs>>{};
+struct DivTest           : public TestParams<std::tuple<int,cv::Size,int,bool, cv::GCompileArgs>>{};
+struct DivCTest          : public TestParams<std::tuple<int,cv::Size,int,bool, cv::GCompileArgs>>{};
+struct MeanTest          : public TestParams<std::tuple<int,cv::Size,bool, cv::GCompileArgs>> {};
+struct MaskTest          : public TestParams<std::tuple<int,cv::Size,bool, cv::GCompileArgs>> {};
+struct Polar2CartTest    : public TestParams<std::tuple<cv::Size,bool, cv::GCompileArgs>> {};
+struct Cart2PolarTest    : public TestParams<std::tuple<cv::Size,bool, cv::GCompileArgs>> {};
+struct CmpTest           : public TestParams<std::tuple<CmpTypes,bool,int,cv::Size,bool, cv::GCompileArgs>>{};
+struct BitwiseTest       : public TestParams<std::tuple<bitwiseOp,int,cv::Size,bool, cv::GCompileArgs>>{};
+struct NotTest           : public TestParams<std::tuple<int,cv::Size,bool, cv::GCompileArgs>> {};
+struct SelectTest        : public TestParams<std::tuple<int,cv::Size,bool, cv::GCompileArgs>> {};
+struct MinTest           : public TestParams<std::tuple<int,cv::Size,bool, cv::GCompileArgs>>{};
+struct MaxTest           : public TestParams<std::tuple<int,cv::Size,bool, cv::GCompileArgs>>{};
+struct AbsDiffTest       : public TestParams<std::tuple<int,cv::Size,bool, cv::GCompileArgs>>{};
+struct AbsDiffCTest      : public TestParams<std::tuple<int,cv::Size,bool, cv::GCompileArgs>> {};
+struct SumTest           : public TestParams<std::tuple<int, cv::Size,bool,double,cv::GCompileArgs>> {};
+struct AddWeightedTest   : public TestParams<std::tuple<int,cv::Size,int,bool,double,cv::GCompileArgs>>{};
+struct NormTest          : public TestParams<std::tuple<NormTypes,int,cv::Size, double, cv::GCompileArgs>>{};
+struct IntegralTest      : public TestWithParam<std::tuple<int,cv::Size, cv::GCompileArgs>> {};
+struct ThresholdTest     : public TestParams<std::tuple<int,cv::Size,int,bool, cv::GCompileArgs>> {};
+struct ThresholdOTTest   : public TestParams<std::tuple<int,cv::Size,int,bool, cv::GCompileArgs>> {};
+struct InRangeTest       : public TestParams<std::tuple<int,cv::Size,bool, cv::GCompileArgs>> {};
+struct Split3Test        : public TestParams<std::tuple<cv::Size, cv::GCompileArgs>> {};
+struct Split4Test        : public TestParams<std::tuple<cv::Size, cv::GCompileArgs>> {};
+struct ResizeTest        : public TestWithParam<std::tuple<compare_f, int, int, cv::Size, cv::Size, cv::GCompileArgs>> {};
+struct ResizeTestFxFy    : public TestWithParam<std::tuple<compare_f, int, int, cv::Size, double, double, cv::GCompileArgs>> {};
+struct Merge3Test        : public TestParams<std::tuple<cv::Size, cv::GCompileArgs>> {};
+struct Merge4Test        : public TestParams<std::tuple<cv::Size, cv::GCompileArgs>> {};
+struct RemapTest         : public TestParams<std::tuple<int,cv::Size,bool, cv::GCompileArgs>> {};
+struct FlipTest          : public TestParams<std::tuple<int, int, cv::Size,bool, cv::GCompileArgs>> {};
+struct CropTest          : public TestParams<std::tuple<int,cv::Rect,cv::Size,bool, cv::GCompileArgs>> {};
+struct ConcatHorTest     : public TestWithParam<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+struct ConcatVertTest    : public TestWithParam<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+struct ConcatVertVecTest : public TestWithParam<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+struct ConcatHorVecTest  : public TestWithParam<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+struct LUTTest           : public TestParams<std::tuple<int, int, cv::Size,bool, cv::GCompileArgs>> {};
+struct ConvertToTest     : public TestParams<std::tuple<int, int, cv::Size, cv::GCompileArgs>> {};
+struct PhaseTest         : public TestParams<std::tuple<int, cv::Size, bool, cv::GCompileArgs>> {};
+struct SqrtTest          : public TestParams<std::tuple<int, cv::Size, cv::GCompileArgs>> {};
+} // opencv_test
+
+#endif //OPENCV_GAPI_CORE_TESTS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests_inl.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_core_tests_inl.hpp
new file mode 100644 (file)
index 0000000..d33b5cc
--- /dev/null
@@ -0,0 +1,1479 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_CORE_TESTS_INL_HPP
+#define OPENCV_GAPI_CORE_TESTS_INL_HPP
+
+#include "opencv2/gapi/core.hpp"
+#include "gapi_core_tests.hpp"
+
+namespace opencv_test
+{
+
+TEST_P(MathOpTest, MatricesAccuracyTest )
+{
+    mathOp opType = ADD;
+    int type = 0, dtype = 0;
+    cv::Size sz;
+    double scale = 1; // mul, div
+    bool testWithScalar = false, initOutMatr = false, doReverseOp = false;
+    cv::GCompileArgs compile_args;
+    std::tie(opType, testWithScalar, type, scale, sz, dtype, initOutMatr, doReverseOp, compile_args) = GetParam();
+    initMatsRandU(type, sz, dtype, initOutMatr);
+
+    // G-API code & corresponding OpenCV code ////////////////////////////////
+    cv::GMat in1, in2, out;
+    if( testWithScalar )
+    {
+        cv::GScalar sc1;
+        switch(opType)
+        {
+        case (ADD):
+        {
+            out = cv::gapi::addC(in1, sc1, dtype);
+            cv::add(in_mat1, sc, out_mat_ocv, cv::noArray(), dtype);
+            break;
+        }
+        case (SUB):
+        {
+            if( doReverseOp )
+            {
+                out = cv::gapi::subRC(sc1, in1, dtype);
+                cv::subtract(sc, in_mat1, out_mat_ocv, cv::noArray(), dtype);
+            }
+            else
+            {
+                out = cv::gapi::subC(in1, sc1, dtype);
+                cv::subtract(in_mat1, sc, out_mat_ocv, cv::noArray(), dtype);
+            }
+            break;
+        }
+        case (DIV):
+        {
+            if( doReverseOp )
+            {
+                in_mat1.setTo(1, in_mat1 == 0);  // avoid zeros in divide input data
+                out = cv::gapi::divRC(sc1, in1, scale, dtype);
+                cv::divide(sc, in_mat1, out_mat_ocv, scale, dtype);
+                break;
+            }
+            else
+            {
+                sc += Scalar(1, 1, 1, 1);  // avoid zeros in divide input data
+                out = cv::gapi::divC(in1, sc1, scale, dtype);
+                cv::divide(in_mat1, sc, out_mat_ocv, scale, dtype);
+                break;
+            }
+        }
+        case (MUL):
+        {
+            // FIXME: add `scale` parameter to mulC
+            out = cv::gapi::mulC(in1, sc1, /* scale, */ dtype);
+            cv::multiply(in_mat1, sc, out_mat_ocv, 1., dtype);
+            break;
+        }
+        default:
+        {
+            FAIL() << "no such math operation type for scalar and matrix!";
+        }
+        }
+        cv::GComputation c(GIn(in1, sc1), GOut(out));
+        c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+    }
+    else
+    {
+        switch(opType)
+        {
+        case (ADD):
+        {
+            out = cv::gapi::add(in1, in2, dtype);
+            cv::add(in_mat1, in_mat2, out_mat_ocv, cv::noArray(), dtype);
+            break;
+        }
+        case (SUB):
+        {
+            out = cv::gapi::sub(in1, in2, dtype);
+            cv::subtract(in_mat1, in_mat2, out_mat_ocv, cv::noArray(), dtype);
+            break;
+        }
+        case (DIV):
+        {
+            in_mat2.setTo(1, in_mat2 == 0);  // avoid zeros in divide input data
+            out = cv::gapi::div(in1, in2, scale, dtype);
+            cv::divide(in_mat1, in_mat2, out_mat_ocv, scale, dtype);
+            break;
+        }
+        case (MUL):
+        {
+            out = cv::gapi::mul(in1, in2, scale, dtype);
+            cv::multiply(in_mat1, in_mat2, out_mat_ocv, scale, dtype);
+            break;
+        }
+        default:
+        {
+            FAIL() << "no such math operation type for matrix and matrix!";
+        }}
+        cv::GComputation c(GIn(in1, in2), GOut(out));
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+    // TODO: make threshold vs bit-exact criteria be driven by testing parameter
+    #if 1
+        if (CV_MAT_DEPTH(out_mat_ocv.type()) != CV_32F &&
+            CV_MAT_DEPTH(out_mat_ocv.type()) != CV_64F)
+        {
+            // integral: allow 1% of differences, and no diffs by >1 unit
+            EXPECT_LE(countNonZeroPixels(cv::abs(out_mat_gapi - out_mat_ocv) > 0),
+                                                           0.01*out_mat_ocv.total());
+            EXPECT_LE(countNonZeroPixels(cv::abs(out_mat_gapi - out_mat_ocv) > 1), 0);
+        }
+        else
+        {
+            // floating-point: expect 6 decimal digits - best we expect of F32
+            EXPECT_EQ(0, cv::countNonZero(cv::abs(out_mat_gapi - out_mat_ocv) >
+                                                    1e-6*cv::abs(out_mat_ocv)));
+        }
+    #else
+        EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+    #endif
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(MulDoubleTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    int dtype = std::get<2>(param);
+    cv::Size sz_in = std::get<1>(param);
+    bool initOut = std::get<3>(param);
+
+    auto& rng = cv::theRNG();
+    double d = rng.uniform(0.0, 10.0);
+    auto compile_args = std::get<4>(param);
+    initMatrixRandU(type, sz_in, dtype, initOut);
+
+    // G-API code ////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    out = cv::gapi::mulC(in1, d, dtype);
+    cv::GComputation c(in1, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    // OpenCV code ///////////////////////////////////////////////////////////
+    cv::multiply(in_mat1, d, out_mat_ocv, 1, dtype);
+
+    // Comparison ////////////////////////////////////////////////////////////
+#if 1
+    if (CV_MAT_DEPTH(out_mat_ocv.type()) != CV_32F &&
+        CV_MAT_DEPTH(out_mat_ocv.type()) != CV_64F)
+    {
+        // integral: allow 1% of differences, and no diffs by >1 unit
+        EXPECT_LE(countNonZeroPixels(cv::abs(out_mat_gapi - out_mat_ocv) > 0),
+                                                    0.01*out_mat_ocv.total());
+        EXPECT_LE(countNonZeroPixels(cv::abs(out_mat_gapi - out_mat_ocv) > 1), 0);
+    }
+    else
+    {
+        // floating-point: expect 6 decimal digits - best we expect of F32
+        EXPECT_EQ(0, cv::countNonZero(cv::abs(out_mat_gapi - out_mat_ocv) >
+            1e-6*cv::abs(out_mat_ocv)));
+    }
+#else
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+#endif
+    EXPECT_EQ(out_mat_gapi.size(), sz_in);
+}
+
+TEST_P(DivTest, DISABLED_DivByZeroTest)  // https://github.com/opencv/opencv/pull/12826
+{
+    int type = 0, dtype = 0;
+    cv::Size sz_in;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(type, sz_in, dtype, initOut, compile_args) = GetParam();
+
+    initMatrixRandU(type, sz_in, dtype, initOut);
+    in_mat2 = cv::Mat(sz_in, type);
+    in_mat2.setTo(cv::Scalar::all(0));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = cv::gapi::div(in1, in2, 1.0, dtype);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::divide(in_mat1, in_mat2, out_mat_ocv, 1.0, dtype);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(DivCTest, DISABLED_DivByZeroTest)  // https://github.com/opencv/opencv/pull/12826
+{
+    int type = 0, dtype = 0;
+    cv::Size sz_in;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(type, sz_in, dtype, initOut, compile_args) = GetParam();
+
+    initMatrixRandU(type, sz_in, dtype, initOut);
+    sc = cv::Scalar::all(0);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1;
+    cv::GScalar sc1;
+    auto out = cv::gapi::divC(in1, sc1, dtype);
+    cv::GComputation c(GIn(in1, sc1), GOut(out));
+
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::divide(in_mat1, sc, out_mat_ocv, dtype);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        cv::Mat zeros = cv::Mat::zeros(sz_in, type);
+        EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != zeros));
+    }
+}
+
+TEST_P(MeanTest, AccuracyTest)
+{
+    int type = 0;
+    bool initOut = false;
+    cv::Size sz_in;
+    cv::GCompileArgs compile_args;
+    std::tie(type, sz_in, initOut, compile_args) = GetParam();
+    initMatrixRandU(type, sz_in, initOut);
+    cv::Scalar out_norm;
+    cv::Scalar out_norm_ocv;
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::mean(in);
+
+    cv::GComputation c(cv::GIn(in), cv::GOut(out));
+    c.apply(cv::gin(in_mat1), cv::gout(out_norm), std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        out_norm_ocv = cv::mean(in_mat1);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(out_norm[0], out_norm_ocv[0]);
+    }
+}
+
+TEST_P(MaskTest, AccuracyTest)
+{
+    int type = 0;
+    bool initOut = false;
+    cv::Size sz_in;
+    cv::GCompileArgs compile_args;
+    std::tie(type, sz_in, initOut, compile_args) = GetParam();
+    initMatrixRandU(type, sz_in, type, initOut);
+
+    in_mat2 = cv::Mat(sz_in, CV_8UC1);
+    cv::randu(in_mat2, cv::Scalar::all(0), cv::Scalar::all(255));
+    in_mat2 = in_mat2 > 128;
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in, m;
+    auto out = cv::gapi::mask(in, m);
+
+    cv::GComputation c(cv::GIn(in, m), cv::GOut(out));
+    c.apply(cv::gin(in_mat1, in_mat2), cv::gout(out_mat_gapi), std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        out_mat_ocv = cv::Mat::zeros(in_mat1.size(), in_mat1.type());
+        in_mat1.copyTo(out_mat_ocv, in_mat2);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    }
+}
+
+TEST_P(Polar2CartTest, AccuracyTest)
+{
+    auto param = GetParam();
+    cv::Size sz_in = std::get<0>(param);
+    auto compile_args = std::get<2>(param);
+    initMatsRandU(CV_32FC1, sz_in, CV_32FC1, std::get<1>(param));
+
+    cv::Mat out_mat2;
+    cv::Mat out_mat_ocv2;
+    if(std::get<1>(param) == true)
+    {
+        out_mat2 = cv::Mat(sz_in, CV_32FC1);
+        out_mat_ocv2 = cv::Mat(sz_in, CV_32FC1);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out1, out2;
+    std::tie(out1, out2) = cv::gapi::polarToCart(in1, in2);
+
+    cv::GComputation c(GIn(in1, in2), GOut(out1, out2));
+    c.apply(gin(in_mat1,in_mat2), gout(out_mat_gapi, out_mat2), std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::polarToCart(in_mat1, in_mat2, out_mat_ocv, out_mat_ocv2);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        // Note that we cannot rely on bit-exact sin/cos functions used for this
+        // transform, so we need a threshold for verifying results vs reference.
+        //
+        // Relative threshold like 1e-6 is very restrictive, nearly best we can
+        // expect of single-precision elementary functions implementation.
+        //
+        // However, good idea is making such threshold configurable: parameter
+        // of this test - which a specific test istantiation could setup.
+        //
+        // Note that test instantiation for the OpenCV back-end could even let
+        // the threshold equal to zero, as CV back-end calls the same kernel.
+        //
+        // TODO: Make threshold a configurable parameter of this test (ADE-221)
+
+        cv::Mat &outx = out_mat_gapi,
+                &outy = out_mat2;
+        cv::Mat &refx = out_mat_ocv,
+                &refy = out_mat_ocv2;
+        cv::Mat difx = cv::abs(refx - outx),
+                dify = cv::abs(refy - outy);
+        cv::Mat absx = cv::abs(refx),
+                absy = cv::abs(refy);
+
+        EXPECT_EQ(0, cv::countNonZero(difx > 1e-6*absx));
+        EXPECT_EQ(0, cv::countNonZero(dify > 1e-6*absy));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(Cart2PolarTest, AccuracyTest)
+{
+    auto param = GetParam();
+    cv::Size sz_in = std::get<0>(param);
+    auto compile_args = std::get<2>(param);
+    initMatsRandU(CV_32FC1, sz_in, CV_32FC1, std::get<1>(param));
+
+    cv::Mat out_mat2(sz_in, CV_32FC1);
+    cv::Mat out_mat_ocv2(sz_in, CV_32FC1);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, out1, out2;
+    std::tie(out1, out2) = cv::gapi::cartToPolar(in1, in2);
+
+    cv::GComputation c(GIn(in1, in2), GOut(out1, out2));
+    c.apply(gin(in_mat1,in_mat2), gout(out_mat_gapi, out_mat2));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cartToPolar(in_mat1, in_mat2, out_mat_ocv, out_mat_ocv2);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        // Note that we cannot rely on bit-exact sin/cos functions used for this
+        // transform, so we need a threshold for verifying results vs reference.
+        //
+        // Relative threshold like 1e-6 is very restrictive, nearly best we can
+        // expect of single-precision elementary functions implementation.
+        //
+        // However, good idea is making such threshold configurable: parameter
+        // of this test - which a specific test istantiation could setup.
+        //
+        // Note that test instantiation for the OpenCV back-end could even let
+        // the threshold equal to zero, as CV back-end calls the same kernel.
+        //
+        // TODO: Make threshold a configurable parameter of this test (ADE-221)
+
+        cv::Mat &outm = out_mat_gapi,
+                &outa = out_mat2;
+        cv::Mat &refm = out_mat_ocv,
+                &refa = out_mat_ocv2;
+        cv::Mat difm = cv::abs(refm - outm),
+                difa = cv::abs(refa - outa);
+        cv::Mat absm = cv::abs(refm),
+                absa = cv::abs(refa);
+
+        // FIXME: Angle result looks inaccurate at OpenCV
+        //        (expected relative accuracy like 1e-6)
+        EXPECT_EQ(0, cv::countNonZero(difm > 1e-6*absm));
+        EXPECT_EQ(0, cv::countNonZero(difa > 1e-3*absa));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(CmpTest, AccuracyTest)
+{
+    CmpTypes opType = CMP_EQ;
+    int type = 0;
+    cv::Size sz;
+    bool testWithScalar = false, initOutMatr = false;
+    cv::GCompileArgs compile_args;
+    std::tie(opType, testWithScalar, type, sz, initOutMatr, compile_args) = GetParam();
+    initMatsRandU(type, sz, CV_8U, initOutMatr);
+
+    // G-API code & corresponding OpenCV code ////////////////////////////////
+    cv::GMat in1, out;
+    if( testWithScalar )
+    {
+        cv::GScalar in2;
+        switch(opType)
+        {
+        case CMP_EQ: out = cv::gapi::cmpEQ(in1, in2); break;
+        case CMP_GT: out = cv::gapi::cmpGT(in1, in2); break;
+        case CMP_GE: out = cv::gapi::cmpGE(in1, in2); break;
+        case CMP_LT: out = cv::gapi::cmpLT(in1, in2); break;
+        case CMP_LE: out = cv::gapi::cmpLE(in1, in2); break;
+        case CMP_NE: out = cv::gapi::cmpNE(in1, in2); break;
+        default: FAIL() << "no such compare operation type for matrix and scalar!";
+        }
+
+        cv::compare(in_mat1, sc, out_mat_ocv, opType);
+
+        cv::GComputation c(GIn(in1, in2), GOut(out));
+        c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+    }
+    else
+    {
+        cv::GMat in2;
+        switch(opType)
+        {
+        case CMP_EQ: out = cv::gapi::cmpEQ(in1, in2); break;
+        case CMP_GT: out = cv::gapi::cmpGT(in1, in2); break;
+        case CMP_GE: out = cv::gapi::cmpGE(in1, in2); break;
+        case CMP_LT: out = cv::gapi::cmpLT(in1, in2); break;
+        case CMP_LE: out = cv::gapi::cmpLE(in1, in2); break;
+        case CMP_NE: out = cv::gapi::cmpNE(in1, in2); break;
+        default: FAIL() << "no such compare operation type for two matrices!";
+        }
+
+        cv::compare(in_mat1, in_mat2, out_mat_ocv, opType);
+
+        cv::GComputation c(GIn(in1, in2), GOut(out));
+        c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(BitwiseTest, AccuracyTest)
+{
+    bitwiseOp opType = AND;
+    int type = 0;
+    cv::Size sz;
+    bool initOutMatr = false;
+    cv::GCompileArgs compile_args;
+    std::tie(opType, type, sz, initOutMatr, compile_args) = GetParam();
+    initMatsRandU(type, sz, type, initOutMatr);
+
+    // G-API code & corresponding OpenCV code ////////////////////////////////
+    cv::GMat in1, in2, out;
+    switch(opType)
+    {
+        case AND:
+        {
+            out = cv::gapi::bitwise_and(in1, in2);
+            cv::bitwise_and(in_mat1, in_mat2, out_mat_ocv);
+            break;
+        }
+        case OR:
+        {
+            out = cv::gapi::bitwise_or(in1, in2);
+            cv::bitwise_or(in_mat1, in_mat2, out_mat_ocv);
+            break;
+        }
+        case XOR:
+        {
+            out = cv::gapi::bitwise_xor(in1, in2);
+            cv::bitwise_xor(in_mat1, in_mat2, out_mat_ocv);
+            break;
+        }
+        default:
+        {
+            FAIL() << "no such bitwise operation type!";
+        }
+    }
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(NotTest, AccuracyTest)
+{
+    auto param = GetParam();
+    cv::Size sz_in = std::get<1>(param);
+    auto compile_args = std::get<3>(param);
+    initMatrixRandU(std::get<0>(param), sz_in, std::get<0>(param), std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::bitwise_not(in);
+    cv::GComputation c(in, out);
+
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::bitwise_not(in_mat1, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(SelectTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    cv::Size sz_in = std::get<1>(param);
+    auto compile_args = std::get<3>(param);
+    initMatsRandU(type, sz_in, type, std::get<2>(param));
+    cv::Mat in_mask(sz_in, CV_8UC1);
+    cv::randu(in_mask, cv::Scalar::all(0), cv::Scalar::all(255));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, in3;
+    auto out = cv::gapi::select(in1, in2, in3);
+    cv::GComputation c(GIn(in1, in2, in3), GOut(out));
+
+    c.apply(gin(in_mat1, in_mat2, in_mask), gout(out_mat_gapi), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        in_mat2.copyTo(out_mat_ocv);
+        in_mat1.copyTo(out_mat_ocv, in_mask);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(MinTest, AccuracyTest)
+{
+    auto param = GetParam();
+    cv::Size sz_in = std::get<1>(param);
+    auto compile_args = std::get<3>(param);
+    initMatsRandU(std::get<0>(param), sz_in, std::get<0>(param), std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = cv::gapi::min(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::min(in_mat1, in_mat2, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(MaxTest, AccuracyTest)
+{
+    auto param = GetParam();
+    cv::Size sz_in = std::get<1>(param);
+    auto compile_args = std::get<3>(param);
+    initMatsRandU(std::get<0>(param), sz_in, std::get<0>(param), std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = cv::gapi::max(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::max(in_mat1, in_mat2, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(AbsDiffTest, AccuracyTest)
+{
+    auto param = GetParam();
+    cv::Size sz_in = std::get<1>(param);
+    auto compile_args = std::get<3>(param);
+    initMatsRandU(std::get<0>(param), sz_in, std::get<0>(param), std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = cv::gapi::absDiff(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::absdiff(in_mat1, in_mat2, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(AbsDiffCTest, AccuracyTest)
+{
+    auto param = GetParam();
+    cv::Size sz_in = std::get<1>(param);
+    auto compile_args = std::get<3>(param);
+    initMatsRandU(std::get<0>(param), sz_in, std::get<0>(param), std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1;
+    cv::GScalar sc1;
+    auto out = cv::gapi::absDiffC(in1, sc1);
+    cv::GComputation c(cv::GIn(in1, sc1), cv::GOut(out));
+
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::absdiff(in_mat1, sc, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(SumTest, AccuracyTest)
+{
+    auto param = GetParam();
+    cv::Size sz_in = std::get<1>(param);
+    auto tolerance = std::get<3>(param);
+    auto compile_args = std::get<4>(param);
+    //initMatrixRandU(std::get<0>(param), sz_in, std::get<2>(param));
+    initMatsRandN(std::get<0>(param), sz_in, std::get<2>(param)); //TODO: workaround trying to fix SumTest failures
+
+
+    cv::Scalar out_sum;
+    cv::Scalar out_sum_ocv;
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::sum(in);
+
+    cv::GComputation c(cv::GIn(in), cv::GOut(out));
+    c.apply(cv::gin(in_mat1), cv::gout(out_sum), std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        out_sum_ocv = cv::sum(in_mat1);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_LE(std::abs(out_sum[0] - out_sum_ocv[0]) / std::max(1.0, std::abs(out_sum_ocv[0])), tolerance)
+            << "OCV=" << out_sum_ocv[0] << "   GAPI=" << out_sum[0];
+    }
+}
+
+TEST_P(AddWeightedTest, AccuracyTest)
+{
+    int type = 0, dtype = 0;
+    cv::Size sz_in;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    double tolerance = 0.0;
+    std::tie(type, sz_in, dtype, initOut, tolerance, compile_args) = GetParam();
+
+    auto& rng = cv::theRNG();
+    double alpha = rng.uniform(0.0, 1.0);
+    double beta = rng.uniform(0.0, 1.0);
+    double gamma = rng.uniform(0.0, 1.0);
+    initMatsRandU(type, sz_in, dtype, initOut);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = cv::gapi::addWeighted(in1, alpha, in2, beta, gamma, dtype);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::addWeighted(in_mat1, alpha, in_mat2, beta, gamma, out_mat_ocv, dtype);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        // Note, that we cannot expect bitwise results for add-weighted:
+        //
+        //    tmp = src1*alpha + src2*beta + gamma;
+        //    dst = saturate<DST>( round(tmp) );
+        //
+        // Because tmp is floating-point, dst depends on compiler optimizations
+        //
+        // However, we must expect good accuracy of tmp, and rounding correctly
+
+        cv::Mat failures;
+
+        if (out_mat_ocv.type() == CV_32FC1)
+        {
+            // result: float - may vary in 7th decimal digit
+            failures = abs(out_mat_gapi - out_mat_ocv) > abs(out_mat_ocv) * 1e-6;
+        }
+        else
+        {
+            // result: integral - rounding may vary if fractional part of tmp
+            //                    is nearly 0.5
+
+            cv::Mat inexact, incorrect, diff, tmp;
+
+            inexact = out_mat_gapi != out_mat_ocv;
+
+            // even if rounded differently, check if still rounded correctly
+            cv::addWeighted(in_mat1, alpha, in_mat2, beta, gamma, tmp, CV_32F);
+            cv::subtract(out_mat_gapi, tmp, diff, cv::noArray(), CV_32F);
+            incorrect = abs(diff) >= tolerance;// 0.5000005f; // relative to 6 digits
+
+            failures = inexact & incorrect;
+        }
+
+        EXPECT_EQ(0, cv::countNonZero(failures));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(NormTest, AccuracyTest)
+{
+    NormTypes opType = NORM_INF;
+    int type = 0;
+    cv::Size sz;
+    double tolerance = 0.0;
+    cv::GCompileArgs compile_args;
+    std::tie(opType, type, sz, tolerance, compile_args) = GetParam();
+    initMatrixRandU(type, sz, type, false);
+
+    cv::Scalar out_norm;
+    cv::Scalar out_norm_ocv;
+
+    // G-API code & corresponding OpenCV code ////////////////////////////////
+    cv::GMat in1;
+    cv::GScalar out;
+    switch(opType)
+    {
+        case NORM_L1: out = cv::gapi::normL1(in1); break;
+        case NORM_L2: out = cv::gapi::normL2(in1); break;
+        case NORM_INF: out = cv::gapi::normInf(in1); break;
+        default: FAIL() << "no such norm operation type!";
+    }
+    out_norm_ocv = cv::norm(in_mat1, opType);
+    cv::GComputation c(GIn(in1), GOut(out));
+    c.apply(gin(in_mat1), gout(out_norm), std::move(compile_args));
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_LE(std::abs(out_norm[0] - out_norm_ocv[0]) / std::max(1.0, std::abs(out_norm_ocv[0])), tolerance)
+            << "OCV=" << out_norm_ocv[0] << "   GAPI=" << out_norm[0];
+    }
+}
+
+TEST_P(IntegralTest, AccuracyTest)
+{
+    int type = std::get<0>(GetParam());
+    cv::Size sz_in = std::get<1>(GetParam());
+    auto compile_args = std::get<2>(GetParam());
+
+    int type_out = (type == CV_8U) ? CV_32SC1 : CV_64FC1;
+    cv::Mat in_mat1(sz_in, type);
+
+    cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+
+    cv::Size sz_out = cv::Size(sz_in.width + 1, sz_in.height + 1);
+    cv::Mat out_mat1(sz_out, type_out);
+    cv::Mat out_mat_ocv1(sz_out, type_out);
+
+    cv::Mat out_mat2(sz_out, CV_64FC1);
+    cv::Mat out_mat_ocv2(sz_out, CV_64FC1);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, out1, out2;
+    std::tie(out1, out2)  = cv::gapi::integral(in1, type_out, CV_64FC1);
+    cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2));
+
+    c.apply(cv::gin(in_mat1), cv::gout(out_mat1, out_mat2), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::integral(in_mat1, out_mat_ocv1, out_mat_ocv2);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv1 != out_mat1));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv2 != out_mat2));
+    }
+}
+
+TEST_P(ThresholdTest, AccuracyTestBinary)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    cv::Size sz_in = std::get<1>(param);
+    int tt = std::get<2>(param);
+
+    auto compile_args = std::get<4>(param);
+    cv::Scalar thr = initScalarRandU(50);
+    cv::Scalar maxval = initScalarRandU(50) + cv::Scalar(50, 50, 50, 50);
+    initMatrixRandU(type, sz_in, type, std::get<3>(param));
+    cv::Scalar out_scalar;
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar th1, mv1;
+    out = cv::gapi::threshold(in1, th1, mv1, tt);
+    cv::GComputation c(GIn(in1, th1, mv1), GOut(out));
+
+    c.apply(gin(in_mat1, thr, maxval), gout(out_mat_gapi), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::threshold(in_mat1, out_mat_ocv, thr.val[0], maxval.val[0], tt);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        ASSERT_EQ(out_mat_gapi.size(), sz_in);
+        EXPECT_EQ(0, cv::norm(out_mat_ocv, out_mat_gapi, NORM_L1));
+    }
+}
+
+TEST_P(ThresholdOTTest, AccuracyTestOtsu)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    cv::Size sz_in = std::get<1>(param);
+    int tt = std::get<2>(param);
+
+    auto compile_args = std::get<4>(param);
+    cv::Scalar maxval = initScalarRandU(50) + cv::Scalar(50, 50, 50, 50);
+    initMatrixRandU(type, sz_in, type, std::get<3>(param));
+    cv::Scalar out_gapi_scalar;
+    double ocv_res;
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, out;
+    cv::GScalar mv1, scout;
+    std::tie<cv::GMat, cv::GScalar>(out, scout) = cv::gapi::threshold(in1, mv1, tt);
+    cv::GComputation c(cv::GIn(in1, mv1), cv::GOut(out, scout));
+
+    c.apply(gin(in_mat1, maxval), gout(out_mat_gapi, out_gapi_scalar), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        ocv_res = cv::threshold(in_mat1, out_mat_ocv, maxval.val[0], maxval.val[0], tt);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+        EXPECT_EQ(ocv_res, out_gapi_scalar.val[0]);
+    }
+}
+
+TEST_P(InRangeTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    cv::Size sz_in = std::get<1>(param);
+
+    auto compile_args = std::get<3>(param);
+    cv::Scalar thrLow = initScalarRandU(100);
+    cv::Scalar thrUp = initScalarRandU(100) + cv::Scalar(100, 100, 100, 100);
+    initMatrixRandU(type, sz_in, type, std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1;
+    cv::GScalar th1, mv1;
+    auto out = cv::gapi::inRange(in1, th1, mv1);
+    cv::GComputation c(GIn(in1, th1, mv1), GOut(out));
+
+    c.apply(gin(in_mat1, thrLow, thrUp), gout(out_mat_gapi), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::inRange(in_mat1, thrLow, thrUp, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(Split3Test, AccuracyTest)
+{
+    cv::Size sz_in = std::get<0>(GetParam());
+    auto compile_args = std::get<1>(GetParam());
+    initMatrixRandU(CV_8UC3, sz_in, CV_8UC1);
+
+    cv::Mat out_mat2 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat3 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv2 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv3 = cv::Mat(sz_in, CV_8UC1);
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, out1, out2, out3;
+    std::tie(out1, out2, out3)  = cv::gapi::split3(in1);
+    cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2, out3));
+
+    c.apply(cv::gin(in_mat1), cv::gout(out_mat_gapi, out_mat2, out_mat3), std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        std::vector<cv::Mat> out_mats_ocv = {out_mat_ocv, out_mat_ocv2, out_mat_ocv3};
+        cv::split(in_mat1, out_mats_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv  != out_mat_gapi));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv2 != out_mat2));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv3 != out_mat3));
+    }
+}
+
+TEST_P(Split4Test, AccuracyTest)
+{
+    cv::Size sz_in = std::get<0>(GetParam());
+    auto compile_args = std::get<1>(GetParam());
+    initMatrixRandU(CV_8UC4, sz_in, CV_8UC1);
+    cv::Mat out_mat2 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat3 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat4 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv2 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv3 = cv::Mat(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv4 = cv::Mat(sz_in, CV_8UC1);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, out1, out2, out3, out4;
+    std::tie(out1, out2, out3, out4)  = cv::gapi::split4(in1);
+    cv::GComputation c(cv::GIn(in1), cv::GOut(out1, out2, out3, out4));
+
+    c.apply(cv::gin(in_mat1), cv::gout(out_mat_gapi, out_mat2, out_mat3, out_mat4), std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        std::vector<cv::Mat> out_mats_ocv = {out_mat_ocv, out_mat_ocv2, out_mat_ocv3, out_mat_ocv4};
+        cv::split(in_mat1, out_mats_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv  != out_mat_gapi));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv2 != out_mat2));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv3 != out_mat3));
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv4 != out_mat4));
+    }
+}
+
+static void ResizeAccuracyTest(compare_f cmpF, int type, int interp, cv::Size sz_in, cv::Size sz_out, double fx, double fy, cv::GCompileArgs&& compile_args)
+{
+    cv::Mat in_mat1 (sz_in, type );
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+
+    auto out_mat_sz = sz_out.area() == 0 ? cv::Size(saturate_cast<int>(sz_in.width *fx),
+                                                    saturate_cast<int>(sz_in.height*fy))
+                                         : sz_out;
+    cv::Mat out_mat(out_mat_sz, type);
+    cv::Mat out_mat_ocv(out_mat_sz, type);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::resize(in, sz_out, fx, fy, interp);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::resize(in_mat1, out_mat_ocv, sz_out, fx, fy, interp);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat, out_mat_ocv));
+    }
+}
+
+TEST_P(ResizeTest, AccuracyTest)
+{
+    compare_f cmpF;
+    int type = 0, interp = 0;
+    cv::Size sz_in, sz_out;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, interp, sz_in, sz_out, compile_args) = GetParam();
+    ResizeAccuracyTest(cmpF, type, interp, sz_in, sz_out, 0.0, 0.0, std::move(compile_args));
+}
+
+TEST_P(ResizeTestFxFy, AccuracyTest)
+{
+    compare_f cmpF;
+    int type = 0, interp = 0;
+    cv::Size sz_in;
+    double fx = 0.0, fy = 0.0;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, interp, sz_in, fx, fy, compile_args) = GetParam();
+    ResizeAccuracyTest(cmpF, type, interp, sz_in, cv::Size{0, 0}, fx, fy, std::move(compile_args));
+}
+
+TEST_P(Merge3Test, AccuracyTest)
+{
+    cv::Size sz_in = std::get<0>(GetParam());
+    initMatsRandU(CV_8UC1, sz_in, CV_8UC3);
+    auto compile_args = std::get<1>(GetParam());
+    cv::Mat in_mat3(sz_in,  CV_8UC1);
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat3, mean, stddev);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, in3;
+    auto out = cv::gapi::merge3(in1, in2, in3);
+
+    cv::GComputation c(cv::GIn(in1, in2, in3), cv::GOut(out));
+    c.apply(cv::gin(in_mat1, in_mat2, in_mat3), cv::gout(out_mat_gapi), std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        std::vector<cv::Mat> in_mats_ocv = {in_mat1, in_mat2, in_mat3};
+        cv::merge(in_mats_ocv, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    }
+}
+
+TEST_P(Merge4Test, AccuracyTest)
+{
+    cv::Size sz_in = std::get<0>(GetParam());
+    initMatsRandU(CV_8UC1, sz_in, CV_8UC4);
+    auto compile_args = std::get<1>(GetParam());
+    cv::Mat in_mat3(sz_in,  CV_8UC1);
+    cv::Mat in_mat4(sz_in,  CV_8UC1);
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat3, mean, stddev);
+    cv::randn(in_mat4, mean, stddev);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2, in3, in4;
+    auto out = cv::gapi::merge4(in1, in2, in3, in4);
+
+    cv::GComputation c(cv::GIn(in1, in2, in3, in4), cv::GOut(out));
+    c.apply(cv::gin(in_mat1, in_mat2, in_mat3, in_mat4), cv::gout(out_mat_gapi), std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        std::vector<cv::Mat> in_mats_ocv = {in_mat1, in_mat2, in_mat3, in_mat4};
+        cv::merge(in_mats_ocv, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    }
+}
+
+TEST_P(RemapTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    cv::Size sz_in = std::get<1>(param);
+    auto compile_args = std::get<3>(param);
+    initMatrixRandU(type, sz_in, type, std::get<2>(param));
+    cv::Mat in_map1(sz_in,  CV_16SC2);
+    cv::Mat in_map2 = cv::Mat();
+    cv::randu(in_map1, cv::Scalar::all(0), cv::Scalar::all(255));
+    cv::Scalar bv = cv::Scalar();
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1;
+    auto out = cv::gapi::remap(in1, in_map1, in_map2, cv::INTER_NEAREST,  cv::BORDER_REPLICATE, bv);
+    cv::GComputation c(in1, out);
+
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::remap(in_mat1, out_mat_ocv, in_map1, in_map2, cv::INTER_NEAREST, cv::BORDER_REPLICATE, bv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(FlipTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    int flipCode =  std::get<1>(param);
+    cv::Size sz_in = std::get<2>(param);
+    initMatrixRandU(type, sz_in, type, false);
+    auto compile_args = std::get<4>(GetParam());
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::flip(in, flipCode);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::flip(in_mat1, out_mat_ocv, flipCode);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(CropTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    cv::Rect rect_to = std::get<1>(param);
+    cv::Size sz_in = std::get<2>(param);
+    auto compile_args = std::get<4>(param);
+
+    initMatrixRandU(type, sz_in, type, false);
+    cv::Size sz_out = cv::Size(rect_to.width, rect_to.height);
+    if( std::get<3>(param) == true )
+    {
+        out_mat_gapi = cv::Mat(sz_out, type);
+        out_mat_ocv = cv::Mat(sz_out, type);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::crop(in, rect_to);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::Mat(in_mat1, rect_to).copyTo(out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        EXPECT_EQ(out_mat_gapi.size(), sz_out);
+    }
+}
+
+TEST_P(ConcatHorTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    cv::Size sz_out = std::get<1>(param);
+    auto compile_args = std::get<2>(param);
+
+    int wpart = sz_out.width / 4;
+    cv::Size sz_in1 = cv::Size(wpart, sz_out.height);
+    cv::Size sz_in2 = cv::Size(sz_out.width - wpart, sz_out.height);
+
+    cv::Mat in_mat1 (sz_in1, type );
+    cv::Mat in_mat2 (sz_in2, type);
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+    cv::randn(in_mat2, mean, stddev);
+
+    cv::Mat out_mat(sz_out, type);
+    cv::Mat out_mat_ocv(sz_out, type);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = cv::gapi::concatHor(in1, in2);
+
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat), std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::hconcat(in_mat1, in_mat2, out_mat_ocv );
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat));
+    }
+}
+
+TEST_P(ConcatVertTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    cv::Size sz_out = std::get<1>(param);
+    auto compile_args = std::get<2>(param);
+
+    int hpart = sz_out.height * 2/3;
+    cv::Size sz_in1 = cv::Size(sz_out.width, hpart);
+    cv::Size sz_in2 = cv::Size(sz_out.width, sz_out.height - hpart);
+
+    cv::Mat in_mat1 (sz_in1, type);
+    cv::Mat in_mat2 (sz_in2, type);
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+    cv::randn(in_mat2, mean, stddev);
+
+    cv::Mat out_mat(sz_out, type);
+    cv::Mat out_mat_ocv(sz_out, type);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in1, in2;
+    auto out = cv::gapi::concatVert(in1, in2);
+
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat), std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::vconcat(in_mat1, in_mat2, out_mat_ocv );
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat));
+    }
+}
+
+TEST_P(ConcatVertVecTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    cv::Size sz_out = std::get<1>(param);
+    auto compile_args = std::get<2>(param);
+
+    int hpart1 = sz_out.height * 2/5;
+    int hpart2 = sz_out.height / 5;
+    cv::Size sz_in1 = cv::Size(sz_out.width, hpart1);
+    cv::Size sz_in2 = cv::Size(sz_out.width, hpart2);
+    cv::Size sz_in3 = cv::Size(sz_out.width, sz_out.height - hpart1 - hpart2);
+
+    cv::Mat in_mat1 (sz_in1, type);
+    cv::Mat in_mat2 (sz_in2, type);
+    cv::Mat in_mat3 (sz_in3, type);
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+    cv::randn(in_mat2, mean, stddev);
+    cv::randn(in_mat3, mean, stddev);
+
+    cv::Mat out_mat(sz_out, type);
+    cv::Mat out_mat_ocv(sz_out, type);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    std::vector <cv::GMat> mats(3);
+    auto out = cv::gapi::concatVert(mats);
+
+    std::vector <cv::Mat> cvmats = {in_mat1, in_mat2, in_mat3};
+
+    cv::GComputation c({mats[0], mats[1], mats[2]}, {out});
+    c.apply(gin(in_mat1, in_mat2, in_mat3), gout(out_mat), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::vconcat(cvmats, out_mat_ocv );
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat));
+    }
+}
+
+TEST_P(ConcatHorVecTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type = std::get<0>(param);
+    cv::Size sz_out = std::get<1>(param);
+    auto compile_args = std::get<2>(param);
+
+    int wpart1 = sz_out.width / 3;
+    int wpart2 = sz_out.width / 4;
+    cv::Size sz_in1 = cv::Size(wpart1, sz_out.height);
+    cv::Size sz_in2 = cv::Size(wpart2, sz_out.height);
+    cv::Size sz_in3 = cv::Size(sz_out.width - wpart1 - wpart2, sz_out.height);
+
+    cv::Mat in_mat1 (sz_in1, type);
+    cv::Mat in_mat2 (sz_in2, type);
+    cv::Mat in_mat3 (sz_in3, type);
+    cv::Scalar mean = cv::Scalar::all(127);
+    cv::Scalar stddev = cv::Scalar::all(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+    cv::randn(in_mat2, mean, stddev);
+    cv::randn(in_mat3, mean, stddev);
+
+    cv::Mat out_mat(sz_out, type);
+    cv::Mat out_mat_ocv(sz_out, type);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    std::vector <cv::GMat> mats(3);
+    auto out = cv::gapi::concatHor(mats);
+
+    std::vector <cv::Mat> cvmats = {in_mat1, in_mat2, in_mat3};
+
+    cv::GComputation c({mats[0], mats[1], mats[2]}, {out});
+    c.apply(gin(in_mat1, in_mat2, in_mat3), gout(out_mat), std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::hconcat(cvmats, out_mat_ocv );
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat));
+    }
+}
+
+TEST_P(LUTTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type_mat = std::get<0>(param);
+    int type_lut = std::get<1>(param);
+    int type_out = CV_MAKETYPE(CV_MAT_DEPTH(type_lut), CV_MAT_CN(type_mat));
+    cv::Size sz_in = std::get<2>(param);
+    auto compile_args = std::get<4>(GetParam());
+
+    initMatrixRandU(type_mat, sz_in, type_out);
+    cv::Size sz_lut = cv::Size(1, 256);
+    cv::Mat in_lut(sz_lut, type_lut);
+    cv::randu(in_lut, cv::Scalar::all(0), cv::Scalar::all(255));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::LUT(in, in_lut);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::LUT(in_mat1, in_lut, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(ConvertToTest, AccuracyTest)
+{
+    auto param = GetParam();
+    int type_mat = std::get<0>(param);
+    int depth_to = std::get<1>(param);
+    cv::Size sz_in = std::get<2>(param);
+    int type_out = CV_MAKETYPE(depth_to, CV_MAT_CN(type_mat));
+    initMatrixRandU(type_mat, sz_in, type_out);
+    auto compile_args = std::get<3>(GetParam());
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::convertTo(in, depth_to);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        in_mat1.convertTo(out_mat_ocv, depth_to);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+
+TEST_P(PhaseTest, AccuracyTest)
+{
+    int img_type = -1;
+    cv::Size img_size;
+    bool angle_in_degrees = false;
+    cv::GCompileArgs compile_args;
+    std::tie(img_type, img_size, angle_in_degrees, compile_args) = GetParam();
+    initMatsRandU(img_type, img_size, img_type);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in_x, in_y;
+    auto out = cv::gapi::phase(in_x, in_y, angle_in_degrees);
+
+    cv::GComputation c(in_x, in_y, out);
+    c.apply(in_mat1, in_mat2, out_mat_gapi, std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    cv::phase(in_mat1, in_mat2, out_mat_ocv, angle_in_degrees);
+
+    // Comparison //////////////////////////////////////////////////////////////
+    // FIXME: use a comparison functor instead (after enabling OpenCL)
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    }
+}
+
+TEST_P(SqrtTest, AccuracyTest)
+{
+    int img_type = -1;
+    cv::Size img_size;
+    cv::GCompileArgs compile_args;
+    std::tie(img_type, img_size, compile_args) = GetParam();
+    initMatrixRandU(img_type, img_size, img_type);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::sqrt(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    cv::sqrt(in_mat1, out_mat_ocv);
+
+    // Comparison //////////////////////////////////////////////////////////////
+    // FIXME: use a comparison functor instead (after enabling OpenCL)
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+    }
+}
+
+
+} // opencv_test
+
+#endif //OPENCV_GAPI_CORE_TESTS_INL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.cpp
new file mode 100644 (file)
index 0000000..b7c0279
--- /dev/null
@@ -0,0 +1,9 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "gapi_imgproc_tests_inl.hpp"
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests.hpp
new file mode 100644 (file)
index 0000000..c21b26b
--- /dev/null
@@ -0,0 +1,42 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_IMGPROC_TESTS_HPP
+#define OPENCV_GAPI_IMGPROC_TESTS_HPP
+
+#include <iostream>
+
+#include "gapi_tests_common.hpp"
+
+namespace opencv_test
+{
+
+struct Filter2DTest : public TestParams <std::tuple<compare_f, MatType,int,cv::Size,int,int,bool,cv::GCompileArgs>> {};
+struct BoxFilterTest : public TestParams <std::tuple<compare_f,MatType,int,cv::Size,int,int,bool,cv::GCompileArgs>> {};
+struct SepFilterTest : public TestParams <std::tuple<compare_f,MatType,int,cv::Size,int,bool,cv::GCompileArgs>> {};
+struct BlurTest : public TestParams <std::tuple<compare_f,MatType,int,cv::Size,int,bool,cv::GCompileArgs>> {};
+struct GaussianBlurTest : public TestParams <std::tuple<compare_f,MatType,int,cv::Size,bool,cv::GCompileArgs>> {};
+struct MedianBlurTest : public TestParams <std::tuple<compare_f,MatType,int,cv::Size,bool,cv::GCompileArgs>> {};
+struct ErodeTest : public TestParams <std::tuple<compare_f,MatType,int,cv::Size,int,bool,cv::GCompileArgs>> {};
+struct Erode3x3Test : public TestParams <std::tuple<compare_f,MatType,cv::Size,bool,int,cv::GCompileArgs>> {};
+struct DilateTest : public TestParams <std::tuple<compare_f,MatType,int,cv::Size,int,bool,cv::GCompileArgs>> {};
+struct Dilate3x3Test : public TestParams <std::tuple<compare_f,MatType,cv::Size,bool,int,cv::GCompileArgs>> {};
+struct SobelTest : public TestParams <std::tuple<compare_f,MatType,int,cv::Size,int,int,int,bool,cv::GCompileArgs>> {};
+struct EqHistTest : public TestParams <std::tuple<compare_f,cv::Size,bool,cv::GCompileArgs>> {};
+struct CannyTest : public TestParams <std::tuple<compare_f,MatType,cv::Size,double,double,int,bool,bool,cv::GCompileArgs>> {};
+struct RGB2GrayTest : public  TestParams<std::tuple<compare_f,cv::Size,bool,cv::GCompileArgs>> {};
+struct BGR2GrayTest : public TestParams<std::tuple<compare_f,cv::Size,bool,cv::GCompileArgs>> {};
+struct RGB2YUVTest : public TestParams<std::tuple<compare_f,cv::Size,bool,cv::GCompileArgs>> {};
+struct YUV2RGBTest : public TestParams<std::tuple<compare_f,cv::Size,bool,cv::GCompileArgs>> {};
+struct RGB2LabTest : public TestParams<std::tuple<compare_f,cv::Size,bool,cv::GCompileArgs>> {};
+struct BGR2LUVTest : public TestParams<std::tuple<compare_f,cv::Size,bool,cv::GCompileArgs>> {};
+struct LUV2BGRTest : public TestParams<std::tuple<compare_f,cv::Size,bool,cv::GCompileArgs>> {};
+struct BGR2YUVTest : public TestParams<std::tuple<compare_f,cv::Size,bool,cv::GCompileArgs>> {};
+struct YUV2BGRTest : public TestParams<std::tuple<compare_f,cv::Size,bool,cv::GCompileArgs>> {};
+} // opencv_test
+
+#endif //OPENCV_GAPI_IMGPROC_TESTS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_imgproc_tests_inl.hpp
new file mode 100644 (file)
index 0000000..3de4289
--- /dev/null
@@ -0,0 +1,630 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_IMGPROC_TESTS_INL_HPP
+#define OPENCV_GAPI_IMGPROC_TESTS_INL_HPP
+
+#include "opencv2/gapi/imgproc.hpp"
+#include "gapi_imgproc_tests.hpp"
+
+namespace opencv_test
+{
+TEST_P(Filter2DTest, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0, borderType = 0, dtype = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, borderType, dtype, initOut, compile_args) = GetParam();
+    initMatsRandN(type, sz, dtype, initOut);
+
+    cv::Point anchor = {-1, -1};
+    double delta = 0;
+
+    cv::Mat kernel = cv::Mat(kernSize, kernSize, CV_32FC1 );
+    cv::Scalar kernMean = cv::Scalar(1.0);
+    cv::Scalar kernStddev = cv::Scalar(2.0/3);
+    randn(kernel, kernMean, kernStddev);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::filter2D(in, dtype, kernel, anchor, delta, borderType);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::filter2D(in_mat1, out_mat_ocv, dtype, kernel, anchor, delta, borderType);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(BoxFilterTest, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int filterSize = 0, borderType = 0, dtype = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, filterSize, sz, borderType, dtype, initOut, compile_args) = GetParam();
+    initMatsRandN(type, sz, dtype, initOut);
+
+    cv::Point anchor = {-1, -1};
+    bool normalize = true;
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::boxFilter(in, dtype, cv::Size(filterSize, filterSize), anchor, normalize, borderType);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::boxFilter(in_mat1, out_mat_ocv, dtype, cv::Size(filterSize, filterSize), anchor, normalize, borderType);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(SepFilterTest, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0, dtype = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, dtype, initOut, compile_args) = GetParam();
+
+    cv::Mat kernelX(kernSize, 1, CV_32F);
+    cv::Mat kernelY(kernSize, 1, CV_32F);
+    randu(kernelX, -1, 1);
+    randu(kernelY, -1, 1);
+    initMatsRandN(type, sz, dtype, initOut);
+
+    cv::Point anchor = cv::Point(-1, -1);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::sepFilter(in, dtype, kernelX, kernelY, anchor, cv::Scalar() );
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::sepFilter2D(in_mat1, out_mat_ocv, dtype, kernelX, kernelY );
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(BlurTest, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int filterSize = 0, borderType = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, filterSize, sz, borderType, initOut, compile_args) = GetParam();
+    initMatsRandN(type, sz, type, initOut);
+
+    cv::Point anchor = {-1, -1};
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::blur(in, cv::Size(filterSize, filterSize), anchor, borderType);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::blur(in_mat1, out_mat_ocv, cv::Size(filterSize, filterSize), anchor, borderType);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(GaussianBlurTest, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF,type, kernSize, sz, initOut, compile_args) = GetParam();
+    initMatsRandN(type, sz, type, initOut);
+
+    cv::Size kSize = cv::Size(kernSize, kernSize);
+    double sigmaX = rand();
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::gaussianBlur(in, kSize, sigmaX);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::GaussianBlur(in_mat1, out_mat_ocv, kSize, sigmaX);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(MedianBlurTest, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, initOut, compile_args) = GetParam();
+    initMatsRandN(type, sz, type, initOut);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::medianBlur(in, kernSize);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::medianBlur(in_mat1, out_mat_ocv, kernSize);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(ErodeTest, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0, kernType = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, kernType, initOut, compile_args) = GetParam();
+    initMatsRandN(type, sz, type, initOut);
+
+    cv::Mat kernel = cv::getStructuringElement(kernType, cv::Size(kernSize, kernSize));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::erode(in, kernel);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::erode(in_mat1, out_mat_ocv, kernel);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(Erode3x3Test, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int numIters = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, initOut, numIters, compile_args) = GetParam();
+    initMatsRandN(type, sz, type, initOut);
+
+    cv::Mat kernel = cv::getStructuringElement(cv::MorphShapes::MORPH_RECT, cv::Size(3,3));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::erode3x3(in, numIters);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::erode(in_mat1, out_mat_ocv, kernel, cv::Point(-1, -1), numIters);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(DilateTest, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0, kernType = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, kernType, initOut, compile_args) = GetParam();
+    initMatsRandN(type, sz, type, initOut);
+
+    cv::Mat kernel = cv::getStructuringElement(kernType, cv::Size(kernSize, kernSize));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::dilate(in, kernel);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::dilate(in_mat1, out_mat_ocv, kernel);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(Dilate3x3Test, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int numIters = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, initOut, numIters, compile_args) = GetParam();
+    initMatsRandN(type, sz, type, initOut);
+
+    cv::Mat kernel = cv::getStructuringElement(cv::MorphShapes::MORPH_RECT, cv::Size(3,3));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::dilate3x3(in, numIters);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::dilate(in_mat1, out_mat_ocv, kernel, cv::Point(-1,-1), numIters);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+
+TEST_P(SobelTest, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type = 0;
+    int kernSize = 0, dtype = 0, dx = 0, dy = 0;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, kernSize, sz, dtype, dx, dy, initOut, compile_args) = GetParam();
+    initMatsRandN(type, sz, dtype, initOut);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::Sobel(in, dtype, dx, dy, kernSize );
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::Sobel(in_mat1, out_mat_ocv, dtype, dx, dy, kernSize);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(EqHistTest, AccuracyTest)
+{
+    compare_f cmpF;
+    cv::Size sz;
+    bool initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, sz, initOut, compile_args) = GetParam();
+    initMatsRandN(CV_8UC1, sz, CV_8UC1, initOut);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::equalizeHist(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::equalizeHist(in_mat1, out_mat_ocv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), std::get<1>(GetParam()));
+    }
+}
+
+TEST_P(CannyTest, AccuracyTest)
+{
+    compare_f cmpF;
+    MatType type;
+    int apSize = 0;
+    double thrLow = 0.0, thrUp = 0.0;
+    cv::Size sz;
+    bool l2gr = false, initOut = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, type, sz, thrLow, thrUp, apSize, l2gr, initOut, compile_args) = GetParam();
+
+    initMatsRandN(type, sz, CV_8UC1, initOut);
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::Canny(in, thrLow, thrUp, apSize, l2gr);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::Canny(in_mat1, out_mat_ocv, thrLow, thrUp, apSize, l2gr);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(RGB2GrayTest, AccuracyTest)
+{
+    auto param = GetParam();
+    auto compile_args = std::get<3>(param);
+    compare_f cmpF = std::get<0>(param);
+    initMatsRandN(CV_8UC3, std::get<1>(param), CV_8UC1, std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::RGB2Gray(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_RGB2GRAY);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), std::get<1>(param));
+    }
+}
+
+TEST_P(BGR2GrayTest, AccuracyTest)
+{
+    auto param = GetParam();
+    auto compile_args = std::get<3>(param);
+    compare_f cmpF = std::get<0>(param);
+    initMatsRandN(CV_8UC3, std::get<1>(param), CV_8UC1, std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::BGR2Gray(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_BGR2GRAY);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), std::get<1>(param));
+    }
+}
+
+TEST_P(RGB2YUVTest, AccuracyTest)
+{
+    auto param = GetParam();
+    auto compile_args = std::get<3>(param);
+    compare_f cmpF = std::get<0>(param);
+    initMatsRandN(CV_8UC3, std::get<1>(param), CV_8UC3, std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::RGB2YUV(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_RGB2YUV);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), std::get<1>(param));
+    }
+}
+
+TEST_P(YUV2RGBTest, AccuracyTest)
+{
+    auto param = GetParam();
+    auto compile_args = std::get<3>(param);
+    compare_f cmpF = std::get<0>(param);
+    initMatsRandN(CV_8UC3, std::get<1>(param), CV_8UC3, std::get<2>(param));
+
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::YUV2RGB(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_YUV2RGB);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), std::get<1>(param));
+    }
+}
+
+TEST_P(RGB2LabTest, AccuracyTest)
+{
+    auto param = GetParam();
+    auto compile_args = std::get<3>(param);
+    compare_f cmpF = std::get<0>(param);
+    initMatsRandN(CV_8UC3, std::get<1>(param), CV_8UC3, std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::RGB2Lab(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_RGB2Lab);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), std::get<1>(param));
+    }
+}
+
+TEST_P(BGR2LUVTest, AccuracyTest)
+{
+    auto param = GetParam();
+    auto compile_args = std::get<3>(param);
+    compare_f cmpF = std::get<0>(param);
+    initMatsRandN(CV_8UC3, std::get<1>(param), CV_8UC3, std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::BGR2LUV(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_BGR2Luv);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), std::get<1>(param));
+    }
+}
+
+TEST_P(LUV2BGRTest, AccuracyTest)
+{
+    auto param = GetParam();
+    auto compile_args = std::get<3>(param);
+    compare_f cmpF = std::get<0>(param);
+    initMatsRandN(CV_8UC3, std::get<1>(param), CV_8UC3, std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::LUV2BGR(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_Luv2BGR);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), std::get<1>(param));
+    }
+}
+
+TEST_P(BGR2YUVTest, AccuracyTest)
+{
+    auto param = GetParam();
+    auto compile_args = std::get<3>(param);
+    compare_f cmpF = std::get<0>(param);
+    initMatsRandN(CV_8UC3, std::get<1>(param), CV_8UC3, std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::BGR2YUV(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_BGR2YUV);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), std::get<1>(param));
+    }
+}
+
+TEST_P(YUV2BGRTest, AccuracyTest)
+{
+    auto param = GetParam();
+    auto compile_args = std::get<3>(param);
+    compare_f cmpF = std::get<0>(param);
+    initMatsRandN(CV_8UC3, std::get<1>(param), CV_8UC3, std::get<2>(param));
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = cv::gapi::YUV2BGR(in);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::cvtColor(in_mat1, out_mat_ocv, cv::COLOR_YUV2BGR);
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), std::get<1>(param));
+    }
+}
+} // opencv_test
+
+#endif //OPENCV_GAPI_IMGPROC_TESTS_INL_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.cpp
new file mode 100644 (file)
index 0000000..1f6f0ce
--- /dev/null
@@ -0,0 +1,9 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "gapi_operators_tests_inl.hpp"
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests.hpp
new file mode 100644 (file)
index 0000000..9f53d36
--- /dev/null
@@ -0,0 +1,192 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OPERATOR_TESTS_COMMON_HPP
+#define OPENCV_GAPI_OPERATOR_TESTS_COMMON_HPP
+
+#include "gapi_tests_common.hpp"
+
+namespace opencv_test
+{
+
+struct g_api_ocv_pair_mat_scalar {
+    using g_api_function_t = std::function<cv::GMat(cv::GMat,cv::GScalar)>;
+    using ocv_function_t   = std::function<void(cv::Mat const&, cv::Scalar, cv::Mat&)>;
+
+    std::string      name;
+    g_api_function_t g_api_function;
+    ocv_function_t   ocv_function;
+
+
+    g_api_ocv_pair_mat_scalar(std::string const& n, g_api_function_t const& g, ocv_function_t const& o)
+    : name(n), g_api_function(g), ocv_function(o) {}
+
+    g_api_ocv_pair_mat_scalar() = default;
+
+    friend std::ostream& operator<<(std::ostream& o, const g_api_ocv_pair_mat_scalar& p)
+    {
+        return o<<p.name;
+    }
+};
+
+struct g_api_ocv_pair_mat_mat {
+    using g_api_function_t = std::function<cv::GMat(cv::GMat,cv::GMat)>;
+    using ocv_function_t   = std::function<void(cv::Mat const&, cv::Mat const&, cv::Mat&)>;
+
+    std::string      name;
+    g_api_function_t g_api_function;
+    ocv_function_t   ocv_function;
+
+
+    g_api_ocv_pair_mat_mat(std::string const& n, g_api_function_t const& g, ocv_function_t const& o)
+    : name(n), g_api_function(g), ocv_function(o) {}
+
+    g_api_ocv_pair_mat_mat() = default;
+
+    friend std::ostream& operator<<(std::ostream& o, const g_api_ocv_pair_mat_mat& p)
+    {
+        return o<<p.name;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// FIXME: Please refactor this test to a template test (T,U) with enum (OP)
+//
+////////////////////////////////////////////////////////////////////////////////
+namespace
+{
+
+
+//declare test cases for matrix and scalar operators
+g_api_ocv_pair_mat_scalar opPlus =  {std::string{"operator+"},
+                                    [](cv::GMat in,cv::GScalar c){return in+c;},
+                                    [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::add(in, c, out);}};
+g_api_ocv_pair_mat_scalar opPlusR = {std::string{"rev_operator+"},
+                                    [](cv::GMat in,cv::GScalar c){return c+in;},
+                                    [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::add(c, in, out);}};
+g_api_ocv_pair_mat_scalar opMinus = {std::string{"operator-"},
+                                    [](cv::GMat in,cv::GScalar c){return in-c;},
+                                    [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::subtract(in, c, out);}};
+g_api_ocv_pair_mat_scalar opMinusR = {std::string{"rev_operator-"},
+                                    [](cv::GMat in,cv::GScalar c){return c-in;},
+                                    [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::subtract(c, in, out);}};
+g_api_ocv_pair_mat_scalar opMul =   {std::string{"operator*"},
+                                    [](cv::GMat in,cv::GScalar c){return in*c;},
+                                    [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::multiply(in, c, out);}};
+g_api_ocv_pair_mat_scalar opMulR =  {std::string{"rev_operator*"},
+                                    [](cv::GMat in,cv::GScalar c){return c*in;},
+                                    [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::multiply(c, in, out);}};
+g_api_ocv_pair_mat_scalar opDiv =   {std::string{"operator/"},
+                                    [](cv::GMat in,cv::GScalar c){return in/c;},
+                                    [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::divide(in, c, out);}};
+g_api_ocv_pair_mat_scalar opDivR =  {std::string{"rev_operator/"},
+                                    [](cv::GMat in,cv::GScalar c){return c/in;},
+                                    [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::divide(c, in, out);}};
+
+g_api_ocv_pair_mat_scalar opGT = {std::string{"operator>"},
+                                            [](cv::GMat in,cv::GScalar c){return in>c;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(in, c, out,cv::CMP_GT);}};
+g_api_ocv_pair_mat_scalar opLT = {std::string{"operator<"},
+                                            [](cv::GMat in,cv::GScalar c){return in<c;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(in, c, out,cv::CMP_LT);}};
+g_api_ocv_pair_mat_scalar opGE = {std::string{"operator>="},
+                                            [](cv::GMat in,cv::GScalar c){return in>=c;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(in, c, out,cv::CMP_GE);}};
+g_api_ocv_pair_mat_scalar opLE = {std::string{"operator<="},
+                                            [](cv::GMat in,cv::GScalar c){return in<=c;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(in, c, out,cv::CMP_LE);}};
+g_api_ocv_pair_mat_scalar opEQ = {std::string{"operator=="},
+                                            [](cv::GMat in,cv::GScalar c){return in==c;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(in, c, out,cv::CMP_EQ);}};
+g_api_ocv_pair_mat_scalar opNE = {std::string{"operator!="},
+                                            [](cv::GMat in,cv::GScalar c){return in!=c;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(in, c, out,cv::CMP_NE);}};
+g_api_ocv_pair_mat_scalar opGTR = {std::string{"rev_operator>"},
+                                            [](cv::GMat in,cv::GScalar c){return c>in;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(c, in, out,cv::CMP_GT);}};
+g_api_ocv_pair_mat_scalar opLTR = {std::string{"rev_operator<"},
+                                            [](cv::GMat in,cv::GScalar c){return c<in;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(c, in, out,cv::CMP_LT);}};
+g_api_ocv_pair_mat_scalar opGER = {std::string{"rev_operator>="},
+                                            [](cv::GMat in,cv::GScalar c){return c>=in;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(c, in, out,cv::CMP_GE);}};
+g_api_ocv_pair_mat_scalar opLER = {std::string{"rev_operator<="},
+                                            [](cv::GMat in,cv::GScalar c){return c<=in;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(c, in, out,cv::CMP_LE);}};
+g_api_ocv_pair_mat_scalar opEQR = {std::string{"rev_operator=="},
+                                            [](cv::GMat in,cv::GScalar c){return c==in;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(c, in, out,cv::CMP_EQ);}};
+g_api_ocv_pair_mat_scalar opNER = {std::string{"rev_operator!="},
+                                            [](cv::GMat in,cv::GScalar c){return c!=in;},
+                                            [](const cv::Mat& in, cv::Scalar c, cv::Mat& out){cv::compare(c, in, out,cv::CMP_NE);}};
+
+g_api_ocv_pair_mat_scalar opAND = {std::string{"operator&"},
+                                        [](cv::GMat in1,cv::GScalar in2){return in1&in2;},
+                                        [](const cv::Mat& in1, const cv::Scalar& in2, cv::Mat& out){cv::bitwise_and(in1, in2, out);}};
+g_api_ocv_pair_mat_scalar opOR = {std::string{"operator|"},
+                                        [](cv::GMat in1,cv::GScalar in2){return in1|in2;},
+                                        [](const cv::Mat& in1, const cv::Scalar& in2, cv::Mat& out){cv::bitwise_or(in1, in2, out);}};
+g_api_ocv_pair_mat_scalar opXOR = {std::string{"operator^"},
+                                        [](cv::GMat in1,cv::GScalar in2){return in1^in2;},
+                                        [](const cv::Mat& in1, const cv::Scalar& in2, cv::Mat& out){cv::bitwise_xor(in1, in2, out);}};
+g_api_ocv_pair_mat_scalar opANDR = {std::string{"rev_operator&"},
+                                        [](cv::GMat in1,cv::GScalar in2){return in2&in1;},
+                                        [](const cv::Mat& in1, const cv::Scalar& in2, cv::Mat& out){cv::bitwise_and(in2, in1, out);}};
+g_api_ocv_pair_mat_scalar opORR = {std::string{"rev_operator|"},
+                                        [](cv::GMat in1,cv::GScalar in2){return in2|in1;},
+                                        [](const cv::Mat& in1, const cv::Scalar& in2, cv::Mat& out){cv::bitwise_or(in2, in1, out);}};
+g_api_ocv_pair_mat_scalar opXORR = {std::string{"rev_operator^"},
+                                        [](cv::GMat in1,cv::GScalar in2){return in2^in1;},
+                                        [](const cv::Mat& in1, const cv::Scalar& in2, cv::Mat& out){cv::bitwise_xor(in2, in1, out);}};
+
+// declare test cases for matrix and matrix operators
+g_api_ocv_pair_mat_mat opPlusM =  {std::string{"operator+"},
+                                            [](cv::GMat in1,cv::GMat in2){return in1+in2;},
+                                            [](const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out){cv::add(in1, in2, out);}};
+g_api_ocv_pair_mat_mat opMinusM = {std::string{"operator-"},
+                                            [](cv::GMat in,cv::GMat in2){return in-in2;},
+                                            [](const cv::Mat& in, const cv::Mat& in2, cv::Mat& out){cv::subtract(in, in2, out);}};
+g_api_ocv_pair_mat_mat opDivM = {std::string{"operator/"},
+                                            [](cv::GMat in,cv::GMat in2){return in/in2;},
+                                            [](const cv::Mat& in, const cv::Mat& in2, cv::Mat& out){cv::divide(in, in2, out);}};
+g_api_ocv_pair_mat_mat opGreater =  {std::string{"operator>"},
+                                            [](cv::GMat in1,cv::GMat in2){return in1>in2;},
+                                            [](const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out){cv::compare(in1, in2, out, cv::CMP_GT);}};
+g_api_ocv_pair_mat_mat opGreaterEq = {std::string{"operator>="},
+                                            [](cv::GMat in1,cv::GMat in2){return in1>=in2;},
+                                            [](const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out){cv::compare(in1, in2, out, cv::CMP_GE);}};
+g_api_ocv_pair_mat_mat opLess = {std::string{"operator<"},
+                                            [](cv::GMat in1,cv::GMat in2){return in1<in2;},
+                                            [](const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out){cv::compare(in1, in2, out, cv::CMP_LT);}};
+g_api_ocv_pair_mat_mat opLessEq = {std::string{"operator<="},
+                                            [](cv::GMat in1,cv::GMat in2){return in1<=in2;},
+                                            [](const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out){cv::compare(in1, in2, out, cv::CMP_LE);}};
+g_api_ocv_pair_mat_mat opEq = {std::string{"operator=="},
+                                            [](cv::GMat in1,cv::GMat in2){return in1==in2;},
+                                            [](const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out){cv::compare(in1, in2, out, cv::CMP_EQ);}};
+g_api_ocv_pair_mat_mat opNotEq = {std::string{"operator!="},
+                                            [](cv::GMat in1,cv::GMat in2){return in1!=in2;},
+                                            [](const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out){cv::compare(in1, in2, out, cv::CMP_NE);}};
+
+g_api_ocv_pair_mat_mat opAnd = {std::string{"operator&"},
+                                        [](cv::GMat in1,cv::GMat in2){return in1&in2;},
+                                        [](const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out){cv::bitwise_and(in1, in2, out);}};
+g_api_ocv_pair_mat_mat opOr = {std::string{"operator|"},
+                                        [](cv::GMat in1,cv::GMat in2){return in1|in2;},
+                                        [](const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out){cv::bitwise_or(in1, in2, out);}};
+g_api_ocv_pair_mat_mat opXor = {std::string{"operator^"},
+                                        [](cv::GMat in1,cv::GMat in2){return in1^in2;},
+                                        [](const cv::Mat& in1, const cv::Mat& in2, cv::Mat& out){cv::bitwise_xor(in1, in2, out);}};
+
+} // anonymous namespace
+struct MathOperatorMatScalarTest : public TestParams<std::tuple<compare_f, g_api_ocv_pair_mat_scalar,int,cv::Size,int,bool,cv::GCompileArgs>>{};
+struct MathOperatorMatMatTest : public TestParams<std::tuple<compare_f, g_api_ocv_pair_mat_mat,int,cv::Size,int,bool,cv::GCompileArgs>>{};
+struct NotOperatorTest : public TestParams<std::tuple<int,cv::Size,bool,cv::GCompileArgs>> {};
+} // opencv_test
+
+#endif // OPENCV_GAPI_OPERATOR_TESTS_COMMON_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests_inl.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_operators_tests_inl.hpp
new file mode 100644 (file)
index 0000000..7ec702a
--- /dev/null
@@ -0,0 +1,104 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef OPENCV_GAPI_OPERATOR_TESTS_INL_COMMON_HPP
+#define OPENCV_GAPI_OPERATOR_TESTS_INL_COMMON_HPP
+
+#include "gapi_operators_tests.hpp"
+
+namespace opencv_test
+{
+TEST_P(MathOperatorMatScalarTest, OperatorAccuracyTest )
+{
+    compare_f cmpF;
+    g_api_ocv_pair_mat_scalar op;
+    int type = 0, dtype = 0;
+    cv::Size sz;
+    bool initOutMatr = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, op, type, sz, dtype, initOutMatr, compile_args) = GetParam();
+    initMatsRandU(type, sz, dtype, initOutMatr);
+
+    auto fun_gapi = op.g_api_function;
+    auto fun_ocv = op.ocv_function ;
+
+    // G-API code & corresponding OpenCV code ////////////////////////////////
+
+    cv::GMat in1;
+    cv::GScalar in2;
+    auto out = fun_gapi(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    c.apply(gin(in_mat1, sc), gout(out_mat_gapi), std::move(compile_args));
+
+    fun_ocv(in_mat1, sc, out_mat_ocv);
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(MathOperatorMatMatTest, OperatorAccuracyTest )
+{
+    compare_f cmpF;
+    g_api_ocv_pair_mat_mat op;
+    int type = 0, dtype = 0;
+    cv::Size sz;
+    bool initOutMatr = false;
+    cv::GCompileArgs compile_args;
+    std::tie(cmpF, op, type, sz, dtype, initOutMatr, compile_args) = GetParam();
+    initMatsRandU(type, sz, dtype, initOutMatr);
+
+    auto fun_gapi = op.g_api_function;
+    auto fun_ocv = op.ocv_function ;
+
+    // G-API code & corresponding OpenCV code ////////////////////////////////
+
+    cv::GMat in1;
+    cv::GMat in2;
+    auto out = fun_gapi(in1, in2);
+    cv::GComputation c(GIn(in1, in2), GOut(out));
+
+    c.apply(gin(in_mat1, in_mat2), gout(out_mat_gapi), std::move(compile_args));
+
+    fun_ocv(in_mat1, in_mat2, out_mat_ocv);
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_TRUE(cmpF(out_mat_gapi, out_mat_ocv));
+        EXPECT_EQ(out_mat_gapi.size(), sz);
+    }
+}
+
+TEST_P(NotOperatorTest, OperatorAccuracyTest)
+{
+    cv::Size sz_in = std::get<1>(GetParam());
+    initMatrixRandU(std::get<0>(GetParam()), sz_in, std::get<0>(GetParam()), std::get<2>(GetParam()));
+    cv::GCompileArgs compile_args;
+
+    // G-API code //////////////////////////////////////////////////////////////
+    cv::GMat in;
+    auto out = ~in;
+    cv::GComputation c(in, out);
+
+    c.apply(in_mat1, out_mat_gapi, std::move(compile_args));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        out_mat_ocv =~in_mat1;
+    }
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        EXPECT_EQ(0, cv::countNonZero(out_mat_ocv != out_mat_gapi));
+        EXPECT_EQ(out_mat_gapi.size(), sz_in);
+    }
+}
+} // opencv_test
+
+#endif // OPENCV_GAPI_OPERATOR_TESTS_INL_COMMON_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_tests_common.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/common/gapi_tests_common.hpp
new file mode 100644 (file)
index 0000000..be0fc3c
--- /dev/null
@@ -0,0 +1,296 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include <iostream>
+
+#include "opencv2/ts.hpp"
+#include "opencv2/gapi.hpp"
+
+namespace
+{
+    inline std::ostream& operator<<(std::ostream& o, const cv::GCompileArg& arg)
+    {
+        return o << (arg.tag.empty() ? "empty" : arg.tag);
+    }
+}
+
+namespace opencv_test
+{
+
+class TestFunctional
+{
+public:
+    cv::Mat in_mat1;
+    cv::Mat in_mat2;
+    cv::Mat out_mat_gapi;
+    cv::Mat out_mat_ocv;
+
+    cv::Scalar sc;
+
+    cv::Scalar initScalarRandU(unsigned upper)
+    {
+        auto& rng = cv::theRNG();
+        double s1 = rng(upper);
+        double s2 = rng(upper);
+        double s3 = rng(upper);
+        double s4 = rng(upper);
+        return cv::Scalar(s1, s2, s3, s4);
+    }
+
+    void initMatsRandU(int type, cv::Size sz_in, int dtype, bool createOutputMatrices = true)
+    {
+        in_mat1 = cv::Mat(sz_in, type);
+        in_mat2 = cv::Mat(sz_in, type);
+
+        sc = initScalarRandU(100);
+        cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+        cv::randu(in_mat2, cv::Scalar::all(0), cv::Scalar::all(255));
+
+        if (createOutputMatrices && dtype != -1)
+        {
+            out_mat_gapi = cv::Mat (sz_in, dtype);
+            out_mat_ocv = cv::Mat (sz_in, dtype);
+        }
+    }
+
+    void initMatrixRandU(int type, cv::Size sz_in, int dtype, bool createOutputMatrices = true)
+    {
+        in_mat1 = cv::Mat(sz_in, type);
+
+        sc = initScalarRandU(100);
+
+        cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+
+        if (createOutputMatrices && dtype != -1)
+        {
+            out_mat_gapi = cv::Mat (sz_in, dtype);
+            out_mat_ocv = cv::Mat (sz_in, dtype);
+        }
+    }
+
+    void initMatsRandN(int type, cv::Size sz_in, int dtype, bool createOutputMatrices = true)
+    {
+        in_mat1  = cv::Mat(sz_in, type);
+        cv::randn(in_mat1, cv::Scalar::all(127), cv::Scalar::all(40.f));
+
+        if (createOutputMatrices  && dtype != -1)
+        {
+            out_mat_gapi = cv::Mat(sz_in, dtype);
+            out_mat_ocv = cv::Mat(sz_in, dtype);
+        }
+    }
+
+    static cv::Mat nonZeroPixels(const cv::Mat& mat)
+    {
+        int channels = mat.channels();
+        std::vector<cv::Mat> split(channels);
+        cv::split(mat, split);
+        cv::Mat result;
+        for (int c=0; c < channels; c++)
+        {
+            if (c == 0)
+                result = split[c] != 0;
+            else
+                result = result | (split[c] != 0);
+        }
+        return result;
+    }
+
+    static int countNonZeroPixels(const cv::Mat& mat)
+    {
+        return cv::countNonZero( nonZeroPixels(mat) );
+    }
+
+};
+
+template<class T>
+class TestParams: public TestFunctional, public TestWithParam<T>{};
+
+template<class T>
+class TestPerfParams: public TestFunctional, public perf::TestBaseWithParam<T>{};
+
+using compare_f = std::function<bool(const cv::Mat &a, const cv::Mat &b)>;
+
+template<typename T>
+struct Wrappable
+{
+    compare_f to_compare_f()
+    {
+        T t = *static_cast<T*const>(this);
+        return [t](const cv::Mat &a, const cv::Mat &b)
+        {
+            return t(a, b);
+        };
+    }
+};
+
+class AbsExact : public Wrappable<AbsExact>
+{
+public:
+    AbsExact() {}
+    bool operator() (const cv::Mat& in1, const cv::Mat& in2) const
+    {
+        if (cv::norm(in1, in2, NORM_INF) != 0)
+        {
+            std::cout << "AbsExact error: G-API output and reference output matrixes are not bitexact equal."  << std::endl;
+            return false;
+        }
+        else
+        {
+            return true;
+        }
+    }
+private:
+};
+
+class AbsTolerance : public Wrappable<AbsTolerance>
+{
+public:
+    AbsTolerance(double tol) : _tol(tol) {}
+    bool operator() (const cv::Mat& in1, const cv::Mat& in2) const
+    {
+        if (cv::norm(in1, in2, NORM_INF) > _tol)
+        {
+            std::cout << "AbsTolerance error: Number of different pixels in " << std::endl;
+            std::cout << "G-API output and reference output matrixes exceeds " << _tol << " pixels threshold." << std::endl;
+            return false;
+        }
+        else
+        {
+            return true;
+        }
+    }
+private:
+    double _tol;
+};
+
+class Tolerance_FloatRel_IntAbs : public Wrappable<Tolerance_FloatRel_IntAbs>
+{
+public:
+    Tolerance_FloatRel_IntAbs(double tol, double tol8u) : _tol(tol), _tol8u(tol8u) {}
+    bool operator() (const cv::Mat& in1, const cv::Mat& in2) const
+    {
+        int depth = CV_MAT_DEPTH(in1.type());
+        {
+            double err = depth >= CV_32F ? cv::norm(in1, in2, NORM_L1 | NORM_RELATIVE)
+                                                     : cv::norm(in1, in2, NORM_INF);
+            double tolerance = depth >= CV_32F ? _tol : _tol8u;
+            if (err > tolerance)
+            {
+                std::cout << "Tolerance_FloatRel_IntAbs error: err=" << err
+                          << "  tolerance=" << tolerance
+                          << "  depth=" << cv::typeToString(depth) << std::endl;
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+        }
+    }
+private:
+    double _tol;
+    double _tol8u;
+};
+
+
+class AbsSimilarPoints : public Wrappable<AbsSimilarPoints>
+{
+public:
+    AbsSimilarPoints(double tol, double percent) : _tol(tol), _percent(percent) {}
+    bool operator() (const cv::Mat& in1, const cv::Mat& in2) const
+    {
+        Mat diff;
+        cv::absdiff(in1, in2, diff);
+        Mat err_mask = diff > _tol;
+        int err_points = cv::countNonZero(err_mask.reshape(1));
+        double max_err_points = _percent * std::max((size_t)1000, in1.total());
+        if (err_points > max_err_points)
+        {
+            std::cout << "AbsSimilarPoints error: err_points=" << err_points
+                      << "  max_err_points=" << max_err_points << " (total=" << in1.total() << ")"
+                      << "  diff_tolerance=" << _tol << std::endl;
+            return false;
+        }
+        else
+        {
+            return true;
+        }
+    }
+private:
+    double _tol;
+    double _percent;
+};
+
+
+class ToleranceFilter : public Wrappable<ToleranceFilter>
+{
+public:
+    ToleranceFilter(double tol, double tol8u, double inf_tol = 2.0) : _tol(tol), _tol8u(tol8u), _inf_tol(inf_tol) {}
+    bool operator() (const cv::Mat& in1, const cv::Mat& in2) const
+    {
+        int depth = CV_MAT_DEPTH(in1.type());
+        {
+            double err_Inf = cv::norm(in1, in2, NORM_INF);
+            if (err_Inf > _inf_tol)
+            {
+                std::cout << "ToleranceFilter error: err_Inf=" << err_Inf << "  tolerance=" << _inf_tol << std::endl;
+                return false;
+            }
+            double err = cv::norm(in1, in2, NORM_L2 | NORM_RELATIVE);
+            double tolerance = depth >= CV_32F ? _tol : _tol8u;
+            if (err > tolerance)
+            {
+                std::cout << "ToleranceFilter error: err=" << err << "  tolerance=" << tolerance
+                          << "  depth=" << cv::depthToString(depth)
+                          << std::endl;
+                return false;
+            }
+        }
+        return true;
+    }
+private:
+    double _tol;
+    double _tol8u;
+    double _inf_tol;
+};
+
+class ToleranceColor : public Wrappable<ToleranceColor>
+{
+public:
+    ToleranceColor(double tol, double inf_tol = 2.0) : _tol(tol), _inf_tol(inf_tol) {}
+    bool operator() (const cv::Mat& in1, const cv::Mat& in2) const
+    {
+        {
+            double err_Inf = cv::norm(in1, in2, NORM_INF);
+            if (err_Inf > _inf_tol)
+            {
+                std::cout << "ToleranceColor error: err_Inf=" << err_Inf << "  tolerance=" << _inf_tol << std::endl;;
+                return false;
+            }
+            double err = cv::norm(in1, in2, NORM_L1 | NORM_RELATIVE);
+            if (err > _tol)
+            {
+                std::cout << "ToleranceColor error: err=" << err << "  tolerance=" << _tol << std::endl;;
+                return false;
+            }
+        }
+        return true;
+    }
+private:
+    double _tol;
+    double _inf_tol;
+};
+} // namespace opencv_test
+
+namespace
+{
+    inline std::ostream& operator<<(std::ostream& os, const opencv_test::compare_f&)
+    {
+        return os << "compare_f";
+    }
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_cpu.cpp
new file mode 100644 (file)
index 0000000..11e78bd
--- /dev/null
@@ -0,0 +1,405 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+#include "../common/gapi_core_tests.hpp"
+#include "opencv2/gapi/cpu/core.hpp"
+
+#define CORE_CPU cv::gapi::core::cpu::kernels()
+
+namespace opencv_test
+{
+
+
+// FIXME: Wut? See MulTestCPU/MathOpTest below (duplicate?)
+INSTANTIATE_TEST_CASE_P(AddTestCPU, MathOpTest,
+                        Combine(Values(ADD, MUL),
+                                testing::Bool(),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(1.0),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(false),
+                                Values(cv::compile_args(CORE_CPU))),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(MulTestCPU, MathOpTest,
+                        Combine(Values(MUL),
+                                testing::Bool(),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(1.0, 0.5, 2.0),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(false),
+                                Values(cv::compile_args(CORE_CPU))),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(SubTestCPU, MathOpTest,
+                        Combine(Values(SUB),
+                                testing::Bool(),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values (1.0),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(DivTestCPU, MathOpTest,
+                        Combine(Values(DIV),
+                                testing::Bool(),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values (1.0, 0.5, 2.0),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(MulTestCPU, MulDoubleTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(DivTestCPU, DivTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(DivCTestCPU, DivCTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MeanTestCPU, MeanTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MaskTestCPU, MaskTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SelectTestCPU, SelectTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Polar2CartCPU, Polar2CartTest,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Cart2PolarCPU, Cart2PolarTest,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(PhaseCPU, PhaseTest,
+                        Combine(Values(CV_32F, CV_32FC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SqrtCPU, SqrtTest,
+                        Combine(Values(CV_32F, CV_32FC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(CompareTestCPU, CmpTest,
+                        Combine(Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
+                                testing::Bool(),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))),
+                        opencv_test::PrintCmpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(BitwiseTestCPU, BitwiseTest,
+                        Combine(Values(AND, OR, XOR),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))),
+                        opencv_test::PrintBWCoreParams());
+
+INSTANTIATE_TEST_CASE_P(BitwiseNotTestCPU, NotTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+ /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MinTestCPU, MinTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MaxTestCPU, MaxTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SumTestCPU, SumTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(1e-5),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffTestCPU, AbsDiffTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffCTestCPU, AbsDiffCTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+// FIXME: Comparison introduced by YL doesn't work with C3
+INSTANTIATE_TEST_CASE_P(AddWeightedTestCPU, AddWeightedTest,
+                        Combine(Values( CV_8UC1/*, CV_8UC3*/, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(0.5000005),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(NormTestCPU, NormTest,
+                        Combine(Values(NORM_INF, NORM_L1, NORM_L2),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(1e-5),
+                                Values(cv::compile_args(CORE_CPU))),
+                        opencv_test::PrintNormCoreParams());
+
+INSTANTIATE_TEST_CASE_P(IntegralTestCPU, IntegralTest,
+                        Combine(Values( CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ThresholdTestCPU, ThresholdTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ThresholdTestCPU, ThresholdOTTest,
+                        Combine(Values(CV_8UC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::THRESH_OTSU, cv::THRESH_TRIANGLE),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+
+INSTANTIATE_TEST_CASE_P(InRangeTestCPU, InRangeTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Split3TestCPU, Split3Test,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Split4TestCPU, Split4Test,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ResizeTestCPU, ResizeTest,
+                        Combine(Values(AbsSimilarPoints(2, 0.05).to_compare_f()),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::Size(64,64),
+                                       cv::Size(30,30)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ResizeTestCPU, ResizeTestFxFy,
+                        Combine(Values(AbsSimilarPoints(2, 0.05).to_compare_f()),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(0.5, 0.1),
+                                Values(0.5, 0.1),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Merge3TestCPU, Merge3Test,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Merge4TestCPU, Merge4Test,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(RemapTestCPU, RemapTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(FlipTestCPU, FlipTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(0,1,-1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(CropTestCPU, CropTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50)),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(LUTTestCPU, LUTTest,
+                        Combine(Values(CV_8UC1, CV_8UC3),
+                                Values(CV_8UC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(LUTTestCustomCPU, LUTTest,
+                        Combine(Values(CV_8UC3),
+                                Values(CV_8UC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ConvertToCPU, ConvertToTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(CV_8U, CV_16U, CV_16S, CV_32F),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatHorTestCPU, ConcatHorTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatVertTestCPU, ConcatVertTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatVertVecTestCPU, ConcatVertVecTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatHorVecTestCPU, ConcatHorVecTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_CPU))));
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp
new file mode 100644 (file)
index 0000000..c65052b
--- /dev/null
@@ -0,0 +1,506 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+#include "../common/gapi_core_tests.hpp"
+
+namespace opencv_test
+{
+
+#define CORE_FLUID cv::gapi::core::fluid::kernels()
+
+
+// FIXME: Windows accuracy problems after recent update!
+INSTANTIATE_TEST_CASE_P(MathOpTestFluid, MathOpTest,
+                        Combine(Values(ADD, SUB, DIV, MUL),
+                                testing::Bool(),
+                                Values(CV_8UC3, CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(1.0),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1, CV_8U, CV_32F),
+                                testing::Bool(),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(MulSTestFluid, MulDoubleTest,
+                        Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1), // FIXME: extend with more types
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(DivCTestFluid, DivCTest,
+                        Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(CV_8U, CV_32F),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffTestFluid, AbsDiffTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffCTestFluid, AbsDiffCTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseTestFluid, BitwiseTest,
+                        Combine(Values(AND, OR, XOR),
+                                Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))),
+                        opencv_test::PrintBWCoreParams());
+
+INSTANTIATE_TEST_CASE_P(BitwiseNotTestFluid, NotTest,
+                        Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(MinTestFluid, MinTest,
+                        Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(MaxTestFluid, MaxTest,
+                        Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(CompareTestFluid, CmpTest,
+                        Combine(Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
+                                testing::Bool(),
+                                Values(CV_8UC3, CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))),
+                        opencv_test::PrintCmpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(AddWeightedTestFluid, AddWeightedTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1, CV_8U, CV_32F),
+                                testing::Bool(),
+                                Values(0.5000005),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(LUTTestFluid, LUTTest,
+                        Combine(Values(CV_8UC1, CV_8UC3),
+                                Values(CV_8UC1),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(ConvertToFluid, ConvertToTest,
+                        Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_32FC1),
+                                Values(CV_8U, CV_16U, CV_32F),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Split3TestFluid, Split3Test,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Split4TestFluid, Split4Test,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Merge3TestFluid, Merge3Test,
+                        Combine(Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Merge4TestFluid, Merge4Test,
+                        Combine(Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(SelectTestFluid, SelectTest,
+                        Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Polar2CartFluid, Polar2CartTest,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(Cart2PolarFluid, Cart2PolarTest,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(PhaseFluid, PhaseTest,
+                        Combine(Values(CV_32F, CV_32FC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(SqrtFluid, SqrtTest,
+                        Combine(Values(CV_32F, CV_32FC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(ThresholdTestFluid, ThresholdTest,
+                        Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::THRESH_BINARY, cv::THRESH_BINARY_INV,
+                                       cv::THRESH_TRUNC,
+                                       cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(InRangeTestFluid, InRangeTest,
+                        Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1920, 1080),
+                                       cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(
+                        ResizeTestFluid, ResizeTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC3/*CV_8UC1, CV_16UC1, CV_16SC1*/),
+                                Values(/*cv::INTER_NEAREST,*/ cv::INTER_LINEAR/*, cv::INTER_AREA*/),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128),
+                                       cv::Size(64, 64),
+                                       cv::Size(30, 30)),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128),
+                                       cv::Size(64, 64),
+                                       cv::Size(30, 30)),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+//----------------------------------------------------------------------
+// FIXME: Clean-up test configurations which are enabled already
+#if 0
+INSTANTIATE_TEST_CASE_P(MathOpTestCPU, MathOpTest,
+                        Combine(Values(ADD, DIV, MUL),
+                                testing::Bool(),
+                                Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1, CV_8U, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(false)),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(SubTestCPU, MathOpTest,
+                        Combine(Values(SUB),
+                                testing::Bool(),
+                                Values(CV_8UC1, CV_16SC1 , CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1, CV_8U, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                testing::Bool()),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(MulSTestCPU, MulSTest,
+                        Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(DivCTestCPU, DivCTest,
+                        Combine(Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(CV_8U, CV_32F),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(MeanTestCPU, MeanTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(SelectTestCPU, SelectTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(Polar2CartCPU, Polar2CartTest,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(Cart2PolarCPU, Cart2PolarTest,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(CompareTestCPU, CmpTest,
+                        Combine(Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
+                                testing::Bool(),
+                                Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()),
+                        opencv_test::PrintCmpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(BitwiseTestCPU, BitwiseTest,
+                        Combine(Values(AND, OR, XOR),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()),
+                        opencv_test::PrintBWCoreParams());
+
+INSTANTIATE_TEST_CASE_P(BitwiseNotTestCPU, NotTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+ /*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(MinTestCPU, MinTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(MaxTestCPU, MaxTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(SumTestCPU, SumTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool())
+                                Values(0.0),
+                       );
+
+INSTANTIATE_TEST_CASE_P(AbsDiffTestCPU, AbsDiffTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffCTestCPU, AbsDiffCTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(AddWeightedTestCPU, AddWeightedTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1, CV_8U, CV_32F),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(NormTestCPU, NormTest,
+                        Combine(Values(NORM_INF, NORM_L1, NORM_L2),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128))),
+                                Values(0.0),
+                        opencv_test::PrintNormCoreParams());
+
+INSTANTIATE_TEST_CASE_P(IntegralTestCPU, IntegralTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128))));
+
+INSTANTIATE_TEST_CASE_P(ThresholdTestCPU, ThresholdTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(ThresholdTestCPU, ThresholdOTTest,
+                        Combine(Values(CV_8UC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::THRESH_OTSU, cv::THRESH_TRIANGLE),
+/*init output matrices or not*/ testing::Bool()));
+
+
+INSTANTIATE_TEST_CASE_P(InRangeTestCPU, InRangeTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(Split3TestCPU, Split3Test,
+                        (Values(cv::Size(1280, 720),
+                                cv::Size(640, 480),
+                                cv::Size(128, 128))));
+
+INSTANTIATE_TEST_CASE_P(Split4TestCPU, Split4Test,
+                        (Values(cv::Size(1280, 720),
+                                cv::Size(640, 480),
+                                cv::Size(128, 128))));
+
+INSTANTIATE_TEST_CASE_P(Merge3TestCPU, Merge3Test,
+                        (Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128))));
+
+INSTANTIATE_TEST_CASE_P(Merge4TestCPU, Merge4Test,
+                        (Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128))));
+
+INSTANTIATE_TEST_CASE_P(RemapTestCPU, RemapTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(FlipTestCPU, FlipTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(0,1,-1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(CropTestCPU, CropTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50)),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(LUTTestCPU, LUTTest,
+                        Combine(Values(CV_8UC1, CV_8UC3),
+                                Values(CV_8UC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(LUTTestCustomCPU, LUTTest,
+                        Combine(Values(CV_8UC3),
+                                Values(CV_8UC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool()));
+
+INSTANTIATE_TEST_CASE_P(ConvertToCPU, ConvertToTest,
+                        Combine(Values(CV_8UC3, CV_8UC1, CV_16UC1, CV_32FC1),
+                                Values(CV_8U, CV_16U, CV_32F),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128))));
+
+INSTANTIATE_TEST_CASE_P(ConcatHorTestCPU, ConcatHorTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128))));
+INSTANTIATE_TEST_CASE_P(ConcatVertTestCPU, ConcatVertTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128))));
+
+//----------------------------------------------------------------------
+#endif // 0
+
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_cpu.cpp
new file mode 100644 (file)
index 0000000..beda022
--- /dev/null
@@ -0,0 +1,238 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+
+#include "../common/gapi_imgproc_tests.hpp"
+#include "opencv2/gapi/cpu/imgproc.hpp"
+
+#define IMGPROC_CPU cv::gapi::imgproc::cpu::kernels()
+
+namespace opencv_test
+{
+
+
+INSTANTIATE_TEST_CASE_P(Filter2DTestCPU, Filter2DTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 4, 5, 7),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::BORDER_DEFAULT),
+                                Values(-1, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BoxFilterTestCPU, BoxFilterTest,
+                        Combine(Values(AbsTolerance(0).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3,5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::BORDER_DEFAULT),
+                                Values(-1, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SepFilterTestCPU_8U, SepFilterTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3),
+                                Values(3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(-1, CV_16S, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SepFilterTestCPU_other, SepFilterTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(-1, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BlurTestCPU, BlurTest,
+                        Combine(Values(AbsTolerance(0.0).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3,5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::BORDER_DEFAULT),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(gaussBlurTestCPU, GaussianBlurTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MedianBlurTestCPU, MedianBlurTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(ErodeTestCPU, ErodeTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::MorphShapes::MORPH_RECT,
+                                       cv::MorphShapes::MORPH_CROSS,
+                                       cv::MorphShapes::MORPH_ELLIPSE),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Erode3x3TestCPU, Erode3x3Test,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(1,2,4),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(DilateTestCPU, DilateTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::MorphShapes::MORPH_RECT,
+                                       cv::MorphShapes::MORPH_CROSS,
+                                       cv::MorphShapes::MORPH_ELLIPSE),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(Dilate3x3TestCPU, Dilate3x3Test,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(1,2,4),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SobelTestCPU, SobelTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
+                                Values(3, 5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(-1, CV_16S, CV_32F),
+                                Values(0, 1),
+                                Values(1, 2),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(SobelTestCPU32F, SobelTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_32FC1),
+                                Values(3, 5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(CV_32F),
+                                Values(0, 1),
+                                Values(1, 2),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(EqHistTestCPU, EqHistTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(CannyTestCPU, CannyTest,
+                        Combine(Values(AbsSimilarPoints(0, 0.05).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(3.0, 120.0),
+                                Values(125.0, 240.0),
+                                Values(3, 5),
+                                testing::Bool(),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2GrayTestCPU, RGB2GrayTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2GrayTestCPU, BGR2GrayTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2YUVTestCPU, RGB2YUVTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(YUV2RGBTestCPU, YUV2RGBTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                            /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2LabTestCPU, RGB2LabTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2LUVTestCPU, BGR2LUVTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(LUV2BGRTestCPU, LUV2BGRTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2YUVTestCPU, BGR2YUVTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+INSTANTIATE_TEST_CASE_P(YUV2BGRTestCPU, YUV2BGRTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_CPU))));
+
+} // opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_imgproc_tests_fluid.cpp
new file mode 100644 (file)
index 0000000..5dca209
--- /dev/null
@@ -0,0 +1,168 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+#include "../common/gapi_imgproc_tests.hpp"
+
+#define IMGPROC_FLUID cv::gapi::imgproc::fluid::kernels()
+
+namespace opencv_test
+{
+
+INSTANTIATE_TEST_CASE_P(RGB2GrayTestFluid, RGB2GrayTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                cv::Size(640, 480)),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BGR2GrayTestFluid, BGR2GrayTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(RGB2YUVTestFluid, RGB2YUVTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(YUV2RGBTestFluid, YUV2RGBTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(RGB2LabTestFluid, RGB2LabTest,
+                        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+// FIXME: Not supported by Fluid yet (no kernel implemented)
+INSTANTIATE_TEST_CASE_P(BGR2LUVTestFluid, BGR2LUVTest,
+                        Combine(Values(ToleranceColor(5e-3, 6).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(blurTestFluid, BlurTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(3), // add kernel size=5 when implementation is ready
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::BORDER_DEFAULT),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(gaussBlurTestFluid, GaussianBlurTest,
+                        Combine(Values(ToleranceFilter(1e-3f, 0.01).to_compare_f()),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(3), // add kernel size=5 when implementation is ready
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(medianBlurTestFluid, MedianBlurTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(3), // add kernel size=5 when implementation is ready
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(erodeTestFluid, ErodeTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(3), // add kernel size=5 when implementation is ready
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::MorphShapes::MORPH_RECT,
+                                       cv::MorphShapes::MORPH_CROSS,
+                                       cv::MorphShapes::MORPH_ELLIPSE),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(dilateTestFluid, DilateTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(3), // add kernel size=5 when implementation is ready
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::MorphShapes::MORPH_RECT,
+                                       cv::MorphShapes::MORPH_CROSS,
+                                       cv::MorphShapes::MORPH_ELLIPSE),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(SobelTestFluid, SobelTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(3), // add kernel size=5 when implementation is ready
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(-1, CV_16S, CV_32F),
+                                Values(0, 1),
+                                Values(1, 2),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(SobelTestFluid32F, SobelTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_32FC1),
+                                Values(3), // add kernel size=5 when implementation is ready
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(CV_32F),
+                                Values(0, 1),
+                                Values(1, 2),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(boxFilterTestFluid32, BoxFilterTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(3), // add kernel size=5 when implementation is ready
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::BORDER_DEFAULT),
+                                Values(-1, CV_32F),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(sepFilterTestFluid, SepFilterTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_32FC1),
+                                Values(3), // add kernel size=5 when implementation is ready
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(-1, CV_32F),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(filter2DTestFluid, Filter2DTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(3), // add kernel size=4,5,7 when implementation ready
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::BORDER_DEFAULT),
+                                Values(-1, CV_32F),
+                                Values(true, false),
+                                Values(cv::compile_args(IMGPROC_FLUID))));
+
+} // opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_cpu.cpp
new file mode 100644 (file)
index 0000000..435c798
--- /dev/null
@@ -0,0 +1,73 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+#include "../common/gapi_operators_tests.hpp"
+#include "opencv2/gapi/cpu/core.hpp"
+
+#define CORE_CPU cv::gapi::core::cpu::kernels()
+
+namespace opencv_test
+{
+
+
+// FIXME: CPU test runs are disabled since Fluid is an exclusive plugin now!
+INSTANTIATE_TEST_CASE_P(MathOperatorTestCPU, MathOperatorMatMatTest,
+                    Combine(Values(AbsExact().to_compare_f()),
+                            Values( opPlusM, opMinusM, opDivM,
+                                    opGreater, opLess, opGreaterEq, opLessEq, opEq, opNotEq),
+                            Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                            Values(cv::Size(1280, 720),
+                               cv::Size(640, 480),
+                               cv::Size(128, 128)),
+                            Values(-1, CV_8U, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                            Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(MathOperatorTestCPU, MathOperatorMatScalarTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( opPlus, opPlusR, opMinus, opMinusR, opMul, opMulR,  // FIXIT avoid division by values near zero: opDiv, opDivR,
+                                        opGT, opLT, opGE, opLE, opEQ, opNE,
+                                        opGTR, opLTR, opGER, opLER, opEQR, opNER),
+                                Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1, CV_8U, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestCPU, MathOperatorMatMatTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( opAnd, opOr, opXor ),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                   cv::Size(640, 480),
+                                   cv::Size(128, 128)),
+                                Values(-1),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestCPU, MathOperatorMatScalarTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( opAND, opOR, opXOR, opANDR, opORR, opXORR ),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseNotOperatorTestCPU, NotOperatorTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_CPU))));
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/cpu/gapi_operators_tests_fluid.cpp
new file mode 100644 (file)
index 0000000..4179fa5
--- /dev/null
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "../common/gapi_operators_tests.hpp"
+
+#define CORE_FLUID cv::gapi::core::fluid::kernels()
+
+namespace opencv_test
+{
+
+INSTANTIATE_TEST_CASE_P(MathOperatorTestFluid, MathOperatorMatMatTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( opPlusM, opMinusM, opDivM,
+                                        opGreater, opLess, opGreaterEq, opLessEq, opEq, opNotEq),
+                                Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                   cv::Size(640, 480),
+                                   cv::Size(128, 128)),
+                                Values(-1, CV_8U, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+//FIXME: Some Mat/Scalar Fluid kernels are not there yet!
+INSTANTIATE_TEST_CASE_P(DISABLED_MathOperatorTestFluid, MathOperatorMatScalarTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( opPlus, opPlusR, opMinus, opMinusR, opMul, opMulR,  // FIXIT avoid division by values near zero: opDiv, opDivR,
+                                        opGT, opLT, opGE, opLE, opEQ, opNE,
+                                        opGTR, opLTR, opGER, opLER, opEQR, opNER),
+                                Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1, CV_8U, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestFluid, MathOperatorMatMatTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( opAnd, opOr, opXor ),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                   cv::Size(640, 480),
+                                   cv::Size(128, 128)),
+                                Values(-1),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+//FIXME: Some Mat/Scalar Fluid kernels are not there yet!
+INSTANTIATE_TEST_CASE_P(DISABLED_BitwiseOperatorTestFluid, MathOperatorMatScalarTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( opAND, opOR, opXOR, opANDR, opORR, opXORR ),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseNotOperatorTestFluid, NotOperatorTest,
+                    Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                            Values(cv::Size(1280, 720),
+                                   cv::Size(640, 480),
+                                   cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_FLUID))));
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_array_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_array_tests.cpp
new file mode 100644 (file)
index 0000000..e576562
--- /dev/null
@@ -0,0 +1,166 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include <vector>
+#include <ade/util/algorithm.hpp>
+
+namespace opencv_test
+{
+
+namespace ThisTest
+{
+using GPointArray = cv::GArray<cv::Point>;
+G_TYPED_KERNEL(GeneratePoints, <GPointArray(GMat)>, "test.array.out_const")
+{
+    static GArrayDesc outMeta(const GMatDesc&) { return empty_array_desc(); }
+};
+G_TYPED_KERNEL(FindCorners,    <GPointArray(GMat)>, "test.array.out")
+{
+    static GArrayDesc outMeta(const GMatDesc&) { return empty_array_desc(); }
+};
+G_TYPED_KERNEL(CountCorners,   <GScalar(GPointArray)>,  "test.array.in")
+{
+    static GScalarDesc outMeta(const GArrayDesc &) { return empty_scalar_desc(); }
+};
+} // namespace ThisTest
+
+namespace
+{
+GAPI_OCV_KERNEL(OCVGeneratePoints, ThisTest::GeneratePoints)
+{
+    static void run(cv::Mat, std::vector<cv::Point> &out)
+    {
+        for (int i = 0; i < 10; i++)
+            out.emplace_back(i, i);
+    }
+};
+
+GAPI_OCV_KERNEL(OCVFindCorners, ThisTest::FindCorners)
+{
+    static void run(cv::Mat in, std::vector<cv::Point> &out)
+    {
+        cv::goodFeaturesToTrack(in, out, 1024, 0.01, 3);
+    }
+};
+
+GAPI_OCV_KERNEL(OCVCountCorners, ThisTest::CountCorners)
+{
+    static void run(const std::vector<cv::Point> &in, cv::Scalar &out)
+    {
+        out[0] = static_cast<double>(in.size());
+    }
+};
+
+cv::Mat cross(int w, int h)
+{
+    cv::Mat mat = cv::Mat::eye(h, w, CV_8UC1)*255;
+    cv::Mat yee;
+    cv::flip(mat, yee, 0); // X-axis
+    mat |= yee;            // make an "X" matrix;
+    return mat;
+}
+} // (anonymous namespace)
+
+TEST(GArray, TestReturnValue)
+{
+    // FIXME: Make .apply() able to take compile arguments
+    cv::GComputationT<ThisTest::GPointArray(cv::GMat)> c(ThisTest::FindCorners::on);
+    auto cc = c.compile(cv::GMatDesc{CV_8U,1,{32,32}},
+                        cv::compile_args(cv::gapi::kernels<OCVFindCorners>()));
+
+    // Prepare input matrix
+    cv::Mat input = cross(32, 32);
+
+    std::vector<cv::Point> points;
+    cc(input, points);
+
+    // OCV goodFeaturesToTrack should find 5 points here (with these settings)
+    EXPECT_EQ(5u, points.size());
+    EXPECT_TRUE(ade::util::find(points, cv::Point(16,16)) != points.end());
+    EXPECT_TRUE(ade::util::find(points, cv::Point(30,30)) != points.end());
+    EXPECT_TRUE(ade::util::find(points, cv::Point( 1,30)) != points.end());
+    EXPECT_TRUE(ade::util::find(points, cv::Point(30, 1)) != points.end());
+    EXPECT_TRUE(ade::util::find(points, cv::Point( 1, 1)) != points.end());
+}
+
+TEST(GArray, TestInputArg)
+{
+    cv::GComputationT<cv::GScalar(ThisTest::GPointArray)> c(ThisTest::CountCorners::on);
+    auto cc = c.compile(cv::empty_array_desc(),
+                        cv::compile_args(cv::gapi::kernels<OCVCountCorners>()));
+
+    const std::vector<cv::Point> arr = {cv::Point(1,1), cv::Point(2,2)};
+    cv::Scalar out;
+    cc(arr, out);
+    EXPECT_EQ(2, out[0]);
+}
+
+TEST(GArray, TestPipeline)
+{
+    cv::GComputationT<cv::GScalar(cv::GMat)> c([](cv::GMat in)
+    {
+        return ThisTest::CountCorners::on(ThisTest::FindCorners::on(in));
+    });
+    auto cc = c.compile(cv::GMatDesc{CV_8U,1,{32,32}},
+                        cv::compile_args(cv::gapi::kernels<OCVFindCorners, OCVCountCorners>()));
+
+    cv::Mat input = cross(32, 32);
+    cv::Scalar out;
+    cc(input, out);
+    EXPECT_EQ(5, out[0]);
+}
+
+TEST(GArray, NoAggregationBetweenRuns)
+{
+    cv::GComputationT<cv::GScalar(cv::GMat)> c([](cv::GMat in)
+    {
+        return ThisTest::CountCorners::on(ThisTest::GeneratePoints::on(in));
+    });
+    auto cc = c.compile(cv::GMatDesc{CV_8U,1,{32,32}},
+                        cv::compile_args(cv::gapi::kernels<OCVGeneratePoints, OCVCountCorners>()));
+
+    cv::Mat input = cv::Mat::eye(32, 32, CV_8UC1);
+    cv::Scalar out;
+
+    cc(input, out);
+    EXPECT_EQ(10, out[0]);
+
+    // Last kernel in the graph counts number of elements in array, returned by the previous kernel
+    // (in this test, this variable is constant).
+    // After 10 executions, this number MUST remain the same - 1st kernel is adding new values on every
+    // run, but it is graph's responsibility to reset internal object state.
+    cv::Scalar out2;
+    for (int i = 0; i < 10; i++)
+    {
+        cc(input, out2);
+    }
+    EXPECT_EQ(10, out2[0]);
+}
+
+TEST(GArray, TestIntermediateOutput)
+{
+    using Result = std::tuple<ThisTest::GPointArray, cv::GScalar>;
+    cv::GComputationT<Result(cv::GMat)> c([](cv::GMat in)
+    {
+        auto corners = ThisTest::GeneratePoints::on(in);
+        return std::make_tuple(corners, ThisTest::CountCorners::on(corners));
+    });
+
+    cv::Mat in_mat = cv::Mat::eye(32, 32, CV_8UC1);
+    std::vector<cv::Point> out_points;
+    cv::Scalar out_count;
+
+    auto cc = c.compile(cv::descr_of(in_mat),
+                        cv::compile_args(cv::gapi::kernels<OCVGeneratePoints, OCVCountCorners>()));
+    cc(in_mat, out_points, out_count);
+
+    EXPECT_EQ(10u, out_points.size());
+    EXPECT_EQ(10,  out_count[0]);
+}
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_basic_hetero_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_basic_hetero_tests.cpp
new file mode 100644 (file)
index 0000000..62069d8
--- /dev/null
@@ -0,0 +1,312 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "gapi_mock_kernels.hpp"
+
+#include "opencv2/gapi/fluid/gfluidkernel.hpp"
+
+namespace opencv_test
+{
+
+namespace
+{
+    GAPI_OCV_KERNEL(OCVFoo, I::Foo)
+    {
+        static void run(const cv::Mat &in, cv::Mat &out)
+        {
+            out = in + 2;
+        }
+    };
+
+    GAPI_OCV_KERNEL(OCVBar, I::Bar)
+    {
+        static void run(const cv::Mat &a, const cv::Mat &b, cv::Mat &out)
+        {
+            out = 4*(a + b);
+        }
+    };
+
+    void FluidFooRow(const uint8_t* in, uint8_t* out, int length)
+    {
+        for (int i = 0; i < length; i++)
+        {
+            out[i] = in[i] + 3;
+        }
+    }
+
+    void FluidBarRow(const uint8_t* in1, const uint8_t* in2, uint8_t* out, int length)
+    {
+        for (int i = 0; i < length; i++)
+        {
+            out[i] = 3*(in1[i] + in2[i]);
+        }
+    }
+
+    GAPI_FLUID_KERNEL(FFoo, I::Foo, false)
+    {
+        static const int Window = 1;
+
+        static void run(const cv::gapi::fluid::View   &in,
+                              cv::gapi::fluid::Buffer &out)
+        {
+            FluidFooRow(in.InLineB(0), out.OutLineB(), in.length());
+        }
+    };
+
+    GAPI_FLUID_KERNEL(FBar, I::Bar, false)
+    {
+        static const int Window = 1;
+
+        static void run(const cv::gapi::fluid::View   &in1,
+                        const cv::gapi::fluid::View   &in2,
+                              cv::gapi::fluid::Buffer &out)
+        {
+            FluidBarRow(in1.InLineB(0), in2.InLineB(0), out.OutLineB(), in1.length());
+        }
+    };
+
+    G_TYPED_KERNEL(FluidFooI, <cv::GMat(cv::GMat)>, "test.kernels.fluid_foo")
+    {
+        static cv::GMatDesc outMeta(const cv::GMatDesc &in) { return in; }
+    };
+
+    G_TYPED_KERNEL(FluidBarI, <cv::GMat(cv::GMat,cv::GMat)>, "test.kernels.fluid_bar")
+    {
+        static cv::GMatDesc outMeta(const cv::GMatDesc &in, const cv::GMatDesc &) { return in; }
+    };
+
+    GAPI_FLUID_KERNEL(FluidFoo, FluidFooI, false)
+    {
+        static const int Window = 1;
+
+        static void run(const cv::gapi::fluid::View   &in,
+                              cv::gapi::fluid::Buffer &out)
+        {
+            FluidFooRow(in.InLineB(0), out.OutLineB(), in.length());
+        }
+    };
+
+    GAPI_FLUID_KERNEL(FluidBar, FluidBarI, false)
+    {
+        static const int Window = 1;
+
+        static void run(const cv::gapi::fluid::View   &in1,
+                        const cv::gapi::fluid::View   &in2,
+                              cv::gapi::fluid::Buffer &out)
+        {
+            FluidBarRow(in1.InLineB(0), in2.InLineB(0), out.OutLineB(), in1.length());
+        }
+    };
+
+    GAPI_FLUID_KERNEL(FluidFoo2lpi, FluidFooI, false)
+    {
+        static const int Window = 1;
+        static const int LPI    = 2;
+
+        static void run(const cv::gapi::fluid::View   &in,
+                              cv::gapi::fluid::Buffer &out)
+        {
+            for (int l = 0; l < out.lpi(); l++)
+            {
+                FluidFooRow(in.InLineB(l), out.OutLineB(l), in.length());
+            }
+        }
+    };
+
+    cv::Mat ocvFoo(const cv::Mat &in)
+    {
+        cv::Mat out;
+        OCVFoo::run(in, out);
+        return out;
+    }
+    cv::Mat ocvBar(const cv::Mat &in1, const cv::Mat &in2)
+    {
+        cv::Mat out;
+        OCVBar::run(in1, in2, out);
+        return out;
+    }
+    cv::Mat fluidFoo(const cv::Mat &in)
+    {
+        cv::Mat out(in.rows, in.cols, in.type());
+        for (int y = 0; y < in.rows; y++)
+        {
+            FluidFooRow(in.ptr(y), out.ptr(y), in.cols);
+        }
+        return out;
+    }
+    cv::Mat fluidBar(const cv::Mat &in1, const cv::Mat &in2)
+    {
+        cv::Mat out(in1.rows, in1.cols, in1.type());
+        for (int y = 0; y < in1.rows; y++)
+        {
+            FluidBarRow(in1.ptr(y), in2.ptr(y), out.ptr(y), in1.cols);
+        }
+        return out;
+    }
+} // anonymous namespace
+
+struct GAPIHeteroTest: public ::testing::Test
+{
+    cv::GComputation m_comp;
+    cv::gapi::GKernelPackage m_ocv_kernels;
+    cv::gapi::GKernelPackage m_fluid_kernels;
+    cv::gapi::GKernelPackage m_hetero_kernels;
+
+    cv::Mat m_in_mat;
+    cv::Mat m_out_mat;
+
+    GAPIHeteroTest();
+};
+
+GAPIHeteroTest::GAPIHeteroTest()
+    : m_comp([](){
+            cv::GMat in;
+            cv::GMat out = I::Bar::on(I::Foo::on(in),
+                                      I::Foo::on(in));
+            return cv::GComputation(in, out);
+        })
+    , m_ocv_kernels(cv::gapi::kernels<OCVFoo, OCVBar>())
+    , m_fluid_kernels(cv::gapi::kernels<FFoo, FBar>())
+    , m_hetero_kernels(cv::gapi::kernels<OCVFoo, FBar>())
+    , m_in_mat(cv::Mat::eye(cv::Size(64, 64), CV_8UC1))
+{
+}
+
+TEST_F(GAPIHeteroTest, TestOCV)
+{
+    EXPECT_TRUE(cv::gapi::cpu::backend() == m_ocv_kernels.lookup<I::Foo>());
+    EXPECT_TRUE(cv::gapi::cpu::backend() == m_ocv_kernels.lookup<I::Bar>());
+
+    cv::Mat ref = ocvBar(ocvFoo(m_in_mat), ocvFoo(m_in_mat));
+    EXPECT_NO_THROW(m_comp.apply(m_in_mat, m_out_mat, cv::compile_args(m_ocv_kernels)));
+    EXPECT_EQ(0, cv::countNonZero(ref != m_out_mat));
+}
+
+TEST_F(GAPIHeteroTest, TestFluid)
+{
+    EXPECT_TRUE(cv::gapi::fluid::backend() == m_fluid_kernels.lookup<I::Foo>());
+    EXPECT_TRUE(cv::gapi::fluid::backend() == m_fluid_kernels.lookup<I::Bar>());
+
+    cv::Mat ref = fluidBar(fluidFoo(m_in_mat), fluidFoo(m_in_mat));
+    EXPECT_NO_THROW(m_comp.apply(m_in_mat, m_out_mat, cv::compile_args(m_fluid_kernels)));
+    EXPECT_EQ(0, cv::countNonZero(ref != m_out_mat));
+}
+
+TEST_F(GAPIHeteroTest, TestBoth)
+{
+    EXPECT_TRUE(cv::gapi::cpu::backend()   == m_hetero_kernels.lookup<I::Foo>());
+    EXPECT_TRUE(cv::gapi::fluid::backend() == m_hetero_kernels.lookup<I::Bar>());
+
+    cv::Mat ref = fluidBar(ocvFoo(m_in_mat), ocvFoo(m_in_mat));
+    EXPECT_NO_THROW(m_comp.apply(m_in_mat, m_out_mat, cv::compile_args(m_hetero_kernels)));
+    EXPECT_EQ(0, cv::countNonZero(ref != m_out_mat));
+}
+
+struct GAPIBigHeteroTest : public ::testing::TestWithParam<std::array<int, 9>>
+{
+    cv::GComputation m_comp;
+    cv::gapi::GKernelPackage m_kernels;
+
+    cv::Mat m_in_mat;
+    cv::Mat m_out_mat1;
+    cv::Mat m_out_mat2;
+
+    cv::Mat m_ref_mat1;
+    cv::Mat m_ref_mat2;
+
+    GAPIBigHeteroTest();
+};
+
+//                                    Foo7
+//                .-> Foo2 -> Foo3 -<
+//   Foo0 -> Foo1                     Bar -> Foo6
+//                `-> Foo4 -> Foo5 -`
+
+GAPIBigHeteroTest::GAPIBigHeteroTest()
+    : m_comp([&](){
+        auto flags = GetParam();
+        std::array<std::function<cv::GMat(cv::GMat)>, 8> foos;
+
+        for (int i = 0; i < 8; i++)
+        {
+            foos[i] = flags[i] ? &I::Foo::on : &FluidFooI::on;
+        }
+        auto bar = flags[8] ? &I::Bar::on : &FluidBarI::on;
+
+        cv::GMat in;
+        auto foo1Out = foos[1](foos[0](in));
+        auto foo3Out = foos[3](foos[2](foo1Out));
+        auto foo6Out = foos[6](bar(foo3Out,
+                               foos[5](foos[4](foo1Out))));
+        auto foo7Out = foos[7](foo3Out);
+
+        return cv::GComputation(GIn(in), GOut(foo6Out, foo7Out));
+    })
+    , m_kernels(cv::gapi::kernels<OCVFoo, OCVBar, FluidFoo, FluidBar>())
+    , m_in_mat(cv::Mat::eye(cv::Size(64, 64), CV_8UC1))
+{
+    auto flags = GetParam();
+    std::array<std::function<cv::Mat(cv::Mat)>, 8> foos;
+
+    for (int i = 0; i < 8; i++)
+    {
+        foos[i] = flags[i] ? ocvFoo : fluidFoo;
+    }
+    auto bar = flags[8] ? ocvBar : fluidBar;
+
+    cv::Mat foo1OutMat = foos[1](foos[0](m_in_mat));
+    cv::Mat foo3OutMat = foos[3](foos[2](foo1OutMat));
+
+    m_ref_mat1 = foos[6](bar(foo3OutMat,
+                             foos[5](foos[4](foo1OutMat))));
+
+    m_ref_mat2 = foos[7](foo3OutMat);
+}
+
+TEST_P(GAPIBigHeteroTest, Test)
+{
+    EXPECT_NO_THROW(m_comp.apply(gin(m_in_mat), gout(m_out_mat1, m_out_mat2), cv::compile_args(m_kernels)));
+    EXPECT_EQ(0, cv::countNonZero(m_ref_mat1 != m_out_mat1));
+    EXPECT_EQ(0, cv::countNonZero(m_ref_mat2 != m_out_mat2));
+}
+
+static auto configurations = []()
+{
+    // Fill all possible configurations
+    // from 000000000 to 111111111
+    std::array<std::array<int, 9>, 512> arr;
+    for (auto n = 0; n < 512; n++)
+    {
+        for (auto i = 0; i < 9; i++)
+        {
+            arr[n][i] = (n >> (8 - i)) & 1;
+        }
+    }
+    return arr;
+}();
+
+INSTANTIATE_TEST_CASE_P(GAPIBigHeteroTest, GAPIBigHeteroTest,
+                        ::testing::ValuesIn(configurations));
+
+TEST(GAPIHeteroTestLPI, Test)
+{
+    cv::GMat in;
+    auto mid = FluidFooI::on(in);
+    auto out = FluidFooI::on(mid);
+    cv::gapi::island("isl0", GIn(in),  GOut(mid));
+    cv::gapi::island("isl1", GIn(mid), GOut(out));
+    cv::GComputation c(in, out);
+
+    cv::Mat in_mat = cv::Mat::eye(cv::Size(64, 64), CV_8UC1);
+    cv::Mat out_mat;
+    EXPECT_NO_THROW(c.apply(in_mat, out_mat, cv::compile_args(cv::gapi::kernels<FluidFoo2lpi>())));
+    cv::Mat ref = fluidFoo(fluidFoo(in_mat));
+    EXPECT_EQ(0, cv::countNonZero(ref != out_mat));
+}
+
+}  // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_desc_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_desc_tests.cpp
new file mode 100644 (file)
index 0000000..711211d
--- /dev/null
@@ -0,0 +1,202 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include "opencv2/gapi/cpu/gcpukernel.hpp"
+
+namespace opencv_test
+{
+
+namespace
+{
+    G_TYPED_KERNEL(KTest, <cv::GScalar(cv::GScalar)>, "org.opencv.test.scalar_kernel") {
+        static cv::GScalarDesc outMeta(cv::GScalarDesc in) { return in; }
+    };
+    GAPI_OCV_KERNEL(GOCVScalarTest, KTest)
+    {
+        static void run(const cv::Scalar &in, cv::Scalar &out) { out = in+cv::Scalar(1); }
+    };
+}
+
+TEST(GAPI_MetaDesc, MatDesc)
+{
+    cv::Mat m1(240, 320, CV_8U);
+    const auto desc1 = cv::descr_of(m1);
+    EXPECT_EQ(CV_8U, desc1.depth);
+    EXPECT_EQ(1,     desc1.chan);
+    EXPECT_EQ(320,   desc1.size.width);
+    EXPECT_EQ(240,   desc1.size.height);
+
+    cv::Mat m2(480, 640, CV_8UC3);
+    const auto desc2 = cv::descr_of(m2);
+    EXPECT_EQ(CV_8U, desc2.depth);
+    EXPECT_EQ(3,       desc2.chan);
+    EXPECT_EQ(640,     desc2.size.width);
+    EXPECT_EQ(480,     desc2.size.height);
+}
+
+TEST(GAPI_MetaDesc, Compare_Equal_MatDesc)
+{
+    const auto desc1 = cv::GMatDesc{CV_8U, 1, {64, 64}};
+    const auto desc2 = cv::GMatDesc{CV_8U, 1, {64, 64}};
+
+    EXPECT_TRUE(desc1 == desc2);
+}
+
+TEST(GAPI_MetaDesc, Compare_Not_Equal_MatDesc)
+{
+    const auto desc1 = cv::GMatDesc{CV_8U,  1, {64, 64}};
+    const auto desc2 = cv::GMatDesc{CV_32F, 1, {64, 64}};
+
+    EXPECT_TRUE(desc1 != desc2);
+}
+
+TEST(GAPI_MetaDesc, Compile_MatchMetaNumber_1)
+{
+    cv::GMat in;
+    cv::GComputation cc(in, in+in);
+
+    const auto desc1 = cv::GMatDesc{CV_8U,1,{64,64}};
+    const auto desc2 = cv::GMatDesc{CV_32F,1,{128,128}};
+
+    EXPECT_NO_THROW(cc.compile(desc1));
+    EXPECT_NO_THROW(cc.compile(desc2));
+
+    // FIXME: custom exception type?
+    // It is worth checking if compilation fails with different number
+    // of meta parameters
+    EXPECT_THROW(cc.compile(desc1, desc1),        std::logic_error);
+    EXPECT_THROW(cc.compile(desc1, desc2, desc2), std::logic_error);
+}
+
+TEST(GAPI_MetaDesc, Compile_MatchMetaNumber_2)
+{
+    cv::GMat a, b;
+    cv::GComputation cc(cv::GIn(a, b), cv::GOut(a+b));
+
+    const auto desc1 = cv::GMatDesc{CV_8U,1,{64,64}};
+    EXPECT_NO_THROW(cc.compile(desc1, desc1));
+
+    const auto desc2 = cv::GMatDesc{CV_32F,1,{128,128}};
+    EXPECT_NO_THROW(cc.compile(desc2, desc2));
+
+    // FIXME: custom exception type?
+    EXPECT_THROW(cc.compile(desc1),               std::logic_error);
+    EXPECT_THROW(cc.compile(desc2),               std::logic_error);
+    EXPECT_THROW(cc.compile(desc2, desc2, desc2), std::logic_error);
+}
+
+TEST(GAPI_MetaDesc, Compile_MatchMetaType_Mat)
+{
+    cv::GMat in;
+    cv::GComputation cc(in, in+in);
+
+    EXPECT_NO_THROW(cc.compile(cv::GMatDesc{CV_8U,1,{64,64}}));
+
+    // FIXME: custom exception type?
+    EXPECT_THROW(cc.compile(cv::empty_scalar_desc()), std::logic_error);
+}
+
+TEST(GAPI_MetaDesc, Compile_MatchMetaType_Scalar)
+{
+    cv::GScalar in;
+    cv::GComputation cc(cv::GIn(in), cv::GOut(KTest::on(in)));
+
+    const auto desc1 = cv::descr_of(cv::Scalar(128));
+    const auto desc2 = cv::GMatDesc{CV_8U,1,{64,64}};
+    const auto pkg   = cv::gapi::kernels<GOCVScalarTest>();
+    EXPECT_NO_THROW(cc.compile(desc1, cv::compile_args(pkg)));
+
+    // FIXME: custom exception type?
+    EXPECT_THROW(cc.compile(desc2, cv::compile_args(pkg)), std::logic_error);
+}
+
+TEST(GAPI_MetaDesc, Compile_MatchMetaType_Mixed)
+{
+    cv::GMat a;
+    cv::GScalar v;
+    cv::GComputation cc(cv::GIn(a, v), cv::GOut(cv::gapi::addC(a, v)));
+
+    const auto desc1 = cv::GMatDesc{CV_8U,1,{64,64}};
+    const auto desc2 = cv::descr_of(cv::Scalar(4));
+
+    EXPECT_NO_THROW(cc.compile(desc1, desc2));
+
+    // FIXME: custom exception type(s)?
+    EXPECT_THROW(cc.compile(desc1),               std::logic_error);
+    EXPECT_THROW(cc.compile(desc2),               std::logic_error);
+    EXPECT_THROW(cc.compile(desc2, desc1),        std::logic_error);
+    EXPECT_THROW(cc.compile(desc1, desc1, desc1), std::logic_error);
+    EXPECT_THROW(cc.compile(desc1, desc2, desc1), std::logic_error);
+}
+
+TEST(GAPI_MetaDesc, Typed_Compile_MatchMetaNumber_1)
+{
+    cv::GComputationT<cv::GMat(cv::GMat)> cc([](cv::GMat in)
+    {
+        return in+in;
+    });
+
+    const auto desc1 = cv::GMatDesc{CV_8U,1,{64,64}};
+    const auto desc2 = cv::GMatDesc{CV_32F,1,{128,128}};
+
+    EXPECT_NO_THROW(cc.compile(desc1));
+    EXPECT_NO_THROW(cc.compile(desc2));
+}
+
+TEST(GAPI_MetaDesc, Typed_Compile_MatchMetaNumber_2)
+{
+    cv::GComputationT<cv::GMat(cv::GMat,cv::GMat)> cc([](cv::GMat a, cv::GMat b)
+    {
+        return a + b;
+    });
+
+    const auto desc1 = cv::GMatDesc{CV_8U,1,{64,64}};
+    EXPECT_NO_THROW(cc.compile(desc1, desc1));
+
+    const auto desc2 = cv::GMatDesc{CV_32F,1,{128,128}};
+    EXPECT_NO_THROW(cc.compile(desc2, desc2));
+}
+
+TEST(GAPI_MetaDesc, Typed_Compile_MatchMetaType_Mat)
+{
+    cv::GComputationT<cv::GMat(cv::GMat)> cc([](cv::GMat in)
+    {
+        return in+in;
+    });
+
+    EXPECT_NO_THROW(cc.compile(cv::GMatDesc{CV_8U,1,{64,64}}));
+}
+
+TEST(GAPI_MetaDesc, Typed_Compile_MatchMetaType_Scalar)
+{
+    cv::GComputationT<cv::GScalar(cv::GScalar)> cc([](cv::GScalar in)
+    {
+        return KTest::on(in);
+    });
+
+    const auto desc1 = cv::descr_of(cv::Scalar(128));
+    const auto pkg = cv::gapi::kernels<GOCVScalarTest>();
+    //     EXPECT_NO_THROW(cc.compile(desc1, cv::compile_args(pkg)));
+    cc.compile(desc1, cv::compile_args(pkg));
+}
+
+TEST(GAPI_MetaDesc, Typed_Compile_MatchMetaType_Mixed)
+{
+    cv::GComputationT<cv::GMat(cv::GMat,cv::GScalar)> cc([](cv::GMat a, cv::GScalar v)
+    {
+        return cv::gapi::addC(a, v);
+    });
+
+    const auto desc1 = cv::GMatDesc{CV_8U,1,{64,64}};
+    const auto desc2 = cv::descr_of(cv::Scalar(4));
+
+    EXPECT_NO_THROW(cc.compile(desc1, desc2));
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_resize_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_resize_test.cpp
new file mode 100644 (file)
index 0000000..bc0b991
--- /dev/null
@@ -0,0 +1,720 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include "gapi_fluid_test_kernels.hpp"
+
+namespace opencv_test
+{
+
+using namespace cv::gapi_test_kernels;
+
+G_TYPED_KERNEL(TCopy, <GMat(GMat)>, "test.fluid.copy")
+{
+    static GMatDesc outMeta(const cv::GMatDesc &in) {
+        return in;
+    }
+};
+
+GAPI_FLUID_KERNEL(FCopy, TCopy, false)
+{
+    static const int Window = 1;
+
+    static void run(const cv::gapi::fluid::View   &in,
+                          cv::gapi::fluid::Buffer &out)
+    {
+        const uint8_t* in_row  = in .InLine <uint8_t>(0);
+        uint8_t* out_row = out.OutLine<uint8_t>();
+
+        for (int i = 0, w = in.length(); i < w; i++)
+        {
+            //std::cout << std::setw(4) << int(in_row[i]);
+            out_row[i] = in_row[i];
+        }
+        //std::cout << std::endl;
+    }
+};
+
+GAPI_FLUID_KERNEL(FResizeNN1Lpi, cv::gapi::core::GResize, false)
+{
+    static const int Window = 1;
+    static const auto Kind = GFluidKernel::Kind::Resize;
+
+    static void run(const cv::gapi::fluid::View& in, cv::Size /*sz*/, double /*fx*/, double /*fy*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out)
+
+    {
+        auto length = out.length();
+        double vRatio = (double)in.meta().size.height / out.meta().size.height;
+        double hRatio = (double)in.length() / length;
+        auto y = out.y();
+        auto inY = in.y();
+
+        for (int l = 0; l < out.lpi(); l++)
+        {
+            auto sy = static_cast<int>((y+l) * vRatio);
+            int idx = sy - inY;
+
+            const auto src = in.InLine <unsigned char>(idx);
+            auto dst = out.OutLine<unsigned char>(l);
+
+            for (int x = 0; x < length; x++)
+            {
+                auto inX = static_cast<int>(x * hRatio);
+                dst[x] = src[inX];
+            }
+        }
+    }
+};
+
+namespace
+{
+namespace func
+{
+template <class Mapper>
+void initScratch(const cv::GMatDesc& in, cv::Size outSz, cv::gapi::fluid::Buffer &scratch)
+{
+    CV_Assert(in.depth == CV_8U && in.chan == 1);
+
+    cv::Size scratch_size{static_cast<int>(outSz.width * sizeof(typename Mapper::Unit)), 1};
+
+    cv::GMatDesc desc;
+    desc.chan  = 1;
+    desc.depth = CV_8UC1;
+    desc.size  = scratch_size;
+
+    cv::gapi::fluid::Buffer buffer(desc);
+    scratch = std::move(buffer);
+
+    auto mapX = scratch.OutLine<typename Mapper::Unit>();
+    double hRatio = (double)in.size.width / outSz.width;
+
+    for (int x = 0, w = outSz.width; x < w; x++)
+    {
+        mapX[x] = Mapper::map(hRatio, 0, in.size.width, x);
+    }
+}
+
+template <class Mapper>
+inline void calcRow(const cv::gapi::fluid::View& in, cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch)
+{
+    double vRatio = (double)in.meta().size.height / out.meta().size.height;
+    auto mapX = scratch.OutLine<typename Mapper::Unit>();
+    auto inY = in.y();
+    auto inH = in.meta().size.height;
+    auto outY = out.y();
+    auto length = out.length();
+
+    for (int l = 0; l < out.lpi(); l++)
+    {
+        auto mapY = Mapper::map(vRatio, inY, inH, outY + l);
+
+        const auto src0 = in.InLine <unsigned char>(mapY.s0);
+        const auto src1 = in.InLine <unsigned char>(mapY.s1);
+
+        auto dst = out.OutLine<unsigned char>(l);
+
+        for (int x = 0; x < length; x++)
+        {
+            auto alpha0 = mapX[x].alpha0;
+            auto alpha1 = mapX[x].alpha1;
+            auto sx0 = mapX[x].s0;
+            auto sx1 = mapX[x].s1;
+
+            int res0 = src0[sx0]*alpha0 + src0[sx1]*alpha1;
+            int res1 = src1[sx0]*alpha0 + src1[sx1]*alpha1;
+
+            dst[x] = uchar(( ((mapY.alpha0 * (res0 >> 4)) >> 16) + ((mapY.alpha1 * (res1 >> 4)) >> 16) + 2)>>2);
+        }
+    }
+}
+} // namespace func
+
+constexpr static const int INTER_RESIZE_COEF_BITS = 11;
+constexpr static const int INTER_RESIZE_COEF_SCALE = 1 << INTER_RESIZE_COEF_BITS;
+
+namespace linear
+{
+struct Mapper
+{
+    struct Unit
+    {
+        short alpha0;
+        short alpha1;
+        int   s0;
+        int   s1;
+    };
+
+    static inline Unit map(double ratio, int start, int max, int outCoord)
+    {
+        auto f = static_cast<float>((outCoord + 0.5f) * ratio - 0.5f);
+        int s = cvFloor(f);
+        f -= s;
+
+        Unit u;
+
+        u.s0 = std::max(s - start, 0);
+        u.s1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
+
+        u.alpha0 = saturate_cast<short>((1.0f - f) * INTER_RESIZE_COEF_SCALE);
+        u.alpha1 = saturate_cast<short>((f) * INTER_RESIZE_COEF_SCALE);
+
+        return u;
+    }
+};
+
+} // namespace linear
+
+namespace areaUpscale
+{
+struct Mapper
+{
+    struct Unit
+    {
+        short alpha0;
+        short alpha1;
+        int   s0;
+        int   s1;
+    };
+
+    static inline Unit map(double ratio, int start, int max, int outCoord)
+    {
+        int s = cvFloor(outCoord*ratio);
+        float f = (float)((outCoord+1) - (s+1)/ratio);
+        f = f <= 0 ? 0.f : f - cvFloor(f);
+
+        Unit u;
+
+        u.s0 = std::max(s - start, 0);
+        u.s1 = ((f == 0.0) || s + 1 >= max) ? s - start : s - start + 1;
+
+        u.alpha0 = saturate_cast<short>((1.0f - f) * INTER_RESIZE_COEF_SCALE);
+        u.alpha1 = saturate_cast<short>((f) * INTER_RESIZE_COEF_SCALE);
+
+        return u;
+    }
+};
+} // namespace areaUpscale
+} // anonymous namespace
+
+GAPI_FLUID_KERNEL(FResizeLinear1Lpi, cv::gapi::core::GResize, true)
+{
+    static const int Window = 1;
+    static const auto Kind = GFluidKernel::Kind::Resize;
+
+    static void initScratch(const cv::GMatDesc& in,
+                            cv::Size outSz, double /*fx*/, double /*fy*/, int /*interp*/,
+                            cv::gapi::fluid::Buffer &scratch)
+    {
+        func::initScratch<linear::Mapper>(in, outSz, scratch);
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/)
+    {}
+
+    static void run(const cv::gapi::fluid::View& in, cv::Size /*sz*/, double /*fx*/, double /*fy*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch)
+
+    {
+        func::calcRow<linear::Mapper>(in, out, scratch);
+    }
+};
+
+namespace
+{
+// FIXME
+// Move to some common place (to reuse/align with ResizeAgent)
+auto startInCoord = [](int outCoord, double ratio) {
+    return static_cast<int>(outCoord * ratio + 1e-3);
+};
+auto endInCoord = [](int outCoord, double ratio) {
+    return static_cast<int>(std::ceil((outCoord + 1) * ratio - 1e-3));
+};
+} // namespace
+
+GAPI_FLUID_KERNEL(FResizeArea1Lpi, cv::gapi::core::GResize, false)
+{
+    static const int Window = 1;
+    static const auto Kind = GFluidKernel::Kind::Resize;
+
+    static void run(const cv::gapi::fluid::View& in, cv::Size /*sz*/, double /*fx*/, double /*fy*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out)
+
+    {
+        auto firstOutLineIdx = out.y();
+        auto firstViewLineIdx = in.y();
+        auto length = out.length();
+        double vRatio = (double)in.meta().size.height / out.meta().size.height;
+        double hRatio = (double)in.length() / length;
+
+        for (int l = 0; l < out.lpi(); l++)
+        {
+            int outY = firstOutLineIdx + l;
+            int startY = startInCoord(outY, vRatio);
+            int endY   = endInCoord  (outY, vRatio);
+
+            auto dst = out.OutLine<unsigned char>(l);
+
+            for (int x = 0; x < length; x++)
+            {
+                float res = 0.0;
+
+                int startX = startInCoord(x, hRatio);
+                int endX   = endInCoord  (x, hRatio);
+
+                for (int inY = startY; inY < endY; inY++)
+                {
+                    double startCoordY = inY / vRatio;
+                    double endCoordY = startCoordY + 1/vRatio;
+
+                    if (startCoordY < outY) startCoordY = outY;
+                    if (endCoordY > outY + 1) endCoordY = outY + 1;
+
+                    float fracY = static_cast<float>((inY == startY || inY == endY - 1) ? endCoordY - startCoordY : 1/vRatio);
+
+                    const auto src = in.InLine <unsigned char>(inY - firstViewLineIdx);
+
+                    float rowSum = 0.0f;
+
+                    for (int inX = startX; inX < endX; inX++)
+                    {
+                        double startCoordX = inX / hRatio;
+                        double endCoordX = startCoordX + 1/hRatio;
+
+                        if (startCoordX < x) startCoordX = x;
+                        if (endCoordX > x + 1) endCoordX = x + 1;
+
+                        float fracX = static_cast<float>((inX == startX || inX == endX - 1) ? endCoordX - startCoordX : 1/hRatio);
+
+                        rowSum += src[inX] * fracX;
+                    }
+                    res += rowSum * fracY;
+                }
+                dst[x] = static_cast<unsigned char>(std::rint(res));
+            }
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FResizeAreaUpscale1Lpi, cv::gapi::core::GResize, true)
+{
+    static const int Window = 1;
+    static const auto Kind = GFluidKernel::Kind::Resize;
+
+    static void initScratch(const cv::GMatDesc& in,
+                            cv::Size outSz, double /*fx*/, double /*fy*/, int /*interp*/,
+                            cv::gapi::fluid::Buffer &scratch)
+    {
+        func::initScratch<areaUpscale::Mapper>(in, outSz, scratch);
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer& /*scratch*/)
+    {}
+
+    static void run(const cv::gapi::fluid::View& in, cv::Size /*sz*/, double /*fx*/, double /*fy*/, int /*interp*/,
+                    cv::gapi::fluid::Buffer& out, cv::gapi::fluid::Buffer &scratch)
+    {
+        func::calcRow<areaUpscale::Mapper>(in, out, scratch);
+    }
+};
+
+#define ADD_RESIZE_KERNEL_WITH_LPI(interp, lpi, scratch)                                                                           \
+struct Resize##interp##lpi##LpiHelper : public FResize##interp##1Lpi { static const int LPI = lpi; };                              \
+struct FResize##interp##lpi##Lpi : public cv::GFluidKernelImpl<Resize##interp##lpi##LpiHelper, cv::gapi::core::GResize, scratch>{};
+
+ADD_RESIZE_KERNEL_WITH_LPI(NN, 2, false)
+ADD_RESIZE_KERNEL_WITH_LPI(NN, 3, false)
+ADD_RESIZE_KERNEL_WITH_LPI(NN, 4, false)
+
+ADD_RESIZE_KERNEL_WITH_LPI(Linear, 2, true)
+ADD_RESIZE_KERNEL_WITH_LPI(Linear, 3, true)
+ADD_RESIZE_KERNEL_WITH_LPI(Linear, 4, true)
+
+ADD_RESIZE_KERNEL_WITH_LPI(Area, 2, false)
+ADD_RESIZE_KERNEL_WITH_LPI(Area, 3, false)
+ADD_RESIZE_KERNEL_WITH_LPI(Area, 4, false)
+
+ADD_RESIZE_KERNEL_WITH_LPI(AreaUpscale, 2, true)
+ADD_RESIZE_KERNEL_WITH_LPI(AreaUpscale, 3, true)
+ADD_RESIZE_KERNEL_WITH_LPI(AreaUpscale, 4, true)
+#undef ADD_RESIZE_KERNEL_WITH_LPI
+
+static auto fluidResizeTestPackage = [](int interpolation, cv::Size szIn, cv::Size szOut, int lpi = 1)
+{
+    using namespace cv;
+    using namespace cv::gapi;
+    bool upscale = szIn.width < szOut.width || szIn.height < szOut.height;
+
+#define RESIZE_CASE(interp, lpi) \
+    case lpi: pkg = kernels<FCopy, FResize##interp##lpi##Lpi>(); break;
+
+#define RESIZE_SWITCH(interp)   \
+    switch(lpi)                 \
+    {                           \
+    RESIZE_CASE(interp, 1)      \
+    RESIZE_CASE(interp, 2)      \
+    RESIZE_CASE(interp, 3)      \
+    RESIZE_CASE(interp, 4)      \
+    default: CV_Assert(false);  \
+    }
+
+    GKernelPackage pkg;
+    switch (interpolation)
+    {
+    case INTER_NEAREST: RESIZE_SWITCH(NN); break;
+    case INTER_LINEAR:  RESIZE_SWITCH(Linear); break;
+    case INTER_AREA:
+    {
+        if (upscale)
+        {
+            RESIZE_SWITCH(AreaUpscale)
+        }
+        else
+        {
+            RESIZE_SWITCH(Area);
+        }
+    }break;
+    default: CV_Assert(false);
+    }
+    return combine(pkg, fluidTestPackage, unite_policy::KEEP);
+
+#undef RESIZE_SWITCH
+#undef RESIZE_CASE
+};
+
+struct ResizeTestFluid : public TestWithParam<std::tuple<int, int, cv::Size, std::tuple<cv::Size, cv::Rect>, int, double>> {};
+TEST_P(ResizeTestFluid, SanityTest)
+{
+    int type = 0, interp = 0;
+    cv::Size sz_in, sz_out;
+    int lpi = 0;
+    double tolerance = 0.0;
+    cv::Rect outRoi;
+    std::tuple<cv::Size, cv::Rect> outSizeAndRoi;
+    std::tie(type, interp, sz_in, outSizeAndRoi, lpi, tolerance) = GetParam();
+    std::tie(sz_out, outRoi) = outSizeAndRoi;
+    if (outRoi == cv::Rect{}) outRoi = {0,0,sz_out.width,sz_out.height};
+    if (outRoi.width == 0) outRoi.width = sz_out.width;
+    double fx = 0, fy = 0;
+
+    cv::Mat in_mat1 (sz_in, type );
+    cv::Scalar mean = cv::Scalar(127);
+    cv::Scalar stddev = cv::Scalar(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+
+    cv::Mat out_mat = cv::Mat::zeros(sz_out, type);
+    cv::Mat out_mat_ocv = cv::Mat::zeros(sz_out, type);
+
+    cv::GMat in;
+    auto mid = TBlur3x3::on(in, cv::BORDER_REPLICATE, {});
+    auto out = cv::gapi::resize(mid, sz_out, fx, fy, interp);
+
+    cv::GComputation c(in, out);
+    c.apply(in_mat1, out_mat, cv::compile_args(GFluidOutputRois{{outRoi}}, fluidResizeTestPackage(interp, sz_in, sz_out, lpi)));
+
+    cv::Mat mid_mat;
+    cv::blur(in_mat1, mid_mat, {3,3}, {-1,-1},  cv::BORDER_REPLICATE);
+    cv::resize(mid_mat, out_mat_ocv, sz_out, fx, fy, interp);
+
+    cv::Mat absDiff;
+    cv::absdiff(out_mat(outRoi), out_mat_ocv(outRoi), absDiff);
+    EXPECT_EQ(0, cv::countNonZero(absDiff > tolerance));
+}
+
+INSTANTIATE_TEST_CASE_P(ResizeTestCPU, ResizeTestFluid,
+                        Combine(Values(CV_8UC1),
+                                Values(cv::INTER_NEAREST, cv::INTER_LINEAR),
+                                Values(cv::Size(8, 7),
+                                       cv::Size(8, 8),
+                                       cv::Size(8, 64),
+                                       cv::Size(8, 25),
+                                       cv::Size(16, 8),
+                                       cv::Size(16, 7)),
+                                Values(std::make_tuple(cv::Size(5, 4), cv::Rect{}),
+                                       std::make_tuple(cv::Size(5, 4), cv::Rect{0, 0, 0, 2}),
+                                       std::make_tuple(cv::Size(5, 4), cv::Rect{0, 1, 0, 2}),
+                                       std::make_tuple(cv::Size(5, 4), cv::Rect{0, 2, 0, 2}),
+                                       std::make_tuple(cv::Size(7, 7), cv::Rect{}),
+                                       std::make_tuple(cv::Size(7, 7), cv::Rect{0, 0, 0, 3}),
+                                       std::make_tuple(cv::Size(7, 7), cv::Rect{0, 2, 0, 2}),
+                                       std::make_tuple(cv::Size(7, 7), cv::Rect{0, 4, 0, 3}),
+                                       std::make_tuple(cv::Size(8, 4), cv::Rect{}),
+                                       std::make_tuple(cv::Size(8, 4), cv::Rect{0, 0, 0, 3}),
+                                       std::make_tuple(cv::Size(8, 4), cv::Rect{0, 1, 0, 2}),
+                                       std::make_tuple(cv::Size(8, 4), cv::Rect{0, 3, 0, 1})),
+                                Values(1, 2, 3, 4), // lpi
+                                Values(0.0)));
+
+INSTANTIATE_TEST_CASE_P(ResizeAreaTestCPU, ResizeTestFluid,
+                        Combine(Values(CV_8UC1),
+                                Values(cv::INTER_AREA),
+                                Values(cv::Size(8, 7),
+                                       cv::Size(8, 8),
+                                       cv::Size(8, 64),
+                                       cv::Size(8, 25),
+                                       cv::Size(16, 8),
+                                       cv::Size(16, 7)),
+                                Values(std::make_tuple(cv::Size(5, 4), cv::Rect{}),
+                                       std::make_tuple(cv::Size(5, 4), cv::Rect{0, 0, 0, 2}),
+                                       std::make_tuple(cv::Size(5, 4), cv::Rect{0, 1, 0, 2}),
+                                       std::make_tuple(cv::Size(5, 4), cv::Rect{0, 2, 0, 2}),
+                                       std::make_tuple(cv::Size(7, 7), cv::Rect{}),
+                                       std::make_tuple(cv::Size(7, 7), cv::Rect{0, 0, 0, 3}),
+                                       std::make_tuple(cv::Size(7, 7), cv::Rect{0, 2, 0, 2}),
+                                       std::make_tuple(cv::Size(7, 7), cv::Rect{0, 4, 0, 3}),
+                                       std::make_tuple(cv::Size(8, 4), cv::Rect{}),
+                                       std::make_tuple(cv::Size(8, 4), cv::Rect{0, 0, 0, 3}),
+                                       std::make_tuple(cv::Size(8, 4), cv::Rect{0, 1, 0, 2}),
+                                       std::make_tuple(cv::Size(8, 4), cv::Rect{0, 3, 0, 1})),
+                                Values(1, 2, 3, 4), // lpi
+                                // Actually this tolerance only for cases where OpenCV
+                                // uses ResizeAreaFast
+                                Values(1.0)));
+
+INSTANTIATE_TEST_CASE_P(ResizeUpscaleTestCPU, ResizeTestFluid,
+                        Combine(Values(CV_8UC1),
+                                Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(cv::Size(1, 5),
+                                       cv::Size(3, 5),
+                                       cv::Size(7, 5),
+                                       cv::Size(1, 7),
+                                       cv::Size(3, 7),
+                                       cv::Size(7, 7)),
+                                Values(std::make_tuple(cv::Size(8, 8), cv::Rect{0,0,8,2}),
+                                       std::make_tuple(cv::Size(8, 8), cv::Rect{0,2,8,2}),
+                                       std::make_tuple(cv::Size(8, 8), cv::Rect{0,4,8,2}),
+                                       std::make_tuple(cv::Size(8, 8), cv::Rect{0,6,8,2}),
+                                       std::make_tuple(cv::Size(8, 8), cv::Rect{0,0,8,8}),
+                                       std::make_tuple(cv::Size(16, 8), cv::Rect{}),
+                                       std::make_tuple(cv::Size(16, 64), cv::Rect{0, 0,16,16}),
+                                       std::make_tuple(cv::Size(16, 64), cv::Rect{0,16,16,16}),
+                                       std::make_tuple(cv::Size(16, 64), cv::Rect{0,32,16,16}),
+                                       std::make_tuple(cv::Size(16, 64), cv::Rect{0,48,16,16}),
+                                       std::make_tuple(cv::Size(16, 64), cv::Rect{0, 0,16,64}),
+                                       std::make_tuple(cv::Size(16, 25), cv::Rect{0, 0,16, 7}),
+                                       std::make_tuple(cv::Size(16, 25), cv::Rect{0, 7,16, 6}),
+                                       std::make_tuple(cv::Size(16, 25), cv::Rect{0,13,16, 6}),
+                                       std::make_tuple(cv::Size(16, 25), cv::Rect{0,19,16, 6}),
+                                       std::make_tuple(cv::Size(16, 25), cv::Rect{0, 0,16, 7}),
+                                       std::make_tuple(cv::Size(16, 25), cv::Rect{0, 7,16, 7}),
+                                       std::make_tuple(cv::Size(16, 25), cv::Rect{0,14,16, 7}),
+                                       std::make_tuple(cv::Size(16, 25), cv::Rect{0,21,16, 4}),
+                                       std::make_tuple(cv::Size(16, 25), cv::Rect{0, 0,16,25}),
+                                       std::make_tuple(cv::Size(16, 7), cv::Rect{}),
+                                       std::make_tuple(cv::Size(16, 8), cv::Rect{})),
+                                Values(1, 2, 3, 4), // lpi
+                                Values(0.0)));
+
+INSTANTIATE_TEST_CASE_P(ResizeUpscaleOneDimDownscaleAnother, ResizeTestFluid,
+                        Combine(Values(CV_8UC1),
+                                Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(cv::Size(6, 6),
+                                       cv::Size(8, 7),
+                                       cv::Size(8, 8),
+                                       cv::Size(8, 10),
+                                       cv::Size(10, 8),
+                                       cv::Size(10, 7)),
+                                Values(std::make_tuple(cv::Size(11, 5), cv::Rect{}),
+                                       std::make_tuple(cv::Size(11, 5), cv::Rect{0, 0, 0, 2}),
+                                       std::make_tuple(cv::Size(11, 5), cv::Rect{0, 2, 0, 2}),
+                                       std::make_tuple(cv::Size(11, 5), cv::Rect{0, 4, 0, 1}),
+                                       std::make_tuple(cv::Size(12, 2), cv::Rect{}),
+                                       std::make_tuple(cv::Size(12, 2), cv::Rect{0, 0, 0, 1}),
+                                       std::make_tuple(cv::Size(12, 2), cv::Rect{0, 1, 0, 1}),
+                                       std::make_tuple(cv::Size(23, 3), cv::Rect{}),
+                                       std::make_tuple(cv::Size(23, 3), cv::Rect{0, 0, 0, 1}),
+                                       std::make_tuple(cv::Size(23, 3), cv::Rect{0, 1, 0, 1}),
+                                       std::make_tuple(cv::Size(23, 3), cv::Rect{0, 2, 0, 1}),
+                                       std::make_tuple(cv::Size(3, 24), cv::Rect{}),
+                                       std::make_tuple(cv::Size(3, 24), cv::Rect{0,  0, 0, 6}),
+                                       std::make_tuple(cv::Size(3, 24), cv::Rect{0,  6, 0, 6}),
+                                       std::make_tuple(cv::Size(3, 24), cv::Rect{0, 12, 0, 6}),
+                                       std::make_tuple(cv::Size(3, 24), cv::Rect{0, 18, 0, 6}),
+                                       std::make_tuple(cv::Size(5, 11), cv::Rect{}),
+                                       std::make_tuple(cv::Size(5, 11), cv::Rect{0, 0, 0, 3}),
+                                       std::make_tuple(cv::Size(5, 11), cv::Rect{0, 3, 0, 3}),
+                                       std::make_tuple(cv::Size(5, 11), cv::Rect{0, 6, 0, 3}),
+                                       std::make_tuple(cv::Size(5, 11), cv::Rect{0, 9, 0, 2})),
+                                Values(1, 2, 3, 4), // lpi
+                                Values(0.0)));
+
+INSTANTIATE_TEST_CASE_P(Resize400_384TestCPU, ResizeTestFluid,
+                        Combine(Values(CV_8UC1),
+                                Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(cv::Size(128, 400)),
+                                Values(std::make_tuple(cv::Size(128, 384), cv::Rect{})),
+                                Values(1, 2, 3, 4), // lpi
+                                Values(0.0)));
+
+INSTANTIATE_TEST_CASE_P(Resize220_400TestCPU, ResizeTestFluid,
+                        Combine(Values(CV_8UC1),
+                                Values(cv::INTER_LINEAR),
+                                Values(cv::Size(220, 220)),
+                                Values(std::make_tuple(cv::Size(400, 400), cv::Rect{})),
+                                Values(1, 2, 3, 4), // lpi
+                                Values(0.0)));
+
+static auto cvBlur = [](const cv::Mat& in, cv::Mat& out, int kernelSize)
+{
+    if (kernelSize == 1)
+    {
+        out = in;
+    }
+    else
+    {
+        cv::blur(in, out, {kernelSize, kernelSize});
+    }
+};
+
+using SizesWithRois = std::tuple<cv::Size, cv::Rect, cv::Size, cv::Rect>;
+struct ResizeAndAnotherReaderTest : public TestWithParam<std::tuple<int, int, bool, SizesWithRois>>{};
+TEST_P(ResizeAndAnotherReaderTest, SanityTest)
+{
+    bool readFromInput = false;
+    int interp = -1, kernelSize = -1;
+    SizesWithRois sizesWithRois;
+    std::tie(interp, kernelSize, readFromInput, sizesWithRois) = GetParam();
+
+    cv::Size sz,  resizedSz;
+    cv::Rect roi, resizedRoi;
+    std::tie(sz, roi, resizedSz, resizedRoi) = sizesWithRois;
+
+    cv::Mat in_mat(sz, CV_8UC1);
+    cv::Scalar mean = cv::Scalar(127);
+    cv::Scalar stddev = cv::Scalar(40.f);
+    cv::randn(in_mat, mean, stddev);
+
+    cv::Mat gapi_resize_out = cv::Mat::zeros(resizedSz, CV_8UC1);
+    cv::Mat gapi_blur_out = cv::Mat::zeros(sz, CV_8UC1);
+
+    auto blur = kernelSize == 1 ? &TBlur1x1::on : kernelSize == 3 ? &TBlur3x3::on : &TBlur5x5::on;
+
+    cv::GMat in, resize_out, blur_out;
+
+    if (readFromInput)
+    {
+        resize_out = gapi::resize(in, resizedSz, 0, 0, interp);
+        blur_out   = blur(in, cv::BORDER_DEFAULT, {});
+    }
+    else
+    {
+        auto mid   = TCopy::on(in);
+        resize_out = gapi::resize(mid, resizedSz, 0, 0, interp);
+        blur_out   = blur(mid, cv::BORDER_DEFAULT, {});
+    }
+
+    cv::GComputation c(GIn(in), GOut(resize_out, blur_out));
+    c.apply(gin(in_mat), gout(gapi_resize_out, gapi_blur_out), cv::compile_args(GFluidOutputRois{{resizedRoi, roi}},
+                                                                                fluidResizeTestPackage(interp, sz, resizedSz)));
+
+    cv::Mat ocv_resize_out = cv::Mat::zeros(resizedSz, CV_8UC1);
+    cv::resize(in_mat, ocv_resize_out, resizedSz, 0, 0, interp);
+    cv::Mat ocv_blur_out = cv::Mat::zeros(sz, CV_8UC1);
+    cvBlur(in_mat, ocv_blur_out, kernelSize);
+
+    EXPECT_EQ(0, cv::countNonZero(gapi_resize_out(resizedRoi) != ocv_resize_out(resizedRoi)));
+    EXPECT_EQ(0, cv::countNonZero(gapi_blur_out(roi) != ocv_blur_out(roi)));
+}
+
+INSTANTIATE_TEST_CASE_P(ResizeTestCPU, ResizeAndAnotherReaderTest,
+                        Combine(Values(cv::INTER_NEAREST, cv::INTER_LINEAR),
+                                Values(1, 3, 5),
+                                testing::Bool(), // Read from input directly or place a copy node at start
+                                Values(std::make_tuple(cv::Size{8,8}, cv::Rect{0,0,8,8},
+                                                       cv::Size{4,4}, cv::Rect{0,0,4,4}),
+                                       std::make_tuple(cv::Size{8,8}, cv::Rect{0,0,8,2},
+                                                       cv::Size{4,4}, cv::Rect{0,0,4,1}),
+                                       std::make_tuple(cv::Size{8,8}, cv::Rect{0,2,8,4},
+                                                       cv::Size{4,4}, cv::Rect{0,1,4,2}),
+                                       std::make_tuple(cv::Size{8,8}, cv::Rect{0,4,8,4},
+                                                       cv::Size{4,4}, cv::Rect{0,2,4,2}),
+                                       std::make_tuple(cv::Size{64,64}, cv::Rect{0, 0,64,64},
+                                                       cv::Size{49,49}, cv::Rect{0, 0,49,49}),
+                                       std::make_tuple(cv::Size{64,64}, cv::Rect{0, 0,64,15},
+                                                       cv::Size{49,49}, cv::Rect{0, 0,49,11}),
+                                       std::make_tuple(cv::Size{64,64}, cv::Rect{0,11,64,23},
+                                                       cv::Size{49,49}, cv::Rect{0, 9,49,17}),
+                                       std::make_tuple(cv::Size{64,64}, cv::Rect{0,50,64,14},
+                                                       cv::Size{49,49}, cv::Rect{0,39,49,10}))));
+
+struct BlursAfterResizeTest : public TestWithParam<std::tuple<int, int, int, bool, std::tuple<cv::Size, cv::Size, cv::Rect>>>{};
+TEST_P(BlursAfterResizeTest, SanityTest)
+{
+    bool readFromInput = false;
+    int interp = -1, kernelSize1 = -1, kernelSize2 = -1;
+    std::tuple<cv::Size, cv::Size, cv::Rect> sizesWithRoi;
+    std::tie(interp, kernelSize1, kernelSize2, readFromInput, sizesWithRoi) = GetParam();
+
+    cv::Size inSz,  outSz;
+    cv::Rect outRoi;
+    std::tie(inSz, outSz, outRoi) = sizesWithRoi;
+
+    cv::Mat in_mat(inSz, CV_8UC1);
+    cv::Scalar mean = cv::Scalar(127);
+    cv::Scalar stddev = cv::Scalar(40.f);
+    cv::randn(in_mat, mean, stddev);
+    cv::Mat gapi_out1 = cv::Mat::zeros(outSz, CV_8UC1);
+    cv::Mat gapi_out2 = cv::Mat::zeros(outSz, CV_8UC1);
+
+    auto blur1 = kernelSize1 == 1 ? &TBlur1x1::on : kernelSize1 == 3 ? &TBlur3x3::on : &TBlur5x5::on;
+    auto blur2 = kernelSize2 == 1 ? &TBlur1x1::on : kernelSize2 == 3 ? &TBlur3x3::on : &TBlur5x5::on;
+
+    cv::GMat in, out1, out2;
+    if (readFromInput)
+    {
+        auto resized = gapi::resize(in, outSz, 0, 0, interp);
+        out1 = blur1(resized, cv::BORDER_DEFAULT, {});
+        out2 = blur2(resized, cv::BORDER_DEFAULT, {});
+    }
+    else
+    {
+        auto mid = TCopy::on(in);
+        auto resized = gapi::resize(mid, outSz, 0, 0, interp);
+        out1 = blur1(resized, cv::BORDER_DEFAULT, {});
+        out2 = blur2(resized, cv::BORDER_DEFAULT, {});
+    }
+
+    cv::GComputation c(GIn(in), GOut(out1, out2));
+    c.apply(gin(in_mat), gout(gapi_out1, gapi_out2), cv::compile_args(GFluidOutputRois{{outRoi, outRoi}},
+                                                                      fluidResizeTestPackage(interp, inSz, outSz)));
+
+    cv::Mat ocv_out1 = cv::Mat::zeros(outSz, CV_8UC1);
+    cv::Mat ocv_out2 = cv::Mat::zeros(outSz, CV_8UC1);
+    cv::Mat resized = cv::Mat::zeros(outSz, CV_8UC1);
+    cv::resize(in_mat, resized, outSz, 0, 0, interp);
+    cvBlur(resized, ocv_out1, kernelSize1);
+    cvBlur(resized, ocv_out2, kernelSize2);
+
+    EXPECT_EQ(0, cv::countNonZero(gapi_out1(outRoi) != ocv_out1(outRoi)));
+    EXPECT_EQ(0, cv::countNonZero(gapi_out2(outRoi) != ocv_out2(outRoi)));
+}
+
+INSTANTIATE_TEST_CASE_P(ResizeTestCPU, BlursAfterResizeTest,
+                        Combine(Values(cv::INTER_NEAREST, cv::INTER_LINEAR),
+                                Values(1, 3, 5),
+                                Values(1, 3, 5),
+                                testing::Bool(), // Read from input directly or place a copy node at start
+                                Values(std::make_tuple(cv::Size{8,8},
+                                                       cv::Size{4,4}, cv::Rect{0,0,4,4}),
+                                       std::make_tuple(cv::Size{8,8},
+                                                       cv::Size{4,4}, cv::Rect{0,0,4,1}),
+                                       std::make_tuple(cv::Size{8,8},
+                                                       cv::Size{4,4}, cv::Rect{0,1,4,2}),
+                                       std::make_tuple(cv::Size{8,8},
+                                                       cv::Size{4,4}, cv::Rect{0,2,4,2}),
+                                       std::make_tuple(cv::Size{64,64},
+                                                       cv::Size{49,49}, cv::Rect{0, 0,49,49}),
+                                       std::make_tuple(cv::Size{64,64},
+                                                       cv::Size{49,49}, cv::Rect{0, 0,49,11}),
+                                       std::make_tuple(cv::Size{64,64},
+                                                       cv::Size{49,49}, cv::Rect{0, 9,49,17}),
+                                       std::make_tuple(cv::Size{64,64},
+                                                       cv::Size{49,49}, cv::Rect{0,39,49,10}))));
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_roi_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_roi_test.cpp
new file mode 100644 (file)
index 0000000..ee8674e
--- /dev/null
@@ -0,0 +1,197 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include "gapi_fluid_test_kernels.hpp"
+
+namespace opencv_test
+{
+
+using namespace cv::gapi_test_kernels;
+
+struct PartialComputation : public TestWithParam <std::tuple<cv::Rect>> {};
+TEST_P(PartialComputation, Test)
+{
+    cv::Rect roi;
+    std::tie(roi) = GetParam();
+
+    int borderType = BORDER_REPLICATE;
+    int kernelSize = 3;
+    cv::Point anchor = {-1, -1};
+
+    cv::GMat in;
+    cv::GMat out = TBlur3x3::on(in, borderType, {});
+    cv::GComputation c(cv::GIn(in), cv::GOut(out));
+
+    const auto sz = cv::Size(8, 10);
+    cv::Mat in_mat(sz, CV_8UC1);
+    cv::Scalar mean   = cv::Scalar(127.0f);
+    cv::Scalar stddev = cv::Scalar(40.f);
+    cv::randn(in_mat, mean, stddev);
+
+    cv::Mat out_mat_gapi = cv::Mat::zeros(sz, CV_8UC1);
+    cv::Mat out_mat_ocv = cv::Mat::zeros(sz, CV_8UC1);
+
+    // Run G-API
+    auto cc = c.compile(cv::descr_of(in_mat), cv::compile_args(fluidTestPackage, GFluidOutputRois{{to_own(roi)}}));
+    cc(cv::gin(in_mat), cv::gout(out_mat_gapi));
+
+    // Check with OpenCV
+    if (roi == cv::Rect{}) roi = cv::Rect{0,0,sz.width,sz.height};
+    cv::blur(in_mat(roi), out_mat_ocv(roi), {kernelSize, kernelSize}, anchor, borderType);
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+}
+
+INSTANTIATE_TEST_CASE_P(Fluid, PartialComputation,
+                        Values(cv::Rect{},        cv::Rect{0,0,8,6}, cv::Rect{0,1,8,3},
+                               cv::Rect{0,2,8,3}, cv::Rect{0,3,8,5}, cv::Rect{0,4,8,6}));
+
+struct PartialComputationAddC : public TestWithParam <std::tuple<cv::Rect>> {};
+TEST_P(PartialComputationAddC, Test)
+{
+    cv::Rect roi;
+    std::tie(roi) = GetParam();
+
+    cv::GMat in;
+    cv::GMat out = TAddCSimple::on(in, 1);
+    cv::GComputation c(cv::GIn(in), cv::GOut(out));
+
+    const auto sz = cv::Size(8, 10);
+    cv::Mat in_mat(sz, CV_8UC1);
+    cv::Scalar mean   = cv::Scalar(127.0f);
+    cv::Scalar stddev = cv::Scalar(40.f);
+    cv::randn(in_mat, mean, stddev);
+
+    cv::Mat out_mat_gapi = cv::Mat::zeros(sz, CV_8UC1);
+    cv::Mat out_mat_ocv = cv::Mat::zeros(sz, CV_8UC1);
+
+    // Run G-API
+    auto cc = c.compile(cv::descr_of(in_mat), cv::compile_args(fluidTestPackage, GFluidOutputRois{{to_own(roi)}}));
+    cc(cv::gin(in_mat), cv::gout(out_mat_gapi));
+
+    // Check with OpenCV
+    if (roi == cv::Rect{}) roi = cv::Rect{0,0,sz.width,sz.height};
+    out_mat_ocv(roi) = in_mat(roi) + 1;
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+}
+
+INSTANTIATE_TEST_CASE_P(FluidRoi, PartialComputationAddC,
+                        Values(cv::Rect{},        cv::Rect{0,0,8,6}, cv::Rect{0,1,8,3},
+                               cv::Rect{0,2,8,3}, cv::Rect{0,3,8,5}, cv::Rect{0,4,8,6}));
+
+struct SequenceOfBlursRoiTest : public TestWithParam <std::tuple<int, cv::Rect>> {};
+TEST_P(SequenceOfBlursRoiTest, Test)
+{
+    cv::Size sz_in = { 320, 240 };
+
+    int borderType = 0;
+    cv::Rect roi;
+    std::tie(borderType, roi) = GetParam();
+    cv::Mat in_mat(sz_in, CV_8UC1);
+    cv::Scalar mean   = cv::Scalar(127.0f);
+    cv::Scalar stddev = cv::Scalar(40.f);
+
+    cv::randn(in_mat, mean, stddev);
+
+    cv::Point anchor = {-1, -1};
+    cv::Scalar borderValue(0);
+
+    GMat in;
+    auto mid = TBlur3x3::on(in,  borderType, borderValue);
+    auto out = TBlur5x5::on(mid, borderType, borderValue);
+
+    Mat out_mat_gapi = Mat::zeros(sz_in, CV_8UC1);
+
+    GComputation c(GIn(in), GOut(out));
+    auto cc = c.compile(descr_of(in_mat), cv::compile_args(fluidTestPackage, GFluidOutputRois{{to_own(roi)}}));
+    cc(gin(in_mat), gout(out_mat_gapi));
+
+    cv::Mat mid_mat_ocv = Mat::zeros(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv = Mat::zeros(sz_in, CV_8UC1);
+
+    cv::blur(in_mat, mid_mat_ocv, {3,3}, anchor, borderType);
+
+    if (roi == cv::Rect{})
+    {
+        roi = cv::Rect{0, 0, sz_in.width, sz_in.height};
+    }
+
+    cv::blur(mid_mat_ocv(roi), out_mat_ocv(roi), {5,5}, anchor, borderType);
+
+    EXPECT_EQ(0, countNonZero(out_mat_ocv != out_mat_gapi));
+}
+
+INSTANTIATE_TEST_CASE_P(FluidRoi, SequenceOfBlursRoiTest,
+                        Combine(Values(BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT_101),
+                                Values(cv::Rect{0,0,320,240}, cv::Rect{0,64,320,128}, cv::Rect{0,128,320,112})));
+
+struct TwoBlursRoiTest : public TestWithParam <std::tuple<int, int, int, int, int, int, bool, cv::Rect>> {};
+TEST_P(TwoBlursRoiTest, Test)
+{
+    cv::Size sz_in = { 320, 240 };
+
+    int kernelSize1 = 0, kernelSize2 = 0;
+    int borderType1 = -1, borderType2 = -1;
+    cv::Scalar borderValue1{}, borderValue2{};
+    bool readFromInput = false;
+    cv::Rect outRoi;
+    std::tie(kernelSize1, borderType1, borderValue1, kernelSize2, borderType2, borderValue2, readFromInput, outRoi) = GetParam();
+    cv::Mat in_mat(sz_in, CV_8UC1);
+    cv::Scalar mean   = cv::Scalar(127.0f);
+    cv::Scalar stddev = cv::Scalar(40.f);
+
+    cv::randn(in_mat, mean, stddev);
+
+    cv::Point anchor = {-1, -1};
+
+    auto blur1 = kernelSize1 == 3 ? &TBlur3x3::on : TBlur5x5::on;
+    auto blur2 = kernelSize2 == 3 ? &TBlur3x3::on : TBlur5x5::on;
+
+    GMat in, out1, out2;
+    if (readFromInput)
+    {
+        out1 = blur1(in, borderType1, borderValue1);
+        out2 = blur2(in, borderType2, borderValue2);
+    }
+    else
+    {
+        auto mid = TAddCSimple::on(in, 0);
+        out1 = blur1(mid, borderType1, borderValue1);
+        out2 = blur2(mid, borderType2, borderValue2);
+    }
+
+    Mat out_mat_gapi1 = Mat::zeros(sz_in, CV_8UC1);
+    Mat out_mat_gapi2 = Mat::zeros(sz_in, CV_8UC1);
+
+    GComputation c(GIn(in), GOut(out1, out2));
+    auto cc = c.compile(descr_of(in_mat), cv::compile_args(fluidTestPackage, GFluidOutputRois{{outRoi, outRoi}}));
+    cc(gin(in_mat), gout(out_mat_gapi1, out_mat_gapi2));
+
+    cv::Mat out_mat_ocv1 = Mat::zeros(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv2 = Mat::zeros(sz_in, CV_8UC1);
+
+    cv::blur(in_mat(outRoi), out_mat_ocv1(outRoi), {kernelSize1, kernelSize1}, anchor, borderType1);
+    cv::blur(in_mat(outRoi), out_mat_ocv2(outRoi), {kernelSize2, kernelSize2}, anchor, borderType2);
+
+    EXPECT_EQ(0, countNonZero(out_mat_ocv1 != out_mat_gapi1));
+    EXPECT_EQ(0, countNonZero(out_mat_ocv2 != out_mat_gapi2));
+}
+
+INSTANTIATE_TEST_CASE_P(FluidRoi, TwoBlursRoiTest,
+                        Combine(Values(3, 5),
+                                Values(cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, cv::BORDER_REFLECT_101),
+                                Values(0),
+                                Values(3, 5),
+                                Values(cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, cv::BORDER_REFLECT_101),
+                                Values(0),
+                                testing::Bool(), // Read from input directly or place a copy node at start
+                                Values(cv::Rect{0,0,320,240}, cv::Rect{0,64,320,128}, cv::Rect{0,128,320,112})));
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test.cpp
new file mode 100644 (file)
index 0000000..5b35011
--- /dev/null
@@ -0,0 +1,713 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include "opencv2/gapi/core.hpp"
+
+#include "opencv2/gapi/fluid/gfluidbuffer.hpp"
+#include "opencv2/gapi/fluid/gfluidkernel.hpp"
+
+ // FIXME: move these tests with priv() to internal suite
+#include "backends/fluid/gfluidbuffer_priv.hpp"
+
+#include "gapi_fluid_test_kernels.hpp"
+#include "logger.hpp"
+
+namespace opencv_test
+{
+
+using namespace cv::gapi_test_kernels;
+
+namespace
+{
+    void WriteFunction(uint8_t* row, int nr, int w) {
+        for (int i = 0; i < w; i++)
+            row[i] = static_cast<uint8_t>(nr+i);
+    };
+    void ReadFunction1x1(const uint8_t* row, int w) {
+        for (int i = 0; i < w; i++)
+            std::cout << std::setw(4) << static_cast<int>(row[i]) << " ";
+        std::cout << "\n";
+    };
+    void ReadFunction3x3(const uint8_t* rows[3], int w) {
+        for (int i = 0; i < 3; i++) {
+            for (int j = -1; j < w+1; j++) {
+                std::cout << std::setw(4) << static_cast<int>(rows[i][j]) << " ";
+            }
+            std::cout << "\n";
+        }
+        std::cout << "\n";
+    };
+}
+
+TEST(FluidBuffer, InputTest)
+{
+    const cv::Size buffer_size = {8,8};
+    cv::Mat in_mat = cv::Mat::eye(buffer_size, CV_8U);
+
+    cv::gapi::fluid::Buffer buffer(to_own(in_mat), true);
+    cv::gapi::fluid::View  view = buffer.mkView(0, false);
+    view.priv().allocate(1, {});
+    view.priv().reset(1);
+    int this_y = 0;
+
+    while (this_y < buffer_size.height)
+    {
+        view.priv().prepareToRead();
+        const uint8_t* rrow = view.InLine<uint8_t>(0);
+        ReadFunction1x1(rrow, buffer_size.width);
+        view.priv().readDone(1,1);
+
+        cv::Mat from_buffer(1, buffer_size.width, CV_8U, const_cast<uint8_t*>(rrow));
+        EXPECT_EQ(0, cv::countNonZero(in_mat.row(this_y) != from_buffer));
+
+        this_y++;
+    }
+}
+
+TEST(FluidBuffer, CircularTest)
+{
+    const cv::Size buffer_size = {8,16};
+
+    cv::gapi::fluid::Buffer buffer(cv::GMatDesc{CV_8U,1,buffer_size}, 3, 1, 0, 1,
+        util::make_optional(cv::gapi::fluid::Border{cv::BORDER_CONSTANT, cv::gapi::own::Scalar(255)}));
+    cv::gapi::fluid::View view = buffer.mkView(1, {});
+    view.priv().reset(3);
+    view.priv().allocate(3, {});
+    buffer.debug(std::cout);
+
+    const auto whole_line_is = [](const uint8_t *line, int len, int value)
+    {
+        return std::all_of(line, line+len, [&](const uint8_t v){return v == value;});
+    };
+
+    // Store all read/written data in separate Mats to compare with
+    cv::Mat written_data(buffer_size, CV_8U);
+
+    // Simulate write/read process
+    int num_reads = 0, num_writes = 0;
+    while (num_reads < buffer_size.height)
+    {
+        if (num_writes < buffer_size.height)
+        {
+            uint8_t* wrow = buffer.OutLine<uint8_t>();
+            WriteFunction(wrow, num_writes, buffer_size.width);
+            buffer.priv().writeDone();
+
+            cv::Mat(1, buffer_size.width, CV_8U, wrow)
+                .copyTo(written_data.row(num_writes));
+            num_writes++;
+        }
+        buffer.debug(std::cout);
+
+        if (view.ready())
+        {
+            view.priv().prepareToRead();
+            const uint8_t* rrow[3] = {
+                view.InLine<uint8_t>(-1),
+                view.InLine<uint8_t>( 0),
+                view.InLine<uint8_t>( 1),
+            };
+            ReadFunction3x3(rrow, buffer_size.width);
+            view.priv().readDone(1,3);
+            buffer.debug(std::cout);
+
+            // Check borders right here
+            EXPECT_EQ(255u, rrow[0][-1]);
+            EXPECT_EQ(255u, rrow[0][buffer_size.width]);
+            if (num_reads == 0)
+            {
+                EXPECT_TRUE(whole_line_is(rrow[0]-1, buffer_size.width+2, 255u));
+            }
+            if (num_reads == buffer_size.height-1)
+            {
+                EXPECT_TRUE(whole_line_is(rrow[2]-1, buffer_size.width+2, 255u));
+            }
+
+            // Check window (without borders)
+            if (num_reads > 0 && num_reads < buffer_size.height-1)
+            {
+                // +1 everywhere since num_writes was just incremented above
+                cv::Mat written_lastLine2 = written_data.row(num_writes - (2+1));
+                cv::Mat written_lastLine1 = written_data.row(num_writes - (1+1));
+                cv::Mat written_lastLine0 = written_data.row(num_writes - (0+1));
+
+                cv::Mat read_prevLine(1, buffer_size.width, CV_8U, const_cast<uint8_t*>(rrow[0]));
+                cv::Mat read_thisLine(1, buffer_size.width, CV_8U, const_cast<uint8_t*>(rrow[1]));
+                cv::Mat read_nextLine(1, buffer_size.width, CV_8U, const_cast<uint8_t*>(rrow[2]));
+
+                EXPECT_EQ(0, cv::countNonZero(written_lastLine2 != read_prevLine));
+                EXPECT_EQ(0, cv::countNonZero(written_lastLine1 != read_thisLine));
+                EXPECT_EQ(0, cv::countNonZero(written_lastLine0 != read_nextLine));
+            }
+            num_reads++;
+        }
+    }
+}
+
+TEST(FluidBuffer, OutputTest)
+{
+    const cv::Size buffer_size = {8,16};
+    cv::Mat out_mat = cv::Mat(buffer_size, CV_8U);
+
+    cv::gapi::fluid::Buffer buffer(to_own(out_mat), false);
+    int num_writes = 0;
+    while (num_writes < buffer_size.height)
+    {
+        uint8_t* wrow = buffer.OutLine<uint8_t>();
+        WriteFunction(wrow, num_writes, buffer_size.width);
+        buffer.priv().writeDone();
+        num_writes++;
+    }
+
+    GAPI_LOG_INFO(NULL, "\n" << out_mat);
+
+    // Validity check
+    for (int r = 0; r < buffer_size.height; r++)
+    {
+        for (int c = 0; c < buffer_size.width; c++)
+        {
+            EXPECT_EQ(r+c, out_mat.at<uint8_t>(r, c));
+        }
+    }
+}
+
+TEST(Fluid, AddC_WithScalar)
+{
+    cv::GMat in;
+    cv::GScalar s;
+
+    cv::GComputation c(cv::GIn(in, s), cv::GOut(TAddScalar::on(in, s)));
+    cv::Mat in_mat = cv::Mat::eye(3, 3, CV_8UC1), out_mat(3, 3, CV_8UC1), ref_mat;
+    cv::Scalar in_s(100);
+
+    auto cc = c.compile(cv::descr_of(in_mat), cv::descr_of(in_s), cv::compile_args(fluidTestPackage));
+
+    cc(cv::gin(in_mat, in_s), cv::gout(out_mat));
+    ref_mat = in_mat + in_s;
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(Fluid, Scalar_In_Middle_Graph)
+{
+    cv::GMat in;
+    cv::GScalar s;
+
+    cv::GComputation c(cv::GIn(in, s), cv::GOut(TAddScalar::on(TAddCSimple::on(in, 5), s)));
+    cv::Mat in_mat = cv::Mat::eye(3, 3, CV_8UC1), out_mat(3, 3, CV_8UC1), ref_mat;
+    cv::Scalar in_s(100);
+
+    auto cc = c.compile(cv::descr_of(in_mat), cv::descr_of(in_s), cv::compile_args(fluidTestPackage));
+
+    cc(cv::gin(in_mat, in_s), cv::gout(out_mat));
+    ref_mat = (in_mat + 5) + in_s;
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(Fluid, Add_Scalar_To_Mat)
+{
+    cv::GMat in;
+    cv::GScalar s;
+
+    cv::GComputation c(cv::GIn(s, in), cv::GOut(TAddScalarToMat::on(s, in)));
+    cv::Mat in_mat = cv::Mat::eye(3, 3, CV_8UC1), out_mat(3, 3, CV_8UC1), ref_mat;
+    cv::Scalar in_s(100);
+
+    auto cc = c.compile(cv::descr_of(in_s), cv::descr_of(in_mat), cv::compile_args(fluidTestPackage));
+
+    cc(cv::gin(in_s, in_mat), cv::gout(out_mat));
+    ref_mat = in_mat + in_s;
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(Fluid, Sum_2_Mats_And_Scalar)
+{
+    cv::GMat a, b;
+    cv::GScalar s;
+
+    cv::GComputation c(cv::GIn(a, s, b), cv::GOut(TSum2MatsAndScalar::on(a, s, b)));
+    cv::Mat in_mat1 = cv::Mat::eye(3, 3, CV_8UC1),
+            in_mat2 = cv::Mat::eye(3, 3, CV_8UC1),
+            out_mat(3, 3, CV_8UC1),
+            ref_mat;
+    cv::Scalar in_s(100);
+
+    auto cc = c.compile(cv::descr_of(in_mat1), cv::descr_of(in_s), cv::descr_of(in_mat2), cv::compile_args(fluidTestPackage));
+
+    cc(cv::gin(in_mat1, in_s, in_mat2), cv::gout(out_mat));
+    ref_mat = in_mat1 + in_mat2 + in_s;
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(Fluid, Split3)
+{
+    cv::GMat bgr;
+    cv::GMat r,g,b;
+    std::tie(b,g,r) = cv::gapi::split3(bgr);
+    auto rr = TAddSimple::on(r, TId::on(b));
+    auto rrr = TAddSimple::on(TId::on(rr), g);
+    cv::GComputation c(bgr, TId::on(rrr));
+
+    cv::Size sz(5120, 5120);
+    cv::Mat eye_1 = cv::Mat::eye(sz, CV_8UC1);
+    std::vector<cv::Mat> eyes = {eye_1, eye_1, eye_1};
+    cv::Mat in_mat;
+    cv::merge(eyes, in_mat);
+    cv::Mat out_mat(sz, CV_8UC1);
+
+    // G-API
+    auto cc = c.compile(cv::descr_of(in_mat),
+                        cv::compile_args(fluidTestPackage));
+    cc(in_mat, out_mat);
+
+    // OCV
+    std::vector<cv::Mat> chans;
+    cv::split(in_mat, chans);
+
+    // Compare
+    EXPECT_EQ(0, cv::countNonZero(out_mat != (chans[2]*3)));
+}
+
+TEST(Fluid, ScratchTest)
+{
+    cv::GMat in;
+    cv::GMat out = TPlusRow0::on(TPlusRow0::on(in));
+    cv::GComputation c(in, out);
+
+    cv::Size sz(8, 8);
+    cv::Mat in_mat = cv::Mat::eye(sz, CV_8UC1);
+    cv::Mat out_mat(sz, CV_8UC1);
+
+    // OpenCV (reference)
+    cv::Mat ref;
+    {
+        cv::Mat first_row = cv::Mat::zeros(1, sz.width, CV_8U);
+        cv::Mat remaining = cv::repeat(in_mat.row(0), sz.height-1, 1);
+        cv::Mat operand;
+        cv::vconcat(first_row, 2*remaining, operand);
+        ref = in_mat + operand;
+    }
+    GAPI_LOG_INFO(NULL, "\n" << ref);
+
+    // G-API
+    auto cc = c.compile(cv::descr_of(in_mat),
+                        cv::compile_args(fluidTestPackage));
+    cc(in_mat, out_mat);
+    GAPI_LOG_INFO(NULL, "\n" << out_mat);
+    EXPECT_EQ(0, cv::countNonZero(ref != out_mat));
+
+    cc(in_mat, out_mat);
+    GAPI_LOG_INFO(NULL, "\n" << out_mat);
+    EXPECT_EQ(0, cv::countNonZero(ref != out_mat));
+}
+
+TEST(Fluid, MultipleOutRowsTest)
+{
+    cv::GMat in;
+    cv::GMat out = TAddCSimple::on(TAddCSimple::on(in, 1), 2);
+    cv::GComputation c(in, out);
+
+    cv::Size sz(4, 4);
+    cv::Mat in_mat = cv::Mat::eye(sz, CV_8UC1);
+    cv::Mat out_mat(sz, CV_8UC1);
+
+    auto cc = c.compile(cv::descr_of(in_mat),
+                        cv::compile_args(fluidTestPackage));
+    cc(in_mat, out_mat);
+
+    std::cout << out_mat << std::endl;
+
+    cv::Mat ocv_ref = in_mat + 1 + 2;
+    EXPECT_EQ(0, cv::countNonZero(ocv_ref != out_mat));
+}
+
+
+TEST(Fluid, LPIWindow)
+{
+    cv::GMat in;
+    cv::GMat r,g,b;
+    std::tie(r,g,b) = cv::gapi::split3(in);
+    cv::GMat rr = TId7x7::on(r);
+    cv::GMat tmp = TAddSimple::on(rr, g);
+    cv::GMat out = TAddSimple::on(tmp, b);
+
+    cv::GComputation c(in, out);
+
+    cv::Size sz(8, 8);
+
+    cv::Mat eye_1 = cv::Mat::eye(sz, CV_8UC1);
+    std::vector<cv::Mat> eyes = {eye_1, eye_1, eye_1};
+    cv::Mat in_mat;
+    cv::merge(eyes, in_mat);
+
+    cv::Mat out_mat(sz, CV_8U);
+    auto cc = c.compile(cv::descr_of(in_mat), cv::compile_args(fluidTestPackage));
+    cc(in_mat, out_mat);
+
+    //std::cout << out_mat << std::endl;
+
+    // OpenCV reference
+    cv::Mat ocv_ref = eyes[0]+eyes[1]+eyes[2];
+
+    EXPECT_EQ(0, cv::countNonZero(ocv_ref != out_mat));
+}
+
+TEST(Fluid, MultipleReaders_SameLatency)
+{
+    //  in -> AddC -> a -> AddC -> b -> Add -> out
+    //                '--> AddC -> c -'
+    //
+    // b and c have the same skew
+
+    cv::GMat in;
+    cv::GMat a = TAddCSimple::on(in, 1); // FIXME - align naming (G, non-G)
+    cv::GMat b = TAddCSimple::on(a,  2);
+    cv::GMat c = TAddCSimple::on(a,  3);
+    cv::GMat out = TAddSimple::on(b, c);
+    cv::GComputation comp(in, out);
+
+    const auto sz = cv::Size(32, 32);
+    cv::Mat in_mat = cv::Mat::eye(sz, CV_8UC1);
+    cv::Mat out_mat_gapi(sz, CV_8UC1);
+    cv::Mat out_mat_ocv (sz, CV_8UC1);
+
+    // Run G-API
+    auto cc = comp.compile(cv::descr_of(in_mat), cv::compile_args(fluidTestPackage));
+    cc(in_mat, out_mat_gapi);
+
+    // Check with OpenCV
+    cv::Mat tmp = in_mat + 1;
+    out_mat_ocv = (tmp+2) + (tmp+3);
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+}
+
+TEST(Fluid, MultipleReaders_DifferentLatency)
+{
+    //  in1 -> AddC -> a -> AddC -------------> b -> Add -> out
+    //                 '--------------> Add --> c -'
+    //                 '--> Id7x7-> d -'
+    //
+    // b and c have different skew (due to latency introduced by Id7x7)
+    // a is ready by multiple views with different latency.
+
+    cv::GMat in;
+    cv::GMat a   = TAddCSimple::on(in, 1); // FIXME - align naming (G, non-G)
+    cv::GMat b   = TAddCSimple::on(a,  2);
+    cv::GMat d   = TId7x7::on(a);
+    cv::GMat c   = TAddSimple::on(a, d);
+    cv::GMat out = TAddSimple::on(b, c);
+    cv::GComputation comp(in, out);
+
+    const auto sz = cv::Size(32, 32);
+    cv::Mat in_mat = cv::Mat::eye(sz, CV_8UC1);
+    cv::Mat out_mat_gapi(sz, CV_8UC1);
+
+    // Run G-API
+    auto cc = comp.compile(cv::descr_of(in_mat), cv::compile_args(fluidTestPackage));
+    cc(in_mat, out_mat_gapi);
+
+    // Check with OpenCV
+    cv::Mat ocv_a = in_mat + 1;
+    cv::Mat ocv_b = ocv_a + 2;
+    cv::Mat ocv_d = ocv_a;
+    cv::Mat ocv_c = ocv_a + ocv_d;
+    cv::Mat out_mat_ocv = ocv_b + ocv_c;
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+}
+
+TEST(Fluid, MultipleOutputs)
+{
+    // in -> AddC -> a -> AddC ------------------> out1
+    //               `--> Id7x7  --> b --> AddC -> out2
+
+    cv::GMat in;
+    cv::GMat a    = TAddCSimple::on(in, 1);
+    cv::GMat b    = TId7x7::on(a);
+    cv::GMat out1 = TAddCSimple::on(a, 2);
+    cv::GMat out2 = TAddCSimple::on(b, 7);
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out1, out2));
+
+    const auto sz = cv::Size(32, 32);
+    cv::Mat in_mat = cv::Mat::eye(sz, CV_8UC1);
+    cv::Mat out_mat_gapi1(sz, CV_8UC1), out_mat_gapi2(sz, CV_8UC1);
+    cv::Mat out_mat_ocv1(sz, CV_8UC1), out_mat_ocv2(sz, CV_8UC1);
+
+    // Run G-API
+    auto cc = comp.compile(cv::descr_of(in_mat), cv::compile_args(fluidTestPackage));
+    cc(cv::gin(in_mat), cv::gout(out_mat_gapi1, out_mat_gapi2));
+
+    // Check with OpenCV
+    out_mat_ocv1 = in_mat + 1 + 2;
+    out_mat_ocv2 = in_mat + 1 + 7;
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi1 != out_mat_ocv1));
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi2 != out_mat_ocv2));
+}
+
+TEST(Fluid, EmptyOutputMatTest)
+{
+    cv::GMat in;
+    cv::GMat out = TAddCSimple::on(in, 2);
+    cv::GComputation c(in, out);
+
+    cv::Mat in_mat = cv::Mat::eye(cv::Size(32, 24), CV_8UC1);
+    cv::Mat out_mat;
+
+    auto cc = c.compile(cv::descr_of(in_mat), cv::compile_args(fluidTestPackage));
+
+    cc(in_mat,    out_mat);
+    EXPECT_EQ(CV_8UC1, out_mat.type());
+    EXPECT_EQ(32, out_mat.cols);
+    EXPECT_EQ(24, out_mat.rows);
+    EXPECT_TRUE(out_mat.ptr() != nullptr);
+}
+
+struct LPISequenceTest : public TestWithParam<int>{};
+TEST_P(LPISequenceTest, LPISequenceTest)
+{
+    // in -> AddC -> a -> Blur (2lpi) -> out
+
+    int kernelSize = GetParam();
+    cv::GMat in;
+    cv::GMat a = TAddCSimple::on(in, 1);
+    auto blur = kernelSize == 3 ? &TBlur3x3_2lpi::on : &TBlur5x5_2lpi::on;
+    cv::GMat out = blur(a, cv::BORDER_CONSTANT, cv::Scalar(0));
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+
+    const auto sz = cv::Size(8, 10);
+    cv::Mat in_mat = cv::Mat::eye(sz, CV_8UC1);
+    cv::Mat out_mat_gapi(sz, CV_8UC1);
+    cv::Mat out_mat_ocv(sz, CV_8UC1);
+
+    // Run G-API
+    auto cc = comp.compile(cv::descr_of(in_mat), cv::compile_args(fluidTestPackage));
+    cc(cv::gin(in_mat), cv::gout(out_mat_gapi));
+
+    // Check with OpenCV
+    cv::blur(in_mat + 1, out_mat_ocv, {kernelSize,kernelSize}, {-1,-1}, cv::BORDER_CONSTANT);
+    EXPECT_EQ(0, cv::countNonZero(out_mat_gapi != out_mat_ocv));
+}
+
+INSTANTIATE_TEST_CASE_P(Fluid, LPISequenceTest,
+                        Values(3, 5));
+
+struct InputImageBorderTest : public TestWithParam <std::tuple<int, int>> {};
+TEST_P(InputImageBorderTest, InputImageBorderTest)
+{
+    cv::Size sz_in = { 320, 240 };
+
+    int ks         = 0;
+    int borderType = 0;
+    std::tie(ks, borderType) = GetParam();
+    cv::Mat in_mat1(sz_in, CV_8UC1);
+    cv::Scalar mean   = cv::Scalar(127.0f);
+    cv::Scalar stddev = cv::Scalar(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+
+    cv::Size kernelSize = {ks, ks};
+    cv::Point anchor = {-1, -1};
+    cv::Scalar borderValue(0);
+
+    auto gblur = ks == 3 ? &TBlur3x3::on : &TBlur5x5::on;
+
+    GMat in;
+    auto out = gblur(in, borderType, borderValue);
+
+    Mat out_mat_gapi = Mat::zeros(sz_in, CV_8UC1);
+
+    GComputation c(GIn(in), GOut(out));
+    auto cc = c.compile(descr_of(in_mat1), cv::compile_args(fluidTestPackage));
+    cc(gin(in_mat1), gout(out_mat_gapi));
+
+    cv::Mat out_mat_ocv = Mat::zeros(sz_in, CV_8UC1);
+    cv::blur(in_mat1, out_mat_ocv, kernelSize, anchor, borderType);
+
+    EXPECT_EQ(0, countNonZero(out_mat_ocv != out_mat_gapi));
+}
+
+INSTANTIATE_TEST_CASE_P(Fluid, InputImageBorderTest,
+                        Combine(Values(3, 5),
+                                Values(BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT_101)));
+
+struct SequenceOfBlursTest : public TestWithParam <std::tuple<int>> {};
+TEST_P(SequenceOfBlursTest, Test)
+{
+    cv::Size sz_in = { 320, 240 };
+
+    int borderType = 0;;
+    std::tie(borderType) = GetParam();
+    cv::Mat in_mat(sz_in, CV_8UC1);
+    cv::Scalar mean   = cv::Scalar(127.0f);
+    cv::Scalar stddev = cv::Scalar(40.f);
+
+    cv::randn(in_mat, mean, stddev);
+
+    cv::Point anchor = {-1, -1};
+    cv::Scalar borderValue(0);
+
+    GMat in;
+    auto mid = TBlur3x3::on(in,  borderType, borderValue);
+    auto out = TBlur5x5::on(mid, borderType, borderValue);
+
+    Mat out_mat_gapi = Mat::zeros(sz_in, CV_8UC1);
+
+    GComputation c(GIn(in), GOut(out));
+    auto cc = c.compile(descr_of(in_mat), cv::compile_args(fluidTestPackage));
+    cc(gin(in_mat), gout(out_mat_gapi));
+
+    cv::Mat mid_mat_ocv = Mat::zeros(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv = Mat::zeros(sz_in, CV_8UC1);
+    cv::blur(in_mat, mid_mat_ocv, {3,3}, anchor, borderType);
+    cv::blur(mid_mat_ocv, out_mat_ocv, {5,5}, anchor, borderType);
+
+    EXPECT_EQ(0, countNonZero(out_mat_ocv != out_mat_gapi));
+}
+
+INSTANTIATE_TEST_CASE_P(Fluid, SequenceOfBlursTest,
+                               Values(BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT_101));
+
+struct TwoBlursTest : public TestWithParam <std::tuple<int, int, int, int, int, int, bool>> {};
+TEST_P(TwoBlursTest, Test)
+{
+    cv::Size sz_in = { 320, 240 };
+
+    int kernelSize1 = 0, kernelSize2 = 0;
+    int borderType1 = -1, borderType2 = -1;
+    cv::Scalar borderValue1{}, borderValue2{};
+    bool readFromInput = false;
+    std::tie(kernelSize1, borderType1, borderValue1, kernelSize2, borderType2, borderValue2, readFromInput) = GetParam();
+    cv::Mat in_mat(sz_in, CV_8UC1);
+    cv::Scalar mean   = cv::Scalar(127.0f);
+    cv::Scalar stddev = cv::Scalar(40.f);
+
+    cv::randn(in_mat, mean, stddev);
+
+    cv::Point anchor = {-1, -1};
+
+    auto blur1 = kernelSize1 == 3 ? &TBlur3x3::on : TBlur5x5::on;
+    auto blur2 = kernelSize2 == 3 ? &TBlur3x3::on : TBlur5x5::on;
+
+    GMat in, out1, out2;
+    if (readFromInput)
+    {
+        out1 = blur1(in, borderType1, borderValue1);
+        out2 = blur2(in, borderType2, borderValue2);
+    }
+    else
+    {
+        auto mid = TAddCSimple::on(in, 0);
+        out1 = blur1(mid, borderType1, borderValue1);
+        out2 = blur2(mid, borderType2, borderValue2);
+    }
+
+    Mat out_mat_gapi1 = Mat::zeros(sz_in, CV_8UC1);
+    Mat out_mat_gapi2 = Mat::zeros(sz_in, CV_8UC1);
+
+    GComputation c(GIn(in), GOut(out1, out2));
+    auto cc = c.compile(descr_of(in_mat), cv::compile_args(fluidTestPackage));
+    cc(gin(in_mat), gout(out_mat_gapi1, out_mat_gapi2));
+
+    cv::Mat out_mat_ocv1 = Mat::zeros(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv2 = Mat::zeros(sz_in, CV_8UC1);
+    cv::blur(in_mat, out_mat_ocv1, {kernelSize1, kernelSize1}, anchor, borderType1);
+    cv::blur(in_mat, out_mat_ocv2, {kernelSize2, kernelSize2}, anchor, borderType2);
+
+    EXPECT_EQ(0, countNonZero(out_mat_ocv1 != out_mat_gapi1));
+    EXPECT_EQ(0, countNonZero(out_mat_ocv2 != out_mat_gapi2));
+}
+
+INSTANTIATE_TEST_CASE_P(Fluid, TwoBlursTest,
+                               Combine(Values(3, 5),
+                                       Values(cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, cv::BORDER_REFLECT_101),
+                                       Values(0),
+                                       Values(3, 5),
+                                       Values(cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, cv::BORDER_REFLECT_101),
+                                       Values(0),
+                                       testing::Bool())); // Read from input directly or place a copy node at start
+
+struct TwoReadersTest : public TestWithParam <std::tuple<int, int, int, bool>> {};
+TEST_P(TwoReadersTest, Test)
+{
+    cv::Size sz_in = { 320, 240 };
+
+    int kernelSize = 0;
+    int borderType = -1;
+    cv::Scalar borderValue;
+    bool readFromInput = false;
+    std::tie(kernelSize, borderType, borderValue, readFromInput) = GetParam();
+    cv::Mat in_mat(sz_in, CV_8UC1);
+    cv::Scalar mean   = cv::Scalar(127.0f);
+    cv::Scalar stddev = cv::Scalar(40.f);
+
+    cv::randn(in_mat, mean, stddev);
+
+    cv::Point anchor = {-1, -1};
+
+    auto blur = kernelSize == 3 ? &TBlur3x3::on : TBlur5x5::on;
+
+    GMat in, out1, out2;
+    if (readFromInput)
+    {
+        out1 = TAddCSimple::on(in, 0);
+        out2 = blur(in, borderType, borderValue);
+    }
+    else
+    {
+        auto mid = TAddCSimple::on(in, 0);
+        out1 = TAddCSimple::on(mid, 0);
+        out2 = blur(mid, borderType, borderValue);
+    }
+
+    Mat out_mat_gapi1 = Mat::zeros(sz_in, CV_8UC1);
+    Mat out_mat_gapi2 = Mat::zeros(sz_in, CV_8UC1);
+
+    GComputation c(GIn(in), GOut(out1, out2));
+    auto cc = c.compile(descr_of(in_mat), cv::compile_args(fluidTestPackage));
+    cc(gin(in_mat), gout(out_mat_gapi1, out_mat_gapi2));
+
+    cv::Mat out_mat_ocv1 = Mat::zeros(sz_in, CV_8UC1);
+    cv::Mat out_mat_ocv2 = Mat::zeros(sz_in, CV_8UC1);
+    out_mat_ocv1 = in_mat;
+    cv::blur(in_mat, out_mat_ocv2, {kernelSize, kernelSize}, anchor, borderType);
+
+    EXPECT_EQ(0, countNonZero(out_mat_ocv1 != out_mat_gapi1));
+    EXPECT_EQ(0, countNonZero(out_mat_ocv2 != out_mat_gapi2));
+}
+
+INSTANTIATE_TEST_CASE_P(Fluid, TwoReadersTest,
+                               Combine(Values(3, 5),
+                                       Values(cv::BORDER_CONSTANT, cv::BORDER_REPLICATE, cv::BORDER_REFLECT_101),
+                                       Values(0),
+                                       testing::Bool())); // Read from input directly or place a copy node at start
+
+TEST(FluidTwoIslands, SanityTest)
+{
+    cv::Size sz_in{8,8};
+
+    GMat in1, in2;
+    auto out1 = TAddScalar::on(in1, {0});
+    auto out2 = TAddScalar::on(in2, {0});
+
+    cv::Mat in_mat1(sz_in, CV_8UC1);
+    cv::Mat in_mat2(sz_in, CV_8UC1);
+    cv::Scalar mean   = cv::Scalar(127.0f);
+    cv::Scalar stddev = cv::Scalar(40.f);
+
+    cv::randn(in_mat1, mean, stddev);
+    cv::randn(in_mat2, mean, stddev);
+
+    Mat out_mat1 = Mat::zeros(sz_in, CV_8UC1);
+    Mat out_mat2 = Mat::zeros(sz_in, CV_8UC1);
+
+    GComputation c(GIn(in1, in2), GOut(out1, out2));
+    EXPECT_NO_THROW(c.apply(gin(in_mat1, in_mat2), gout(out_mat1, out_mat2), cv::compile_args(fluidTestPackage)));
+    EXPECT_EQ(0, countNonZero(in_mat1 != out_mat1));
+    EXPECT_EQ(0, countNonZero(in_mat2 != out_mat2));
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.cpp
new file mode 100644 (file)
index 0000000..6bd06fe
--- /dev/null
@@ -0,0 +1,436 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+#include "test_precomp.hpp"
+
+#include <iomanip>
+#include "gapi_fluid_test_kernels.hpp"
+#include <opencv2/gapi/core.hpp>
+
+namespace cv
+{
+namespace gapi_test_kernels
+{
+
+GAPI_FLUID_KERNEL(FAddSimple, TAddSimple, false)
+{
+    static const int Window = 1;
+
+    static void run(const cv::gapi::fluid::View   &a,
+                    const cv::gapi::fluid::View   &b,
+                          cv::gapi::fluid::Buffer &o)
+    {
+        // std::cout << "AddSimple {{{\n";
+        // std::cout << "  a - "; a.debug(std::cout);
+        // std::cout << "  b - "; b.debug(std::cout);
+        // std::cout << "  o - "; o.debug(std::cout);
+
+        const uint8_t* in1 = a.InLine<uint8_t>(0);
+        const uint8_t* in2 = b.InLine<uint8_t>(0);
+              uint8_t* out = o.OutLine<uint8_t>();
+
+        // std::cout << "a: ";
+        // for (int i = 0, w = a.length(); i < w; i++)
+        // {
+        //     std::cout << std::setw(4) << int(in1[i]);
+        // }
+        // std::cout << "\n";
+
+        // std::cout << "b: ";
+        // for (int i = 0, w = a.length(); i < w; i++)
+        // {
+        //     std::cout << std::setw(4) << int(in2[i]);
+        // }
+        // std::cout << "\n";
+
+        for (int i = 0, w = a.length(); i < w; i++)
+        {
+            out[i] = in1[i] + in2[i];
+        }
+
+        // std::cout << "}}} " << std::endl;;
+    }
+};
+
+GAPI_FLUID_KERNEL(FAddCSimple, TAddCSimple, false)
+{
+    static const int Window = 1;
+    static const int LPI    = 2;
+
+    static void run(const cv::gapi::fluid::View   &in,
+                    const int                      cval,
+                          cv::gapi::fluid::Buffer &out)
+    {
+        for (int l = 0, lpi = out.lpi(); l < lpi; l++)
+        {
+            const uint8_t* in_row  = in .InLine <uint8_t>(l);
+                  uint8_t* out_row = out.OutLine<uint8_t>(l);
+            //std::cout << "l=" << l << ": ";
+            for (int i = 0, w = in.length(); i < w; i++)
+            {
+                //std::cout << std::setw(4) << int(in_row[i]);
+                out_row[i] = static_cast<uint8_t>(in_row[i] + cval);
+            }
+            //std::cout << std::endl;
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FAddScalar, TAddScalar, false)
+{
+    static const int Window = 1;
+    static const int LPI    = 2;
+
+    static void run(const cv::gapi::fluid::View   &in,
+                    const cv::Scalar              &cval,
+                          cv::gapi::fluid::Buffer &out)
+    {
+        for (int l = 0, lpi = out.lpi(); l < lpi; l++)
+        {
+            const uint8_t* in_row  = in .InLine <uint8_t>(l);
+                  uint8_t* out_row = out.OutLine<uint8_t>(l);
+            std::cout << "l=" << l << ": ";
+            for (int i = 0, w = in.length(); i < w; i++)
+            {
+                std::cout << std::setw(4) << int(in_row[i]);
+                out_row[i] = static_cast<uint8_t>(in_row[i] + cval[0]);
+            }
+            std::cout << std::endl;
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FAddScalarToMat, TAddScalarToMat, false)
+{
+    static const int Window = 1;
+    static const int LPI    = 2;
+
+    static void run(const cv::Scalar              &cval,
+                    const cv::gapi::fluid::View   &in,
+                          cv::gapi::fluid::Buffer &out)
+    {
+        for (int l = 0, lpi = out.lpi(); l < lpi; l++)
+        {
+            const uint8_t* in_row  = in .InLine <uint8_t>(l);
+                  uint8_t* out_row = out.OutLine<uint8_t>(l);
+            std::cout << "l=" << l << ": ";
+            for (int i = 0, w = in.length(); i < w; i++)
+            {
+                std::cout << std::setw(4) << int(in_row[i]);
+                out_row[i] = static_cast<uint8_t>(in_row[i] + cval[0]);
+            }
+            std::cout << std::endl;
+        }
+    }
+};
+
+template<int kernelSize, int lpi = 1>
+static void runBlur(const cv::gapi::fluid::View& src, cv::gapi::fluid::Buffer& dst)
+{
+    const auto borderSize = (kernelSize - 1) / 2;
+    const unsigned char* ins[kernelSize];
+
+    for (int l = 0; l < lpi; l++)
+    {
+        for (int i = 0; i < kernelSize; i++)
+        {
+            ins[i] = src.InLine<unsigned char>(i - borderSize + l);
+        }
+
+        auto out = dst.OutLine<unsigned char>(l);
+        const auto width = dst.length();
+
+        for (int w = 0; w < width; w++)
+        {
+            float res = 0.0f;
+            for (int i = 0; i < kernelSize; i++)
+            {
+                for (int j = -borderSize; j < borderSize + 1; j++)
+                {
+                    res += ins[i][w+j];
+                }
+            }
+            out[w] = static_cast<unsigned char>(std::rint(res / (kernelSize * kernelSize)));
+        }
+    }
+}
+
+GAPI_FLUID_KERNEL(FBlur1x1, TBlur1x1, false)
+{
+    static const int Window = 1;
+
+    static void run(const cv::gapi::fluid::View &src, int /*borderType*/,
+                    cv::Scalar /*borderValue*/, cv::gapi::fluid::Buffer &dst)
+    {
+        runBlur<Window>(src, dst);
+    }
+};
+
+GAPI_FLUID_KERNEL(FBlur3x3, TBlur3x3, false)
+{
+    static const int Window = 3;
+
+    static void run(const cv::gapi::fluid::View &src, int /*borderType*/,
+                    cv::Scalar /*borderValue*/, cv::gapi::fluid::Buffer &dst)
+    {
+        runBlur<Window>(src, dst);
+    }
+
+    static cv::gapi::fluid::Border getBorder(const cv::GMatDesc &/*src*/, int borderType, cv::Scalar borderValue)
+    {
+        return { borderType, to_own(borderValue)};
+    }
+};
+
+GAPI_FLUID_KERNEL(FBlur5x5, TBlur5x5, false)
+{
+    static const int Window = 5;
+
+    static void run(const cv::gapi::fluid::View &src, int /*borderType*/,
+                    cv::Scalar /*borderValue*/, cv::gapi::fluid::Buffer &dst)
+    {
+        runBlur<Window>(src, dst);
+    }
+
+    static cv::gapi::fluid::Border getBorder(const cv::GMatDesc &/*src*/, int borderType, cv::Scalar borderValue)
+    {
+        return { borderType, to_own(borderValue)};
+    }
+};
+
+GAPI_FLUID_KERNEL(FBlur3x3_2lpi, TBlur3x3_2lpi, false)
+{
+    static const int Window = 3;
+    static const int LPI    = 2;
+
+    static void run(const cv::gapi::fluid::View &src, int /*borderType*/,
+                    cv::Scalar /*borderValue*/, cv::gapi::fluid::Buffer &dst)
+    {
+        runBlur<Window, LPI>(src, dst);
+    }
+
+    static cv::gapi::fluid::Border getBorder(const cv::GMatDesc &/*src*/, int borderType, cv::Scalar borderValue)
+    {
+        return { borderType, to_own(borderValue)};
+    }
+};
+
+GAPI_FLUID_KERNEL(FBlur5x5_2lpi, TBlur5x5_2lpi, false)
+{
+    static const int Window = 5;
+    static const int LPI    = 2;
+
+    static void run(const cv::gapi::fluid::View &src, int /*borderType*/,
+                    cv::Scalar /*borderValue*/, cv::gapi::fluid::Buffer &dst)
+    {
+        runBlur<Window, LPI>(src, dst);
+    }
+
+    static cv::gapi::fluid::Border getBorder(const cv::GMatDesc &/*src*/, int borderType, cv::Scalar borderValue)
+    {
+        return { borderType, to_own(borderValue )};
+    }
+};
+
+GAPI_FLUID_KERNEL(FIdentity, TId, false)
+{
+    static const int Window = 3;
+
+    static void run(const cv::gapi::fluid::View   &a,
+                          cv::gapi::fluid::Buffer &o)
+    {
+        const uint8_t* in[3] = {
+            a.InLine<uint8_t>(-1),
+            a.InLine<uint8_t>( 0),
+            a.InLine<uint8_t>(+1)
+        };
+        uint8_t* out = o.OutLine<uint8_t>();
+
+        // ReadFunction3x3(in, a.length());
+        for (int i = 0, w = a.length(); i < w; i++)
+        {
+            out[i] = in[1][i];
+        }
+    }
+
+    static gapi::fluid::Border getBorder(const cv::GMatDesc &)
+    {
+        return { cv::BORDER_REPLICATE, cv::gapi::own::Scalar{} };
+    }
+};
+
+GAPI_FLUID_KERNEL(FId7x7, TId7x7, false)
+{
+    static const int Window = 7;
+    static const int LPI    = 2;
+
+    static void run(const cv::gapi::fluid::View   &a,
+                          cv::gapi::fluid::Buffer &o)
+    {
+        for (int l = 0, lpi = o.lpi(); l < lpi; l++)
+        {
+            const uint8_t* in[Window] = {
+                a.InLine<uint8_t>(-3 + l),
+                a.InLine<uint8_t>(-2 + l),
+                a.InLine<uint8_t>(-1 + l),
+                a.InLine<uint8_t>( 0 + l),
+                a.InLine<uint8_t>(+1 + l),
+                a.InLine<uint8_t>(+2 + l),
+                a.InLine<uint8_t>(+3 + l),
+            };
+            uint8_t* out = o.OutLine<uint8_t>(l);
+
+            // std::cout << "Id7x7 " << l << " of " << lpi << " {{{\n";
+            // std::cout << "  a - "; a.debug(std::cout);
+            // std::cout << "  o - "; o.debug(std::cout);
+            // std::cout << "}}} " << std::endl;;
+
+            // // std::cout << "Id7x7 at " << a.y() << "/L" << l <<  " {{{" << std::endl;
+            // for (int j = 0; j < Window; j++)
+            // {
+            //     // std::cout << std::setw(2) << j-(Window-1)/2 << ": ";
+            //     for (int i = 0, w = a.length(); i < w; i++)
+            //         std::cout << std::setw(4) << int(in[j][i]);
+            //     std::cout << std::endl;
+            // }
+            // std::cout << "}}}" << std::endl;
+
+            for (int i = 0, w = a.length(); i < w; i++)
+                out[i] = in[(Window-1)/2][i];
+        }
+    }
+
+    static cv::gapi::fluid::Border getBorder(const cv::GMatDesc&/* src*/)
+    {
+        return { cv::BORDER_REPLICATE, cv::gapi::own::Scalar{} };
+    }
+};
+
+GAPI_FLUID_KERNEL(FPlusRow0, TPlusRow0, true)
+{
+    static const int Window = 1;
+
+    static void initScratch(const cv::GMatDesc            &in,
+                                  cv::gapi::fluid::Buffer &scratch)
+    {
+        cv::Size scratch_size{in.size.width, 1};
+        cv::gapi::fluid::Buffer buffer(in.withSize(scratch_size));
+        scratch = std::move(buffer);
+    }
+
+    static void resetScratch(cv::gapi::fluid::Buffer &scratch)
+    {
+        // FIXME: only 1 line can be used!
+        uint8_t* out_row = scratch.OutLine<uint8_t>();
+        for (int i = 0, w = scratch.length(); i < w; i++)
+        {
+            out_row[i] = 0;
+        }
+    }
+
+    static void run(const cv::gapi::fluid::View   &in,
+                          cv::gapi::fluid::Buffer &out,
+                          cv::gapi::fluid::Buffer &scratch)
+    {
+        const uint8_t* in_row  = in     .InLine <uint8_t>(0);
+              uint8_t* out_row = out    .OutLine<uint8_t>();
+              uint8_t* tmp_row = scratch.OutLine<uint8_t>();
+
+        if (in.y() == 0)
+        {
+            // Copy 1st row to scratch buffer
+            for (int i = 0, w = in.length(); i < w; i++)
+            {
+                out_row[i] = in_row[i];
+                tmp_row[i] = in_row[i];
+            }
+        }
+        else
+        {
+            // Output is 1st row + in
+            for (int i = 0, w = in.length(); i < w; i++)
+            {
+                out_row[i] = in_row[i] + tmp_row[i];
+            }
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FTestSplit3, cv::gapi::core::GSplit3, false)
+{
+    static const int Window = 1;
+
+    static void run(const cv::gapi::fluid::View   &in,
+                          cv::gapi::fluid::Buffer &o1,
+                          cv::gapi::fluid::Buffer &o2,
+                          cv::gapi::fluid::Buffer &o3)
+    {
+        // std::cout << "Split3  {{{\n";
+        // std::cout << "  a - "; in.debug(std::cout);
+        // std::cout << "  1 - "; o1.debug(std::cout);
+        // std::cout << "  2 - "; o2.debug(std::cout);
+        // std::cout << "  3 - "; o3.debug(std::cout);
+        // std::cout << "}}} " << std::endl;;
+
+        const uint8_t* in_rgb = in.InLine<uint8_t>(0);
+              uint8_t* out_r  = o1.OutLine<uint8_t>();
+              uint8_t* out_g  = o2.OutLine<uint8_t>();
+              uint8_t* out_b  = o3.OutLine<uint8_t>();
+
+        for (int i = 0, w = in.length(); i < w; i++)
+        {
+            out_r[i] = in_rgb[3*i];
+            out_g[i] = in_rgb[3*i+1];
+            out_b[i] = in_rgb[3*i+2];
+        }
+    }
+};
+
+GAPI_FLUID_KERNEL(FSum2MatsAndScalar, TSum2MatsAndScalar, false)
+{
+    static const int Window = 1;
+    static const int LPI    = 2;
+
+    static void run(const cv::gapi::fluid::View   &a,
+                    const cv::Scalar              &cval,
+                    const cv::gapi::fluid::View   &b,
+                          cv::gapi::fluid::Buffer &out)
+    {
+        for (int l = 0, lpi = out.lpi(); l < lpi; l++)
+        {
+            const uint8_t* in_row1  = a .InLine <uint8_t>(l);
+            const uint8_t* in_row2  = b .InLine <uint8_t>(l);
+                  uint8_t* out_row = out.OutLine<uint8_t>(l);
+            std::cout << "l=" << l << ": ";
+            for (int i = 0, w = a.length(); i < w; i++)
+            {
+                std::cout << std::setw(4) << int(in_row1[i]);
+                std::cout << std::setw(4) << int(in_row2[i]);
+                out_row[i] = static_cast<uint8_t>(in_row1[i] + in_row2[i] + cval[0]);
+            }
+            std::cout << std::endl;
+        }
+    }
+};
+
+cv::gapi::GKernelPackage fluidTestPackage = cv::gapi::kernels
+        <FAddSimple
+        ,FAddCSimple
+        ,FAddScalar
+        ,FAddScalarToMat
+        ,FBlur1x1
+        ,FBlur3x3
+        ,FBlur5x5
+        ,FBlur3x3_2lpi
+        ,FBlur5x5_2lpi
+        ,FIdentity
+        ,FId7x7
+        ,FPlusRow0
+        ,FSum2MatsAndScalar
+        ,FTestSplit3
+        >();
+} // namespace gapi_test_kernels
+} // namespace cv
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_fluid_test_kernels.hpp
new file mode 100644 (file)
index 0000000..f5d83ed
--- /dev/null
@@ -0,0 +1,105 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#ifndef GAPI_FLUID_TEST_KERNELS_HPP
+#define GAPI_FLUID_TEST_KERNELS_HPP
+
+#include "opencv2/gapi/fluid/gfluidkernel.hpp"
+
+namespace cv
+{
+namespace gapi_test_kernels
+{
+
+G_TYPED_KERNEL(TAddSimple, <GMat(GMat, GMat)>, "test.fluid.add_simple") {
+    static cv::GMatDesc outMeta(cv::GMatDesc a, cv::GMatDesc) {
+        return a;
+    }
+};
+
+G_TYPED_KERNEL(TAddCSimple, <GMat(GMat,int)>, "test.fluid.addc_simple")
+{
+    static GMatDesc outMeta(const cv::GMatDesc &in, int) {
+        return in;
+    }
+};
+
+G_TYPED_KERNEL(TAddScalar, <GMat(GMat,GScalar)>, "test.fluid.addc_scalar")
+{
+    static GMatDesc outMeta(const cv::GMatDesc &in, const cv::GScalarDesc&) {
+        return in;
+    }
+};
+
+G_TYPED_KERNEL(TAddScalarToMat, <GMat(GScalar,GMat)>, "test.fluid.add_scalar_to_mat")
+{
+    static GMatDesc outMeta(const cv::GScalarDesc&, const cv::GMatDesc &in) {
+        return in;
+    }
+};
+
+G_TYPED_KERNEL(TBlur1x1, <GMat(GMat,int,Scalar)>, "org.opencv.imgproc.filters.blur1x1"){
+    static GMatDesc outMeta(GMatDesc in, int, Scalar) {
+        return in;
+    }
+};
+
+G_TYPED_KERNEL(TBlur3x3, <GMat(GMat,int,Scalar)>, "org.opencv.imgproc.filters.blur3x3"){
+    static GMatDesc outMeta(GMatDesc in, int, Scalar) {
+        return in;
+    }
+};
+
+G_TYPED_KERNEL(TBlur5x5, <GMat(GMat,int,Scalar)>, "org.opencv.imgproc.filters.blur5x5"){
+    static GMatDesc outMeta(GMatDesc in, int, Scalar) {
+        return in;
+    }
+};
+
+G_TYPED_KERNEL(TBlur3x3_2lpi, <GMat(GMat,int,Scalar)>, "org.opencv.imgproc.filters.blur3x3_2lpi"){
+    static GMatDesc outMeta(GMatDesc in, int, Scalar) {
+        return in;
+    }
+};
+
+G_TYPED_KERNEL(TBlur5x5_2lpi, <GMat(GMat,int,Scalar)>, "org.opencv.imgproc.filters.blur5x5_2lpi"){
+    static GMatDesc outMeta(GMatDesc in, int, Scalar) {
+        return in;
+    }
+};
+
+G_TYPED_KERNEL(TId, <GMat(GMat)>, "test.fluid.identity") {
+    static cv::GMatDesc outMeta(cv::GMatDesc a) {
+        return a;
+    }
+};
+
+G_TYPED_KERNEL(TId7x7, <GMat(GMat)>, "test.fluid.identity7x7") {
+    static cv::GMatDesc outMeta(cv::GMatDesc a) {
+        return a;
+    }
+};
+
+G_TYPED_KERNEL(TPlusRow0, <GMat(GMat)>, "test.fluid.plus_row0") {
+    static cv::GMatDesc outMeta(cv::GMatDesc a) {
+        return a;
+    }
+};
+
+G_TYPED_KERNEL(TSum2MatsAndScalar, <GMat(GMat,GScalar,GMat)>, "test.fluid.sum_2_mats_and_scalar")
+{
+    static GMatDesc outMeta(const cv::GMatDesc &in, const cv::GScalarDesc&, const cv::GMatDesc&) {
+        return in;
+    }
+};
+
+extern cv::gapi::GKernelPackage fluidTestPackage;
+
+} // namespace gapi_test_kernels
+} // namespace cv
+
+#endif // GAPI_FLUID_TEST_KERNELS_HPP
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcompiled_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcompiled_tests.cpp
new file mode 100644 (file)
index 0000000..e482e2e
--- /dev/null
@@ -0,0 +1,173 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+
+namespace
+{
+    static cv::GMat DemoCC(cv::GMat in, cv::GScalar scale)
+    {
+        return cv::gapi::medianBlur(in + in*scale, 3);
+    }
+
+    struct GCompiledValidateMetaTyped: public ::testing::Test
+    {
+        cv::GComputationT<cv::GMat(cv::GMat,cv::GScalar)> m_cc;
+
+        GCompiledValidateMetaTyped() : m_cc(DemoCC)
+        {
+        }
+    };
+
+    struct GCompiledValidateMetaUntyped: public ::testing::Test
+    {
+        cv::GMat in;
+        cv::GScalar scale;
+        cv::GComputation m_ucc;
+
+        GCompiledValidateMetaUntyped() : m_ucc(cv::GIn(in, scale),
+                                               cv::GOut(DemoCC(in, scale)))
+        {
+        }
+    };
+} // anonymous namespace
+
+TEST_F(GCompiledValidateMetaTyped, ValidMeta)
+{
+    cv::Mat in = cv::Mat::eye(cv::Size(128, 32), CV_8UC1);
+    cv::Scalar sc(127);
+
+    auto f = m_cc.compile(cv::descr_of(in),
+                          cv::descr_of(sc));
+
+    // Correct operation when meta is exactly the same
+    cv::Mat out;
+    EXPECT_NO_THROW(f(in, sc, out));
+
+    // Correct operation on next invocation with same meta
+    // taken from different input objects
+    cv::Mat in2 = cv::Mat::zeros(cv::Size(128, 32), CV_8UC1);
+    cv::Scalar sc2(64);
+    cv::Mat out2;
+    EXPECT_NO_THROW(f(in2, sc2, out2));
+}
+
+TEST_F(GCompiledValidateMetaTyped, InvalidMeta)
+{
+    auto f = m_cc.compile(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(64,32)},
+                          cv::empty_scalar_desc());
+
+    cv::Scalar sc(33);
+    cv::Mat out;
+
+    // 3 channels intead 1
+    cv::Mat in1 = cv::Mat::eye(cv::Size(64,32), CV_8UC3);
+    EXPECT_THROW(f(in1, sc, out), std::logic_error);
+
+    // 32f intead 8u
+    cv::Mat in2 = cv::Mat::eye(cv::Size(64,32), CV_32F);
+    EXPECT_THROW(f(in2, sc, out), std::logic_error);
+
+    // 32x32 instead of 64x32
+    cv::Mat in3 = cv::Mat::eye(cv::Size(32,32), CV_8UC1);
+    EXPECT_THROW(f(in3, sc, out), std::logic_error);
+
+    // All is wrong
+    cv::Mat in4 = cv::Mat::eye(cv::Size(128,64), CV_32FC3);
+    EXPECT_THROW(f(in4, sc, out), std::logic_error);
+}
+
+TEST_F(GCompiledValidateMetaUntyped, ValidMeta)
+{
+    cv::Mat in1 = cv::Mat::eye(cv::Size(128, 32), CV_8UC1);
+    cv::Scalar sc(127);
+
+    auto f = m_ucc.compile(cv::descr_of(in1),
+                           cv::descr_of(sc));
+
+    // Correct operation when meta is exactly the same
+    cv::Mat out1;
+    EXPECT_NO_THROW(f(cv::gin(in1, sc), cv::gout(out1)));
+
+    // Correct operation on next invocation with same meta
+    // taken from different input objects
+    cv::Mat in2 = cv::Mat::zeros(cv::Size(128, 32), CV_8UC1);
+    cv::Scalar sc2(64);
+    cv::Mat out2;
+    EXPECT_NO_THROW(f(cv::gin(in2, sc2), cv::gout(out2)));
+}
+
+TEST_F(GCompiledValidateMetaUntyped, InvalidMetaValues)
+{
+    auto f = m_ucc.compile(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(64,32)},
+                           cv::empty_scalar_desc());
+
+    cv::Scalar sc(33);
+    cv::Mat out;
+
+    // 3 channels intead 1
+    cv::Mat in1 = cv::Mat::eye(cv::Size(64,32), CV_8UC3);
+    EXPECT_THROW(f(cv::gin(in1, sc), cv::gout(out)), std::logic_error);
+
+    // 32f intead 8u
+    cv::Mat in2 = cv::Mat::eye(cv::Size(64,32), CV_32F);
+    EXPECT_THROW(f(cv::gin(in2, sc), cv::gout(out)), std::logic_error);
+
+    // 32x32 instead of 64x32
+    cv::Mat in3 = cv::Mat::eye(cv::Size(32,32), CV_8UC1);
+    EXPECT_THROW(f(cv::gin(in3, sc), cv::gout(out)), std::logic_error);
+
+    // All is wrong
+    cv::Mat in4 = cv::Mat::eye(cv::Size(128,64), CV_32FC3);
+    EXPECT_THROW(f(cv::gin(in4, sc), cv::gout(out)), std::logic_error);
+}
+
+TEST_F(GCompiledValidateMetaUntyped, InvalidMetaShape)
+{
+    auto f = m_ucc.compile(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(64,32)},
+                           cv::empty_scalar_desc());
+
+    cv::Mat in1 = cv::Mat::eye(cv::Size(64,32), CV_8UC1);
+    cv::Scalar sc(33);
+    cv::Mat out1;
+
+    // call as f(Mat,Mat) while f(Mat,Scalar) is expected
+    EXPECT_THROW(f(cv::gin(in1, in1), cv::gout(out1)), std::logic_error);
+
+    // call as f(Scalar,Mat) while f(Mat,Scalar) is expected
+    EXPECT_THROW(f(cv::gin(sc, in1), cv::gout(out1)), std::logic_error);
+
+    // call as f(Scalar,Scalar) while f(Mat,Scalar) is expected
+    EXPECT_THROW(f(cv::gin(sc, sc), cv::gout(out1)), std::logic_error);
+}
+
+TEST_F(GCompiledValidateMetaUntyped, InvalidMetaNumber)
+{
+    auto f = m_ucc.compile(cv::GMatDesc{CV_8U,1,cv::Size(64,32)},
+                           cv::empty_scalar_desc());
+
+    cv::Mat in1 = cv::Mat::eye(cv::Size(64,32), CV_8UC1);
+    cv::Scalar sc(33);
+    cv::Mat out1, out2;
+
+    // call as f(Mat,Scalar,Scalar) while f(Mat,Scalar) is expected
+    EXPECT_THROW(f(cv::gin(in1, sc, sc), cv::gout(out1)), std::logic_error);
+
+    // call as f(Scalar,Mat,Scalar) while f(Mat,Scalar) is expected
+    EXPECT_THROW(f(cv::gin(sc, in1, sc), cv::gout(out1)), std::logic_error);
+
+    // call as f(Scalar) while f(Mat,Scalar) is expected
+    EXPECT_THROW(f(cv::gin(sc), cv::gout(out1)), std::logic_error);
+
+    // call as f(Mat,Scalar,[out1],[out2]) while f(Mat,Scalar,[out]) is expected
+    EXPECT_THROW(f(cv::gin(in1, sc), cv::gout(out1, out2)), std::logic_error);
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcomputation_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_gcomputation_tests.cpp
new file mode 100644 (file)
index 0000000..070cea6
--- /dev/null
@@ -0,0 +1,68 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "opencv2/gapi/cpu/gcpukernel.hpp"
+
+namespace opencv_test
+{
+
+  namespace
+  {
+      G_TYPED_KERNEL(CustomResize, <cv::GMat(cv::GMat, cv::Size, double, double, int)>, "org.opencv.customk.resize")
+      {
+          static cv::GMatDesc outMeta(cv::GMatDesc in, cv::Size sz, double fx, double fy, int) {
+              if (sz.width != 0 && sz.height != 0)
+              {
+                  return in.withSize(to_own(sz));
+              }
+              else
+              {
+                  GAPI_Assert(fx != 0. && fy != 0.);
+                  return in.withSize
+                    (cv::gapi::own::Size(static_cast<int>(std::round(in.size.width  * fx)),
+                                         static_cast<int>(std::round(in.size.height * fy))));
+              }
+          }
+      };
+
+      GAPI_OCV_KERNEL(CustomResizeImpl, CustomResize)
+      {
+          static void run(const cv::Mat& in, cv::Size sz, double fx, double fy, int interp, cv::Mat &out)
+          {
+              cv::resize(in, out, sz, fx, fy, interp);
+          }
+      };
+
+      struct GComputationApplyTest: public ::testing::Test
+      {
+          cv::GMat in;
+          cv::Mat  in_mat;
+          cv::Mat  out_mat;
+          cv::GComputation m_c;
+
+          GComputationApplyTest() : in_mat(300, 300, CV_8UC1),
+                                    m_c(cv::GIn(in), cv::GOut(CustomResize::on(in, cv::Size(100, 100),
+                                                                               0.0, 0.0, cv::INTER_LINEAR)))
+          {
+          }
+      };
+  }
+
+  TEST_F(GComputationApplyTest, ThrowDontPassCustomKernel)
+  {
+      EXPECT_THROW(m_c.apply(in_mat, out_mat), std::logic_error);
+  }
+
+  TEST_F(GComputationApplyTest, NoThrowPassCustomKernel)
+  {
+      const auto pkg = cv::gapi::kernels<CustomResizeImpl>();
+
+      ASSERT_NO_THROW(m_c.apply(in_mat, out_mat, cv::compile_args(pkg)));
+  }
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_kernel_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_kernel_tests.cpp
new file mode 100644 (file)
index 0000000..aeb4762
--- /dev/null
@@ -0,0 +1,284 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "opencv2/gapi/cpu/gcpukernel.hpp"
+#include "gapi_mock_kernels.hpp"
+
+namespace opencv_test
+{
+
+namespace
+{
+    G_TYPED_KERNEL(GClone, <GMat(GMat)>, "org.opencv.test.clone")
+    {
+        static GMatDesc outMeta(GMatDesc in) { return in;  }
+
+    };
+
+    GAPI_OCV_KERNEL(GCloneImpl, GClone)
+    {
+        static void run(const cv::Mat& in, cv::Mat &out)
+        {
+            out = in.clone();
+        }
+    };
+}
+
+TEST(KernelPackage, Create)
+{
+    namespace J = Jupiter;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz>();
+    EXPECT_EQ(3u, pkg.size());
+}
+
+TEST(KernelPackage, Includes)
+{
+    namespace J = Jupiter;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz>();
+    EXPECT_TRUE (pkg.includes<J::Foo>());
+    EXPECT_TRUE (pkg.includes<J::Bar>());
+    EXPECT_TRUE (pkg.includes<J::Baz>());
+    EXPECT_FALSE(pkg.includes<J::Qux>());
+}
+
+TEST(KernelPackage, IncludesAPI)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto pkg = cv::gapi::kernels<J::Foo, S::Bar>();
+    EXPECT_TRUE (pkg.includesAPI<I::Foo>());
+    EXPECT_TRUE (pkg.includesAPI<I::Bar>());
+    EXPECT_FALSE(pkg.includesAPI<I::Baz>());
+    EXPECT_FALSE(pkg.includesAPI<I::Qux>());
+}
+
+TEST(KernelPackage, IncludesAPI_Overlapping)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar, S::Foo, S::Bar>();
+    EXPECT_TRUE (pkg.includesAPI<I::Foo>());
+    EXPECT_TRUE (pkg.includesAPI<I::Bar>());
+    EXPECT_FALSE(pkg.includesAPI<I::Baz>());
+    EXPECT_FALSE(pkg.includesAPI<I::Qux>());
+}
+
+TEST(KernelPackage, Include_Add)
+{
+    namespace J = Jupiter;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz>();
+    EXPECT_FALSE(pkg.includes<J::Qux>());
+
+    pkg.include<J::Qux>();
+    EXPECT_TRUE(pkg.includes<J::Qux>());
+}
+
+TEST(KernelPackage, Include_KEEP)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar>();
+    EXPECT_FALSE(pkg.includes<S::Foo>());
+    EXPECT_FALSE(pkg.includes<S::Bar>());
+
+    pkg.include<S::Bar>(); // default (KEEP)
+    EXPECT_TRUE(pkg.includes<J::Bar>());
+    EXPECT_TRUE(pkg.includes<S::Bar>());
+
+    pkg.include<S::Foo>(cv::unite_policy::KEEP); // explicit (KEEP)
+    EXPECT_TRUE(pkg.includes<J::Foo>());
+    EXPECT_TRUE(pkg.includes<S::Foo>());
+}
+
+TEST(KernelPackage, Include_REPLACE)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar>();
+    EXPECT_FALSE(pkg.includes<S::Bar>());
+
+    pkg.include<S::Bar>(cv::unite_policy::REPLACE);
+    EXPECT_FALSE(pkg.includes<J::Bar>());
+    EXPECT_TRUE(pkg.includes<S::Bar>());
+}
+
+TEST(KernelPackage, RemoveBackend)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar, S::Foo>();
+    EXPECT_TRUE(pkg.includes<J::Foo>());
+    EXPECT_TRUE(pkg.includes<J::Bar>());
+    EXPECT_TRUE(pkg.includes<S::Foo>());
+
+    pkg.remove(J::backend());
+    EXPECT_FALSE(pkg.includes<J::Foo>());
+    EXPECT_FALSE(pkg.includes<J::Bar>());
+    EXPECT_TRUE(pkg.includes<S::Foo>());
+};
+
+TEST(KernelPackage, RemoveAPI)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar, S::Foo, S::Bar>();
+    EXPECT_TRUE(pkg.includes<J::Foo>());
+    EXPECT_TRUE(pkg.includes<J::Bar>());
+    EXPECT_TRUE(pkg.includes<S::Foo>());
+
+    pkg.remove<I::Foo>();
+    EXPECT_TRUE(pkg.includes<J::Bar>());
+    EXPECT_TRUE(pkg.includes<S::Bar>());
+    EXPECT_FALSE(pkg.includes<J::Foo>());
+    EXPECT_FALSE(pkg.includes<S::Foo>());
+};
+
+TEST(KernelPackage, CreateHetero)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz, S::Qux>();
+    EXPECT_EQ(4u, pkg.size());
+}
+
+TEST(KernelPackage, IncludesHetero)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz, S::Qux>();
+    EXPECT_TRUE (pkg.includes<J::Foo>());
+    EXPECT_TRUE (pkg.includes<J::Bar>());
+    EXPECT_TRUE (pkg.includes<J::Baz>());
+    EXPECT_FALSE(pkg.includes<J::Qux>());
+    EXPECT_TRUE (pkg.includes<S::Qux>());
+}
+
+TEST(KernelPackage, IncludeHetero)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz>();
+    EXPECT_FALSE(pkg.includes<J::Qux>());
+    EXPECT_FALSE(pkg.includes<S::Qux>());
+
+    pkg.include<S::Qux>();
+    EXPECT_FALSE(pkg.includes<J::Qux>());
+    EXPECT_TRUE (pkg.includes<S::Qux>());
+}
+
+TEST(KernelPackage, Combine_REPLACE_Full)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto j_pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz>();
+    auto s_pkg = cv::gapi::kernels<S::Foo, S::Bar, S::Baz>();
+    auto u_pkg = cv::gapi::combine(j_pkg, s_pkg, cv::unite_policy::REPLACE);
+
+    EXPECT_EQ(3u, u_pkg.size());
+    EXPECT_FALSE(u_pkg.includes<J::Foo>());
+    EXPECT_FALSE(u_pkg.includes<J::Bar>());
+    EXPECT_FALSE(u_pkg.includes<J::Baz>());
+    EXPECT_TRUE (u_pkg.includes<S::Foo>());
+    EXPECT_TRUE (u_pkg.includes<S::Bar>());
+    EXPECT_TRUE (u_pkg.includes<S::Baz>());
+}
+
+TEST(KernelPackage, Combine_REPLACE_Partial)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto j_pkg = cv::gapi::kernels<J::Foo, J::Bar>();
+    auto s_pkg = cv::gapi::kernels<S::Bar>();
+    auto u_pkg = cv::gapi::combine(j_pkg, s_pkg, cv::unite_policy::REPLACE);
+
+    EXPECT_EQ(2u, u_pkg.size());
+    EXPECT_TRUE (u_pkg.includes<J::Foo>());
+    EXPECT_FALSE(u_pkg.includes<J::Bar>());
+    EXPECT_TRUE (u_pkg.includes<S::Bar>());
+}
+
+TEST(KernelPackage, Combine_REPLACE_Append)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto j_pkg = cv::gapi::kernels<J::Foo, J::Bar>();
+    auto s_pkg = cv::gapi::kernels<S::Qux>();
+    auto u_pkg = cv::gapi::combine(j_pkg, s_pkg, cv::unite_policy::REPLACE);
+
+    EXPECT_EQ(3u, u_pkg.size());
+    EXPECT_TRUE(u_pkg.includes<J::Foo>());
+    EXPECT_TRUE(u_pkg.includes<J::Bar>());
+    EXPECT_TRUE(u_pkg.includes<S::Qux>());
+}
+
+TEST(KernelPackage, Combine_KEEP_AllDups)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto j_pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz>();
+    auto s_pkg = cv::gapi::kernels<S::Foo, S::Bar, S::Baz>();
+    auto u_pkg = cv::gapi::combine(j_pkg ,s_pkg, cv::unite_policy::KEEP);
+
+    EXPECT_EQ(6u, u_pkg.size());
+    EXPECT_TRUE(u_pkg.includes<J::Foo>());
+    EXPECT_TRUE(u_pkg.includes<J::Bar>());
+    EXPECT_TRUE(u_pkg.includes<J::Baz>());
+    EXPECT_TRUE(u_pkg.includes<S::Foo>());
+    EXPECT_TRUE(u_pkg.includes<S::Bar>());
+    EXPECT_TRUE(u_pkg.includes<S::Baz>());
+}
+
+TEST(KernelPackage, Combine_KEEP_Append_NoDups)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    auto j_pkg = cv::gapi::kernels<J::Foo, J::Bar>();
+    auto s_pkg = cv::gapi::kernels<S::Qux>();
+    auto u_pkg = cv::gapi::combine(j_pkg, s_pkg, cv::unite_policy::KEEP);
+
+    EXPECT_EQ(3u, u_pkg.size());
+    EXPECT_TRUE(u_pkg.includes<J::Foo>());
+    EXPECT_TRUE(u_pkg.includes<J::Bar>());
+    EXPECT_TRUE(u_pkg.includes<S::Qux>());
+}
+
+TEST(KernelPackage, TestWithEmptyLHS)
+{
+    namespace J = Jupiter;
+    auto lhs = cv::gapi::kernels<>();
+    auto rhs = cv::gapi::kernels<J::Foo>();
+    auto pkg = cv::gapi::combine(lhs, rhs, cv::unite_policy::KEEP);
+
+    EXPECT_EQ(1u, pkg.size());
+    EXPECT_TRUE(pkg.includes<J::Foo>());
+}
+
+TEST(KernelPackage, TestWithEmptyRHS)
+{
+    namespace J = Jupiter;
+    auto lhs = cv::gapi::kernels<J::Foo>();
+    auto rhs = cv::gapi::kernels<>();
+    auto pkg = cv::gapi::combine(lhs, rhs, cv::unite_policy::KEEP);
+
+    EXPECT_EQ(1u, pkg.size());
+    EXPECT_TRUE(pkg.includes<J::Foo>());
+}
+
+TEST(KernelPackage, Can_Use_Custom_Kernel)
+{
+    cv::GMat in[2];
+    auto out = GClone::on(cv::gapi::add(in[0], in[1]));
+    const auto in_meta = cv::GMetaArg(cv::GMatDesc{CV_8U,1,cv::Size(32,32)});
+
+    auto pkg = cv::gapi::kernels<GCloneImpl>();
+
+    EXPECT_NO_THROW(cv::GComputation(cv::GIn(in[0], in[1]), cv::GOut(out)).
+                        compile({in_meta, in_meta}, cv::compile_args(pkg)));
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_mock_kernels.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_mock_kernels.hpp
new file mode 100644 (file)
index 0000000..cd876ef
--- /dev/null
@@ -0,0 +1,123 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "opencv2/gapi/cpu/gcpukernel.hpp"
+
+#include "api/gbackend_priv.hpp" // directly instantiate GBackend::Priv
+
+namespace opencv_test
+{
+namespace {
+    // FIXME: Currently every Kernel implementation in this test file has
+    // its own backend() method and it is incorrect! API classes should
+    // provide it out of the box.
+
+namespace I
+{
+    G_TYPED_KERNEL(Foo, <cv::GMat(cv::GMat)>, "test.kernels.foo")
+    {
+        static cv::GMatDesc outMeta(const cv::GMatDesc &in) { return in; }
+    };
+
+    G_TYPED_KERNEL(Bar, <cv::GMat(cv::GMat,cv::GMat)>, "test.kernels.bar")
+    {
+        static cv::GMatDesc outMeta(const cv::GMatDesc &in, const cv::GMatDesc &) { return in; }
+    };
+
+    G_TYPED_KERNEL(Baz, <cv::GScalar(cv::GMat)>, "test.kernels.baz")
+    {
+        static cv::GScalarDesc outMeta(const cv::GMatDesc &) { return cv::empty_scalar_desc(); }
+    };
+
+    G_TYPED_KERNEL(Qux, <cv::GMat(cv::GMat, cv::GScalar)>, "test.kernels.qux")
+    {
+        static cv::GMatDesc outMeta(const cv::GMatDesc &in, const cv::GScalarDesc &) { return in; }
+    };
+
+    G_TYPED_KERNEL(Quux, <cv::GMat(cv::GScalar, cv::GMat)>, "test.kernels.quux")
+    {
+        static cv::GMatDesc outMeta(const cv::GScalarDesc &, const cv::GMatDesc& in) { return in; }
+    };
+}
+
+// Kernel implementations for imaginary Jupiter device
+namespace Jupiter
+{
+    namespace detail
+    {
+        static cv::gapi::GBackend backend(std::make_shared<cv::gapi::GBackend::Priv>());
+    }
+
+    inline cv::gapi::GBackend backend() { return detail::backend; }
+
+    GAPI_OCV_KERNEL(Foo, I::Foo)
+    {
+        static void run(const cv::Mat &, cv::Mat &) { /*Do nothing*/ }
+        static cv::gapi::GBackend backend() { return detail::backend; } // FIXME: Must be removed
+    };
+    GAPI_OCV_KERNEL(Bar, I::Bar)
+    {
+        static void run(const cv::Mat &, const cv::Mat &, cv::Mat &) { /*Do nothing*/ }
+        static cv::gapi::GBackend backend() { return detail::backend; } // FIXME: Must be removed
+    };
+    GAPI_OCV_KERNEL(Baz, I::Baz)
+    {
+        static void run(const cv::Mat &, cv::Scalar &) { /*Do nothing*/ }
+        static cv::gapi::GBackend backend() { return detail::backend; } // FIXME: Must be removed
+    };
+    GAPI_OCV_KERNEL(Qux, I::Qux)
+    {
+        static void run(const cv::Mat &, const cv::Scalar&, cv::Mat &) { /*Do nothing*/ }
+        static cv::gapi::GBackend backend() { return detail::backend; } // FIXME: Must be removed
+    };
+
+    GAPI_OCV_KERNEL(Quux, I::Quux)
+    {
+        static void run(const cv::Scalar&, const cv::Mat&, cv::Mat &) { /*Do nothing*/ }
+        static cv::gapi::GBackend backend() { return detail::backend; } // FIXME: Must be removed
+    };
+} // namespace Jupiter
+
+// Kernel implementations for imaginary Saturn device
+namespace Saturn
+{
+    namespace detail
+    {
+        static cv::gapi::GBackend backend(std::make_shared<cv::gapi::GBackend::Priv>());
+    }
+
+    inline cv::gapi::GBackend backend() { return detail::backend; }
+
+    GAPI_OCV_KERNEL(Foo, I::Foo)
+    {
+        static void run(const cv::Mat &, cv::Mat &) { /*Do nothing*/ }
+        static cv::gapi::GBackend backend() { return detail::backend; } // FIXME: Must be removed
+    };
+    GAPI_OCV_KERNEL(Bar, I::Bar)
+    {
+        static void run(const cv::Mat &, const cv::Mat &, cv::Mat &) { /*Do nothing*/ }
+        static cv::gapi::GBackend backend() { return detail::backend; } // FIXME: Must be removed
+    };
+    GAPI_OCV_KERNEL(Baz, I::Baz)
+    {
+        static void run(const cv::Mat &, cv::Scalar &) { /*Do nothing*/ }
+        static cv::gapi::GBackend backend() { return detail::backend; } // FIXME: Must be removed
+    };
+    GAPI_OCV_KERNEL(Qux, I::Qux)
+    {
+        static void run(const cv::Mat &, const cv::Scalar&, cv::Mat &) { /*Do nothing*/ }
+        static cv::gapi::GBackend backend() { return detail::backend; } // FIXME: Must be removed
+    };
+
+    GAPI_OCV_KERNEL(Quux, I::Quux)
+    {
+        static void run(const cv::Scalar&, const cv::Mat&, cv::Mat &) { /*Do nothing*/ }
+        static cv::gapi::GBackend backend() { return detail::backend; } // FIXME: Must be removed
+    };
+} // namespace Saturn
+} // anonymous namespace
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_sample_pipelines.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_sample_pipelines.cpp
new file mode 100644 (file)
index 0000000..815aa0d
--- /dev/null
@@ -0,0 +1,301 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include <stdexcept>
+#include <ade/util/iota_range.hpp>
+#include "logger.hpp"
+
+namespace opencv_test
+{
+
+namespace
+{
+    G_TYPED_KERNEL(GInvalidResize, <GMat(GMat,Size,double,double,int)>, "org.opencv.test.invalid_resize")
+    {
+         static GMatDesc outMeta(GMatDesc in, Size, double, double, int) { return in; }
+    };
+
+    GAPI_OCV_KERNEL(GOCVInvalidResize, GInvalidResize)
+    {
+        static void run(const cv::Mat& in, cv::Size sz, double fx, double fy, int interp, cv::Mat &out)
+        {
+            cv::resize(in, out, sz, fx, fy, interp);
+        }
+    };
+
+    G_TYPED_KERNEL(GReallocatingCopy, <GMat(GMat)>, "org.opencv.test.reallocating_copy")
+    {
+         static GMatDesc outMeta(GMatDesc in) { return in; }
+    };
+
+    GAPI_OCV_KERNEL(GOCVReallocatingCopy, GReallocatingCopy)
+    {
+        static void run(const cv::Mat& in, cv::Mat &out)
+        {
+            out = in.clone();
+        }
+    };
+}
+
+TEST(GAPI_Pipeline, OverloadUnary_MatMat)
+{
+    cv::GMat in;
+    cv::GComputation comp(in, cv::gapi::bitwise_not(in));
+
+    cv::Mat in_mat = cv::Mat::eye(32, 32, CV_8UC1);
+    cv::Mat ref_mat = ~in_mat;
+
+    cv::Mat out_mat;
+    comp.apply(in_mat, out_mat);
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+
+    out_mat = cv::Mat();
+    auto cc = comp.compile(cv::descr_of(in_mat));
+    cc(in_mat, out_mat);
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GAPI_Pipeline, OverloadUnary_MatScalar)
+{
+    cv::GMat in;
+    cv::GComputation comp(in, cv::gapi::sum(in));
+
+    cv::Mat in_mat = cv::Mat::eye(32, 32, CV_8UC1);
+    cv::Scalar ref_scl = cv::sum(in_mat);
+
+    cv::Scalar out_scl;
+    comp.apply(in_mat, out_scl);
+    EXPECT_EQ(out_scl, ref_scl);
+
+    out_scl = cv::Scalar();
+    auto cc = comp.compile(cv::descr_of(in_mat));
+    cc(in_mat, out_scl);
+    EXPECT_EQ(out_scl, ref_scl);
+}
+
+TEST(GAPI_Pipeline, OverloadBinary_Mat)
+{
+    cv::GMat a, b;
+    cv::GComputation comp(a, b, cv::gapi::add(a, b));
+
+    cv::Mat in_mat = cv::Mat::eye(32, 32, CV_8UC1);
+    cv::Mat ref_mat = (in_mat+in_mat);
+
+    cv::Mat out_mat;
+    comp.apply(in_mat, in_mat, out_mat);
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+
+    out_mat = cv::Mat();
+    auto cc = comp.compile(cv::descr_of(in_mat), cv::descr_of(in_mat));
+    cc(in_mat, in_mat, out_mat);
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GAPI_Pipeline, OverloadBinary_Scalar)
+{
+    cv::GMat a, b;
+    cv::GComputation comp(a, b, cv::gapi::sum(a + b));
+
+    cv::Mat in_mat = cv::Mat::eye(32, 32, CV_8UC1);
+    cv::Scalar ref_scl = cv::sum(in_mat+in_mat);
+
+    cv::Scalar out_scl;
+    comp.apply(in_mat, in_mat, out_scl);
+    EXPECT_EQ(out_scl, ref_scl);
+
+    out_scl = cv::Scalar();
+    auto cc = comp.compile(cv::descr_of(in_mat), cv::descr_of(in_mat));
+    cc(in_mat, in_mat, out_scl);
+    EXPECT_EQ(out_scl, ref_scl);
+}
+
+TEST(GAPI_Pipeline, Sharpen)
+{
+    const cv::Size sz_in (1280, 720);
+    const cv::Size sz_out( 640, 480);
+    cv::Mat in_mat (sz_in,  CV_8UC3);
+    in_mat = cv::Scalar(128, 33, 53);
+
+    cv::Mat out_mat(sz_out, CV_8UC3);
+    cv::Mat out_mat_y;
+    cv::Mat out_mat_ocv(sz_out, CV_8UC3);
+
+    float sharpen_coeffs[] = {
+         0.0f, -1.f,  0.0f,
+        -1.0f,  5.f, -1.0f,
+         0.0f, -1.f,  0.0f
+    };
+    cv::Mat sharpen_kernel(3, 3, CV_32F, sharpen_coeffs);
+
+    // G-API code //////////////////////////////////////////////////////////////
+
+    cv::GMat in;
+    auto vga     = cv::gapi::resize(in, sz_out);
+    auto yuv     = cv::gapi::RGB2YUV(vga);
+    auto yuv_p   = cv::gapi::split3(yuv);
+    auto y_sharp = cv::gapi::filter2D(std::get<0>(yuv_p), -1, sharpen_kernel);
+    auto yuv_new = cv::gapi::merge3(y_sharp, std::get<1>(yuv_p), std::get<2>(yuv_p));
+    auto out     = cv::gapi::YUV2RGB(yuv_new);
+
+    cv::GComputation c(cv::GIn(in), cv::GOut(y_sharp, out));
+    c.apply(cv::gin(in_mat), cv::gout(out_mat_y, out_mat));
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::Mat smaller;
+        cv::resize(in_mat, smaller, sz_out);
+
+        cv::Mat yuv_mat;
+        cv::cvtColor(smaller, yuv_mat, cv::COLOR_RGB2YUV);
+        std::vector<cv::Mat> yuv_planar(3);
+        cv::split(yuv_mat, yuv_planar);
+        cv::filter2D(yuv_planar[0], yuv_planar[0], -1, sharpen_kernel);
+        cv::merge(yuv_planar, yuv_mat);
+        cv::cvtColor(yuv_mat, out_mat_ocv, cv::COLOR_YUV2RGB);
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        cv::Mat diff = out_mat_ocv != out_mat;
+        std::vector<cv::Mat> diffBGR(3);
+        cv::split(diff, diffBGR);
+        EXPECT_EQ(0, cv::countNonZero(diffBGR[0]));
+        EXPECT_EQ(0, cv::countNonZero(diffBGR[1]));
+        EXPECT_EQ(0, cv::countNonZero(diffBGR[2]));
+    }
+
+    // Metadata check /////////////////////////////////////////////////////////
+    {
+        auto cc    = c.compile(cv::descr_of(in_mat));
+        auto metas = cc.outMetas();
+        ASSERT_EQ(2u, metas.size());
+
+        auto out_y_meta = cv::util::get<cv::GMatDesc>(metas[0]);
+        auto out_meta   = cv::util::get<cv::GMatDesc>(metas[1]);
+
+        // Y-output
+        EXPECT_EQ(CV_8U,   out_y_meta.depth);
+        EXPECT_EQ(1,       out_y_meta.chan);
+        EXPECT_EQ(640,     out_y_meta.size.width);
+        EXPECT_EQ(480,     out_y_meta.size.height);
+
+        // Final output
+        EXPECT_EQ(CV_8U,   out_meta.depth);
+        EXPECT_EQ(3,       out_meta.chan);
+        EXPECT_EQ(640,     out_meta.size.width);
+        EXPECT_EQ(480,     out_meta.size.height);
+    }
+}
+
+TEST(GAPI_Pipeline, CustomRGB2YUV)
+{
+    const cv::Size sz(1280, 720);
+
+    // BEWARE:
+    //
+    //    std::vector<cv::Mat> out_mats_cv(3, cv::Mat(sz, CV_8U))
+    //
+    // creates a vector of 3 elements pointing to the same Mat!
+    // FIXME: Make a G-API check for that
+    const int INS = 3;
+    std::vector<cv::Mat> in_mats(INS);
+    for (auto i : ade::util::iota(INS))
+    {
+        in_mats[i].create(sz, CV_8U);
+        cv::randu(in_mats[i], cv::Scalar::all(0), cv::Scalar::all(255));
+    }
+
+    const int OUTS = 3;
+    std::vector<cv::Mat> out_mats_cv(OUTS);
+    std::vector<cv::Mat> out_mats_gapi(OUTS);
+    for (auto i : ade::util::iota(OUTS))
+    {
+        out_mats_cv  [i].create(sz, CV_8U);
+        out_mats_gapi[i].create(sz, CV_8U);
+    }
+
+    // G-API code //////////////////////////////////////////////////////////////
+    {
+        cv::GMat r, g, b;
+        cv::GMat y = 0.299f*r + 0.587f*g + 0.114f*b;
+        cv::GMat u = 0.492f*(b - y);
+        cv::GMat v = 0.877f*(r - y);
+
+        cv::GComputation customCvt({r, g, b}, {y, u, v});
+        customCvt.apply(in_mats, out_mats_gapi);
+    }
+
+    // OpenCV code /////////////////////////////////////////////////////////////
+    {
+        cv::Mat r = in_mats[0], g = in_mats[1], b = in_mats[2];
+        cv::Mat y = 0.299f*r + 0.587f*g + 0.114f*b;
+        cv::Mat u = 0.492f*(b - y);
+        cv::Mat v = 0.877f*(r - y);
+
+        out_mats_cv[0] = y;
+        out_mats_cv[1] = u;
+        out_mats_cv[2] = v;
+    }
+
+    // Comparison //////////////////////////////////////////////////////////////
+    {
+        const auto diff = [](cv::Mat m1, cv::Mat m2, int t) {
+            return cv::abs(m1-m2) > t;
+        };
+
+        // FIXME: Not bit-accurate even now!
+        cv::Mat
+            diff_y = diff(out_mats_cv[0], out_mats_gapi[0], 2),
+            diff_u = diff(out_mats_cv[1], out_mats_gapi[1], 2),
+            diff_v = diff(out_mats_cv[2], out_mats_gapi[2], 2);
+
+        EXPECT_EQ(0, cv::countNonZero(diff_y));
+        EXPECT_EQ(0, cv::countNonZero(diff_u));
+        EXPECT_EQ(0, cv::countNonZero(diff_v));
+    }
+}
+
+TEST(GAPI_Pipeline, PipelineWithInvalidKernel)
+{
+    cv::GMat in, out;
+    cv::Mat in_mat(500, 500, CV_8UC1), out_mat;
+    out = GInvalidResize::on(in, cv::Size(300, 300), 0.0, 0.0, cv::INTER_LINEAR);
+
+    const auto pkg = cv::gapi::kernels<GOCVInvalidResize>();
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+
+    EXPECT_THROW(comp.apply(in_mat, out_mat, cv::compile_args(pkg)), std::logic_error);
+}
+
+TEST(GAPI_Pipeline, InvalidOutputComputation)
+{
+    cv::GMat in1, out1, out2, out3;
+
+    std::tie(out1, out2, out2) = cv::gapi::split3(in1);
+    cv::GComputation c({in1}, {out1, out2, out3});
+    cv::Mat in_mat;
+    cv::Mat out_mat1, out_mat2, out_mat3, out_mat4;
+    std::vector<cv::Mat> u_outs = {out_mat1, out_mat2, out_mat3, out_mat4};
+    std::vector<cv::Mat> u_ins = {in_mat};
+
+    EXPECT_THROW(c.apply(u_ins, u_outs), std::logic_error);
+}
+
+TEST(GAPI_Pipeline, PipelineAllocatingKernel)
+{
+    cv::GMat in, out;
+    cv::Mat in_mat(500, 500, CV_8UC1), out_mat;
+    out = GReallocatingCopy::on(in);
+
+    const auto pkg = cv::gapi::kernels<GOCVReallocatingCopy>();
+    cv::GComputation comp(cv::GIn(in), cv::GOut(out));
+
+    EXPECT_THROW(comp.apply(in_mat, out_mat, cv::compile_args(pkg)), std::logic_error);
+}
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_scalar_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_scalar_tests.cpp
new file mode 100644 (file)
index 0000000..7b4baa0
--- /dev/null
@@ -0,0 +1,117 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include <iostream>
+
+namespace opencv_test
+{
+
+TEST(GAPI_Scalar, Argument)
+{
+    cv::Size sz(2, 2);
+    cv::Mat in_mat(sz, CV_8U);
+    cv::randn(in_mat, cv::Scalar::all(127), cv::Scalar::all(40.f));
+
+    cv::GComputationT<cv::GMat (cv::GMat, cv::GScalar)> mulS([](cv::GMat in, cv::GScalar c)
+    {
+        return in*c;
+    });
+
+    cv::Mat out_mat(sz, CV_8U);
+    mulS.apply(in_mat, cv::Scalar(2), out_mat);
+
+    cv::Mat reference = in_mat*2;
+    EXPECT_EQ(0, cv::countNonZero(cv::abs(out_mat - reference)));
+}
+
+TEST(GAPI_Scalar, ReturnValue)
+{
+    const cv::Size sz(2, 2);
+    cv::Mat in_mat(sz, CV_8U, cv::Scalar(1));
+
+    cv::GComputationT<cv::GScalar (cv::GMat)> sum_of_sum([](cv::GMat in)
+    {
+        return cv::gapi::sum(in + in);
+    });
+
+    cv::Scalar out;
+    sum_of_sum.apply(in_mat, out);
+
+    EXPECT_EQ(8, out[0]);
+}
+
+TEST(GAPI_Scalar, TmpScalar)
+{
+    const cv::Size sz(2, 2);
+    cv::Mat in_mat(sz, CV_8U, cv::Scalar(1));
+
+    cv::GComputationT<cv::GMat (cv::GMat)> mul_by_sum([](cv::GMat in)
+    {
+        return in * cv::gapi::sum(in);
+    });
+
+    cv::Mat out_mat(sz, CV_8U);
+    mul_by_sum.apply(in_mat, out_mat);
+
+    cv::Mat reference = cv::Mat(sz, CV_8U, cv::Scalar(4));
+    EXPECT_EQ(0, cv::countNonZero(cv::abs(out_mat - reference)));
+}
+
+TEST(GAPI_ScalarWithValue, Simple_Arithmetic_Pipeline)
+{
+    GMat in;
+    GMat out = (in + 1) * 2;
+    cv::GComputation comp(in, out);
+
+    cv::Mat in_mat  = cv::Mat::eye(3, 3, CV_8UC1);
+    cv::Mat ref_mat, out_mat;
+
+    ref_mat = (in_mat + 1) * 2;
+    comp.apply(in_mat, out_mat);
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GAPI_ScalarWithValue, GScalar_Initilization)
+{
+    cv::Scalar sc(2);
+    cv::GMat in;
+    cv::GScalar s(sc);
+    cv::GComputation comp(in, cv::gapi::mulC(in, s));
+
+    cv::Mat in_mat = cv::Mat::eye(3, 3, CV_8UC1);
+    cv::Mat ref_mat, out_mat;
+    cv::multiply(in_mat, sc, ref_mat, 1, CV_8UC1);
+    comp.apply(cv::gin(in_mat), cv::gout(out_mat));
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+TEST(GAPI_ScalarWithValue, Constant_GScalar_In_Middle_Graph)
+{
+    cv::Scalar  sc(5);
+    cv::GMat    in1;
+    cv::GScalar in2;
+    cv::GScalar s(sc);
+
+    auto add_out = cv::gapi::addC(in1, in2);
+    cv::GComputation comp(cv::GIn(in1, in2), cv::GOut(cv::gapi::mulC(add_out, s)));
+
+    cv::Mat    in_mat = cv::Mat::eye(3, 3, CV_8UC1);
+    cv::Scalar in_scalar(3);
+
+    cv::Mat ref_mat, out_mat, add_mat;
+    cv::add(in_mat, in_scalar, add_mat);
+    cv::multiply(add_mat, sc, ref_mat, 1, CV_8UC1);
+    comp.apply(cv::gin(in_mat, in_scalar), cv::gout(out_mat));
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat != ref_mat));
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_smoke_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_smoke_test.cpp
new file mode 100644 (file)
index 0000000..9ac47f6
--- /dev/null
@@ -0,0 +1,97 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+
+TEST(GAPI, Mat_Create_NoLink)
+{
+    cv::Mat m1;
+    cv::Mat m2 = m1;
+    m2.create(32, 32, CV_8U);
+
+    EXPECT_NE(m1.rows, m2.rows);
+    EXPECT_NE(m1.cols, m2.cols);
+    EXPECT_NE(m1.data, m2.data);
+}
+
+TEST(GAPI, Mat_Recreate)
+{
+    cv::Mat m1 = cv::Mat::zeros(480, 640, CV_8U);
+    m1.at<uchar>(0, 0) = 128;
+    cv::Mat m2 = m1;
+
+    EXPECT_EQ(m1.rows, m2.rows);
+    EXPECT_EQ(m1.cols, m2.cols);
+    EXPECT_EQ(m1.data, m2.data);
+    EXPECT_EQ(m1.at<uchar>(0, 0), m2.at<uchar>(0, 0));
+
+    // Calling "create" with the same meta is NOOP - both m1 and m2 are the same
+    m1.create(480, 640, CV_8U);
+    EXPECT_EQ(m1.rows, m2.rows);
+    EXPECT_EQ(m1.cols, m2.cols);
+    EXPECT_EQ(m1.data, m2.data);
+    EXPECT_EQ(m1.at<uchar>(0, 0), m2.at<uchar>(0, 0));
+
+    // Calling "create" on m2 with different meta doesn't update original m1
+    // Now m1 and m2 are distinct
+    m2.create(720, 1280, CV_8U);
+    m2.at<uchar>(0, 0) = 64; // Initialize 0,0 element since m2 is a new buffer
+    EXPECT_NE(m1.rows, m2.rows);
+    EXPECT_NE(m1.cols, m2.cols);
+    EXPECT_NE(m1.data, m2.data);
+    EXPECT_NE(m1.at<uchar>(0, 0), m2.at<uchar>(0, 0));
+
+    // What if a Mat is created from handle?
+    uchar data[] = {
+        32, 0, 0,
+         0, 0, 0,
+         0, 0, 0
+    };
+    cv::Mat m3(3, 3, CV_8U, data);
+    cv::Mat m4 = m3;
+    EXPECT_EQ(m3.rows, m4.rows);
+    EXPECT_EQ(m3.cols, m4.cols);
+    EXPECT_EQ(m3.data, m4.data);
+    EXPECT_EQ(data, m3.data);
+    EXPECT_EQ(data, m4.data);
+    EXPECT_EQ(m3.at<uchar>(0, 0), m4.at<uchar>(0, 0));
+
+    // cv::Mat::create must be NOOP if we don't change the meta,
+    // even if the origianl mat is created from handle.
+    m4.create(3, 3, CV_8U);
+    EXPECT_EQ(m3.rows, m4.rows);
+    EXPECT_EQ(m3.cols, m4.cols);
+    EXPECT_EQ(m3.data, m4.data);
+    EXPECT_EQ(data, m3.data);
+    EXPECT_EQ(data, m4.data);
+    EXPECT_EQ(m3.at<uchar>(0, 0), m4.at<uchar>(0, 0));
+}
+
+TEST(GAPI, EmptyOutMat)
+{
+    cv::Mat in_mat = cv::Mat(480, 640, CV_8U, cv::Scalar(64));
+
+    cv::GComputation cc([]()
+    {
+        cv::GMat in;
+        cv::GMat out = in + in;
+        return cv::GComputation(in, out);
+    });
+
+    cv::Mat out;
+    cc.apply(in_mat, out);
+
+    EXPECT_EQ(640, out.cols);
+    EXPECT_EQ(480, out.rows);
+    EXPECT_EQ(CV_8U, out.type());
+    EXPECT_EQ(0, cv::countNonZero(out - (in_mat+in_mat)));
+}
+
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_typed_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_typed_tests.cpp
new file mode 100644 (file)
index 0000000..1716b55
--- /dev/null
@@ -0,0 +1,185 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+
+namespace
+{
+    cv::Mat diff(cv::Mat m1, cv::Mat m2, int t)
+    {
+        return cv::abs(m1-m2) > t;
+    }
+
+    int non_zero3(cv::Mat m3c)
+    {
+        std::vector<cv::Mat> mm(3);
+        cv::split(m3c, mm);
+        return (  cv::countNonZero(mm[0])
+                + cv::countNonZero(mm[1])
+                + cv::countNonZero(mm[2]));
+    }
+}
+
+TEST(GAPI_Typed, UnaryOp)
+{
+    // Initialization //////////////////////////////////////////////////////////
+    const cv::Size sz(32, 32);
+    cv::Mat
+        in_mat         (sz, CV_8UC3),
+        out_mat_untyped(sz, CV_8UC3),
+        out_mat_typed1 (sz, CV_8UC3),
+        out_mat_typed2 (sz, CV_8UC3),
+        out_mat_cv     (sz, CV_8UC3);
+    cv::randu(in_mat, cv::Scalar::all(0), cv::Scalar::all(255));
+
+    // Untyped G-API ///////////////////////////////////////////////////////////
+    cv::GComputation cvtU([]()
+    {
+        cv::GMat in;
+        cv::GMat out = cv::gapi::RGB2YUV(in);
+        return cv::GComputation(in, out);
+    });
+    cvtU.apply(in_mat, out_mat_untyped);
+
+    // Typed G-API /////////////////////////////////////////////////////////////
+    cv::GComputationT<cv::GMat (cv::GMat)> cvtT(cv::gapi::RGB2YUV);
+    auto cvtTComp = cvtT.compile(cv::descr_of(in_mat));
+
+    cvtT.apply(in_mat, out_mat_typed1);
+    cvtTComp(in_mat, out_mat_typed2);
+
+    // Plain OpenCV ////////////////////////////////////////////////////////////
+    cv::cvtColor(in_mat, out_mat_cv, cv::COLOR_RGB2YUV);
+
+    // Comparison //////////////////////////////////////////////////////////////
+    // FIXME: There must be OpenCV comparison test functions already available!
+    cv::Mat
+        diff_u  = diff(out_mat_cv, out_mat_untyped, 0),
+        diff_t  = diff(out_mat_cv, out_mat_typed1,  0),
+        diff_tc = diff(out_mat_cv, out_mat_typed2,  0);
+
+    EXPECT_EQ(0, non_zero3(diff_u));
+    EXPECT_EQ(0, non_zero3(diff_t));
+    EXPECT_EQ(0, non_zero3(diff_tc));
+}
+
+TEST(GAPI_Typed, BinaryOp)
+{
+    // Initialization //////////////////////////////////////////////////////////
+    const cv::Size sz(32, 32);
+    cv::Mat
+        in_mat1        (sz, CV_8UC1),
+        in_mat2        (sz, CV_8UC1),
+        out_mat_untyped(sz, CV_8UC1),
+        out_mat_typed1 (sz, CV_8UC1),
+        out_mat_typed2 (sz, CV_8UC1),
+        out_mat_cv     (sz, CV_8UC1);
+    cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+    cv::randu(in_mat2, cv::Scalar::all(0), cv::Scalar::all(255));
+
+    // Untyped G-API ///////////////////////////////////////////////////////////
+    cv::GComputation cvtU([]()
+    {
+        cv::GMat in1, in2;
+        cv::GMat out = cv::gapi::add(in1, in2);
+        return cv::GComputation({in1, in2}, {out});
+    });
+    std::vector<cv::Mat> u_ins  = {in_mat1, in_mat2};
+    std::vector<cv::Mat> u_outs = {out_mat_untyped};
+    cvtU.apply(u_ins, u_outs);
+
+    // Typed G-API /////////////////////////////////////////////////////////////
+    cv::GComputationT<cv::GMat (cv::GMat, cv::GMat)> cvtT([](cv::GMat m1, cv::GMat m2)
+    {
+        return m1+m2;
+    });
+    auto cvtTC =  cvtT.compile(cv::descr_of(in_mat1),
+                               cv::descr_of(in_mat2));
+
+    cvtT.apply(in_mat1, in_mat2, out_mat_typed1);
+    cvtTC(in_mat1, in_mat2, out_mat_typed2);
+
+    // Plain OpenCV ////////////////////////////////////////////////////////////
+    cv::add(in_mat1, in_mat2, out_mat_cv);
+
+    // Comparison //////////////////////////////////////////////////////////////
+    // FIXME: There must be OpenCV comparison test functions already available!
+    cv::Mat
+        diff_u  = diff(out_mat_cv, out_mat_untyped, 0),
+        diff_t  = diff(out_mat_cv, out_mat_typed1,  0),
+        diff_tc = diff(out_mat_cv, out_mat_typed2,  0);
+
+    EXPECT_EQ(0, cv::countNonZero(diff_u));
+    EXPECT_EQ(0, cv::countNonZero(diff_t));
+    EXPECT_EQ(0, cv::countNonZero(diff_tc));
+}
+
+
+TEST(GAPI_Typed, MultipleOuts)
+{
+    // Initialization //////////////////////////////////////////////////////////
+    const cv::Size sz(32, 32);
+    cv::Mat
+        in_mat        (sz, CV_8UC1),
+        out_mat_unt1  (sz, CV_8UC1),
+        out_mat_unt2  (sz, CV_8UC1),
+        out_mat_typed1(sz, CV_8UC1),
+        out_mat_typed2(sz, CV_8UC1),
+        out_mat_comp1 (sz, CV_8UC1),
+        out_mat_comp2 (sz, CV_8UC1),
+        out_mat_cv1   (sz, CV_8UC1),
+        out_mat_cv2   (sz, CV_8UC1);
+    cv::randu(in_mat, cv::Scalar::all(0), cv::Scalar::all(255));
+
+    // Untyped G-API ///////////////////////////////////////////////////////////
+    cv::GComputation cvtU([]()
+    {
+        cv::GMat in;
+        cv::GMat out1 = in * 2.f;
+        cv::GMat out2 = in * 4.f;
+        return cv::GComputation({in}, {out1, out2});
+    });
+    std::vector<cv::Mat> u_ins  = {in_mat};
+    std::vector<cv::Mat> u_outs = {out_mat_unt1, out_mat_unt2};
+    cvtU.apply(u_ins, u_outs);
+
+    // Typed G-API /////////////////////////////////////////////////////////////
+    cv::GComputationT<std::tuple<cv::GMat, cv::GMat> (cv::GMat)> cvtT([](cv::GMat in)
+    {
+        return std::make_tuple(in*2.f, in*4.f);
+    });
+    auto cvtTC =  cvtT.compile(cv::descr_of(in_mat));
+
+    cvtT.apply(in_mat, out_mat_typed1, out_mat_typed2);
+    cvtTC(in_mat, out_mat_comp1, out_mat_comp2);
+
+    // Plain OpenCV ////////////////////////////////////////////////////////////
+    out_mat_cv1 = in_mat * 2.f;
+    out_mat_cv2 = in_mat * 4.f;
+
+    // Comparison //////////////////////////////////////////////////////////////
+    // FIXME: There must be OpenCV comparison test functions already available!
+    cv::Mat
+        diff_u1 = diff(out_mat_cv1, out_mat_unt1,   0),
+        diff_u2 = diff(out_mat_cv2, out_mat_unt2,   0),
+        diff_t1 = diff(out_mat_cv1, out_mat_typed1, 0),
+        diff_t2 = diff(out_mat_cv2, out_mat_typed2, 0),
+        diff_c1 = diff(out_mat_cv1, out_mat_comp1,  0),
+        diff_c2 = diff(out_mat_cv2, out_mat_comp2,  0);
+
+    EXPECT_EQ(0, cv::countNonZero(diff_u1));
+    EXPECT_EQ(0, cv::countNonZero(diff_u2));
+    EXPECT_EQ(0, cv::countNonZero(diff_t1));
+    EXPECT_EQ(0, cv::countNonZero(diff_t2));
+    EXPECT_EQ(0, cv::countNonZero(diff_c1));
+    EXPECT_EQ(0, cv::countNonZero(diff_c2));
+}
+
+} // opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_util_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gapi_util_tests.cpp
new file mode 100644 (file)
index 0000000..574c0ab
--- /dev/null
@@ -0,0 +1,43 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include <type_traits>
+
+#include "opencv2/gapi/util/util.hpp"
+
+namespace opencv_test
+{
+
+TEST(GAPIUtil, AllSatisfy)
+{
+    static_assert(true == cv::detail::all_satisfy<std::is_integral, long, int, char>::value,
+                  "[long, int, char] are all integral types");
+    static_assert(true == cv::detail::all_satisfy<std::is_integral, char>::value,
+                  "char is an integral type");
+
+    static_assert(false == cv::detail::all_satisfy<std::is_integral, float, int, char>::value,
+                  "[float, int, char] are NOT all integral types");
+    static_assert(false == cv::detail::all_satisfy<std::is_integral, int, char, float>::value,
+                  "[int, char, float] are NOT all integral types");
+    static_assert(false == cv::detail::all_satisfy<std::is_integral, float>::value,
+                  "float is not an integral types");
+}
+
+TEST(GAPIUtil, AllButLast)
+{
+    using test1 = cv::detail::all_but_last<long, int, float>::type;
+    static_assert(true == cv::detail::all_satisfy<std::is_integral, test1>::value,
+                  "[long, int] are all integral types (float skipped)");
+
+    using test2 = cv::detail::all_but_last<int, float, char>::type;
+    static_assert(false == cv::detail::all_satisfy<std::is_integral, test2>::value,
+                  "[int, float] are NOT all integral types");
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_core_tests_gpu.cpp
new file mode 100644 (file)
index 0000000..6c331c0
--- /dev/null
@@ -0,0 +1,395 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+#include "../common/gapi_core_tests.hpp"
+#include "opencv2/gapi/gpu/core.hpp"
+
+#define CORE_GPU cv::gapi::core::gpu::kernels()
+
+namespace opencv_test
+{
+
+// FIXME: Wut? See MulTestGPU/MathOpTest below (duplicate?)
+INSTANTIATE_TEST_CASE_P(AddTestGPU, MathOpTest,
+                        Combine(Values(ADD, MUL),
+                                testing::Bool(),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(1.0),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(false),
+                                Values(cv::compile_args(CORE_GPU))),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(MulTestGPU, MathOpTest,
+                        Combine(Values(MUL),
+                                testing::Bool(),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(1.0, 0.5, 2.0),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(false),
+                                Values(cv::compile_args(CORE_GPU))),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(SubTestGPU, MathOpTest,
+                        Combine(Values(SUB),
+                                testing::Bool(),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values (1.0),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(DivTestGPU, MathOpTest,
+                        Combine(Values(DIV),
+                                testing::Bool(),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values (1.0, 0.5, 2.0),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))),
+                        opencv_test::PrintMathOpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(MulTestGPU, MulDoubleTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(DivTestGPU, DivTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(DivCTestGPU, DivCTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MeanTestGPU, MeanTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+    /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+//TODO: mask test doesn't work
+#if 0
+INSTANTIATE_TEST_CASE_P(MaskTestGPU, MaskTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+#endif
+
+INSTANTIATE_TEST_CASE_P(SelectTestGPU, SelectTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Polar2CartGPU, Polar2CartTest,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Cart2PolarGPU, Cart2PolarTest,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(CompareTestGPU, CmpTest,
+                        Combine(Values(CMP_EQ, CMP_GE, CMP_NE, CMP_GT, CMP_LT, CMP_LE),
+                                testing::Bool(),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))),
+                        opencv_test::PrintCmpCoreParams());
+
+INSTANTIATE_TEST_CASE_P(BitwiseTestGPU, BitwiseTest,
+                        Combine(Values(AND, OR, XOR),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))),
+                        opencv_test::PrintBWCoreParams());
+
+INSTANTIATE_TEST_CASE_P(BitwiseNotTestGPU, NotTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+ /*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MinTestGPU, MinTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MaxTestGPU, MaxTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(SumTestGPU, SumTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(1e-3), //TODO: too relaxed?
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffTestGPU, AbsDiffTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(AbsDiffCTestGPU, AbsDiffCTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+// FIXME: Comparison introduced by YL doesn't work with C3
+INSTANTIATE_TEST_CASE_P(AddWeightedTestGPU, AddWeightedTest,
+                        Combine(Values( CV_8UC1/*, CV_8UC3*/, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values( -1, CV_8U, CV_16U, CV_32F ),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(0.50005),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(NormTestGPU, NormTest,
+                        Combine(Values(NORM_INF, NORM_L1, NORM_L2),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(1e-3), //TODO: too relaxed?
+                                Values(cv::compile_args(CORE_GPU))),
+                        opencv_test::PrintNormCoreParams());
+
+INSTANTIATE_TEST_CASE_P(IntegralTestGPU, IntegralTest,
+                        Combine(Values( CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ThresholdTestGPU, ThresholdTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::THRESH_BINARY, cv::THRESH_BINARY_INV, cv::THRESH_TRUNC, cv::THRESH_TOZERO, cv::THRESH_TOZERO_INV),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ThresholdTestGPU, ThresholdOTTest,
+                        Combine(Values(CV_8UC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::THRESH_OTSU, cv::THRESH_TRIANGLE),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+
+INSTANTIATE_TEST_CASE_P(InRangeTestGPU, InRangeTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Split3TestGPU, Split3Test,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Split4TestGPU, Split4Test,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ResizeTestGPU, ResizeTest,
+                        Combine(Values(AbsSimilarPoints(2, 0.05).to_compare_f()),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::Size(64,64),
+                                       cv::Size(30,30)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ResizeTestGPU, ResizeTestFxFy,
+                        Combine(Values(AbsSimilarPoints(2, 0.05).to_compare_f()),
+                                Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_AREA),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(0.5, 0.1),
+                                Values(0.5, 0.1),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Merge3TestGPU, Merge3Test,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Merge4TestGPU, Merge4Test,
+                        Combine(Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(RemapTestGPU, RemapTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(FlipTestGPU, FlipTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(0,1,-1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(CropTestGPU, CropTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Rect(10, 8, 20, 35), cv::Rect(4, 10, 37, 50)),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(LUTTestGPU, LUTTest,
+                        Combine(Values(CV_8UC1, CV_8UC3),
+                                Values(CV_8UC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(LUTTestCustomGPU, LUTTest,
+                        Combine(Values(CV_8UC3),
+                                Values(CV_8UC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ConvertToGPU, ConvertToTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(CV_8U, CV_16U, CV_16S, CV_32F),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatHorTestGPU, ConcatHorTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatVertTestGPU, ConcatVertTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+//TODO: fix this backend to allow ConcatVertVec ConcatHorVec
+#if 0
+INSTANTIATE_TEST_CASE_P(ConcatVertVecTestGPU, ConcatVertVecTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ConcatHorVecTestGPU, ConcatHorVecTest,
+                        Combine(Values( CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1 ),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::compile_args(CORE_GPU))));
+#endif
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_imgproc_tests_gpu.cpp
new file mode 100644 (file)
index 0000000..65d452c
--- /dev/null
@@ -0,0 +1,227 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+
+#include "../common/gapi_imgproc_tests.hpp"
+#include "opencv2/gapi/gpu/imgproc.hpp"
+
+#define IMGPROC_GPU cv::gapi::imgproc::gpu::kernels()
+
+namespace opencv_test
+{
+
+
+INSTANTIATE_TEST_CASE_P(Filter2DTestGPU, Filter2DTest,
+                        Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 4, 5, 7),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(cv::BORDER_DEFAULT),
+                                Values(-1, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BoxFilterTestGPU, BoxFilterTest,
+                        Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_f()),
+                                Values(/*CV_8UC1,*/ CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3,5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::BORDER_DEFAULT),
+                                Values(-1, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));  //TODO: 8UC1 doesn't work
+
+INSTANTIATE_TEST_CASE_P(SepFilterTestGPU_8U, SepFilterTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3),
+                                Values(3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(-1, CV_16S, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(SepFilterTestGPU_other, SepFilterTest,
+                        Combine(Values(ToleranceFilter(1e-4f, 0.01).to_compare_f()),
+                                Values(CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(-1, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BlurTestGPU, BlurTest,
+                        Combine(Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3,5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::BORDER_DEFAULT),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(gaussBlurTestGPU, GaussianBlurTest,
+                        Combine(Values(ToleranceFilter(1e-5f, 0.01).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3),  // FIXIT 5
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MedianBlurTestGPU, MedianBlurTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(ErodeTestGPU, ErodeTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::MorphShapes::MORPH_RECT,
+                                       cv::MorphShapes::MORPH_CROSS,
+                                       cv::MorphShapes::MORPH_ELLIPSE),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Erode3x3TestGPU, Erode3x3Test,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(1,2,4),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(DilateTestGPU, DilateTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(3, 5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(cv::MorphShapes::MORPH_RECT,
+                                       cv::MorphShapes::MORPH_CROSS,
+                                       cv::MorphShapes::MORPH_ELLIPSE),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(Dilate3x3TestGPU, Dilate3x3Test,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(1,2,4),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(SobelTestGPU, SobelTest,
+                        Combine(Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1/*, CV_32FC1*/), //TODO: CV_32FC1 fails accuracy
+                                Values(3, 5),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(-1, CV_32F),
+                                Values(0, 1),
+                                Values(1, 2),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(EqHistTestGPU, EqHistTest,
+                        Combine(Values(AbsExact().to_compare_f()),  // FIXIT Non reliable check
+                                Values(cv::Size(1280, 720),
+                                cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(CannyTestGPU, CannyTest,
+                        Combine(Values(AbsSimilarPoints(0, 0.05).to_compare_f()),
+                                Values(CV_8UC1, CV_8UC3),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+                                Values(3.0, 120.0),
+                                Values(125.0, 240.0),
+                                Values(3, 5),
+                                testing::Bool(),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2GrayTestGPU, RGB2GrayTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2GrayTestGPU, BGR2GrayTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2YUVTestGPU, RGB2YUVTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(YUV2RGBTestGPU, YUV2RGBTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(RGB2LabTestGPU, RGB2LabTest,
+                        Combine(Values(AbsSimilarPoints(1, 0.05).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2LUVTestGPU, BGR2LUVTest,
+                        Combine(Values(ToleranceColor(5e-3, 6).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(LUV2BGRTestGPU, LUV2BGRTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BGR2YUVTestGPU, BGR2YUVTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+INSTANTIATE_TEST_CASE_P(YUV2BGRTestGPU, YUV2BGRTest,
+                        Combine(Values(ToleranceColor(1e-3).to_compare_f()),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(IMGPROC_GPU))));
+
+
+} // opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/gpu/gapi_operators_tests_gpu.cpp
new file mode 100644 (file)
index 0000000..5a116bd
--- /dev/null
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "../test_precomp.hpp"
+#include "../common/gapi_operators_tests.hpp"
+#include "opencv2/gapi/gpu/core.hpp"
+
+#define CORE_GPU cv::gapi::core::gpu::kernels()
+
+namespace opencv_test
+{
+
+
+INSTANTIATE_TEST_CASE_P(MathOperatorTestGPU, MathOperatorMatMatTest,
+                    Combine(Values(Tolerance_FloatRel_IntAbs(1e-5, 2).to_compare_f()),
+                            Values( opPlusM, opMinusM, opDivM,
+                                    opGreater, opLess, opGreaterEq, opLessEq, opEq, opNotEq),
+                            Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                            Values(cv::Size(1280, 720),
+                               cv::Size(640, 480),
+                               cv::Size(128, 128)),
+                            Values(-1, CV_8U, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                            Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(MathOperatorTestGPU, MathOperatorMatScalarTest,
+                        Combine(Values(Tolerance_FloatRel_IntAbs(1e-4, 2).to_compare_f()),
+                                Values( opPlus, opPlusR, opMinus, opMinusR, opMul, opMulR,  // FIXIT avoid division by values near zero: opDiv, opDivR,
+                                        opGT, opLT, opGE, opLE, opEQ, opNE,
+                                        opGTR, opLTR, opGER, opLER, opEQR, opNER),
+                                Values(CV_8UC1, CV_16SC1, CV_32FC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1, CV_8U, CV_32F),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestGPU, MathOperatorMatMatTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( opAnd, opOr, opXor ),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                   cv::Size(640, 480),
+                                   cv::Size(128, 128)),
+                                Values(-1),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseOperatorTestGPU, MathOperatorMatScalarTest,
+                        Combine(Values(AbsExact().to_compare_f()),
+                                Values( opAND, opOR, opXOR, opANDR, opORR, opXORR ),
+                                Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+                                Values(-1),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+
+INSTANTIATE_TEST_CASE_P(BitwiseNotOperatorTestGPU, NotOperatorTest,
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                                Values(cv::Size(1280, 720),
+                                       cv::Size(640, 480),
+                                       cv::Size(128, 128)),
+/*init output matrices or not*/ testing::Bool(),
+                                Values(cv::compile_args(CORE_GPU))));
+}
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_backend_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_backend_tests.cpp
new file mode 100644 (file)
index 0000000..67b6273
--- /dev/null
@@ -0,0 +1,86 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "gapi_mock_kernels.hpp"
+
+#include "compiler/gmodel.hpp"
+#include "compiler/gcompiler.hpp"
+
+namespace opencv_test {
+
+namespace {
+
+struct MockMeta
+{
+    static const char* name() { return "MockMeta"; }
+};
+
+class GMockBackendImpl final: public cv::gapi::GBackend::Priv
+{
+    virtual void unpackKernel(ade::Graph            &,
+                              const ade::NodeHandle &,
+                              const cv::GKernelImpl &) override
+    {
+        // Do nothing here
+    }
+
+    virtual EPtr compile(const ade::Graph &,
+                         const cv::GCompileArgs &,
+                         const std::vector<ade::NodeHandle> &) const override
+    {
+        // Do nothing here as well
+        return {};
+    }
+
+    virtual void addBackendPasses(ade::ExecutionEngineSetupContext &ectx) override
+    {
+        ectx.addPass("transform", "set_mock_meta", [](ade::passes::PassContext &ctx) {
+                ade::TypedGraph<MockMeta> me(ctx.graph);
+                for (const auto &nh : me.nodes())
+                {
+                    me.metadata(nh).set(MockMeta{});
+                }
+            });
+    }
+};
+
+static cv::gapi::GBackend mock_backend(std::make_shared<GMockBackendImpl>());
+
+GAPI_OCV_KERNEL(MockFoo, I::Foo)
+{
+    static void run(const cv::Mat &, cv::Mat &) { /*Do nothing*/ }
+    static cv::gapi::GBackend backend() { return mock_backend; } // FIXME: Must be removed
+};
+
+} // anonymous namespace
+
+TEST(GBackend, CustomPassesExecuted)
+{
+    cv::GMat in;
+    cv::GMat out = I::Foo::on(in);
+    cv::GComputation c(in, out);
+
+    // Prepare compilation parameters manually
+    const auto in_meta = cv::GMetaArg(cv::GMatDesc{CV_8U,1,cv::Size(32,32)});
+    const auto pkg     = cv::gapi::kernels<MockFoo>();
+
+    // Directly instantiate G-API graph compiler and run partial compilation
+    cv::gimpl::GCompiler compiler(c, {in_meta}, cv::compile_args(pkg));
+    cv::gimpl::GCompiler::GPtr graph = compiler.generateGraph();
+    compiler.runPasses(*graph);
+
+    // Inspect the graph and verify the metadata written by Mock backend
+    ade::TypedGraph<MockMeta> me(*graph);
+    EXPECT_LT(0u, static_cast<std::size_t>(me.nodes().size()));
+    for (const auto &nh : me.nodes())
+    {
+        EXPECT_TRUE(me.metadata(nh).contains<MockMeta>());
+    }
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_executor_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_executor_tests.cpp
new file mode 100644 (file)
index 0000000..20aad89
--- /dev/null
@@ -0,0 +1,83 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+
+// FIXME: avoid code duplication
+// The below graph and config is taken from ComplexIslands test suite
+TEST(GExecutor, SmokeTest)
+{
+    cv::GMat    in[2];
+    cv::GMat    tmp[4];
+    cv::GScalar scl;
+    cv::GMat    out[2];
+
+    tmp[0] = cv::gapi::bitwise_not(cv::gapi::bitwise_not(in[0]));
+    tmp[1] = cv::gapi::boxFilter(in[1], -1, cv::Size(3,3));
+    tmp[2] = tmp[0] + tmp[1]; // FIXME: handle tmp[2] = tmp[0]+tmp[2] typo
+    scl    = cv::gapi::sum(tmp[1]);
+    tmp[3] = cv::gapi::medianBlur(tmp[1], 3);
+    out[0] = tmp[2] + scl;
+    out[1] = cv::gapi::boxFilter(tmp[3], -1, cv::Size(3,3));
+
+    //       isl0                                         #internal1
+    //       ...........................                  .........
+    // (in1) -> NotNot ->(tmp0) --> Add ---------> (tmp2) --> AddC -------> (out1)
+    //       :.....................^...:                  :..^....:
+    //                             :                         :
+    //                             :                         :
+    //      #internal0             :                         :
+    //        .....................:.........                :
+    // (in2) -> Blur -> (tmp1) ----'--> Sum ----> (scl0) ----'
+    //        :..........:..................:                  isl1
+    //                   :           ..............................
+    //                   `------------> Median -> (tmp3) --> Blur -------> (out2)
+    //                               :............................:
+
+    cv::gapi::island("isl0", cv::GIn(in[0], tmp[1]),  cv::GOut(tmp[2]));
+    cv::gapi::island("isl1", cv::GIn(tmp[1]), cv::GOut(out[1]));
+
+    cv::Mat in_mat1 = cv::Mat::eye(32, 32, CV_8UC1);
+    cv::Mat in_mat2 = cv::Mat::eye(32, 32, CV_8UC1);
+    cv::Mat out_gapi[2];
+
+    // Run G-API:
+    cv::GComputation(cv::GIn(in[0],   in[1]),    cv::GOut(out[0],      out[1]))
+              .apply(cv::gin(in_mat1, in_mat2),  cv::gout(out_gapi[0], out_gapi[1]));
+
+    // Run OpenCV
+    cv::Mat out_ocv[2];
+    {
+        cv::Mat    ocv_tmp0;
+        cv::Mat    ocv_tmp1;
+        cv::Mat    ocv_tmp2;
+        cv::Mat    ocv_tmp3;
+        cv::Scalar ocv_scl;
+
+        ocv_tmp0 = in_mat1; // skip !(!)
+        cv::boxFilter(in_mat2, ocv_tmp1, -1, cv::Size(3,3));
+        ocv_tmp2 = ocv_tmp0 + ocv_tmp1;
+        ocv_scl  = cv::sum(ocv_tmp1);
+        cv::medianBlur(ocv_tmp1, ocv_tmp3, 3);
+        out_ocv[0] = ocv_tmp2 + ocv_scl;
+        cv::boxFilter(ocv_tmp3, out_ocv[1], -1, cv::Size(3,3));
+    }
+
+    EXPECT_EQ(0, cv::countNonZero(out_gapi[0] != out_ocv[0]));
+    EXPECT_EQ(0, cv::countNonZero(out_gapi[1] != out_ocv[1]));
+
+    // FIXME: check that GIslandModel has more than 1 island (e.g. fusion
+    // with breakdown worked)
+}
+
+// FIXME: Add explicit tests on GMat/GScalar/GArray<T> being connectors
+// between executed islands
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_garg_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_garg_test.cpp
new file mode 100644 (file)
index 0000000..67696db
--- /dev/null
@@ -0,0 +1,100 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+namespace opencv_test {
+// Tests on T/Kind matching ////////////////////////////////////////////////////
+// {{
+
+template<class T, cv::detail::ArgKind Exp>
+struct Expected
+{
+    using type = T;
+    static const constexpr cv::detail::ArgKind kind = Exp;
+};
+
+template<typename T>
+struct GArgKind: public ::testing::Test
+{
+    using Type = typename T::type;
+    const cv::detail::ArgKind Kind = T::kind;
+};
+
+// The reason here is to _manually_ list types and their kinds
+// (and NOT reuse cv::detail::ArgKind::Traits<>, since it is a subject of testing)
+using GArg_Test_Types = ::testing::Types
+   <
+  // G-API types
+     Expected<cv::GMat,                 cv::detail::ArgKind::GMAT>
+   , Expected<cv::GScalar,              cv::detail::ArgKind::GSCALAR>
+   , Expected<cv::GArray<int>,          cv::detail::ArgKind::GARRAY>
+   , Expected<cv::GArray<float>,        cv::detail::ArgKind::GARRAY>
+   , Expected<cv::GArray<cv::Point>,    cv::detail::ArgKind::GARRAY>
+   , Expected<cv::GArray<cv::Rect>,     cv::detail::ArgKind::GARRAY>
+
+ // Built-in types
+   , Expected<int,                      cv::detail::ArgKind::OPAQUE>
+   , Expected<float,                    cv::detail::ArgKind::OPAQUE>
+   , Expected<int*,                     cv::detail::ArgKind::OPAQUE>
+   , Expected<cv::Point,                cv::detail::ArgKind::OPAQUE>
+   , Expected<std::string,              cv::detail::ArgKind::OPAQUE>
+   , Expected<cv::Mat,                  cv::detail::ArgKind::OPAQUE>
+   , Expected<std::vector<int>,         cv::detail::ArgKind::OPAQUE>
+   , Expected<std::vector<cv::Point>,   cv::detail::ArgKind::OPAQUE>
+   >;
+
+TYPED_TEST_CASE(GArgKind, GArg_Test_Types);
+
+TYPED_TEST(GArgKind, LocalVar)
+{
+    typename TestFixture::Type val{};
+    cv::GArg arg(val);
+    EXPECT_EQ(TestFixture::Kind, arg.kind);
+}
+
+TYPED_TEST(GArgKind, ConstLocalVar)
+{
+    const typename TestFixture::Type val{};
+    cv::GArg arg(val);
+    EXPECT_EQ(TestFixture::Kind, arg.kind);
+}
+
+TYPED_TEST(GArgKind, RValue)
+{
+    cv::GArg arg = cv::GArg(typename TestFixture::Type());
+    EXPECT_EQ(TestFixture::Kind, arg.kind);
+}
+
+// }}
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(GArg, HasWrap)
+{
+    static_assert(!cv::detail::has_custom_wrap<cv::GMat>::value,
+                  "GMat has no custom marshalling logic");
+    static_assert(!cv::detail::has_custom_wrap<cv::GScalar>::value,
+                  "GScalar has no custom marshalling logic");
+
+    static_assert(cv::detail::has_custom_wrap<cv::GArray<int> >::value,
+                  "GArray<int> has custom marshalling logic");
+    static_assert(cv::detail::has_custom_wrap<cv::GArray<std::string> >::value,
+                  "GArray<int> has custom marshalling logic");
+}
+
+TEST(GArg, GArrayU)
+{
+    // Placing a GArray<T> into GArg automatically strips it to GArrayU
+    cv::GArg arg1 = cv::GArg(cv::GArray<int>());
+    EXPECT_NO_THROW(arg1.get<cv::detail::GArrayU>());
+
+    cv::GArg arg2 = cv::GArg(cv::GArray<cv::Point>());
+    EXPECT_NO_THROW(arg2.get<cv::detail::GArrayU>());
+}
+
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmetaarg_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmetaarg_test.cpp
new file mode 100644 (file)
index 0000000..6dbf777
--- /dev/null
@@ -0,0 +1,136 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include "api/gcomputation_priv.hpp"
+
+namespace opencv_test
+{
+
+TEST(GMetaArg, Traits_Is_Positive)
+{
+    using namespace cv::detail;
+
+    static_assert(is_meta_descr<cv::GScalarDesc>::value,
+                  "GScalarDesc is a meta description type");
+
+    static_assert(is_meta_descr<cv::GMatDesc>::value,
+                  "GMatDesc is a meta description type");
+}
+
+TEST(GMetaArg, Traits_Is_Negative)
+{
+    using namespace cv::detail;
+
+    static_assert(!is_meta_descr<cv::GCompileArgs>::value,
+                  "GCompileArgs is NOT a meta description type");
+
+    static_assert(!is_meta_descr<int>::value,
+                  "int is NOT a meta description type");
+
+    static_assert(!is_meta_descr<std::string>::value,
+                  "str::string is NOT a meta description type");
+}
+
+TEST(GMetaArg, Traits_Are_EntireList_Positive)
+{
+    using namespace cv::detail;
+
+    static_assert(are_meta_descrs<cv::GScalarDesc>::value,
+                  "GScalarDesc is a meta description type");
+
+    static_assert(are_meta_descrs<cv::GMatDesc>::value,
+                  "GMatDesc is a meta description type");
+
+    static_assert(are_meta_descrs<cv::GMatDesc, cv::GScalarDesc>::value,
+                  "Both GMatDesc and GScalarDesc are meta types");
+}
+
+TEST(GMetaArg, Traits_Are_EntireList_Negative)
+{
+    using namespace cv::detail;
+
+    static_assert(!are_meta_descrs<cv::GCompileArgs>::value,
+                  "GCompileArgs is NOT among meta types");
+
+    static_assert(!are_meta_descrs<int, std::string>::value,
+                  "Both int and std::string is NOT among meta types");
+
+    static_assert(!are_meta_descrs<cv::GMatDesc, cv::GScalarDesc, int>::value,
+                  "List of type is not valid for meta as there\'s int");
+
+    static_assert(!are_meta_descrs<cv::GMatDesc, cv::GScalarDesc, cv::GCompileArgs>::value,
+                  "List of type is not valid for meta as there\'s GCompileArgs");
+}
+
+TEST(GMetaArg, Traits_Are_ButLast_Positive)
+{
+    using namespace cv::detail;
+
+    static_assert(are_meta_descrs_but_last<cv::GScalarDesc, int>::value,
+                  "List is valid (int is ommitted)");
+
+    static_assert(are_meta_descrs_but_last<cv::GMatDesc, cv::GScalarDesc, cv::GCompileArgs>::value,
+                  "List is valid (GCompileArgs are omitted)");
+}
+
+TEST(GMetaArg, Traits_Are_ButLast_Negative)
+{
+    using namespace cv::detail;
+
+    static_assert(!are_meta_descrs_but_last<int, std::string>::value,
+                  "Both int is NOT among meta types (std::string is omitted)");
+
+    static_assert(!are_meta_descrs_but_last<cv::GMatDesc, cv::GScalarDesc, int, int>::value,
+                  "List of type is not valid for meta as there\'s two ints");
+
+    static_assert(!are_meta_descrs_but_last<cv::GMatDesc, cv::GScalarDesc, cv::GCompileArgs, float>::value,
+                  "List of type is not valid for meta as there\'s GCompileArgs");
+}
+
+TEST(GMetaArg, Can_Get_Metas_From_Input_Run_Args)
+{
+    cv::Mat m(3, 3, CV_8UC3);
+    cv::Scalar s;
+    std::vector<int> v;
+
+    GMatDesc m_desc;
+    GMetaArgs meta_args = descr_of(cv::gin(m, s, v));
+
+    EXPECT_EQ(meta_args.size(), 3u);
+    EXPECT_NO_THROW(m_desc = util::get<cv::GMatDesc>(meta_args[0]));
+    EXPECT_NO_THROW(util::get<cv::GScalarDesc>(meta_args[1]));
+    EXPECT_NO_THROW(util::get<cv::GArrayDesc>(meta_args[2]));
+
+    EXPECT_EQ(CV_8U, m_desc.depth);
+    EXPECT_EQ(3, m_desc.chan);
+    EXPECT_EQ(cv::gapi::own::Size(3, 3), m_desc.size);
+}
+
+TEST(GMetaArg, Can_Get_Metas_From_Output_Run_Args)
+{
+    cv::Mat m(3, 3, CV_8UC3);
+    cv::Scalar s;
+    std::vector<int> v;
+
+    GMatDesc m_desc;
+    GRunArgsP out_run_args = cv::gout(m, s, v);
+    GMetaArg m_meta = descr_of(out_run_args[0]);
+    GMetaArg s_meta = descr_of(out_run_args[1]);
+    GMetaArg v_meta = descr_of(out_run_args[2]);
+
+    EXPECT_NO_THROW(m_desc = util::get<cv::GMatDesc>(m_meta));
+    EXPECT_NO_THROW(util::get<cv::GScalarDesc>(s_meta));
+    EXPECT_NO_THROW(util::get<cv::GArrayDesc>(v_meta));
+
+    EXPECT_EQ(CV_8U, m_desc.depth);
+    EXPECT_EQ(3, m_desc.chan);
+    EXPECT_EQ(cv::Size(3, 3), m_desc.size);
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_gmodel_builder_test.cpp
new file mode 100644 (file)
index 0000000..a815e0d
--- /dev/null
@@ -0,0 +1,364 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include <ade/util/zip_range.hpp>   // util::indexed
+
+#include "opencv2/gapi/gkernel.hpp"
+#include "compiler/gmodelbuilder.hpp"
+#include "compiler/gmodel.hpp" // RcDesc, GModel::init
+
+namespace opencv_test
+{
+
+namespace test
+{
+
+namespace
+{
+    cv::GMat unaryOp(cv::GMat m)
+    {
+        return cv::GCall(cv::GKernel{"gapi.test.unaryop", nullptr, { GShape::GMAT } }).pass(m).yield(0);
+    }
+
+    cv::GMat binaryOp(cv::GMat m1, cv::GMat m2)
+    {
+        return cv::GCall(cv::GKernel{"gapi.test.binaryOp", nullptr, { GShape::GMAT } }).pass(m1, m2).yield(0);
+    }
+
+    std::vector<ade::NodeHandle> collectOperations(const cv::gimpl::GModel::Graph& gr)
+    {
+        std::vector<ade::NodeHandle> ops;
+        for (const auto& nh : gr.nodes())
+        {
+            if (gr.metadata(nh).get<cv::gimpl::NodeType>().t == cv::gimpl::NodeType::OP)
+                ops.push_back(nh);
+        }
+        return ops;
+    }
+
+    ade::NodeHandle inputOf(cv::gimpl::GModel::Graph& gm, ade::NodeHandle nh, std::size_t port)
+    {
+        for (const auto& eh : nh->inEdges())
+        {
+            if (gm.metadata(eh).get<cv::gimpl::Input>().port == port)
+            {
+                return eh->srcNode();
+            }
+        }
+        util::throw_error(std::logic_error("port " + std::to_string(port) + " not found"));
+    }
+}
+}// namespace opencv_test::test
+
+TEST(GModelBuilder, Unroll_TestUnary)
+{
+    cv::GMat in;
+    cv::GMat out = test::unaryOp(in);
+
+    auto unrolled = cv::gimpl::unrollExpr(cv::GIn(in).m_args, cv::GOut(out).m_args);
+
+    EXPECT_EQ(1u, unrolled.all_ops.size());  // There is one operation
+    EXPECT_EQ(2u, unrolled.all_data.size()); // And two data objects (in, out)
+
+    // TODO check what the operation is, and so on, and so on
+}
+
+TEST(GModelBuilder, Unroll_TestUnaryOfUnary)
+{
+    cv::GMat in;
+    cv::GMat out = test::unaryOp(test::unaryOp(in));
+
+    auto unrolled = cv::gimpl::unrollExpr(cv::GIn(in).m_args, cv::GOut(out).m_args);
+
+    EXPECT_EQ(2u, unrolled.all_ops.size());  // There're two operations
+    EXPECT_EQ(3u, unrolled.all_data.size()); // And three data objects (in, out)
+
+    // TODO check what the operation is, and so on, and so on
+}
+
+TEST(GModelBuilder, Unroll_Not_All_Protocol_Inputs_Are_Reached)
+{
+    cv::GMat in1, in2;                                      // in1 -> unaryOp() -> u_op1 -> unaryOp() -> out
+    auto u_op1 = test::unaryOp(in1);                        // in2 -> unaryOp() -> u_op2
+    auto u_op2 = test::unaryOp(in2);
+    auto out   = test::unaryOp(u_op1);
+
+    EXPECT_THROW(cv::gimpl::unrollExpr(cv::GIn(in1, in2).m_args, cv::GOut(out).m_args), std::logic_error);
+}
+
+TEST(GModelBuilder, Unroll_Parallel_Path)
+{
+    cv::GMat in1, in2;                                      // in1 -> unaryOp() -> out1
+    auto out1 = test::unaryOp(in1);                         // in2 -> unaryOp() -> out2
+    auto out2 = test::unaryOp(in2);
+
+    auto unrolled = cv::gimpl::unrollExpr(cv::GIn(in1, in2).m_args, cv::GOut(out1, out2).m_args);
+
+    EXPECT_EQ(unrolled.all_ops.size(),  2u);
+    EXPECT_EQ(unrolled.all_data.size(), 4u);
+}
+
+TEST(GModelBuilder, Unroll_WithBranch)
+{
+    // in -> unaryOp() -> tmp -->unaryOp() -> out1
+    //                     `---->unaryOp() -> out2
+
+    GMat in;
+    auto tmp = test::unaryOp(in);
+    auto out1 = test::unaryOp(tmp);
+    auto out2 = test::unaryOp(tmp);
+
+    auto unrolled = cv::gimpl::unrollExpr(cv::GIn(in).m_args, cv::GOut(out1, out2).m_args);
+
+    EXPECT_EQ(unrolled.all_ops.size(),  3u);
+    EXPECT_EQ(unrolled.all_data.size(), 4u);
+}
+
+TEST(GModelBuilder, Build_Unary)
+{
+    cv::GMat in;
+    cv::GMat out = test::unaryOp(in);
+
+    ade::Graph g;
+    cv::gimpl::GModel::Graph gm(g);
+    cv::gimpl::GModel::init(gm);
+    cv::gimpl::GModelBuilder(g).put(cv::GIn(in).m_args, cv::GOut(out).m_args);
+
+    EXPECT_EQ(3u, static_cast<std::size_t>(g.nodes().size()));    // Generated graph should have three nodes
+
+    // TODO: Check what the nodes are
+}
+
+TEST(GModelBuilder, Constant_GScalar)
+{
+    // in -> addC()-----(GMat)---->mulC()-----(GMat)---->unaryOp()----out
+    //         ^                     ^
+    //         |                     |
+    // 3-------`           c_s-------'
+
+    cv::GMat in;
+    cv::GScalar c_s = 5;
+    auto out = test::unaryOp((in + 3) * c_s);    // 3 converted to GScalar
+
+    ade::Graph g;
+    cv::gimpl::GModel::Graph gm(g);
+    cv::gimpl::GModel::init(gm);
+    auto proto_slots = cv::gimpl::GModelBuilder(g).put(cv::GIn(in).m_args, cv::GOut(out).m_args);
+    cv::gimpl::Protocol p;
+    std::tie(p.inputs, p.outputs, p.in_nhs, p.out_nhs) = proto_slots;
+
+    auto in_nh   = p.in_nhs.front();
+    auto addC_nh = in_nh->outNodes().front();
+    auto mulC_nh = addC_nh->outNodes().front()->outNodes().front();
+
+    ASSERT_TRUE(gm.metadata(addC_nh).get<cv::gimpl::NodeType>().t == cv::gimpl::NodeType::OP);
+    ASSERT_TRUE(gm.metadata(mulC_nh).get<cv::gimpl::NodeType>().t == cv::gimpl::NodeType::OP);
+
+    auto s_3 = test::inputOf(gm, addC_nh, 1);
+    auto s_5 = test::inputOf(gm, mulC_nh, 1);
+
+    EXPECT_EQ(9u, static_cast<std::size_t>(g.nodes().size()));          // 6 data nodes (1 -input, 1 output, 2 constant, 2 temp) and 3 op nodes
+    EXPECT_EQ(2u, static_cast<std::size_t>(addC_nh->inNodes().size())); // in and 3
+    EXPECT_EQ(2u, static_cast<std::size_t>(mulC_nh->inNodes().size())); // addC output and c_s
+    EXPECT_EQ(3, (util::get<cv::gapi::own::Scalar>(gm.metadata(s_3).get<cv::gimpl::ConstValue>().arg))[0]);
+    EXPECT_EQ(5, (util::get<cv::gapi::own::Scalar>(gm.metadata(s_5).get<cv::gimpl::ConstValue>().arg))[0]);
+}
+
+TEST(GModelBuilder, Check_Multiple_Outputs)
+{
+    //            ------------------------------> r
+    //            '
+    //            '                    -----------> i_out1
+    //            '                    '
+    // in ----> split3() ---> g ---> integral()
+    //            '                    '
+    //            '                    -----------> i_out2
+    //            '
+    //            '---------> b ---> unaryOp() ---> u_out
+
+    cv::GMat in, r, g, b, i_out1, i_out2, u_out;
+    std::tie(r, g, b) = cv::gapi::split3(in);
+    std::tie(i_out1, i_out2) = cv::gapi::integral(g, 1, 1);
+    u_out = test::unaryOp(b);
+
+    ade::Graph gr;
+    cv::gimpl::GModel::Graph gm(gr);
+    cv::gimpl::GModel::init(gm);
+    auto proto_slots = cv::gimpl::GModelBuilder(gr).put(cv::GIn(in).m_args, cv::GOut(r, i_out1, i_out2, u_out).m_args);
+    cv::gimpl::Protocol p;
+    std::tie(p.inputs, p.outputs, p.in_nhs, p.out_nhs) = proto_slots;
+
+    EXPECT_EQ(4u, static_cast<std::size_t>(p.out_nhs.size()));
+    EXPECT_EQ(0u, gm.metadata(p.out_nhs[0]->inEdges().front()).get<cv::gimpl::Output>().port);
+    EXPECT_EQ(0u, gm.metadata(p.out_nhs[1]->inEdges().front()).get<cv::gimpl::Output>().port);
+    EXPECT_EQ(1u, gm.metadata(p.out_nhs[2]->inEdges().front()).get<cv::gimpl::Output>().port);
+    EXPECT_EQ(0u, gm.metadata(p.out_nhs[3]->inEdges().front()).get<cv::gimpl::Output>().port);
+    for (const auto& it : ade::util::indexed(p.out_nhs))
+    {
+        const auto& out_nh = ade::util::value(it);
+
+        EXPECT_EQ(cv::gimpl::NodeType::DATA, gm.metadata(out_nh).get<cv::gimpl::NodeType>().t);
+        EXPECT_EQ(GShape::GMAT, gm.metadata(out_nh).get<cv::gimpl::Data>().shape);
+    }
+}
+
+TEST(GModelBuilder, Unused_Outputs)
+{
+    cv::GMat in;
+    auto yuv_p = cv::gapi::split3(in);
+
+    ade::Graph g;
+    cv::gimpl::GModel::Graph gm(g);
+    cv::gimpl::GModel::init(gm);
+    cv::gimpl::GModelBuilder(g).put(cv::GIn(in).m_args, cv::GOut(std::get<0>(yuv_p)).m_args);
+
+    EXPECT_EQ(5u, static_cast<std::size_t>(g.nodes().size()));    // 1 input, 1 operation, 3 outputs
+}
+
+TEST(GModelBuilder, Work_With_One_Channel_From_Split3)
+{
+    cv::GMat in, y, u, v;
+    std::tie(y, u, v) = cv::gapi::split3(in);
+    auto y_blur = cv::gapi::gaussianBlur(y, cv::Size(3, 3), 1);
+
+    ade::Graph g;
+    cv::gimpl::GModel::Graph gm(g);
+    cv::gimpl::GModel::init(gm);
+    cv::gimpl::GModelBuilder(g).put(cv::GIn(in).m_args, cv::GOut(y_blur).m_args);
+
+    EXPECT_EQ(7u, static_cast<std::size_t>(g.nodes().size())); // 1 input, 2 operation, 3 nodes from split3, 1 output
+}
+
+TEST(GModelBuilder, Add_Nodes_To_Unused_Nodes)
+{
+    cv::GMat in, y, u, v;
+    std::tie(y, u, v) = cv::gapi::split3(in);
+    auto y_blur = cv::gapi::gaussianBlur(y, cv::Size(3, 3), 1);
+    // unused nodes
+    auto u_blur = cv::gapi::gaussianBlur(y, cv::Size(3, 3), 1);
+    auto v_blur = cv::gapi::gaussianBlur(y, cv::Size(3, 3), 1);
+
+    ade::Graph g;
+    cv::gimpl::GModel::Graph gm(g);
+    cv::gimpl::GModel::init(gm);
+    cv::gimpl::GModelBuilder(g).put(cv::GIn(in).m_args, cv::GOut(y_blur).m_args);
+
+    EXPECT_EQ(7u, static_cast<std::size_t>(g.nodes().size())); // 1 input, 2 operation, 3 nodes from split3, 1 output
+}
+
+TEST(GModelBuilder, Unlisted_Inputs)
+{
+    // in1 -> binaryOp() -> out
+    //         ^
+    //         |
+    // in2 ----'
+
+    cv::GMat in1, in2;
+    auto out = test::binaryOp(in1, in2);
+
+    ade::Graph g;
+    cv::gimpl::GModel::Graph gm(g);
+    cv::gimpl::GModel::init(gm);
+    // add required 2 inputs but pass 1
+    EXPECT_THROW(cv::gimpl::GModelBuilder(g).put(cv::GIn(in1).m_args, cv::GOut(out).m_args), std::logic_error);
+}
+
+TEST(GModelBuilder, Unroll_No_Link_Between_In_And_Out)
+{
+    // in    -> unaryOp() -> u_op
+    // other -> unaryOp() -> out
+
+    cv::GMat in, other;
+    auto u_op = test::unaryOp(in);
+    auto out  = test::unaryOp(other);
+
+    EXPECT_THROW(cv::gimpl::unrollExpr(cv::GIn(in).m_args, cv::GOut(out).m_args), std::logic_error);
+}
+
+
+TEST(GModel_builder, Check_Binary_Op)
+{
+    // in1 -> binaryOp() -> out
+    //          ^
+    //          |
+    // in2 -----'
+
+    cv::GMat in1, in2;
+    auto out = test::binaryOp(in1, in2);
+
+    ade::Graph g;
+    cv::gimpl::GModel::Graph gm(g);
+    cv::gimpl::GModel::init(gm);
+    auto proto_slots = cv::gimpl::GModelBuilder(g).put(cv::GIn(in1, in2).m_args, cv::GOut(out).m_args);
+
+    cv::gimpl::Protocol p;
+    std::tie(p.inputs, p.outputs, p.in_nhs, p.out_nhs) = proto_slots;
+    auto ops = test::collectOperations(g);
+
+    EXPECT_EQ(1u, ops.size());
+    EXPECT_EQ("gapi.test.binaryOp", gm.metadata(ops.front()).get<cv::gimpl::Op>().k.name);
+    EXPECT_EQ(2u, static_cast<std::size_t>(ops.front()->inEdges().size()));
+    EXPECT_EQ(1u, static_cast<std::size_t>(ops.front()->outEdges().size()));
+    EXPECT_EQ(1u, static_cast<std::size_t>(ops.front()->outNodes().size()));
+}
+
+TEST(GModelBuilder, Add_Operation_With_Two_Out_One_Time)
+{
+    // in -> integral() --> out_b1 -> unaryOp() -> out1
+    //            |
+    //            '-------> out_b2 -> unaryOp() -> out2
+
+    cv::GMat in, out_b1, out_b2;
+    std::tie(out_b1, out_b2) = cv::gapi::integral(in, 1, 1);
+    auto out1 = test::unaryOp(out_b1);
+    auto out2 = test::unaryOp(out_b1);
+
+    ade::Graph g;
+    cv::gimpl::GModel::Graph gm(g);
+    cv::gimpl::GModel::init(gm);
+    auto proto_slots = cv::gimpl::GModelBuilder(g).put(cv::GIn(in).m_args, cv::GOut(out1, out2).m_args);
+
+    auto ops = test::collectOperations(gm);
+
+    cv::gimpl::Protocol p;
+    std::tie(p.inputs, p.outputs, p.in_nhs, p.out_nhs) = proto_slots;
+    auto integral_nh = p.in_nhs.front()->outNodes().front();
+
+    EXPECT_EQ(3u, ops.size());
+    EXPECT_EQ("org.opencv.core.matrixop.integral", gm.metadata(integral_nh).get<cv::gimpl::Op>().k.name);
+    EXPECT_EQ(1u, static_cast<std::size_t>(integral_nh->inEdges().size()));
+    EXPECT_EQ(2u, static_cast<std::size_t>(integral_nh->outEdges().size()));
+    EXPECT_EQ(2u, static_cast<std::size_t>(integral_nh->outNodes().size()));
+}
+TEST(GModelBuilder, Add_Operation_With_One_Out_One_Time)
+{
+    // in1 -> binaryOp() -> b_out -> unaryOp() -> out1
+    //            ^           |
+    //            |           |
+    // in2 -------            '----> unaryOp() -> out2
+
+    cv::GMat in1, in2;
+    auto b_out = test::binaryOp(in1, in2);
+    auto out1 = test::unaryOp(b_out);
+    auto out2 = test::unaryOp(b_out);
+
+    ade::Graph g;
+    cv::gimpl::GModel::Graph gm(g);
+    cv::gimpl::GModel::init(gm);
+    auto proto_slots = cv::gimpl::GModelBuilder(g).put(cv::GIn(in1, in2).m_args, cv::GOut(out1, out2).m_args);
+    cv::gimpl::Protocol p;
+    std::tie(p.inputs, p.outputs, p.in_nhs, p.out_nhs) = proto_slots;
+    cv::gimpl::GModel::Graph gr(g);
+    auto binaryOp_nh = p.in_nhs.front()->outNodes().front();
+
+    EXPECT_EQ(2u, static_cast<std::size_t>(binaryOp_nh->inEdges().size()));
+    EXPECT_EQ(1u, static_cast<std::size_t>(binaryOp_nh->outEdges().size()));
+    EXPECT_EQ(8u, static_cast<std::size_t>(g.nodes().size()));
+}
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_fusion_tests.cpp
new file mode 100644 (file)
index 0000000..91e55be
--- /dev/null
@@ -0,0 +1,527 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "compiler/transactions.hpp"
+
+#include "gapi_mock_kernels.hpp"
+
+#include "compiler/gmodel.hpp"
+#include "compiler/gislandmodel.hpp"
+#include "compiler/gcompiler.hpp"
+
+namespace opencv_test
+{
+
+TEST(IslandFusion, TwoOps_OneIsland)
+{
+    namespace J = Jupiter; // see mock_kernels.cpp
+
+    // Define a computation:
+    //
+    //    (in) -> J::Foo1 -> (tmp0) -> J::Foo2 -> (out)
+    //          :                               :
+    //          :          "island0"            :
+    //          :<----------------------------->:
+
+    cv::GMat in;
+    cv::GMat tmp0 = I::Foo::on(in);
+    cv::GMat out  = I::Foo::on(tmp0);
+    cv::GComputation cc(in, out);
+
+    // Prepare compilation parameters manually
+    const auto in_meta = cv::GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)});
+    const auto pkg     = cv::gapi::kernels<J::Foo>();
+
+    // Directly instantiate G-API graph compiler and run partial compilation
+    cv::gimpl::GCompiler compiler(cc, {in_meta}, cv::compile_args(pkg));
+    cv::gimpl::GCompiler::GPtr graph = compiler.generateGraph();
+    compiler.runPasses(*graph);
+
+    // Inspect the graph and verify the islands configuration
+    cv::gimpl::GModel::ConstGraph gm(*graph);
+
+    auto in_nh  = cv::gimpl::GModel::dataNodeOf(gm, in);
+    auto tmp_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp0);
+    auto out_nh = cv::gimpl::GModel::dataNodeOf(gm, out);
+
+    // in/out mats shouldn't be assigned to any Island
+    EXPECT_FALSE(gm.metadata(in_nh ).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out_nh).contains<cv::gimpl::Island>());
+
+    // Since tmp is surrounded by two J kernels, tmp should be assigned
+    // to island J
+    EXPECT_TRUE(gm.metadata(tmp_nh).contains<cv::gimpl::Island>());
+}
+
+TEST(IslandFusion, TwoOps_TwoIslands)
+{
+    namespace J = Jupiter; // see mock_kernels.cpp
+    namespace S = Saturn;  // see mock_kernels.cpp
+
+    // Define a computation:
+    //
+    //    (in) -> J::Foo --> (tmp0) -> S::Bar --> (out)
+    //          :          :        ->          :
+    //          :          :         :          :
+    //          :<-------->:         :<-------->:
+
+    cv::GMat in;
+    cv::GMat tmp0 = I::Foo::on(in);
+    cv::GMat out  = I::Bar::on(tmp0, tmp0);
+    cv::GComputation cc(in, out);
+
+    // Prepare compilation parameters manually
+    const auto in_meta = cv::GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)});
+    const auto pkg     = cv::gapi::kernels<J::Foo, S::Bar>();
+
+    // Directly instantiate G-API graph compiler and run partial compilation
+    cv::gimpl::GCompiler compiler(cc, {in_meta}, cv::compile_args(pkg));
+    cv::gimpl::GCompiler::GPtr graph = compiler.generateGraph();
+    compiler.runPasses(*graph);
+
+    // Inspect the graph and verify the islands configuration
+    cv::gimpl::GModel::ConstGraph gm(*graph);
+
+    auto in_nh  = cv::gimpl::GModel::dataNodeOf(gm, in);
+    auto tmp_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp0);
+    auto out_nh = cv::gimpl::GModel::dataNodeOf(gm, out);
+
+    // in/tmp/out mats shouldn't be assigned to any Island
+    EXPECT_FALSE(gm.metadata(in_nh ).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp_nh).contains<cv::gimpl::Island>());
+
+    auto isl_model = gm.metadata().get<cv::gimpl::IslandModel>().model;
+    cv::gimpl::GIslandModel::ConstGraph gim(*isl_model);
+
+    // There should be two islands in the GIslandModel
+    const auto is_island = [&](ade::NodeHandle nh) {
+        return (cv::gimpl::NodeKind::ISLAND
+                == gim.metadata(nh).get<cv::gimpl::NodeKind>().k);
+    };
+    const std::size_t num_isl = std::count_if(gim.nodes().begin(),
+                                              gim.nodes().end(),
+                                              is_island);
+    EXPECT_EQ(2u, num_isl);
+
+    auto isl_foo_nh  = cv::gimpl::GIslandModel::producerOf(gim, tmp_nh);
+    auto isl_bar_nh  = cv::gimpl::GIslandModel::producerOf(gim, out_nh);
+    ASSERT_NE(nullptr, isl_foo_nh);
+    ASSERT_NE(nullptr, isl_bar_nh);
+
+    // Islands should be different
+    auto isl_foo_obj = gim.metadata(isl_foo_nh).get<cv::gimpl::FusedIsland>().object;
+    auto isl_bar_obj = gim.metadata(isl_bar_nh).get<cv::gimpl::FusedIsland>().object;
+    EXPECT_FALSE(isl_foo_obj == isl_bar_obj);
+}
+
+TEST(IslandFusion, ConsumerHasTwoInputs)
+{
+    namespace J = Jupiter; // see mock_kernels.cpp
+
+    // Define a computation:     island
+    //            ............................
+    //    (in0) ->:J::Foo -> (tmp) -> S::Bar :--> (out)
+    //            :....................^.....:
+    //                                 |
+    //    (in1) -----------------------`
+    //
+
+    // Check that island is build correctly, when consumer has two inputs
+
+    GMat in[2];
+    GMat tmp = I::Foo::on(in[0]);
+    GMat out = I::Bar::on(tmp, in[1]);
+
+    cv::GComputation cc(cv::GIn(in[0], in[1]), cv::GOut(out));
+
+    // Prepare compilation parameters manually
+    cv::GMetaArgs in_metas = {GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)}),
+                              GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)})};
+    const auto pkg = cv::gapi::kernels<J::Foo, J::Bar>();
+
+    // Directly instantiate G-API graph compiler and run partial compilation
+    cv::gimpl::GCompiler compiler(cc, std::move(in_metas), cv::compile_args(pkg));
+    cv::gimpl::GCompiler::GPtr graph = compiler.generateGraph();
+    compiler.runPasses(*graph);
+
+    cv::gimpl::GModel::ConstGraph gm(*graph);
+
+    auto in0_nh = cv::gimpl::GModel::dataNodeOf(gm, in[0]);
+    auto in1_nh = cv::gimpl::GModel::dataNodeOf(gm, in[1]);
+    auto tmp_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp);
+    auto out_nh = cv::gimpl::GModel::dataNodeOf(gm, out);
+
+    EXPECT_FALSE(gm.metadata(in0_nh ).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(in1_nh ).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out_nh).contains<cv::gimpl::Island>());
+    EXPECT_TRUE(gm.metadata(tmp_nh).contains<cv::gimpl::Island>());
+
+    auto isl_model = gm.metadata().get<cv::gimpl::IslandModel>().model;
+    cv::gimpl::GIslandModel::ConstGraph gim(*isl_model);
+
+    const auto is_island = [&](ade::NodeHandle nh) {
+        return (cv::gimpl::NodeKind::ISLAND
+                == gim.metadata(nh).get<cv::gimpl::NodeKind>().k);
+    };
+    const std::size_t num_isl = std::count_if(gim.nodes().begin(),
+                                              gim.nodes().end(),
+                                              is_island);
+    EXPECT_EQ(1u, num_isl);
+
+    auto isl_nh  = cv::gimpl::GIslandModel::producerOf(gim, out_nh);
+    auto isl_obj = gim.metadata(isl_nh).get<cv::gimpl::FusedIsland>().object;
+
+    EXPECT_TRUE(ade::util::contains(isl_obj->contents(), tmp_nh));
+
+    EXPECT_EQ(2u, static_cast<std::size_t>(isl_nh->inNodes().size()));
+    EXPECT_EQ(1u, static_cast<std::size_t>(isl_nh->outNodes().size()));
+}
+
+TEST(IslandFusion, DataNodeUsedDifferentBackend)
+{
+    // Define a computation:
+    //
+    //           internal isl            isl0
+    //             ...........................
+    //    (in1) -> :J::Foo--> (tmp) -> J::Foo: --> (out0)
+    //             :............|............:
+    //                          |     ........
+    //                          `---->:S::Baz: --> (out1)
+    //                                :......:
+
+    // Check that the node was not dropped out of the island
+    // because it is used by the kernel from another backend
+
+    namespace J = Jupiter;
+    namespace S = Saturn;
+
+    cv::GMat in, tmp, out0;
+    cv::GScalar out1;
+    tmp  = I::Foo::on(in);
+    out0 = I::Foo::on(tmp);
+    out1 = I::Baz::on(tmp);
+
+    cv::GComputation cc(cv::GIn(in), cv::GOut(out0, out1));
+
+    // Prepare compilation parameters manually
+    const auto in_meta = cv::GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)});
+    const auto pkg     = cv::gapi::kernels<J::Foo, S::Baz>();
+
+    // Directly instantiate G-API graph compiler and run partial compilation
+    cv::gimpl::GCompiler compiler(cc, {in_meta}, cv::compile_args(pkg));
+    cv::gimpl::GCompiler::GPtr graph = compiler.generateGraph();
+    compiler.runPasses(*graph);
+
+    // Inspect the graph and verify the islands configuration
+    cv::gimpl::GModel::ConstGraph gm(*graph);
+
+    auto in_nh   = cv::gimpl::GModel::dataNodeOf(gm, in);
+    auto tmp_nh  = cv::gimpl::GModel::dataNodeOf(gm, tmp);
+    auto out0_nh = cv::gimpl::GModel::dataNodeOf(gm, out0);
+    auto out1_nh = cv::gimpl::GModel::dataNodeOf(gm, out1);
+
+    EXPECT_TRUE(gm.metadata(tmp_nh).contains<cv::gimpl::Island>());
+
+    auto isl_model = gm.metadata().get<cv::gimpl::IslandModel>().model;
+    cv::gimpl::GIslandModel::ConstGraph gim(*isl_model);
+
+    auto isl_nh  = cv::gimpl::GIslandModel::producerOf(gim, tmp_nh);
+    auto isl_obj = gim.metadata(isl_nh).get<cv::gimpl::FusedIsland>().object;
+
+    EXPECT_TRUE(ade::util::contains(isl_obj->contents(), tmp_nh));
+
+    EXPECT_EQ(2u, static_cast<std::size_t>(isl_nh->outNodes().size()));
+    EXPECT_EQ(7u, static_cast<std::size_t>(gm.nodes().size()));
+    EXPECT_EQ(6u, static_cast<std::size_t>(gim.nodes().size()));
+}
+
+TEST(IslandFusion, LoopBetweenDifferentBackends)
+{
+    // Define a computation:
+    //
+    //
+    //            .............................
+    //    (in) -> :J::Baz -> (tmp0) -> J::Quux: -> (out0)
+    //      |     :............|..........^....
+    //      |     ........     |          |         ........
+    //      `---->:S::Foo:     `----------|-------->:S::Qux:-> (out1)
+    //            :....|.:                |         :....^.:
+    //                 |                  |              |
+    //                 `-------------- (tmp1) -----------`
+
+    // Kernels S::Foo and S::Qux cannot merge, because there will be a cycle between islands
+
+    namespace J = Jupiter;
+    namespace S = Saturn;
+
+    cv::GScalar tmp0;
+    cv::GMat in, tmp1, out0, out1;
+
+    tmp0 = I::Baz::on(in);
+    tmp1 = I::Foo::on(in);
+    out1 = I::Qux::on(tmp1, tmp0);
+    out0 = I::Quux::on(tmp0, tmp1);
+
+    cv::GComputation cc(cv::GIn(in), cv::GOut(out1, out0));
+
+    // Prepare compilation parameters manually
+    const auto in_meta = cv::GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)});
+    const auto pkg     = cv::gapi::kernels<J::Baz, J::Quux, S::Foo, S::Qux>();
+
+    // Directly instantiate G-API graph compiler and run partial compilation
+    cv::gimpl::GCompiler compiler(cc, {in_meta}, cv::compile_args(pkg));
+    cv::gimpl::GCompiler::GPtr graph = compiler.generateGraph();
+    compiler.runPasses(*graph);
+
+    cv::gimpl::GModel::ConstGraph gm(*graph);
+    auto isl_model = gm.metadata().get<cv::gimpl::IslandModel>().model;
+    cv::gimpl::GIslandModel::ConstGraph gim(*isl_model);
+
+    auto in_nh   = cv::gimpl::GModel::dataNodeOf(gm, in);
+    auto tmp0_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp0);
+    auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp1);
+    auto out0_nh = cv::gimpl::GModel::dataNodeOf(gm, out0);
+    auto out1_nh = cv::gimpl::GModel::dataNodeOf(gm, out1);
+
+    EXPECT_FALSE(gm.metadata(in_nh ).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out0_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out1_nh).contains<cv::gimpl::Island>());
+    // The node does not belong to the island so as not to form a cycle
+    EXPECT_FALSE(gm.metadata(tmp1_nh).contains<cv::gimpl::Island>());
+
+    EXPECT_TRUE(gm.metadata(tmp0_nh).contains<cv::gimpl::Island>());
+
+    // There should be three islands in the GIslandModel
+    const auto is_island = [&](ade::NodeHandle nh) {
+        return (cv::gimpl::NodeKind::ISLAND
+                == gim.metadata(nh).get<cv::gimpl::NodeKind>().k);
+    };
+    const std::size_t num_isl = std::count_if(gim.nodes().begin(),
+                                              gim.nodes().end(),
+                                              is_island);
+    EXPECT_EQ(3u, num_isl);
+}
+
+TEST(IslandsFusion, PartionOverlapUserIsland)
+{
+    // Define a computation:
+    //
+    //           internal isl            isl0
+    //             ........            ........
+    //    (in0) -> :J::Foo:--> (tmp) ->:S::Bar: --> (out)
+    //             :......:            :......:
+    //                                    ^
+    //                                    |
+    //    (in1) --------------------------`
+
+    // Check that internal islands does't overlap user island
+
+    namespace J = Jupiter;
+    namespace S = Saturn;
+
+    GMat in[2];
+    GMat tmp = I::Foo::on(in[0]);
+    GMat out = I::Bar::on(tmp, in[1]);
+
+    cv::gapi::island("isl0", cv::GIn(tmp, in[1]), cv::GOut(out));
+    cv::GComputation cc(cv::GIn(in[0], in[1]), cv::GOut(out));
+
+    // Prepare compilation parameters manually
+    cv::GMetaArgs in_metas = {GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)}),
+                              GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)})};
+    const auto pkg = cv::gapi::kernels<J::Foo, J::Bar>();
+
+    // Directly instantiate G-API graph compiler and run partial compilation
+    cv::gimpl::GCompiler compiler(cc, std::move(in_metas), cv::compile_args(pkg));
+    cv::gimpl::GCompiler::GPtr graph = compiler.generateGraph();
+    compiler.runPasses(*graph);
+
+    cv::gimpl::GModel::ConstGraph gm(*graph);
+    auto isl_model = gm.metadata().get<cv::gimpl::IslandModel>().model;
+    cv::gimpl::GIslandModel::ConstGraph gim(*isl_model);
+
+    auto in0_nh = cv::gimpl::GModel::dataNodeOf(gm, in[0]);
+    auto in1_nh = cv::gimpl::GModel::dataNodeOf(gm, in[1]);
+    auto tmp_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp);
+    auto out_nh = cv::gimpl::GModel::dataNodeOf(gm, out);
+
+    auto foo_nh  = cv::gimpl::GIslandModel::producerOf(gim, tmp_nh);
+    auto foo_obj = gim.metadata(foo_nh).get<cv::gimpl::FusedIsland>().object;
+
+    auto bar_nh  = cv::gimpl::GIslandModel::producerOf(gim, out_nh);
+    auto bar_obj = gim.metadata(bar_nh).get<cv::gimpl::FusedIsland>().object;
+
+    EXPECT_FALSE(gm.metadata(in0_nh ).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(in1_nh ).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(foo_obj->is_user_specified());
+    EXPECT_TRUE(bar_obj->is_user_specified());
+}
+
+TEST(IslandsFusion, DISABLED_IslandContainsDifferentBackends)
+{
+    // Define a computation:
+    //
+    //                       isl0
+    //             ............................
+    //    (in0) -> :J::Foo:--> (tmp) -> S::Bar: --> (out)
+    //             :..........................:
+    //                                    ^
+    //                                    |
+    //    (in1) --------------------------`
+
+    // Try create island contains different backends
+
+    namespace J = Jupiter;
+    namespace S = Saturn;
+
+    GMat in[2];
+    GMat tmp = I::Foo::on(in[0]);
+    GMat out = I::Bar::on(tmp, in[1]);
+
+    cv::gapi::island("isl0", cv::GIn(in[0], in[1]), cv::GOut(out));
+    cv::GComputation cc(cv::GIn(in[0], in[1]), cv::GOut(out));
+
+    // Prepare compilation parameters manually
+    cv::GMetaArgs in_metas = {GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)}),
+                              GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)})};
+    const auto pkg = cv::gapi::kernels<J::Foo, S::Bar>();
+
+    // Directly instantiate G-API graph compiler and run partial compilation
+    cv::gimpl::GCompiler compiler(cc, std::move(in_metas), cv::compile_args(pkg));
+    cv::gimpl::GCompiler::GPtr graph = compiler.generateGraph();
+    EXPECT_ANY_THROW(compiler.runPasses(*graph));
+}
+
+TEST(IslandFusion, WithLoop)
+{
+    namespace J = Jupiter; // see mock_kernels.cpp
+
+    // Define a computation:
+    //
+    //    (in) -> J::Foo --> (tmp0) -> J::Foo --> (tmp1) -> J::Qux -> (out)
+    //                            :                        ^
+    //                            '--> J::Baz --> (scl0) --'
+    //
+    // The whole thing should be merged to a single island
+    // There's a cycle warning if Foo/Foo/Qux are merged first
+    // Then this island both produces data for Baz and consumes data
+    // from Baz. This is a cycle and it should be avoided by the merging code.
+    //
+    cv::GMat    in;
+    cv::GMat    tmp0 = I::Foo::on(in);
+    cv::GMat    tmp1 = I::Foo::on(tmp0);
+    cv::GScalar scl0 = I::Baz::on(tmp0);
+    cv::GMat    out  = I::Qux::on(tmp1, scl0);
+    cv::GComputation cc(in, out);
+
+    // Prepare compilation parameters manually
+    const auto in_meta = cv::GMetaArg(cv::GMatDesc{CV_8U,1,cv::gapi::own::Size(32,32)});
+    const auto pkg     = cv::gapi::kernels<J::Foo, J::Baz, J::Qux>();
+
+    // Directly instantiate G-API graph compiler and run partial compilation
+    cv::gimpl::GCompiler compiler(cc, {in_meta}, cv::compile_args(pkg));
+    cv::gimpl::GCompiler::GPtr graph = compiler.generateGraph();
+    compiler.runPasses(*graph);
+
+    // Inspect the graph and verify the islands configuration
+    cv::gimpl::GModel::ConstGraph gm(*graph);
+
+    auto in_nh   = cv::gimpl::GModel::dataNodeOf(gm, in);
+    auto tmp0_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp0);
+    auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp1);
+    auto scl0_nh = cv::gimpl::GModel::dataNodeOf(gm, scl0);
+    auto out_nh  = cv::gimpl::GModel::dataNodeOf(gm, out);
+
+    // in/out mats shouldn't be assigned to any Island
+    EXPECT_FALSE(gm.metadata(in_nh ).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out_nh).contains<cv::gimpl::Island>());
+
+    // tmp0/tmp1/scl should be assigned to island
+    EXPECT_TRUE(gm.metadata(tmp0_nh).contains<cv::gimpl::Island>());
+    EXPECT_TRUE(gm.metadata(tmp1_nh).contains<cv::gimpl::Island>());
+    EXPECT_TRUE(gm.metadata(scl0_nh).contains<cv::gimpl::Island>());
+
+    // Check that there's a single island object and it contains all
+    // that data object handles
+
+    cv::gimpl::GModel::ConstGraph cg(*graph);
+    auto isl_model = cg.metadata().get<cv::gimpl::IslandModel>().model;
+    cv::gimpl::GIslandModel::ConstGraph gim(*isl_model);
+
+    const auto is_island = [&](ade::NodeHandle nh) {
+        return (cv::gimpl::NodeKind::ISLAND
+                == gim.metadata(nh).get<cv::gimpl::NodeKind>().k);
+    };
+    const std::size_t num_isl = std::count_if(gim.nodes().begin(),
+                                              gim.nodes().end(),
+                                              is_island);
+    EXPECT_EQ(1u, num_isl);
+
+    auto isl_nh  = cv::gimpl::GIslandModel::producerOf(gim, out_nh);
+    auto isl_obj = gim.metadata(isl_nh).get<cv::gimpl::FusedIsland>().object;
+    EXPECT_TRUE(ade::util::contains(isl_obj->contents(), tmp0_nh));
+    EXPECT_TRUE(ade::util::contains(isl_obj->contents(), tmp1_nh));
+    EXPECT_TRUE(ade::util::contains(isl_obj->contents(), scl0_nh));
+}
+
+TEST(IslandFusion, Regression_ShouldFuseAll)
+{
+    // Initially the merge procedure didn't work as expected and
+    // stopped fusion even if it could be continued (e.g. full
+    // GModel graph could be fused into a single GIsland node).
+    // Example of this is custom RGB 2 YUV pipeline as shown below:
+
+    cv::GMat r, g, b;
+    cv::GMat y = 0.299f*r + 0.587f*g + 0.114f*b;
+    cv::GMat u = 0.492f*(b - y);
+    cv::GMat v = 0.877f*(r - y);
+
+    cv::GComputation customCvt({r, g, b}, {y, u, v});
+
+    const auto in_meta = cv::GMetaArg(cv::GMatDesc{CV_8U,1,cv::Size(32,32)});
+
+    // Directly instantiate G-API graph compiler and run partial compilation
+    cv::gimpl::GCompiler compiler(customCvt, {in_meta,in_meta,in_meta}, cv::compile_args());
+    cv::gimpl::GCompiler::GPtr graph = compiler.generateGraph();
+    compiler.runPasses(*graph);
+
+    cv::gimpl::GModel::ConstGraph cg(*graph);
+    auto isl_model = cg.metadata().get<cv::gimpl::IslandModel>().model;
+    cv::gimpl::GIslandModel::ConstGraph gim(*isl_model);
+
+    std::vector<ade::NodeHandle> data_nhs;
+    std::vector<ade::NodeHandle> isl_nhs;
+    for (auto &&nh : gim.nodes())
+    {
+        if (gim.metadata(nh).contains<cv::gimpl::FusedIsland>())
+            isl_nhs.push_back(std::move(nh));
+        else if (gim.metadata(nh).contains<cv::gimpl::DataSlot>())
+            data_nhs.push_back(std::move(nh));
+        else FAIL() << "GIslandModel node with unexpected metadata type";
+    }
+
+    EXPECT_EQ(6u, data_nhs.size()); // 3 input nodes + 3 output nodes
+    EXPECT_EQ(1u, isl_nhs.size());  // 1 island
+}
+
+// FIXME: add more tests on mixed (hetero) graphs
+// ADE-222, ADE-223
+
+// FIXME: add test on combination of user-specified island
+// which should be heterogeneous (based on kernel availability)
+// but as we don't support this, compilation should fail
+
+// FIXME: add tests on automatic inferred islands which are
+// connected via 1) gmat 2) gscalar 3) garray,
+// check the case with executor
+// check the case when this 1/2/3 interim object is also gcomputation output
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_island_tests.cpp
new file mode 100644 (file)
index 0000000..09f1880
--- /dev/null
@@ -0,0 +1,653 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include "compiler/gmodel.hpp"
+#include "compiler/gcompiled_priv.hpp"
+
+namespace opencv_test
+{
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests on a plain graph
+//
+// (in) -> Blur1 -> (tmp0) -> Blur2 -> (tmp1) -> Blur3 -> (tmp2) -> Blur4 -> (out)
+//
+namespace
+{
+    struct PlainIslandsFixture
+    {
+        cv::GMat in;
+        cv::GMat tmp[3];
+        cv::GMat out;
+
+        PlainIslandsFixture()
+        {
+            tmp[0] = cv::gapi::boxFilter(in,     -1, cv::Size(3,3));
+            tmp[1] = cv::gapi::boxFilter(tmp[0], -1, cv::Size(3,3));
+            tmp[2] = cv::gapi::boxFilter(tmp[1], -1, cv::Size(3,3));
+            out    = cv::gapi::boxFilter(tmp[2], -1, cv::Size(3,3));
+        }
+    };
+
+    struct Islands: public ::testing::Test, public PlainIslandsFixture {};
+
+    using GIntArray = GArray<int>;
+
+    G_TYPED_KERNEL(CreateMatWithDiag, <GMat(GIntArray)>, "test.array.create_mat_with_diag")
+    {
+        static GMatDesc outMeta(const GArrayDesc&) { return cv::GMatDesc{CV_32S, 1,{3, 3}}; }
+    };
+
+    GAPI_OCV_KERNEL(CreateMatWithDiagImpl, CreateMatWithDiag)
+    {
+        static void run(const std::vector<int> &in, cv::Mat& out)
+        {
+            auto size = static_cast<int>(in.size());
+            out = Mat::zeros(size, size, CV_32SC1);
+            for(int i = 0; i < out.rows; i++)
+            {
+                auto* row = out.ptr<int>(i);
+                row[i] = in[i];
+            }
+        }
+    };
+
+    G_TYPED_KERNEL(Mat2Array, <GIntArray(GMat)>, "test.array.mat2array")
+    {
+        static GArrayDesc outMeta(const GMatDesc&) { return empty_array_desc(); }
+    };
+
+    GAPI_OCV_KERNEL(Mat2ArrayImpl, Mat2Array)
+    {
+        static void run(const cv::Mat& in, std::vector<int> &out)
+        {
+            GAPI_Assert(in.depth() == CV_32S && in.isContinuous());
+            out.reserve(in.cols * in.rows);
+            out.assign((int*)in.datastart, (int*)in.dataend);
+        }
+    };
+}
+
+TEST_F(Islands, SmokeTest)
+{
+    // (in) -> Blur1 -> (tmp0) -> Blur2 -> (tmp1) -> Blur3 -> (tmp2) -> Blur4 -> (out)
+    //                         :        "test"             :
+    //                         :<------------------------->:
+    cv::gapi::island("test", cv::GIn(tmp[0]), cv::GOut(tmp[2]));
+    auto cc = cv::GComputation(in, out).compile(cv::GMatDesc{CV_8U,1,{640,480}});
+
+    const auto &gm = cc.priv().model();
+    const auto tmp0_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[0]);
+    const auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[1]);
+    const auto tmp2_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[2]);
+
+    // tmp1 and tmp3 is not a part of any island
+    EXPECT_FALSE(gm.metadata(tmp0_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp2_nh).contains<cv::gimpl::Island>());
+
+    // tmp2 is part of "test" island
+    EXPECT_TRUE(gm.metadata(tmp1_nh).contains<cv::gimpl::Island>());
+    EXPECT_EQ("test", gm.metadata(tmp1_nh).get<cv::gimpl::Island>().island);
+}
+
+TEST_F(Islands, TwoIslands)
+{
+    // (in) -> Blur1 -> (tmp0) -> Blur2 -> (tmp1) -> Blur3 -> (tmp2) -> Blur4 -> (out)
+    //       :  "test1"                     :  : "test2"                          :
+    //       :<---------------------------->:  :<--------------------------------->
+    EXPECT_NO_THROW(cv::gapi::island("test1", cv::GIn(in),     cv::GOut(tmp[1])));
+    EXPECT_NO_THROW(cv::gapi::island("test2", cv::GIn(tmp[1]), cv::GOut(out)));
+
+    auto cc = cv::GComputation(in, out).compile(cv::GMatDesc{CV_8U,1,{640,480}});
+    const auto &gm = cc.priv().model();
+    const auto in_nh   = cv::gimpl::GModel::dataNodeOf(gm, in);
+    const auto tmp0_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[0]);
+    const auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[1]);
+    const auto tmp2_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[2]);
+    const auto out_nh  = cv::gimpl::GModel::dataNodeOf(gm, out);
+
+    // Only tmp0 and tmp2 should be listed in islands.
+    EXPECT_TRUE (gm.metadata(tmp0_nh).contains<cv::gimpl::Island>());
+    EXPECT_TRUE (gm.metadata(tmp2_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(in_nh)  .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp1_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out_nh) .contains<cv::gimpl::Island>());
+
+    EXPECT_EQ("test1", gm.metadata(tmp0_nh).get<cv::gimpl::Island>().island);
+    EXPECT_EQ("test2", gm.metadata(tmp2_nh).get<cv::gimpl::Island>().island);
+}
+
+// FIXME: Disabled since currently merge procedure merges two into one
+// succesfully
+TEST_F(Islands, DISABLED_Two_Islands_With_Same_Name_Should_Fail)
+{
+    // (in) -> Blur1 -> (tmp0) -> Blur2 -> (tmp1) -> Blur3 -> (tmp2) -> Blur4 -> (out)
+    //       :  "test1"                     :  : "test1"                          :
+    //       :<---------------------------->:  :<--------------------------------->
+
+    EXPECT_NO_THROW(cv::gapi::island("test1", cv::GIn(in),     cv::GOut(tmp[1])));
+    EXPECT_NO_THROW(cv::gapi::island("test1", cv::GIn(tmp[1]), cv::GOut(out)));
+
+    EXPECT_ANY_THROW(cv::GComputation(in, out).compile(cv::GMatDesc{CV_8U,1,{640,480}}));
+}
+
+
+// (in) -> Blur1 -> (tmp0) -> Blur2 -> (tmp1) -> Blur3 -> (tmp2) -> Blur4 -> (out)
+//       :          "test1":            :              :
+//       :<----------------:----------->:              :
+//                         :                           :
+//                         :        "test2"            :
+//                         :<------------------------->:
+TEST_F(Islands, OverlappingIslands1)
+{
+    EXPECT_NO_THROW (cv::gapi::island("test1", cv::GIn(in),     cv::GOut(tmp[1])));
+    EXPECT_ANY_THROW(cv::gapi::island("test2", cv::GIn(tmp[0]), cv::GOut(tmp[2])));
+}
+
+TEST_F(Islands, OverlappingIslands2)
+{
+    EXPECT_NO_THROW (cv::gapi::island("test2", cv::GIn(tmp[0]), cv::GOut(tmp[2])));
+    EXPECT_ANY_THROW(cv::gapi::island("test1", cv::GIn(in),     cv::GOut(tmp[1])));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests on a complex graph
+//
+// (in0) -> Not  -> (tmp0) --> Add ---------> (tmp2) --> AddC -------> (out0)
+//                             ^                         ^
+// (in1) -> Blur -> (tmp1) ----'--> Sum ----> (scl0) ----'
+//                   :
+//                   `------------> Median -> (tmp3) --> Blur -------> (out1)
+//
+namespace
+{
+    struct ComplexIslandsFixture
+    {
+        cv::GMat    in[2];
+        cv::GMat    tmp[4];
+        cv::GScalar scl;
+        cv::GMat    out[2];
+
+        ComplexIslandsFixture()
+        {
+            tmp[0] = cv::gapi::bitwise_not(in[0]);
+            tmp[1] = cv::gapi::boxFilter(in[1], -1, cv::Size(3,3));
+            tmp[2] = tmp[0] + tmp[1]; // FIXME: handle tmp[2] = tmp[0]+tmp[2] typo
+            scl    = cv::gapi::sum(tmp[1]);
+            tmp[3] = cv::gapi::medianBlur(tmp[1], 3);
+            out[0] = tmp[2] + scl;
+            out[1] = cv::gapi::boxFilter(tmp[3], -1, cv::Size(3,3));
+        }
+    };
+
+    struct ComplexIslands: public ::testing::Test, public ComplexIslandsFixture {};
+} // namespace
+
+TEST_F(ComplexIslands, SmokeTest)
+{
+    //       isl0                                          #internal1
+    //       ...........................                   ........
+    // (in0) -> Not  -> (tmp0) --> Add ---------> (tmp2) --> AddC -------> (out0)
+    //       :............ ........^...:                   :.^....:
+    //                   ...       :                         :
+    // (in1) -> Blur -> (tmp1) ----'--> Sum ----> (scl0) ----'
+    //                   :                                     isl1
+    //                   :           ..............................
+    //                   `------------> Median -> (tmp3) --> Blur -------> (out1)
+    //                               :............................:
+
+    cv::gapi::island("isl0", cv::GIn(in[0], tmp[1]),  cv::GOut(tmp[2]));
+    cv::gapi::island("isl1", cv::GIn(tmp[1]), cv::GOut(out[1]));
+    auto cc = cv::GComputation(cv::GIn(in[0], in[1]), cv::GOut(out[0], out[1]))
+        .compile(cv::GMatDesc{CV_8U,1,{640,480}},
+                 cv::GMatDesc{CV_8U,1,{640,480}});
+    const auto &gm = cc.priv().model();
+    const auto in0_nh  = cv::gimpl::GModel::dataNodeOf(gm, in[0]);
+    const auto in1_nh  = cv::gimpl::GModel::dataNodeOf(gm, in[1]);
+    const auto tmp0_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[0]);
+    const auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[1]);
+    const auto tmp2_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[2]);
+    const auto tmp3_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[3]);
+    const auto scl_nh  = cv::gimpl::GModel::dataNodeOf(gm, scl);
+    const auto out0_nh = cv::gimpl::GModel::dataNodeOf(gm, out[0]);
+    const auto out1_nh = cv::gimpl::GModel::dataNodeOf(gm, out[1]);
+
+    // tmp0, tmp3 are in islands, others are not
+    EXPECT_TRUE(gm.metadata(tmp0_nh) .contains<cv::gimpl::Island>()); // isl0
+    EXPECT_TRUE(gm.metadata(tmp3_nh) .contains<cv::gimpl::Island>()); // isl1
+    EXPECT_FALSE(gm.metadata(in0_nh) .contains<cv::gimpl::Island>()); // (input is never fused)
+    EXPECT_FALSE(gm.metadata(in1_nh) .contains<cv::gimpl::Island>()); // (input is never fused)
+    EXPECT_TRUE (gm.metadata(tmp1_nh).contains<cv::gimpl::Island>()); // <internal island>
+    EXPECT_FALSE(gm.metadata(tmp2_nh).contains<cv::gimpl::Island>()); // #not fused as cycle-causing#
+    EXPECT_FALSE(gm.metadata(scl_nh) .contains<cv::gimpl::Island>()); // #not fused as cycle-causing#
+    EXPECT_FALSE(gm.metadata(out0_nh).contains<cv::gimpl::Island>()); // (output is never fused)
+    EXPECT_FALSE(gm.metadata(out1_nh).contains<cv::gimpl::Island>()); // (output is never fused)
+
+    EXPECT_EQ("isl0", gm.metadata(tmp0_nh).get<cv::gimpl::Island>().island);
+    EXPECT_EQ("isl1", gm.metadata(tmp3_nh).get<cv::gimpl::Island>().island);
+
+    EXPECT_NE("isl0", gm.metadata(tmp1_nh).get<cv::gimpl::Island>().island);
+    EXPECT_NE("isl1", gm.metadata(tmp1_nh).get<cv::gimpl::Island>().island);
+
+    // FIXME: Add a test with same graph for Fusion and check GIslandModel
+}
+
+TEST_F(ComplexIslands, DistinictIslandsWithSameName)
+{
+    //       isl0
+    //       ...........................
+    // (in0) -> Not  -> (tmp0) --> Add ---------> (tmp2) --> AddC -------> (out0)
+    //       :............ ........^...:                     ^
+    //                   ...       :                         :
+    // (in1) -> Blur -> (tmp1) ----'--> Sum ----> (scl0) ----'
+    //                   :                                     isl0
+    //                   :           ..............................
+    //                   `------------> Median -> (tmp3) --> Blur -------> (out1)
+    //                               :............................:
+
+    cv::gapi::island("isl0", cv::GIn(in[0], tmp[1]),  cv::GOut(tmp[2]));
+    cv::gapi::island("isl0", cv::GIn(tmp[1]), cv::GOut(out[1]));
+
+    auto cc = cv::GComputation(cv::GIn(in[0], in[1]), cv::GOut(out[0], out[1]));
+
+    EXPECT_ANY_THROW(cc.compile(cv::GMatDesc{CV_8U,1,{640,480}},
+                                cv::GMatDesc{CV_8U,1,{640,480}}));
+}
+
+TEST_F(ComplexIslands, FullGraph)
+{
+    cv::gapi::island("isl0",   cv::GIn(in[0], in[1]), cv::GOut(out[0], out[1]));
+    auto cc = cv::GComputation(cv::GIn(in[0], in[1]), cv::GOut(out[0], out[1]))
+        .compile(cv::GMatDesc{CV_8U,1,{640,480}},
+                 cv::GMatDesc{CV_8U,1,{640,480}});
+    const auto &gm = cc.priv().model();
+    std::vector<ade::NodeHandle> handles_inside = {
+        cv::gimpl::GModel::dataNodeOf(gm, tmp[0]),
+        cv::gimpl::GModel::dataNodeOf(gm, tmp[1]),
+        cv::gimpl::GModel::dataNodeOf(gm, tmp[2]),
+        cv::gimpl::GModel::dataNodeOf(gm, tmp[3]),
+        cv::gimpl::GModel::dataNodeOf(gm, scl),
+    };
+    std::vector<ade::NodeHandle> handles_outside = {
+        cv::gimpl::GModel::dataNodeOf(gm, in[0]),
+        cv::gimpl::GModel::dataNodeOf(gm, in[1]),
+        cv::gimpl::GModel::dataNodeOf(gm, out[0]),
+        cv::gimpl::GModel::dataNodeOf(gm, out[1]),
+    };
+
+    for (auto nh_inside : handles_inside)
+    {
+        EXPECT_EQ("isl0", gm.metadata(nh_inside).get<cv::gimpl::Island>().island);
+    }
+    for (auto nh_outside : handles_outside)
+    {
+        EXPECT_FALSE(gm.metadata(nh_outside).contains<cv::gimpl::Island>());
+    }
+}
+
+TEST_F(ComplexIslands, ViaScalar)
+{
+    //
+    //        .........................................#internal0.
+    // (in0) -> Not  -> (tmp0) --> Add ---------> (tmp2) --> AddC -------> (out0)
+    //        :....................^.........................^...:
+    //                             :                         :
+    //        .....................:.........(isl0).         :
+    // (in1) -> Blur -> (tmp1) ----'--> Sum ----> (scl0) ----'
+    //        :..........:.........................:
+    //                   :
+    //                   :            ..................#internal1.
+    //                   `------------> Median -> (tmp3) --> Blur -------> (out1)
+    //                                :...........................:
+
+    cv::gapi::island("isl0",   cv::GIn(in[1]), cv::GOut(scl));
+    auto cc = cv::GComputation(cv::GIn(in[0], in[1]), cv::GOut(out[0], out[1]))
+        .compile(cv::GMatDesc{CV_8U,1,{640,480}},
+                 cv::GMatDesc{CV_8U,1,{640,480}});
+    const auto &gm = cc.priv().model();
+
+    const auto tmp0_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[0]);
+    const auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[1]);
+    const auto tmp2_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[2]);
+    const auto tmp3_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[3]);
+
+    EXPECT_NE("isl0", gm.metadata(tmp0_nh).get<cv::gimpl::Island>().island); // <internal>
+    EXPECT_EQ("isl0", gm.metadata(tmp1_nh).get<cv::gimpl::Island>().island); // isl0
+    EXPECT_NE("isl0", gm.metadata(tmp2_nh).get<cv::gimpl::Island>().island); // <internal>
+    EXPECT_NE("isl0", gm.metadata(tmp3_nh).get<cv::gimpl::Island>().island); // <internal>
+
+    std::vector<ade::NodeHandle> handles_outside = {
+        cv::gimpl::GModel::dataNodeOf(gm, in[0]),
+        cv::gimpl::GModel::dataNodeOf(gm, in[1]),
+        cv::gimpl::GModel::dataNodeOf(gm, scl),
+        cv::gimpl::GModel::dataNodeOf(gm, out[0]),
+        cv::gimpl::GModel::dataNodeOf(gm, out[1]),
+    };
+    for (auto nh_outside : handles_outside)
+    {
+        EXPECT_FALSE(gm.metadata(nh_outside).contains<cv::gimpl::Island>());
+    }
+}
+
+TEST_F(ComplexIslands, BorderDataIsland)
+{
+    //       .................................(isl0)..
+    //       :                                       :
+    // (in0) -> Not  -> (tmp0) --> Add ---------> (tmp2) --> AddC -------> (out0)
+    //       :                     ^                 :       ^
+    //       :                     :                 :       :
+    // (in1) -> Blur -> (tmp1) ----'--> Sum ----> (scl0) ----'
+    //       :...........:...........................:
+    //                :  :  :
+    //                :  :  :.........................................(isl1)..
+    //                :  `------------> Median -> (tmp3) --> Blur -------> (out1)
+    //                :                                                      :
+    //                :......................................................:
+
+    cv::gapi::island("isl0", cv::GIn(in[0],  in[1]), cv::GOut(tmp[2], scl));
+    cv::gapi::island("isl1", cv::GIn(tmp[1]),        cv::GOut(out[1]));
+
+    auto cc = cv::GComputation(cv::GIn(in[0], in[1]), cv::GOut(out[0], out[1]))
+        .compile(cv::GMatDesc{CV_8U,1,{640,480}},
+                 cv::GMatDesc{CV_8U,1,{640,480}});
+    const auto &gm = cc.priv().model();
+    const auto in0_nh  = cv::gimpl::GModel::dataNodeOf(gm, in[0]);
+    const auto in1_nh  = cv::gimpl::GModel::dataNodeOf(gm, in[1]);
+    const auto tmp0_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[0]);
+    const auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[1]);
+    const auto tmp2_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[2]);
+    const auto tmp3_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[3]);
+    const auto scl_nh  = cv::gimpl::GModel::dataNodeOf(gm, scl);
+    const auto out0_nh = cv::gimpl::GModel::dataNodeOf(gm, out[0]);
+    const auto out1_nh = cv::gimpl::GModel::dataNodeOf(gm, out[1]);
+
+    // Check handles inside isl0
+    EXPECT_EQ("isl0", gm.metadata(tmp0_nh).get<cv::gimpl::Island>().island);
+    EXPECT_EQ("isl0", gm.metadata(tmp1_nh).get<cv::gimpl::Island>().island);
+    // ^^^ Important - tmp1 is assigned to isl0, not isl1
+
+    // Check handles inside isl1
+    EXPECT_EQ("isl1", gm.metadata(tmp3_nh).get<cv::gimpl::Island>().island);
+
+    // Check outside handles
+    EXPECT_FALSE(gm.metadata(in0_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(in1_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp2_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(scl_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out0_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out1_nh).contains<cv::gimpl::Island>());
+}
+
+
+TEST_F(ComplexIslands, IncompleteSpec)
+{
+    //       isl0
+    //       ...........................
+    // (in0) -> Not  -> (tmp0) --> Add ---------> (tmp2) --> AddC -------> (out0)
+    //       :...........xxx.......^...:                     ^
+    //                             :                         :
+    // (in1) -> Blur -> (tmp1) ----'--> Sum ----> (scl0) ----'
+    //                   :
+    //                   :
+    //                   `------------> Median -> (tmp3) --> Blur -------> (out1)
+    //
+
+    // tmp1 is missing in the below spec
+    EXPECT_ANY_THROW(cv::gapi::island("isl0", cv::GIn(in[0]),  cv::GOut(tmp[2])));
+
+    // empty range
+    EXPECT_ANY_THROW(cv::gapi::island("isl1", cv::GIn(tmp[2]),  cv::GOut(tmp[2])));
+}
+
+TEST_F(ComplexIslands, InputOperationFromDifferentIslands)
+{
+    //       isl1
+    //       ...........................                   ........
+    // (in0)--> Not  -> (tmp0) --> Add :--------> (tmp2)-->: AddC : -------> (out0)
+    //       :......................^..:                   :  ^   :
+    //       isl0                   :                      :  :   :
+    //       .......................:.......................  :   :
+    // (in1) :-> Blur -> (tmp1) ----'--> Sum ----> (scl0) -----   :
+    //       :....................................................:
+    //       isl0        :
+    //                   `------------> Median -> (tmp3) --> Blur -------> (out1)
+    //
+
+    cv::gapi::island("isl0", cv::GIn(in[1], tmp[2]), cv::GOut(out[0]));
+    cv::gapi::island("isl1", cv::GIn(in[0], tmp[1]), cv::GOut(tmp[2]));
+    auto cc = cv::GComputation(cv::GIn(in[0], in[1]), cv::GOut(out[0], out[1]))
+        .compile(cv::GMatDesc{CV_8U,1,{640,480}},
+                cv::GMatDesc{CV_8U,1,{640,480}});
+
+    const auto &gm = cc.priv().model();
+    const auto tmp0_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[0]);
+    const auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[1]);
+    const auto tmp2_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[2]);
+
+    EXPECT_EQ("isl1", gm.metadata(tmp0_nh).get<cv::gimpl::Island>().island);
+    EXPECT_EQ("isl0", gm.metadata(tmp1_nh).get<cv::gimpl::Island>().island);
+    EXPECT_FALSE(gm.metadata(tmp2_nh).contains<cv::gimpl::Island>());
+}
+
+TEST_F(ComplexIslands, NoWayBetweenNodes)
+{
+    // (in0) -> Not  -> (tmp0) --> Add ---------> (tmp2) --> AddC -------> (out0)
+    //                             ^                         ^
+    // (in1) -> Blur -> (tmp1) ----'--> Sum ----> (scl0) ----'
+    //                   :
+    //                   `------------> Median -> (tmp3) --> Blur -------> (out1)
+
+    EXPECT_ANY_THROW(cv::gapi::island("isl0", cv::GIn(in[1]), cv::GOut(tmp[0])));
+}
+
+TEST_F(ComplexIslands, IslandsContainUnusedPart)
+{
+    // Unused part of the graph
+    // x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x
+    // x                                                                               x
+    // x(in0) -> Not  -> (tmp0) --> Add ---------> (tmp2)---> AddC ---------> (out0)   x
+    // x                             ^                         ^                       x
+    // x x x x x x x x x x x x x x x | x x                     |                       x
+    //                               |   x                     |                       x
+    //          ......               |   x                     |                       x
+    // (in1) -> :Blur:----------> (tmp1) x-----> Sum ------> (scl0)                    x
+    //          ......    :              x x x x x x x x x x x x x x x x x x x x x x x x
+    //          isl0
+    //                    :
+    //                    `------------> Median -> (tmp3) --> Blur -------> (out1)
+
+    cv::gapi::island("isl0", cv::GIn(in[1]), cv::GOut(scl));
+    auto cc = cv::GComputation(cv::GIn(in[1]), cv::GOut(out[1]))
+        .compile(cv::GMatDesc{CV_8U,1,{640,480}});
+
+    const auto &gm = cc.priv().model();
+    const auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[1]);
+
+    //The output 0 is not specified in the graph
+    //means that there will not be a node scl, so that  tmp1 will not assign to the island
+    // FIXME Check that blur assigned to island using the function producerOf
+    // After merge islands fusion
+    EXPECT_FALSE(gm.metadata(tmp1_nh) .contains<cv::gimpl::Island>());
+}
+
+TEST_F(ComplexIslands, FullGraphInTwoIslands)
+{
+    //       isl0
+    //          ..................................................
+    // (in0) -> :Not -> (tmp0) --> Add ---------> (tmp2) --> AddC: -------> (out0)
+    //          ...................^....                     ^   :
+    //          ...............    |   :                     :   :
+    // (in1) -> :Blur-> (tmp1):----'-->:Sum ----> (scl0) ----'   :
+    //          ........ |    :        ...........................
+    //          isl1   : |    :............................................
+    //                 : `------------> Median -> (tmp3) --> Blur ------->:(out1)
+    //                 ....................................................
+
+    cv::gapi::island("isl0", cv::GIn(in[0], tmp[1]), cv::GOut(out[0]));
+    cv::gapi::island("isl1", cv::GIn(in[1]), cv::GOut(out[1]));
+    auto cc = cv::GComputation(cv::GIn(in[0], in[1]), cv::GOut(out[0], out[1]))
+        .compile(cv::GMatDesc{CV_8U,1,{640,480}},
+                cv::GMatDesc{CV_8U,1,{640,480}});
+
+    const auto &gm = cc.priv().model();
+    const auto in0_nh  = cv::gimpl::GModel::dataNodeOf(gm, in[0]);
+    const auto in1_nh  = cv::gimpl::GModel::dataNodeOf(gm, in[1]);
+    const auto tmp0_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[0]);
+    const auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[1]);
+    const auto tmp2_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[2]);
+    const auto tmp3_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[3]);
+    const auto scl_nh  = cv::gimpl::GModel::dataNodeOf(gm, scl);
+    const auto out0_nh = cv::gimpl::GModel::dataNodeOf(gm, out[0]);
+    const auto out1_nh = cv::gimpl::GModel::dataNodeOf(gm, out[1]);
+
+    // Check handles inside isl0
+    EXPECT_EQ("isl0", gm.metadata(tmp0_nh).get<cv::gimpl::Island>().island);
+    EXPECT_EQ("isl0", gm.metadata(tmp2_nh).get<cv::gimpl::Island>().island);
+    EXPECT_EQ("isl0", gm.metadata(scl_nh).get<cv::gimpl::Island>().island);
+
+    // Check handles inside isl1
+    EXPECT_EQ("isl1", gm.metadata(tmp1_nh).get<cv::gimpl::Island>().island);
+    EXPECT_EQ("isl1", gm.metadata(tmp3_nh).get<cv::gimpl::Island>().island);
+
+    // Check outside handles
+    EXPECT_FALSE(gm.metadata(in0_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(in1_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out0_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out1_nh).contains<cv::gimpl::Island>());
+}
+
+TEST_F(ComplexIslands, OnlyOperationsAssignedToIslands)
+{
+    cv::gapi::island("isl0", cv::GIn(in[1]), cv::GOut(tmp[1]));
+    cv::gapi::island("isl1", cv::GIn(tmp[1]), cv::GOut(scl));
+    cv::gapi::island("isl2", cv::GIn(scl, tmp[2]), cv::GOut(out[0]));
+    cv::gapi::island("isl3", cv::GIn(in[0]), cv::GOut(tmp[0]));
+    cv::gapi::island("isl4", cv::GIn(tmp[0], tmp[1]), cv::GOut(tmp[2]));
+    cv::gapi::island("isl5", cv::GIn(tmp[1]), cv::GOut(tmp[3]));
+    cv::gapi::island("isl6", cv::GIn(tmp[3]), cv::GOut(out[1]));
+
+    auto cc = cv::GComputation(cv::GIn(in[0], in[1]), cv::GOut(out[0], out[1]))
+        .compile(cv::GMatDesc{CV_8U,1,{640,480}},
+                cv::GMatDesc{CV_8U,1,{640,480}});
+
+    const auto &gm = cc.priv().model();
+    //FIXME: Check that operation handles are really assigned to isl0..isl6
+    const auto in0_nh  = cv::gimpl::GModel::dataNodeOf(gm, in[0]);
+    const auto in1_nh  = cv::gimpl::GModel::dataNodeOf(gm, in[1]);
+    const auto tmp0_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[0]);
+    const auto tmp1_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[1]);
+    const auto tmp2_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[2]);
+    const auto tmp3_nh = cv::gimpl::GModel::dataNodeOf(gm, tmp[3]);
+    const auto scl_nh  = cv::gimpl::GModel::dataNodeOf(gm, scl);
+    const auto out0_nh = cv::gimpl::GModel::dataNodeOf(gm, out[0]);
+    const auto out1_nh = cv::gimpl::GModel::dataNodeOf(gm, out[1]);
+
+    EXPECT_FALSE(gm.metadata(in0_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(in1_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp0_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp1_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp2_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp3_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(scl_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out0_nh).contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out1_nh).contains<cv::gimpl::Island>());
+}
+
+namespace
+{
+    struct IslandStructureWithGArray
+    {
+        GIntArray in, out;
+        GMat tmp;
+
+        IslandStructureWithGArray()
+        {
+            tmp = CreateMatWithDiag::on(in);
+            out = Mat2Array::on(tmp);
+        }
+    };
+
+    struct IslandsWithGArray: public ::testing::Test, public IslandStructureWithGArray {};
+} // namespace
+
+TEST_F(IslandsWithGArray, IslandWithGArrayAsInput)
+{
+    cv::gapi::island("isl0", cv::GIn(in), cv::GOut(tmp));
+
+    const auto pkg = cv::gapi::kernels<CreateMatWithDiagImpl, Mat2ArrayImpl>();
+    auto cc = cv::GComputation(cv::GIn(in), GOut(out)).compile(cv::empty_array_desc(), cv::compile_args(pkg));
+    const auto &gm = cc.priv().model();
+
+    const auto in_nh   = cv::gimpl::GModel::dataNodeOf(gm, in.strip());
+    const auto out_nh  = cv::gimpl::GModel::dataNodeOf(gm, out.strip());
+    const auto tmp_nh  = cv::gimpl::GModel::dataNodeOf(gm, tmp);
+    GAPI_Assert(tmp_nh->inNodes().size() == 1);
+    const auto create_diag_mat_nh = tmp_nh->inNodes().front();
+
+    EXPECT_EQ("isl0", gm.metadata(create_diag_mat_nh).get<cv::gimpl::Island>().island);
+    EXPECT_FALSE(gm.metadata(in_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp_nh) .contains<cv::gimpl::Island>());
+}
+
+TEST_F(IslandsWithGArray, IslandWithGArrayAsOutput)
+{
+    cv::gapi::island("isl0", cv::GIn(tmp), cv::GOut(out));
+
+    const auto pkg = cv::gapi::kernels<CreateMatWithDiagImpl, Mat2ArrayImpl>();
+    auto cc = cv::GComputation(cv::GIn(in), GOut(out)).compile(cv::empty_array_desc(), cv::compile_args(pkg));
+    const auto &gm = cc.priv().model();
+
+    const auto in_nh   = cv::gimpl::GModel::dataNodeOf(gm, in.strip());
+    const auto out_nh  = cv::gimpl::GModel::dataNodeOf(gm, out.strip());
+    const auto tmp_nh  = cv::gimpl::GModel::dataNodeOf(gm, tmp);
+    GAPI_Assert(tmp_nh->inNodes().size() == 1);
+    const auto mat2array_nh = out_nh->inNodes().front();
+
+    EXPECT_EQ("isl0", gm.metadata(mat2array_nh).get<cv::gimpl::Island>().island);
+    EXPECT_FALSE(gm.metadata(in_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(out_nh) .contains<cv::gimpl::Island>());
+    EXPECT_FALSE(gm.metadata(tmp_nh) .contains<cv::gimpl::Island>());
+}
+////////////////////////////////////////////////////////////////////////////////
+// Wrong input tests on island name
+//
+namespace
+{
+    struct CheckName : public TestWithParam<std::tuple<bool, const char*> >,
+                       public PlainIslandsFixture
+    {
+        void assignIsland(const std::string &s)
+        {
+            cv::gapi::island(s, cv::GIn(tmp[0]), cv::GOut(tmp[2]));
+        };
+    };
+    TEST_P(CheckName, Test)
+    {
+        bool correct = false;
+        const char *name = "";
+        std::tie(correct, name) = GetParam();
+        if (correct) EXPECT_NO_THROW(assignIsland(name));
+        else EXPECT_ANY_THROW(assignIsland(name));
+    }
+} // namespace
+INSTANTIATE_TEST_CASE_P(IslandName, CheckName,
+                        Values(std::make_tuple(true,  "name"),
+                               std::make_tuple(true,  " name "),
+                               std::make_tuple(true,  " n a m e "),
+                               std::make_tuple(true,  " 123 $$ %%"),
+                               std::make_tuple(true,  ".: -"),
+                               std::make_tuple(false, ""),
+                               std::make_tuple(false, " "),
+                               std::make_tuple(false, " \t "),
+                               std::make_tuple(false, "  \t \t   ")));
+
+// FIXME: add <internal> test on unrollExpr() use for islands
+
+} // opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_recompilation_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_recompilation_test.cpp
new file mode 100644 (file)
index 0000000..252af9c
--- /dev/null
@@ -0,0 +1,233 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "api/gcomputation_priv.hpp"
+
+#include "opencv2/gapi/fluid/gfluidkernel.hpp"
+#include "opencv2/gapi/fluid/core.hpp"
+#include "opencv2/gapi/fluid/imgproc.hpp"
+
+namespace opencv_test
+{
+
+TEST(GComputationCompile, NoRecompileWithSameMeta)
+{
+    cv::GMat in;
+    cv::GComputation cc(in, in+in);
+
+    cv::Mat in_mat1 = cv::Mat::eye  (32, 32, CV_8UC1);
+    cv::Mat in_mat2 = cv::Mat::zeros(32, 32, CV_8UC1);
+    cv::Mat out_mat;
+
+    cc.apply(in_mat1, out_mat);
+    auto comp1 = cc.priv().m_lastCompiled;
+
+    cc.apply(in_mat2, out_mat);
+    auto comp2 = cc.priv().m_lastCompiled;
+
+    // Both compiled objects are actually the same unique executable
+    EXPECT_EQ(&comp1.priv(), &comp2.priv());
+}
+
+TEST(GComputationCompile, NoRecompileWithWrongMeta)
+{
+    cv::GMat in;
+    cv::GComputation cc(in, in+in);
+
+    cv::Mat in_mat1 = cv::Mat::eye  (32, 32, CV_8UC1);
+    cv::Mat in_mat2 = cv::Mat::zeros(32, 32, CV_8UC1);
+    cv::Mat out_mat;
+
+    cc.apply(in_mat1, out_mat);
+    auto comp1 = cc.priv().m_lastCompiled;
+
+    EXPECT_THROW(cc.apply(cv::gin(cv::Scalar(128)), cv::gout(out_mat)), std::logic_error);
+    auto comp2 = cc.priv().m_lastCompiled;
+
+    // Both compiled objects are actually the same unique executable
+    EXPECT_EQ(&comp1.priv(), &comp2.priv());
+}
+
+TEST(GComputationCompile, RecompileWithDifferentMeta)
+{
+    cv::GMat in;
+    cv::GComputation cc(in, in+in);
+
+    cv::Mat in_mat1 = cv::Mat::eye  (32, 32, CV_8UC1);
+    cv::Mat in_mat2 = cv::Mat::zeros(64, 64, CV_32F);
+    cv::Mat out_mat;
+
+    cc.apply(in_mat1, out_mat);
+    auto comp1 = cc.priv().m_lastCompiled;
+
+    cc.apply(in_mat2, out_mat);
+    auto comp2 = cc.priv().m_lastCompiled;
+
+    // Both compiled objects are different
+    EXPECT_NE(&comp1.priv(), &comp2.priv());
+}
+
+TEST(GComputationCompile, FluidReshapeWithDifferentDims)
+{
+    cv::GMat in;
+    cv::GComputation cc(in, in+in);
+
+    cv::Mat in_mat1 = cv::Mat::eye  (32, 32, CV_8UC1);
+    cv::Mat in_mat2 = cv::Mat::zeros(64, 64, CV_8UC1);
+    cv::Mat out_mat;
+
+    cc.apply(in_mat1, out_mat, cv::compile_args(cv::gapi::core::fluid::kernels()));
+    auto comp1 = cc.priv().m_lastCompiled;
+
+    cc.apply(in_mat2, out_mat);
+    auto comp2 = cc.priv().m_lastCompiled;
+
+    // Both compiled objects are actually the same unique executable
+    EXPECT_EQ(&comp1.priv(), &comp2.priv());
+}
+
+TEST(GComputationCompile, FluidReshapeResizeDownScale)
+{
+    cv::Size szOut(4, 4);
+    cv::GMat in;
+    cv::GComputation cc(in, cv::gapi::resize(in, szOut));
+
+    cv::Mat in_mat1( 8,  8, CV_8UC3);
+    cv::Mat in_mat2(16, 16, CV_8UC3);
+    cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+    cv::randu(in_mat2, cv::Scalar::all(0), cv::Scalar::all(255));
+    cv::Mat out_mat1, out_mat2;
+
+    cc.apply(in_mat1, out_mat1, cv::compile_args(cv::gapi::core::fluid::kernels()));
+    auto comp1 = cc.priv().m_lastCompiled;
+
+    cc.apply(in_mat2, out_mat2);
+    auto comp2 = cc.priv().m_lastCompiled;
+
+    // Both compiled objects are actually the same unique executable
+    EXPECT_EQ(&comp1.priv(), &comp2.priv());
+
+    cv::Mat cv_out_mat1, cv_out_mat2;
+    cv::resize(in_mat1, cv_out_mat1, szOut);
+    cv::resize(in_mat2, cv_out_mat2, szOut);
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat1 != cv_out_mat1));
+    EXPECT_EQ(0, cv::countNonZero(out_mat2 != cv_out_mat2));
+}
+
+TEST(GComputationCompile, FluidReshapeSwitchToUpscaleFromDownscale)
+{
+    cv::Size szOut(4, 4);
+    cv::GMat in;
+    cv::GComputation cc(in, cv::gapi::resize(in, szOut));
+
+    cv::Mat in_mat1( 8,  8, CV_8UC3);
+    cv::Mat in_mat2( 2,  2, CV_8UC3);
+    cv::Mat in_mat3(16, 16, CV_8UC3);
+    cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+    cv::randu(in_mat2, cv::Scalar::all(0), cv::Scalar::all(255));
+    cv::randu(in_mat3, cv::Scalar::all(0), cv::Scalar::all(255));
+    cv::Mat out_mat1, out_mat2, out_mat3;
+
+    cc.apply(in_mat1, out_mat1, cv::compile_args(cv::gapi::core::fluid::kernels()));
+    auto comp1 = cc.priv().m_lastCompiled;
+
+    cc.apply(in_mat2, out_mat2);
+    auto comp2 = cc.priv().m_lastCompiled;
+
+    cc.apply(in_mat3, out_mat3);
+    auto comp3 = cc.priv().m_lastCompiled;
+
+    EXPECT_EQ(&comp1.priv(), &comp2.priv());
+    EXPECT_EQ(&comp1.priv(), &comp3.priv());
+
+    cv::Mat cv_out_mat1, cv_out_mat2, cv_out_mat3;
+    cv::resize(in_mat1, cv_out_mat1, szOut);
+    cv::resize(in_mat2, cv_out_mat2, szOut);
+    cv::resize(in_mat3, cv_out_mat3, szOut);
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat1 != cv_out_mat1));
+    EXPECT_EQ(0, cv::countNonZero(out_mat2 != cv_out_mat2));
+    EXPECT_EQ(0, cv::countNonZero(out_mat3 != cv_out_mat3));
+}
+
+TEST(GComputationCompile, ReshapeBlur)
+{
+    cv::Size kernelSize{3, 3};
+    cv::GMat in;
+    cv::GComputation cc(in, cv::gapi::blur(in, kernelSize));
+
+    cv::Mat in_mat1( 8,  8, CV_8UC1);
+    cv::Mat in_mat2(16, 16, CV_8UC1);
+    cv::randu(in_mat1, cv::Scalar::all(0), cv::Scalar::all(255));
+    cv::randu(in_mat2, cv::Scalar::all(0), cv::Scalar::all(255));
+    cv::Mat out_mat1, out_mat2;
+
+    cc.apply(in_mat1, out_mat1, cv::compile_args(cv::gapi::imgproc::fluid::kernels()));
+    auto comp1 = cc.priv().m_lastCompiled;
+
+    cc.apply(in_mat2, out_mat2);
+    auto comp2 = cc.priv().m_lastCompiled;
+
+    // Both compiled objects are actually the same unique executable
+    EXPECT_EQ(&comp1.priv(), &comp2.priv());
+
+    cv::Mat cv_out_mat1, cv_out_mat2;
+    cv::blur(in_mat1, cv_out_mat1, kernelSize);
+    cv::blur(in_mat2, cv_out_mat2, kernelSize);
+
+    EXPECT_EQ(0, cv::countNonZero(out_mat1 != cv_out_mat1));
+    EXPECT_EQ(0, cv::countNonZero(out_mat2 != cv_out_mat2));
+}
+
+TEST(GComputationCompile, ReshapeRois)
+{
+    cv::Size kernelSize{3, 3};
+    cv::Size szOut(8, 8);
+    cv::GMat in;
+    auto blurred = cv::gapi::blur(in, kernelSize);
+    cv::GComputation cc(in, cv::gapi::resize(blurred, szOut));
+
+    cv::Mat first_in_mat(8, 8, CV_8UC3);
+    cv::randn(first_in_mat, cv::Scalar::all(127), cv::Scalar::all(40.f));
+    cv::Mat first_out_mat;
+    auto fluidKernels = cv::gapi::combine(gapi::imgproc::fluid::kernels(),
+                                          gapi::core::fluid::kernels(),
+                                          cv::unite_policy::REPLACE);
+    cc.apply(first_in_mat, first_out_mat, cv::compile_args(fluidKernels));
+    auto first_comp = cc.priv().m_lastCompiled;
+
+    constexpr int niter = 4;
+    for (int i = 0; i < niter; i++)
+    {
+        int width  = 4 + 2*i;
+        int height = width;
+        cv::Mat in_mat(width, height, CV_8UC3);
+        cv::randn(in_mat, cv::Scalar::all(127), cv::Scalar::all(40.f));
+        cv::Mat out_mat = cv::Mat::zeros(szOut, CV_8UC3);
+
+        int x = 0;
+        int y = szOut.height * i / niter;
+        int roiW = szOut.width;
+        int roiH = szOut.height / niter;
+        cv::Rect roi{x, y, roiW, roiH};
+
+        cc.apply(in_mat, out_mat, cv::compile_args(cv::GFluidOutputRois{{to_own(roi)}}));
+        auto comp = cc.priv().m_lastCompiled;
+
+        EXPECT_EQ(&first_comp.priv(), &comp.priv());
+
+        cv::Mat blur_mat, cv_out_mat;
+        cv::blur(in_mat, blur_mat, kernelSize);
+        cv::resize(blur_mat, cv_out_mat, szOut);
+
+        EXPECT_EQ(0, cv::countNonZero(out_mat(roi) != cv_out_mat(roi)));
+    }
+}
+
+} // opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_resolve_kernel_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_resolve_kernel_test.cpp
new file mode 100644 (file)
index 0000000..d4b16f6
--- /dev/null
@@ -0,0 +1,119 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+#include "gapi_mock_kernels.hpp"
+
+namespace opencv_test
+{
+
+TEST(Lookup, CreateOrder)
+{
+    const auto order = cv::gapi::lookup_order({Jupiter::backend(),
+                                               Saturn::backend()});
+    EXPECT_EQ(2u, order.size());
+    EXPECT_EQ(Jupiter::backend(), order[0]);
+    EXPECT_EQ(Saturn ::backend(), order[1]);
+}
+
+TEST(Lookup, NoOrder)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    const auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz,
+                                       S::Foo, S::Bar, S::Baz>();
+
+    EXPECT_NO_THROW (pkg.lookup<I::Foo>());
+    EXPECT_NO_THROW (pkg.lookup<I::Bar>());
+    EXPECT_NO_THROW (pkg.lookup<I::Baz>());
+    EXPECT_ANY_THROW(pkg.lookup<I::Qux>());
+}
+
+TEST(Lookup, Only_Jupiter)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    const auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz,
+                                       S::Foo, S::Bar, S::Baz>();
+
+    auto order = cv::gapi::lookup_order({J::backend()});
+
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Foo>(order));
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Bar>(order));
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Baz>(order));
+    EXPECT_ANY_THROW(pkg.lookup<I::Qux>(order));
+}
+
+TEST(Lookup, Only_Saturn)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    const auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz,
+                                       S::Foo, S::Bar, S::Baz>();
+
+    auto order = cv::gapi::lookup_order({S::backend()});
+
+    EXPECT_EQ(S::backend(), pkg.lookup<I::Foo>(order));
+    EXPECT_EQ(S::backend(), pkg.lookup<I::Bar>(order));
+    EXPECT_EQ(S::backend(), pkg.lookup<I::Baz>(order));
+    EXPECT_ANY_THROW(pkg.lookup<I::Qux>(order));
+}
+
+TEST(Lookup, With_Order)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    const auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz,
+                                       S::Foo, S::Bar, S::Baz>();
+
+    auto prefer_j = cv::gapi::lookup_order({J::backend(), S::backend()});
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Foo>(prefer_j));
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Bar>(prefer_j));
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Baz>(prefer_j));
+    EXPECT_ANY_THROW(pkg.lookup<I::Qux>(prefer_j));
+
+    auto prefer_s = cv::gapi::lookup_order({S::backend(), J::backend()});
+    EXPECT_EQ(S::backend(), pkg.lookup<I::Foo>(prefer_s));
+    EXPECT_EQ(S::backend(), pkg.lookup<I::Bar>(prefer_s));
+    EXPECT_EQ(S::backend(), pkg.lookup<I::Baz>(prefer_s));
+    EXPECT_ANY_THROW(pkg.lookup<I::Qux>(prefer_s));
+}
+
+TEST(Lookup, NoOverlap)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    const auto pkg = cv::gapi::kernels<J::Foo, J::Bar, S::Baz, S::Qux>();
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Foo>());
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Bar>());
+    EXPECT_EQ(S::backend(), pkg.lookup<I::Baz>());
+    EXPECT_EQ(S::backend(), pkg.lookup<I::Qux>());
+}
+
+TEST(Lookup, ExtraBackend)
+{
+    namespace J = Jupiter;
+    namespace S = Saturn;
+    const auto pkg = cv::gapi::kernels<J::Foo, J::Bar, J::Baz>();
+
+    // Even if pkg doesn't contain S kernels while S is preferable,
+    // it should work.
+    const auto prefer_sj = cv::gapi::lookup_order({S::backend(), J::backend()});
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Foo>(prefer_sj));
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Bar>(prefer_sj));
+    EXPECT_EQ(J::backend(), pkg.lookup<I::Baz>(prefer_sj));
+
+    // If search scope is limited to S only, neither J nor S  kernels
+    // shouldn't be found
+    const auto only_s = cv::gapi::lookup_order({S::backend()});
+    EXPECT_ANY_THROW(pkg.lookup<I::Foo>(only_s));
+    EXPECT_ANY_THROW(pkg.lookup<I::Bar>(only_s));
+    EXPECT_ANY_THROW(pkg.lookup<I::Baz>(only_s));
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_vectorref_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_int_vectorref_test.cpp
new file mode 100644 (file)
index 0000000..1b14e06
--- /dev/null
@@ -0,0 +1,207 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+
+namespace opencv_test
+{
+
+typedef ::testing::Types<int, cv::Point, cv::Rect> VectorRef_Test_Types;
+
+template<typename T> struct VectorRefT: public ::testing::Test { using Type = T; };
+
+TYPED_TEST_CASE(VectorRefT, VectorRef_Test_Types);
+
+TYPED_TEST(VectorRefT, Reset_Valid)
+{
+    using T = typename TestFixture::Type;
+    cv::detail::VectorRefT<T> ref;       // vector ref created empty
+    EXPECT_NO_THROW(ref.reset());        // 1st reset is OK (initializes)
+    EXPECT_NO_THROW(ref.reset());        // 2nd reset is also OK (resets)
+}
+
+TYPED_TEST(VectorRefT, Reset_Invalid)
+{
+    using T = typename TestFixture::Type;
+    std::vector<T> vec(42);              // create a std::vector of 42 elements
+    cv::detail::VectorRefT<T> ref(vec);  // RO_EXT (since reference is const)
+    EXPECT_ANY_THROW(ref.reset());       // data-bound vector ref can't be reset
+}
+
+TYPED_TEST(VectorRefT, ReadRef_External)
+{
+    using T = typename TestFixture::Type;
+    const std::vector<T> vec(42);        // create a std::vector of 42 elements
+    cv::detail::VectorRefT<T> ref(vec);  // RO_EXT (since reference is const)
+    auto &vref = ref.rref();
+    EXPECT_EQ(vec.data(), vref.data());
+    EXPECT_EQ(vec.size(), vref.size());
+}
+
+TYPED_TEST(VectorRefT, ReadRef_Internal)
+{
+    using T = typename TestFixture::Type;
+    cv::detail::VectorRefT<T> ref;
+    ref.reset();                         // RW_OWN (reset on empty ref)
+    auto &vref = ref.rref();             // read access is valid for RW_OWN
+    EXPECT_EQ(0u, vref.size());          // by default vector is empty
+}
+
+TYPED_TEST(VectorRefT, WriteRef_External)
+{
+    using T = typename TestFixture::Type;
+    std::vector<T> vec(42);               // create a std::vector of 42 elements
+    cv::detail::VectorRefT<T> ref(vec);   // RW_EXT (since reference is not const)
+    auto &vref = ref.wref();              // write access is valid with RW_EXT
+    EXPECT_EQ(vec.data(), vref.data());
+    EXPECT_EQ(vec.size(), vref.size());
+}
+
+TYPED_TEST(VectorRefT, WriteRef_Internal)
+{
+    using T = typename TestFixture::Type;
+    cv::detail::VectorRefT<T> ref;
+    ref.reset();                          // RW_OWN (reset on empty ref)
+    auto &vref = ref.wref();              // write access is valid for RW_OWN
+    EXPECT_EQ(0u, vref.size());           // empty vector by default
+}
+
+TYPED_TEST(VectorRefT, WriteToRO)
+{
+    using T = typename TestFixture::Type;
+    const std::vector<T> vec(42);        // create a std::vector of 42 elements
+    cv::detail::VectorRefT<T> ref(vec);  // RO_EXT (since reference is const)
+    EXPECT_ANY_THROW(ref.wref());
+}
+
+TYPED_TEST(VectorRefT, ReadAfterWrite)
+{
+    using T = typename TestFixture::Type;
+    std::vector<T> vec;                        // Initial data holder (empty vector)
+    cv::detail::VectorRefT<T> writer(vec);     // RW_EXT
+
+    const auto& ro_ref = vec;
+    cv::detail::VectorRefT<T> reader(ro_ref);  // RO_EXT
+
+    EXPECT_EQ(0u, writer.wref().size()); // Check the initial state
+    EXPECT_EQ(0u, reader.rref().size());
+
+    writer.wref().emplace_back();        // Check that write is successfull
+    EXPECT_EQ(1u, writer.wref().size());
+
+    EXPECT_EQ(1u, vec.size());           // Check that changes are reflected to the original container
+    EXPECT_EQ(1u, reader.rref().size()); // Check that changes are reflected to reader's view
+
+    EXPECT_EQ(T(), vec.at(0));           // Check the value (must be default-initialized)
+    EXPECT_EQ(T(), reader.rref().at(0));
+    EXPECT_EQ(T(), writer.wref().at(0));
+}
+
+template<typename T> struct VectorRefU: public ::testing::Test { using Type = T; };
+
+TYPED_TEST_CASE(VectorRefU, VectorRef_Test_Types);
+
+template<class T> struct custom_struct { T a; T b; };
+
+TYPED_TEST(VectorRefU, Reset_Valid)
+{
+    using T = typename TestFixture::Type;
+    cv::detail::VectorRef ref;           // vector ref created empty
+    EXPECT_NO_THROW(ref.reset<T>());     // 1st reset is OK (initializes)
+    EXPECT_NO_THROW(ref.reset<T>());     // 2nd reset is also OK (resets)
+
+    EXPECT_ANY_THROW(ref.reset<custom_struct<T> >()); // type change is not allowed
+}
+
+TYPED_TEST(VectorRefU, Reset_Invalid)
+{
+    using T = typename TestFixture::Type;
+    std::vector<T> vec(42);              // create a std::vector of 42 elements
+    cv::detail::VectorRef ref(vec);      // RO_EXT (since reference is const)
+    EXPECT_ANY_THROW(ref.reset<T>());    // data-bound vector ref can't be reset
+}
+
+TYPED_TEST(VectorRefU, ReadRef_External)
+{
+    using T = typename TestFixture::Type;
+    const std::vector<T> vec(42);        // create a std::vector of 42 elements
+    cv::detail::VectorRef ref(vec);      // RO_EXT (since reference is const)
+    auto &vref = ref.rref<T>();
+    EXPECT_EQ(vec.data(), vref.data());
+    EXPECT_EQ(vec.size(), vref.size());
+}
+
+TYPED_TEST(VectorRefU, ReadRef_Internal)
+{
+    using T = typename TestFixture::Type;
+    cv::detail::VectorRef ref;
+    ref.reset<T>();                      // RW_OWN (reset on empty ref)
+    auto &vref = ref.rref<T>();          // read access is valid for RW_OWN
+    EXPECT_EQ(0u, vref.size());          // by default vector is empty
+}
+
+TYPED_TEST(VectorRefU, WriteRef_External)
+{
+    using T = typename TestFixture::Type;
+    std::vector<T> vec(42);             // create a std::vector of 42 elements
+    cv::detail::VectorRef ref(vec);     // RW_EXT (since reference is not const)
+    auto &vref = ref.wref<T>();         // write access is valid with RW_EXT
+    EXPECT_EQ(vec.data(), vref.data());
+    EXPECT_EQ(vec.size(), vref.size());
+}
+
+TYPED_TEST(VectorRefU, WriteRef_Internal)
+{
+    using T = typename TestFixture::Type;
+    cv::detail::VectorRef ref;
+    ref.reset<T>();                     // RW_OWN (reset on empty ref)
+    auto &vref = ref.wref<T>();         // write access is valid for RW_OWN
+    EXPECT_EQ(0u, vref.size());         // empty vector by default
+}
+
+TYPED_TEST(VectorRefU, WriteToRO)
+{
+    using T = typename TestFixture::Type;
+    const std::vector<T> vec(42);       // create a std::vector of 42 elements
+    cv::detail::VectorRef ref(vec);     // RO_EXT (since reference is const)
+    EXPECT_ANY_THROW(ref.wref<T>());
+}
+
+TYPED_TEST(VectorRefU, ReadAfterWrite)
+{
+    using T = typename TestFixture::Type;
+    std::vector<T> vec;                     // Initial data holder (empty vector)
+    cv::detail::VectorRef writer(vec);      // RW_EXT
+
+    const auto& ro_ref = vec;
+    cv::detail::VectorRef reader(ro_ref);   // RO_EXT
+
+    EXPECT_EQ(0u, writer.wref<T>().size()); // Check the initial state
+    EXPECT_EQ(0u, reader.rref<T>().size());
+
+    writer.wref<T>().emplace_back();        // Check that write is successfull
+    EXPECT_EQ(1u, writer.wref<T>().size());
+
+    EXPECT_EQ(1u, vec.size());              // Check that changes are reflected to the original container
+    EXPECT_EQ(1u, reader.rref<T>().size()); // Check that changes are reflected to reader's view
+
+    EXPECT_EQ(T(), vec.at(0));              // Check the value (must be default-initialized)
+    EXPECT_EQ(T(), reader.rref<T>().at(0));
+    EXPECT_EQ(T(), writer.wref<T>().at(0));
+}
+
+TEST(VectorRefU, TypeCheck)
+{
+    cv::detail::VectorRef ref;
+    ref.reset<int>(); // RW_OWN
+
+    EXPECT_ANY_THROW(ref.reset<char>());
+    EXPECT_ANY_THROW(ref.rref<char>());
+    EXPECT_ANY_THROW(ref.wref<char>());
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_transactions_test.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/internal/gapi_transactions_test.cpp
new file mode 100644 (file)
index 0000000..f550340
--- /dev/null
@@ -0,0 +1,222 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include <ade/graph.hpp>
+#include "compiler/transactions.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+bool contains(const ade::Graph& graph, const ade::NodeHandle& node)
+{
+    auto nodes = graph.nodes();
+    return nodes.end() != std::find(nodes.begin(), nodes.end(), node);
+}
+
+bool connected(const ade::NodeHandle& src_node, const ade::NodeHandle& dst_node)
+{
+    auto nodes = src_node->outNodes();
+    return nodes.end() != std::find(nodes.begin(), nodes.end(), dst_node);
+}
+
+struct SimpleGraph
+{
+    //       ehs[0]      ehs[1]     ehs[2]     ehs[3]
+    // nhs[0] -- > nhs[1] --> nhs[2] --> nhs[3] --> nhs[4]
+
+    enum { node_nums = 5 };
+    ade::Graph        graph;
+    ade::NodeHandle   fused_nh;                     /* For check that fusion  node is connected to the
+                                                               inputs of the prod and the outputs of the cons */
+    std::array<ade::NodeHandle, node_nums>     nhs;
+    std::array<ade::EdgeHandle, node_nums - 1> ehs;
+    Change::List changes;
+
+    SimpleGraph()
+    {
+        nhs[0] = graph.createNode();
+        for (int i = 1; i < node_nums; ++i)
+        {
+            nhs[i    ] = graph.createNode();
+            ehs[i - 1] = graph.link(nhs[i - 1], nhs[i]);
+        }
+    }
+
+    void fuse()
+    {
+        // nhs[0] --> fused_nh --> nhs[4]
+
+        fused_nh = graph.createNode();
+        changes.enqueue<Change::NodeCreated>(fused_nh);
+        changes.enqueue<Change::NewLink> (graph, nhs[0],    fused_nh);
+        changes.enqueue<Change::DropLink>(graph, nhs[1],    ehs[0]);
+        changes.enqueue<Change::NewLink> (graph, fused_nh, nhs[4]);
+        changes.enqueue<Change::DropLink>(graph, nhs[3],    ehs[3]);
+        changes.enqueue<Change::DropLink>(graph, nhs[1],    ehs[1]);
+        changes.enqueue<Change::DropLink>(graph, nhs[2],    ehs[2]);
+        changes.enqueue<Change::DropNode>(nhs[1]);
+        changes.enqueue<Change::DropNode>(nhs[2]);
+        changes.enqueue<Change::DropNode>(nhs[3]);
+    }
+
+    void commit()   { changes.commit(graph);   }
+    void rollback() { changes.rollback(graph); }
+
+};
+
+struct Transactions: public ::testing::Test, public SimpleGraph {};
+
+} // anonymous namespace
+
+TEST_F(Transactions, NodeCreated_Create)
+{
+    auto new_nh = graph.createNode();
+    Change::NodeCreated node_created(new_nh);
+
+    EXPECT_EQ(6u, static_cast<std::size_t>(graph.nodes().size()));
+    EXPECT_TRUE(contains(graph, new_nh));
+}
+
+TEST_F(Transactions, NodeCreated_RollBack)
+{
+    auto new_nh = graph.createNode();
+    Change::NodeCreated node_created(new_nh);
+
+    node_created.rollback(graph);
+
+    EXPECT_EQ(5u, static_cast<std::size_t>(graph.nodes().size()));
+    EXPECT_FALSE(contains(graph, new_nh));
+}
+
+TEST_F(Transactions, NodeCreated_Commit)
+{
+    auto new_nh = graph.createNode();
+    Change::NodeCreated node_created(new_nh);
+
+    node_created.commit(graph);
+
+    EXPECT_EQ(6u, static_cast<std::size_t>(graph.nodes().size()));
+    EXPECT_TRUE(contains(graph, new_nh));
+}
+
+TEST_F(Transactions, DropLink_Create)
+{
+    Change::DropLink drop_link(graph, nhs[0], ehs[0]);
+
+    EXPECT_FALSE(connected(nhs[0], nhs[1]));
+}
+
+TEST_F(Transactions, DropLink_RollBack)
+{
+    Change::DropLink drop_link(graph, nhs[0], ehs[0]);
+
+    drop_link.rollback(graph);
+
+    EXPECT_TRUE(connected(nhs[0], nhs[1]));
+}
+
+TEST_F(Transactions, DropLink_Commit)
+{
+    Change::DropLink drop_link(graph, nhs[0], ehs[0]);
+
+    drop_link.commit(graph);
+
+    EXPECT_FALSE(connected(nhs[0], nhs[1]));
+}
+
+TEST_F(Transactions, NewLink_Create)
+{
+    auto new_nh = graph.createNode();
+    Change::NewLink new_link(graph, new_nh, nhs[0]);
+
+    EXPECT_TRUE(connected(new_nh, nhs[0]));
+}
+
+TEST_F(Transactions, NewLink_RollBack)
+{
+    auto new_nh = graph.createNode();
+    Change::NewLink new_link(graph, new_nh, nhs[0]);
+
+    new_link.rollback(graph);
+
+    EXPECT_FALSE(connected(new_nh, nhs[0]));
+}
+
+TEST_F(Transactions, NewLink_Commit)
+{
+    auto new_nh = graph.createNode();
+    Change::NewLink new_link(graph, new_nh, nhs[0]);
+
+    new_link.commit(graph);
+
+    EXPECT_TRUE(connected(new_nh, nhs[0]));
+}
+
+TEST_F(Transactions, DropNode_Create)
+{
+    auto new_nh = graph.createNode();
+    Change::DropNode drop_node(new_nh);
+
+    EXPECT_EQ(6u, static_cast<std::size_t>(graph.nodes().size()));
+    EXPECT_TRUE(contains(graph, new_nh));
+}
+
+TEST_F(Transactions, DropNode_RollBack)
+{
+    auto new_nh = graph.createNode();
+    Change::DropNode drop_node(new_nh);
+
+    drop_node.rollback(graph);
+
+    EXPECT_EQ(6u, static_cast<std::size_t>(graph.nodes().size()));
+    EXPECT_TRUE(contains(graph, new_nh));
+}
+
+TEST_F(Transactions, DropNode_Commit)
+{
+    auto new_nh = graph.createNode();
+    Change::DropNode drop_node(new_nh);
+
+    drop_node.commit(graph);
+
+    EXPECT_EQ(5u, static_cast<std::size_t>(graph.nodes().size()));
+    EXPECT_FALSE(contains(graph, new_nh));
+}
+
+TEST_F(Transactions, Fusion_Commit)
+{
+    namespace C = Change;
+
+    fuse();
+    commit();
+
+    EXPECT_EQ(3u, static_cast<std::size_t>(graph.nodes().size()));
+    EXPECT_TRUE(connected(nhs[0]   , fused_nh));
+    EXPECT_TRUE(connected(fused_nh, nhs[4]));
+}
+
+TEST_F(Transactions, Fusion_RollBack)
+{
+    namespace C = Change;
+
+    fuse();
+    rollback();
+
+    EXPECT_EQ(static_cast<std::size_t>(node_nums),
+              static_cast<std::size_t>(graph.nodes().size()));
+    EXPECT_FALSE(contains(graph, fused_nh));
+
+    for (int i = 0; i < static_cast<int>(node_nums) - 1; ++i)
+    {
+        EXPECT_TRUE(connected(nhs[i], nhs[i + 1]));
+    }
+}
+
+} // opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/own/gapi_types_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/own/gapi_types_tests.cpp
new file mode 100644 (file)
index 0000000..c254357
--- /dev/null
@@ -0,0 +1,159 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "opencv2/gapi/own/types.hpp"
+
+namespace opencv_test
+{
+
+TEST(Point, CreateEmpty)
+{
+    cv::gapi::own::Point p;
+
+    EXPECT_EQ(0, p.x);
+    EXPECT_EQ(0, p.y);
+}
+
+TEST(Point, CreateWithParams)
+{
+    cv::gapi::own::Point p = {1, 2};
+
+    EXPECT_EQ(1, p.x);
+    EXPECT_EQ(2, p.y);
+}
+
+TEST(Rect, CreateEmpty)
+{
+    cv::gapi::own::Rect r;
+
+    EXPECT_EQ(0, r.x);
+    EXPECT_EQ(0, r.y);
+    EXPECT_EQ(0, r.width);
+    EXPECT_EQ(0, r.height);
+}
+
+TEST(Rect, CreateWithParams)
+{
+    cv::gapi::own::Rect r(1, 2, 3, 4);
+
+    EXPECT_EQ(1, r.x);
+    EXPECT_EQ(2, r.y);
+    EXPECT_EQ(3, r.width);
+    EXPECT_EQ(4, r.height);
+}
+
+TEST(Rect, CompareEqual)
+{
+    cv::gapi::own::Rect r1(1, 2, 3, 4);
+
+    cv::gapi::own::Rect r2(1, 2, 3, 4);
+
+    EXPECT_TRUE(r1 == r2);
+}
+
+TEST(Rect, CompareDefaultEqual)
+{
+    cv::gapi::own::Rect r1;
+
+    cv::gapi::own::Rect r2;
+
+    EXPECT_TRUE(r1 == r2);
+}
+
+TEST(Rect, CompareNotEqual)
+{
+    cv::gapi::own::Rect r1(1, 2, 3, 4);
+
+    cv::gapi::own::Rect r2;
+
+    EXPECT_TRUE(r1 != r2);
+}
+
+TEST(Rect, Intersection)
+{
+    cv::gapi::own::Rect r1(2, 2, 3, 3);
+    cv::gapi::own::Rect r2(3, 1, 3, 3);
+
+    cv::gapi::own::Rect intersect = r1 & r2;
+
+    EXPECT_EQ(3, intersect.x);
+    EXPECT_EQ(2, intersect.y);
+    EXPECT_EQ(2, intersect.width);
+    EXPECT_EQ(2, intersect.height);
+}
+
+TEST(Rect, AssignIntersection)
+{
+    cv::gapi::own::Rect r1(2, 2, 3, 3);
+    cv::gapi::own::Rect r2(3, 1, 3, 3);
+
+    r1 &= r2;
+
+    EXPECT_EQ(3, r1.x);
+    EXPECT_EQ(2, r1.y);
+    EXPECT_EQ(2, r1.width);
+    EXPECT_EQ(2, r1.height);
+}
+
+TEST(Size, CreateEmpty)
+{
+    cv::gapi::own::Size s;
+
+    EXPECT_EQ(0, s.width);
+    EXPECT_EQ(0, s.height);
+}
+
+TEST(Size, CreateWithParams)
+{
+    cv::gapi::own::Size s(640, 480);
+
+    EXPECT_EQ(640, s.width);
+    EXPECT_EQ(480, s.height);
+}
+
+TEST(Size, AdditionAssignment)
+{
+    cv::gapi::own::Size s1(1, 2);
+    cv::gapi::own::Size s2(2, 3);
+
+    s1 += s2;
+
+    EXPECT_EQ(3, s1.width);
+    EXPECT_EQ(5, s1.height);
+}
+
+TEST(Size, CompareEqual)
+{
+    cv::gapi::own::Size s1(1, 2);
+
+    cv::gapi::own::Size s2(1, 2);
+
+    EXPECT_TRUE(s1 == s2);
+    EXPECT_FALSE(s1 != s2);
+}
+
+TEST(Size, CompareDefaultEqual)
+{
+    cv::gapi::own::Size s1;
+    cv::gapi::own::Size s2;
+
+    EXPECT_TRUE(s1 == s2);
+    EXPECT_FALSE(s1 != s2);
+}
+
+TEST(Size, CompareNotEqual)
+{
+    cv::gapi::own::Size s1(1, 2);
+
+    cv::gapi::own::Size s2(3, 4);
+
+    EXPECT_FALSE(s1 == s2);
+    EXPECT_TRUE(s1 != s2);
+}
+
+} // opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/own/mat_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/own/mat_tests.cpp
new file mode 100644 (file)
index 0000000..ba2cd2d
--- /dev/null
@@ -0,0 +1,387 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "opencv2/gapi/own/mat.hpp"
+#include <opencv2/gapi/util/compiler_hints.hpp> //suppress_unused_warning
+
+namespace opencv_test
+{
+using Mat = cv::gapi::own::Mat;
+
+TEST(OwnMat, DefaultConstruction)
+{
+    Mat m;
+    ASSERT_EQ(m.data, nullptr);
+    ASSERT_EQ(m.cols, 0);
+    ASSERT_EQ(m.rows, 0);
+    ASSERT_EQ(m.cols, 0);
+    ASSERT_EQ(m.type(), 0);
+    ASSERT_EQ(m.depth(), 0);
+}
+
+TEST(OwnMat, Create)
+{
+    auto size = cv::gapi::own::Size{32,16};
+    Mat m;
+    m.create(size, CV_8UC1);
+
+    ASSERT_NE(m.data, nullptr);
+    ASSERT_EQ((cv::gapi::own::Size{m.cols, m.rows}), size);
+
+    ASSERT_EQ(m.total(), static_cast<size_t>(size.height*size.width));
+    ASSERT_EQ(m.type(), CV_8UC1);
+    ASSERT_EQ(m.depth(), CV_8U);
+    ASSERT_EQ(m.channels(), 1);
+    ASSERT_EQ(m.elemSize(), sizeof(uint8_t));
+    ASSERT_EQ(m.step,   sizeof(uint8_t) * m.cols);
+}
+
+TEST(OwnMat, CreateOverload)
+{
+    auto size = cv::gapi::own::Size{32,16};
+    Mat m;
+    m.create(size.height,size.width, CV_8UC1);
+
+    ASSERT_NE(m.data, nullptr);
+    ASSERT_EQ((cv::Size{m.cols, m.rows}), size);
+
+    ASSERT_EQ(m.total(), static_cast<size_t>(size.height*size.width));
+    ASSERT_EQ(m.type(), CV_8UC1);
+    ASSERT_EQ(m.depth(), CV_8U);
+    ASSERT_EQ(m.channels(), 1);
+    ASSERT_EQ(m.elemSize(), sizeof(uint8_t));
+    ASSERT_EQ(m.step,   sizeof(uint8_t) * m.cols);
+}
+TEST(OwnMat, Create3chan)
+{
+    auto size = cv::Size{32,16};
+    Mat m;
+    m.create(size, CV_8UC3);
+
+    ASSERT_NE(m.data, nullptr);
+    ASSERT_EQ((cv::Size{m.cols, m.rows}), size);
+
+    ASSERT_EQ(m.type(), CV_8UC3);
+    ASSERT_EQ(m.depth(), CV_8U);
+    ASSERT_EQ(m.channels(), 3);
+    ASSERT_EQ(m.elemSize(), 3 * sizeof(uint8_t));
+    ASSERT_EQ(m.step,       3*  sizeof(uint8_t) * m.cols);
+}
+
+struct NonEmptyMat {
+    cv::gapi::own::Size size{32,16};
+    Mat m;
+    NonEmptyMat() {
+        m.create(size, CV_8UC1);
+    }
+};
+
+struct OwnMatSharedSemantics : NonEmptyMat, ::testing::Test {};
+
+
+namespace {
+    auto state_of = [](Mat const& mat) {
+        return std::make_tuple(
+                mat.data,
+                cv::Size{mat.cols, mat.rows},
+                mat.type(),
+                mat.depth(),
+                mat.channels()
+        );
+    };
+
+    void ensure_mats_are_same(Mat const& copy, Mat const& m){
+        EXPECT_NE(copy.data, nullptr);
+        EXPECT_EQ(state_of(copy), state_of(m));
+    }
+}
+TEST_F(OwnMatSharedSemantics, CopyConstruction)
+{
+    Mat copy(m);
+    ensure_mats_are_same(copy, m);
+}
+
+TEST_F(OwnMatSharedSemantics, CopyAssignment)
+{
+    Mat copy;
+    copy = m;
+    ensure_mats_are_same(copy, m);
+}
+
+struct OwnMatMoveSemantics : NonEmptyMat, ::testing::Test {
+    Mat& moved_from = m;
+    decltype(state_of(moved_from)) initial_state = state_of(moved_from);
+
+    void ensure_state_moved_to(Mat const& moved_to)
+    {
+        EXPECT_EQ(state_of(moved_to),     initial_state);
+        EXPECT_EQ(state_of(moved_from),   state_of(Mat{}));
+    }
+};
+
+TEST_F(OwnMatMoveSemantics, MoveConstruction)
+{
+    Mat moved_to(std::move(moved_from));
+
+    ensure_state_moved_to(moved_to);
+}
+
+TEST_F(OwnMatMoveSemantics, MoveAssignment)
+{
+    Mat moved_to(std::move(moved_from));
+    ensure_state_moved_to(moved_to);
+}
+
+struct OwnMatNonOwningView : NonEmptyMat, ::testing::Test {
+    decltype(state_of(m)) initial_state = state_of(m);
+
+    void TearDown() override {
+        EXPECT_EQ(state_of(m), initial_state)<<"State of the source matrix changed?";
+        //ASAN should complain here if memory is freed here (e.g. by bug in non owning logic of own::Mat)
+        volatile uchar dummy =  m.data[0];
+        cv::util::suppress_unused_warning(dummy);
+    }
+
+};
+
+TEST_F(OwnMatNonOwningView, Construction)
+{
+    Mat non_owning_view(m.rows, m.cols, m.type(), static_cast<void*>(m.data));
+
+    ensure_mats_are_same(non_owning_view, m);
+}
+
+TEST_F(OwnMatNonOwningView, CopyConstruction)
+{
+    Mat non_owning_view{m.rows, m.cols, m.type(), static_cast<void*>(m.data)};
+
+    Mat non_owning_view_copy = non_owning_view;
+    ensure_mats_are_same(non_owning_view_copy, m);
+}
+
+TEST_F(OwnMatNonOwningView, Assignment)
+{
+    Mat non_owning_view{m.rows, m.cols, m.type(), static_cast<void*>(m.data)};
+    Mat non_owning_view_copy;
+
+    non_owning_view_copy = non_owning_view;
+    ensure_mats_are_same(non_owning_view_copy, m);
+}
+
+TEST(OwnMatConversion, WithStep)
+{
+    constexpr int width  = 8;
+    constexpr int height = 8;
+    constexpr int stepInPixels = 16;
+
+    std::array<int, height * stepInPixels> data;
+    for (size_t i = 0; i < data.size(); i++)
+    {
+        data[i] = static_cast<int>(i);
+    }
+    cv::Mat cvMat(cv::Size{width, height}, CV_32S, data.data(), stepInPixels * sizeof(int));
+
+    auto ownMat = to_own(cvMat);
+    auto cvMatFromOwn = cv::gapi::own::to_ocv(ownMat);
+
+    EXPECT_EQ(0, cv::countNonZero(cvMat != cvMatFromOwn))
+    << cvMat << std::endl
+    << (cvMat != cvMatFromOwn);
+}
+
+TEST(OwnMat, PtrWithStep)
+{
+    constexpr int width  = 8;
+    constexpr int height = 8;
+    constexpr int stepInPixels = 16;
+
+    std::array<int, height * stepInPixels> data;
+    for (size_t i = 0; i < data.size(); i++)
+    {
+        data[i] = static_cast<int>(i);
+    }
+    Mat mat(height, width, CV_32S, data.data(), stepInPixels * sizeof(int));
+
+    EXPECT_EQ(& data[0],                reinterpret_cast<int*>(mat.ptr(0)));
+    EXPECT_EQ(& data[1],                reinterpret_cast<int*>(mat.ptr(0, 1)));
+    EXPECT_EQ(& data[stepInPixels],     reinterpret_cast<int*>(mat.ptr(1)));
+    EXPECT_EQ(& data[stepInPixels +1],  reinterpret_cast<int*>(mat.ptr(1,1)));
+
+    auto const& cmat = mat;
+
+    EXPECT_EQ(& data[0],                reinterpret_cast<const int*>(cmat.ptr(0)));
+    EXPECT_EQ(& data[1],                reinterpret_cast<const int*>(cmat.ptr(0, 1)));
+    EXPECT_EQ(& data[stepInPixels],     reinterpret_cast<const int*>(cmat.ptr(1)));
+    EXPECT_EQ(& data[stepInPixels +1],  reinterpret_cast<const int*>(cmat.ptr(1,1)));
+}
+
+TEST(OwnMat, CopyToWithStep)
+{
+    constexpr int width  = 8;
+    constexpr int height = 8;
+    constexpr int stepInPixels = 16;
+
+    std::array<int, height * stepInPixels> data;
+    for (size_t i = 0; i < data.size(); i++)
+    {
+        data[i] = static_cast<int>(i);
+    }
+    Mat mat(height, width, CV_32S, data.data(), stepInPixels * sizeof(int));
+
+    Mat dst;
+    mat.copyTo(dst);
+
+    EXPECT_NE(mat.data, dst.data);
+    EXPECT_EQ(0, cv::countNonZero(to_ocv(mat) != to_ocv(dst)))
+    << to_ocv(mat) << std::endl
+    << (to_ocv(mat) != to_ocv(dst));
+}
+
+TEST(OwnMat, ScalarAssign32SC1)
+{
+    constexpr int width  = 8;
+    constexpr int height = 8;
+    constexpr int stepInPixels = 16;
+
+    std::array<int, height * stepInPixels> data;
+    for (size_t i = 0; i < data.size(); i++)
+    {
+        data[i] = static_cast<int>(i);
+    }
+    Mat mat(height, width, CV_32S, data.data(), stepInPixels * sizeof(data[0]));
+
+    mat = cv::gapi::own::Scalar{-1};
+
+    std::array<int, height * stepInPixels> expected;
+
+    for (size_t row = 0; row < height; row++)
+    {
+        for (size_t col = 0; col < stepInPixels; col++)
+        {
+            auto index = row*stepInPixels + col;
+            expected[index] = col < width ? -1 : static_cast<int>(index);
+        }
+    }
+
+    auto cmp_result_mat = (cv::Mat{height, stepInPixels, CV_32S, data.data()} != cv::Mat{height, stepInPixels, CV_32S, expected.data()});
+    EXPECT_EQ(0, cv::countNonZero(cmp_result_mat))
+    << cmp_result_mat << std::endl;
+}
+
+TEST(OwnMat, ScalarAssign8UC1)
+{
+    constexpr int width  = 8;
+    constexpr int height = 8;
+    constexpr int stepInPixels = 16;
+
+    std::array<uchar, height * stepInPixels> data;
+    for (size_t i = 0; i < data.size(); i++)
+    {
+        data[i] = static_cast<uchar>(i);
+    }
+    Mat mat(height, width, CV_8U, data.data(), stepInPixels * sizeof(data[0]));
+
+    mat = cv::gapi::own::Scalar{-1};
+
+    std::array<uchar, height * stepInPixels> expected;
+
+    for (size_t row = 0; row < height; row++)
+    {
+        for (size_t col = 0; col < stepInPixels; col++)
+        {
+            auto index = row*stepInPixels + col;
+            expected[index] = col < width ? cv::saturate_cast<uchar>(-1) : static_cast<uchar>(index);
+        }
+    }
+
+    auto cmp_result_mat = (cv::Mat{height, stepInPixels, CV_8U, data.data()} != cv::Mat{height, stepInPixels, CV_8U, expected.data()});
+    EXPECT_EQ(0, cv::countNonZero(cmp_result_mat))
+    << cmp_result_mat << std::endl;
+}
+
+TEST(OwnMat, ScalarAssign8UC3)
+{
+    constexpr auto cv_type = CV_8SC3;
+    constexpr int channels = 3;
+    constexpr int width  = 8;
+    constexpr int height = 8;
+    constexpr int stepInPixels = 16;
+
+    std::array<schar, height * stepInPixels * channels> data;
+    for (size_t i = 0; i < data.size(); i+= channels)
+    {
+        data[i + 0] = static_cast<schar>(10 * i + 0);
+        data[i + 1] = static_cast<schar>(10 * i + 1);
+        data[i + 2] = static_cast<schar>(10 * i + 2);
+    }
+
+    Mat mat(height, width, cv_type, data.data(), channels * stepInPixels * sizeof(data[0]));
+
+    mat = cv::gapi::own::Scalar{-10, -11, -12};
+
+    std::array<schar, data.size()> expected;
+
+    for (size_t row = 0; row < height; row++)
+    {
+        for (size_t col = 0; col < stepInPixels; col++)
+        {
+            int index = static_cast<int>(channels * (row*stepInPixels + col));
+            expected[index + 0] = static_cast<schar>(col < width ? -10 : 10 * index + 0);
+            expected[index + 1] = static_cast<schar>(col < width ? -11 : 10 * index + 1);
+            expected[index + 2] = static_cast<schar>(col < width ? -12 : 10 * index + 2);
+        }
+    }
+
+    auto cmp_result_mat = (cv::Mat{height, stepInPixels, cv_type, data.data()} != cv::Mat{height, stepInPixels, cv_type, expected.data()});
+    EXPECT_EQ(0, cv::countNonZero(cmp_result_mat))
+    << cmp_result_mat << std::endl
+    << "data : " << std::endl
+    << cv::Mat{height, stepInPixels, cv_type, data.data()}     << std::endl
+    << "expected : " << std::endl
+    << cv::Mat{height, stepInPixels, cv_type, expected.data()} << std::endl;
+}
+
+TEST(OwnMat, ROIView)
+{
+    constexpr int width  = 8;
+    constexpr int height = 8;
+    constexpr int stepInPixels = 16;
+
+    std::array<uchar, height * stepInPixels> data;
+    for (size_t i = 0; i < data.size(); i++)
+    {
+        data[i] = static_cast<uchar>(i);
+    }
+
+
+//    std::cout<<cv::Mat{height, stepInPixels, CV_8U, data.data()}<<std::endl;
+
+    std::array<uchar, 4 * 4> expected;
+
+    for (size_t row = 0; row < 4; row++)
+    {
+        for (size_t col = 0; col < 4; col++)
+        {
+            expected[row*4 +col] = static_cast<uchar>(stepInPixels * (2 + row) + 2 + col);
+        }
+    }
+
+    Mat mat(height, width, CV_8U, data.data(), stepInPixels * sizeof(data[0]));
+    Mat roi_view (mat, cv::gapi::own::Rect{2,2,4,4});
+
+//    std::cout<<cv::Mat{4, 4, CV_8U, expected.data()}<<std::endl;
+//
+    auto expected_cv_mat = cv::Mat{4, 4, CV_8U, expected.data()};
+
+    auto cmp_result_mat = (to_ocv(roi_view) != expected_cv_mat);
+    EXPECT_EQ(0, cv::countNonZero(cmp_result_mat))
+    << cmp_result_mat   << std::endl
+    << to_ocv(roi_view) << std::endl
+    << expected_cv_mat  << std::endl;
+}
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/own/scalar_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/own/scalar_tests.cpp
new file mode 100644 (file)
index 0000000..a9c5c01
--- /dev/null
@@ -0,0 +1,44 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "opencv2/gapi/own/scalar.hpp"
+
+namespace opencv_test
+{
+
+TEST(Scalar, CreateEmpty)
+{
+    cv::gapi::own::Scalar s;
+
+    for (int i = 0; i < 4; ++i)
+    {
+        EXPECT_EQ(s[i], 0.0);
+    }
+}
+
+TEST(Scalar, CreateFromVal)
+{
+    cv::gapi::own::Scalar s(5.0);
+
+    EXPECT_EQ(s[0], 5.0);
+    EXPECT_EQ(s[1], 0.0);
+    EXPECT_EQ(s[2], 0.0);
+    EXPECT_EQ(s[3], 0.0);
+}
+
+TEST(Scalar, CreateFromVals)
+{
+    cv::gapi::own::Scalar s(5.3, 3.3, 4.1, -2.0);
+
+    EXPECT_EQ(s[0], 5.3);
+    EXPECT_EQ(s[1], 3.3);
+    EXPECT_EQ(s[2], 4.1);
+    EXPECT_EQ(s[3], -2.0);
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/test_main.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/test_main.cpp
new file mode 100644 (file)
index 0000000..fa5862f
--- /dev/null
@@ -0,0 +1,12 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+// FIXME: OpenCV license header
+
+#include "test_precomp.hpp"
+
+CV_TEST_MAIN("gapi")
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/test_precomp.hpp b/inference-engine/thirdparty/fluid/modules/gapi/test/test_precomp.hpp
new file mode 100644 (file)
index 0000000..bcab803
--- /dev/null
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+// FIXME: OpenCV header
+
+#ifndef __OPENCV_GAPI_TEST_PRECOMP_HPP__
+#define __OPENCV_GAPI_TEST_PRECOMP_HPP__
+
+#include <cstdint>
+#include <vector>
+
+#include "opencv2/ts.hpp"
+#include "opencv2/gapi.hpp"
+#include "opencv2/gapi/imgproc.hpp"
+#include "opencv2/gapi/core.hpp"
+#include "opencv2/gapi/cpu/gcpukernel.hpp"
+#include "opencv2/gapi/gpu/ggpukernel.hpp"
+#include "opencv2/gapi/gcompoundkernel.hpp"
+#include "opencv2/gapi/operators.hpp"
+#include "opencv2/gapi/fluid/imgproc.hpp"
+#include "opencv2/gapi/fluid/core.hpp"
+
+#endif // __OPENCV_GAPI_TEST_PRECOMP_HPP__
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/util/any_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/util/any_tests.cpp
new file mode 100644 (file)
index 0000000..60bbcc1
--- /dev/null
@@ -0,0 +1,121 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "opencv2/gapi/util/any.hpp"
+
+namespace opencv_test
+{
+
+TEST(Any, basic)
+{
+   using namespace util;
+   any a(8);
+   auto casted_pointer =  any_cast<int>(&a);
+   ASSERT_NE(nullptr, casted_pointer);
+   ASSERT_EQ(*casted_pointer, 8);
+
+   *casted_pointer = 7;
+   ASSERT_EQ(any_cast<int>(a), 7);
+}
+
+TEST(Any, any_cast_ref_throws_on_empty)
+{
+   using namespace util;
+   any a;
+
+   ASSERT_THROW(util::any_cast<int>(a), bad_any_cast);
+}
+
+TEST(Any, copy)
+{
+   using namespace util;
+   any a(8);
+
+   ASSERT_EQ(any_cast<int>(a), 8);
+
+   any b (a);
+
+   ASSERT_NE(nullptr, any_cast<int>(&b));
+   ASSERT_EQ(8      , any_cast<int>(b));
+   ASSERT_EQ(8      , any_cast<int>(a));
+}
+
+TEST(Any, copy_empty)
+{
+   using namespace util;
+   any a;
+
+   ASSERT_EQ(nullptr, any_cast<int>(&a));
+
+   any b (a);
+
+   ASSERT_EQ(nullptr, any_cast<int>(&a));
+   ASSERT_EQ(nullptr, any_cast<int>(&b));
+}
+
+TEST(Any, move)
+{
+   using namespace util;
+   any a(8);
+
+   ASSERT_EQ(any_cast<int>(a), 8);
+
+   any b (std::move(a));
+
+   ASSERT_NE(nullptr,  any_cast<int>(&b));
+   ASSERT_EQ(8      ,  any_cast<int>(b));
+   ASSERT_EQ(nullptr,  any_cast<int>(&a));
+}
+
+TEST(Any, swap)
+{
+   using namespace util;
+   any a(8);
+   any b(7);
+
+   ASSERT_EQ(7, any_cast<int>(b));
+   ASSERT_EQ(8, any_cast<int>(a));
+
+   swap(a,b);
+
+   ASSERT_EQ(8, any_cast<int>(b));
+   ASSERT_EQ(7, any_cast<int>(a));
+}
+
+TEST(Any, move_assign)
+{
+   using namespace util;
+   any a(8);
+   any b;
+
+   ASSERT_EQ(any_cast<int>(a), 8);
+
+   b = (std::move(a));
+
+   ASSERT_NE(nullptr,  any_cast<int>(&b));
+   ASSERT_EQ(8      ,  any_cast<int>(b));
+   ASSERT_EQ(nullptr,  any_cast<int>(&a));
+}
+
+TEST(Any, copy_assign)
+{
+   using namespace util;
+   any a(8);
+   any b;
+
+   ASSERT_EQ(any_cast<int>(a), 8);
+   ASSERT_EQ(nullptr,  any_cast<int>(&b));
+
+   b = a;
+
+   ASSERT_NE(nullptr, any_cast<int>(&b));
+   ASSERT_EQ(8      , any_cast<int>(b));
+   ASSERT_EQ(8      , any_cast<int>(a));
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/util/optional_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/util/optional_tests.cpp
new file mode 100644 (file)
index 0000000..b7fabd5
--- /dev/null
@@ -0,0 +1,175 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "opencv2/gapi/util/optional.hpp"
+#include <opencv2/gapi/util/compiler_hints.hpp> //suppress_unused_warning
+
+namespace opencv_test
+{
+
+TEST(Optional, EmptyCtor)
+{
+    util::optional<int> o;
+    EXPECT_FALSE(o.has_value());
+    EXPECT_FALSE(static_cast<bool>(o));
+}
+
+TEST(Optional, ValueCTor)
+{
+    util::optional<int> o(42);
+    EXPECT_TRUE(o.has_value());
+    EXPECT_TRUE(static_cast<bool>(o));
+}
+
+TEST(Optional, MoveCtr)
+{
+    util::optional<std::string> os1(std::string("text"));
+    EXPECT_TRUE(os1.has_value());
+
+    util::optional<std::string> os2(std::move(os1));
+    EXPECT_FALSE(os1.has_value());
+    EXPECT_TRUE(os2.has_value());
+    EXPECT_EQ("text", os2.value());
+}
+
+TEST(Optional, EmptyThrows)
+{
+    struct foo { int bar; };
+    util::optional<foo> om;
+    const util::optional<foo> oc;
+
+    int dummy;
+
+    EXPECT_THROW(dummy = om->bar,    util::bad_optional_access);
+    EXPECT_THROW(dummy = oc->bar,    util::bad_optional_access);
+    cv::util::suppress_unused_warning(dummy);
+    EXPECT_THROW(*om,        util::bad_optional_access);
+    EXPECT_THROW(*oc,        util::bad_optional_access);
+    EXPECT_THROW(om.value(), util::bad_optional_access);
+    EXPECT_THROW(oc.value(), util::bad_optional_access);
+}
+
+TEST(Optional, ValueNoThrow)
+{
+    struct foo { int bar; };
+    util::optional<foo> om(foo{42});
+    const util::optional<foo> oc(foo{42});
+
+    int dummy;
+    EXPECT_NO_THROW(dummy = om->bar);
+    EXPECT_NO_THROW(dummy = oc->bar);
+    cv::util::suppress_unused_warning(dummy);
+    EXPECT_NO_THROW(*om);
+    EXPECT_NO_THROW(*oc);
+    EXPECT_NO_THROW(om.value());
+    EXPECT_NO_THROW(oc.value());
+}
+
+TEST(Optional, Value)
+{
+    util::optional<int> oi(42);
+
+    struct foo { int bar; };
+    util::optional<foo> of(foo{42});
+
+    EXPECT_EQ(42, oi.value());
+    EXPECT_EQ(42, *oi);
+
+    EXPECT_EQ(42, of.value().bar);
+    EXPECT_EQ(42, of->bar);
+}
+
+TEST(Optional, Mutable)
+{
+    util::optional<int> oi(42);
+    *oi = 43;
+    EXPECT_EQ(43, *oi);
+
+    struct foo { int bar; int baz; };
+    util::optional<foo> of(foo{11,22});
+
+    (*of).bar = 42;
+    EXPECT_EQ(42, of->bar);
+    EXPECT_EQ(22, of->baz);
+
+    of->baz = 33;
+    EXPECT_EQ(42, of->bar);
+    EXPECT_EQ(33, of->baz);
+}
+
+TEST(Optional, MoveAssign)
+{
+    util::optional<int> e, i(42);
+
+    EXPECT_FALSE(e.has_value());
+    EXPECT_TRUE(i.has_value());
+    EXPECT_EQ(42, *i);
+
+    e = std::move(i);
+    EXPECT_TRUE(e.has_value());
+    EXPECT_FALSE(i.has_value());
+    EXPECT_EQ(42, *e);
+}
+
+TEST(Optional, CopyAssign)
+{
+    util::optional<int> e;
+    const util::optional<int> i(42);
+
+    EXPECT_FALSE(e.has_value());
+    EXPECT_TRUE(i.has_value());
+    EXPECT_EQ(42, *i);
+
+    e = i;
+    EXPECT_TRUE(e.has_value());
+    EXPECT_TRUE(i.has_value());
+    EXPECT_EQ(42, *e);
+    EXPECT_EQ(42, *i);
+}
+
+TEST(Optional, ValueOr)
+{
+    util::optional<int> e;
+    EXPECT_FALSE(e.has_value());
+    EXPECT_EQ(42, e.value_or(42));
+    EXPECT_EQ(42, e.value_or(42.1));
+}
+
+TEST(Optional, Swap)
+{
+    util::optional<int> e, i(42);
+
+    EXPECT_FALSE(e.has_value());
+    EXPECT_TRUE(i.has_value());
+    EXPECT_EQ(42, *i);
+
+    e.swap(i);
+
+    EXPECT_TRUE(e.has_value());
+    EXPECT_FALSE(i.has_value());
+    EXPECT_EQ(42, *e);
+}
+
+TEST(Optional, Reset)
+{
+    util::optional<int> i(42);
+    EXPECT_TRUE(i.has_value());
+
+    i.reset();
+    EXPECT_FALSE(i.has_value());
+}
+
+TEST(Optional, MakeOptional)
+{
+    std::string s("text");
+    auto os = util::make_optional(s);
+    EXPECT_TRUE(os.has_value());
+    EXPECT_EQ(s, os.value());
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/modules/gapi/test/util/variant_tests.cpp b/inference-engine/thirdparty/fluid/modules/gapi/test/util/variant_tests.cpp
new file mode 100644 (file)
index 0000000..a95b6aa
--- /dev/null
@@ -0,0 +1,386 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2018 Intel Corporation
+
+
+#include "test_precomp.hpp"
+#include "opencv2/gapi/util/variant.hpp"
+#include <cstddef> //std::max_align_t
+
+namespace opencv_test
+{
+
+namespace
+{
+    typedef util::variant<int, std::string> TestVar;
+    typedef util::variant<int, float>       TestVar2;
+}
+
+TEST(Variant, EmptyCTor)
+{
+    util::variant<int> vi;
+    EXPECT_EQ(0,  util::get<int>(vi));
+
+    util::variant<int, std::string> vis;
+    EXPECT_EQ(0,  util::get<int>(vis));
+
+    util::variant<std::string> vs;
+    EXPECT_EQ("", util::get<std::string>(vs));
+
+    util::variant<std::string, int> vsi;
+    EXPECT_EQ("", util::get<std::string>(vsi));
+}
+
+TEST(Variant, ValueMoveCTor)
+{
+    util::variant<int> vi(42);
+    EXPECT_EQ(0u,     vi.index());
+    EXPECT_EQ(42,     util::get<int>(vi));
+
+    util::variant<int, std::string> vis(2017);
+    EXPECT_EQ(0u,     vis.index());
+    EXPECT_EQ(2017,   util::get<int>(vis));
+
+    util::variant<int, std::string> vis2(std::string("2017"));
+    EXPECT_EQ(1u,     vis2.index());
+    EXPECT_EQ("2017", util::get<std::string>(vis2));
+
+    util::variant<std::string> vs(std::string("2017"));
+    EXPECT_EQ(0u,     vs.index());
+    EXPECT_EQ("2017", util::get<std::string>(vs));
+
+    util::variant<std::string, int> vsi(std::string("2017"));
+    EXPECT_EQ(0u,     vsi.index());
+    EXPECT_EQ("2017", util::get<std::string>(vsi));
+
+    util::variant<std::string, int> vsi2(42);
+    EXPECT_EQ(1u,     vsi2.index());
+    EXPECT_EQ(42,     util::get<int>(vsi2));
+}
+
+TEST(Variant, ValueCopyCTor)
+{
+    const int i42         = 42;
+    const int i17         = 2017;
+    const std::string s17 = "2017";
+
+    util::variant<int> vi(i42);
+    EXPECT_EQ(0u,     vi.index());
+    EXPECT_EQ(i42,    util::get<int>(vi));
+
+    util::variant<int, std::string> vis(i17);
+    EXPECT_EQ(0u,     vis.index());
+    EXPECT_EQ(i17,    util::get<int>(vis));
+
+    util::variant<int, std::string> vis2(s17);
+    EXPECT_EQ(1u,     vis2.index());
+    EXPECT_EQ(s17,    util::get<std::string>(vis2));
+
+    util::variant<std::string> vs(s17);
+    EXPECT_EQ(0u,     vs.index());
+    EXPECT_EQ(s17,    util::get<std::string>(vs));
+
+    util::variant<std::string, int> vsi(s17);
+    EXPECT_EQ(0u,     vsi.index());
+    EXPECT_EQ(s17, util::get<std::string>(vsi));
+
+    util::variant<std::string, int> vsi2(i42);
+    EXPECT_EQ(1u,     vsi2.index());
+    EXPECT_EQ(i42,    util::get<int>(vsi2));
+}
+
+TEST(Variant, CopyMoveCTor)
+{
+    const TestVar tvconst(std::string("42"));
+
+    TestVar tv = tvconst;
+    EXPECT_EQ( 1u,  tv.index());
+    EXPECT_EQ("42", util::get<std::string>(tv));
+
+    TestVar tv2(TestVar(40+2));
+    EXPECT_EQ( 0u,  tv2.index());
+    EXPECT_EQ( 42,  util::get<int>(tv2));
+}
+
+TEST(Variant, Assign_Basic)
+{
+    TestVar vis;
+    EXPECT_EQ(0u, vis.index());
+    EXPECT_EQ(0,  util::get<int>(vis));
+
+    vis = 42;
+    EXPECT_EQ(0u, vis.index());
+    EXPECT_EQ(42, util::get<int>(vis));
+}
+
+TEST(Variant, Assign_ValueUpdate_SameType)
+{
+    TestVar vis(42);
+
+    EXPECT_EQ(0u, vis.index());
+    EXPECT_EQ(42, util::get<int>(vis));
+
+    vis = 43;
+    EXPECT_EQ(0u, vis.index());
+    EXPECT_EQ(43, util::get<int>(vis));
+}
+
+TEST(Variant, Assign_ValueUpdate_DiffType)
+{
+    TestVar vis(42);
+
+    EXPECT_EQ(0u, vis.index());
+    EXPECT_EQ(42, util::get<int>(vis));
+
+    vis = std::string("42");
+    EXPECT_EQ(1u, vis.index());
+    EXPECT_EQ("42", util::get<std::string>(vis));
+}
+
+TEST(Variant, Assign_ValueUpdate_Const)
+{
+    TestVar va(42);
+    const TestVar vb(43);
+
+    EXPECT_EQ(0u, va.index());
+    EXPECT_EQ(42, util::get<int>(va));
+
+    EXPECT_EQ(0u, vb.index());
+    EXPECT_EQ(43, util::get<int>(vb));
+
+    va = vb;
+
+    EXPECT_EQ(0u, va.index());
+    EXPECT_EQ(43, util::get<int>(va));
+}
+
+TEST(Variant, Assign_ValueUpdate_Const_DiffType)
+{
+    TestVar va(42);
+    const TestVar vb(std::string("42"));
+
+    EXPECT_EQ(0u, va.index());
+    EXPECT_EQ(42, util::get<int>(va));
+
+    EXPECT_EQ(1u, vb.index());
+    EXPECT_EQ("42", util::get<std::string>(vb));
+
+    va = vb;
+
+    EXPECT_EQ(1u,   va.index());
+    EXPECT_EQ("42", util::get<std::string>(va));
+}
+
+TEST(Variant, Assign_Move)
+{
+    TestVar va(42);
+    TestVar vb(std::string("42"));
+    TestVar vc(43);
+
+    EXPECT_EQ(0u, va.index());
+    EXPECT_EQ(42, util::get<int>(va));
+
+    EXPECT_EQ(1u, vb.index());
+    EXPECT_EQ("42", util::get<std::string>(vb));
+
+    EXPECT_EQ(0u, vc.index());
+    EXPECT_EQ(43, util::get<int>(vc));
+
+    va = std::move(vb);
+    EXPECT_EQ(1u, va.index());
+    EXPECT_EQ("42", util::get<std::string>(va));
+
+    va = std::move(vc);
+    EXPECT_EQ(0u, va.index());
+    EXPECT_EQ(43, util::get<int>(va));
+}
+
+TEST(Variant, Swap_SameIndex)
+{
+    TestVar tv1(42);
+    TestVar tv2(43);
+
+    EXPECT_EQ(0u, tv1.index());
+    EXPECT_EQ(42, util::get<int>(tv1));
+
+    EXPECT_EQ(0u, tv2.index());
+    EXPECT_EQ(43, util::get<int>(tv2));
+
+    tv1.swap(tv2);
+
+    EXPECT_EQ(0u, tv1.index());
+    EXPECT_EQ(43, util::get<int>(tv1));
+
+    EXPECT_EQ(0u, tv2.index());
+    EXPECT_EQ(42, util::get<int>(tv2));
+}
+
+TEST(Variant, Swap_DiffIndex)
+{
+    TestVar2 tv1(42);
+    TestVar2 tv2(3.14f);
+
+    EXPECT_EQ(0u, tv1.index());
+    EXPECT_EQ(42, util::get<int>(tv1));
+
+    EXPECT_EQ(1u, tv2.index());
+    EXPECT_EQ(3.14f, util::get<float>(tv2));
+
+    tv1.swap(tv2);
+
+    EXPECT_EQ(0u, tv2.index());
+    EXPECT_EQ(42, util::get<int>(tv2));
+
+    EXPECT_EQ(1u, tv1.index());
+    EXPECT_EQ(3.14f, util::get<float>(tv1));
+}
+
+TEST(Variant, Get)
+{
+    const TestVar cv(42);
+
+    // Test const& get()
+    EXPECT_EQ(42, util::get<int>(cv));
+    EXPECT_THROW(util::get<std::string>(cv), util::bad_variant_access);
+
+    // Test &get
+    TestVar cv2(std::string("42"));
+    EXPECT_EQ("42", util::get<std::string>(cv2));
+    EXPECT_THROW(util::get<int>(cv2), util::bad_variant_access);
+}
+
+TEST(Variant, GetWrite)
+{
+    util::variant<int, std::string> v(42);
+    EXPECT_EQ(42, util::get<int>(v));
+
+    util::get<int>(v) = 43;
+    EXPECT_EQ(43, util::get<int>(v));
+}
+
+TEST(Variant, NoDefaultCtor)
+{
+    struct MyType
+    {
+        int m_a;
+        MyType() = delete;
+    };
+
+    // This code MUST compile
+    util::variant<int, MyType> var;
+    SUCCEED() << "Code compiled";
+
+    // At the same time, util::variant<MyType, ...> MUST NOT.
+}
+
+TEST(Variant, MonoState)
+{
+    struct MyType
+    {
+        int m_a;
+        explicit MyType(int a) : m_a(a) {}
+        MyType() = delete;
+    };
+
+    util::variant<util::monostate, MyType> var;
+    EXPECT_EQ(0u, var.index());
+
+    var = MyType{42};
+    EXPECT_EQ(1u, var.index());
+    EXPECT_EQ(42, util::get<MyType>(var).m_a);
+}
+
+
+TEST(Variant, Eq)
+{
+    TestVar v1(42), v2(std::string("42"));
+    TestVar v3(v1), v4(v2);
+
+    EXPECT_TRUE(v1 == v3);
+    EXPECT_TRUE(v2 == v4);
+    EXPECT_TRUE(v1 != v2);
+    EXPECT_TRUE(v3 != v4);
+
+    EXPECT_FALSE(v1 == v2);
+    EXPECT_FALSE(v3 == v4);
+    EXPECT_FALSE(v1 != v3);
+    EXPECT_FALSE(v2 != v4);
+}
+
+TEST(Variant, Eq_Monostate)
+{
+    using TestVar3 = util::variant<util::monostate, int>;
+    TestVar3 v1;
+    TestVar3 v2(42);
+
+    EXPECT_NE(v1, v2);
+
+    v2 = util::monostate{};
+    EXPECT_EQ(v1, v2);
+}
+
+TEST(Variant, VectorOfVariants)
+{
+    std::vector<TestVar> vv1(1024);
+    std::vector<TestVar> vv2(1024);
+
+    EXPECT_TRUE(vv1 == vv2);
+
+    std::vector<TestVar> vv3(2048, TestVar(std::string("42")));
+
+    // Just test chat the below code compiles:
+    // 1: internal copy of variants from one vector to another,
+    //    with probable reallocation of 1st vector to host all elements
+    std::copy(vv1.begin(), vv1.end(), std::back_inserter(vv2));
+    EXPECT_EQ(2048u, vv2.size());
+
+    // 2: truncation of vector, with probable destruction of its tail memory
+    vv2.resize(1024);
+    EXPECT_EQ(1024u, vv2.size());
+
+    // 3. vector assignment, with overwriting underlying variants
+    vv2 = vv3;
+    EXPECT_EQ(2048u, vv2.size());
+    EXPECT_TRUE(vv2 == vv3);
+}
+
+TEST(Variant, HoldsAlternative)
+{
+    TestVar v(42);
+    EXPECT_TRUE (util::holds_alternative<int>        (v));
+    EXPECT_FALSE(util::holds_alternative<std::string>(v));
+
+    v = std::string("42");
+    EXPECT_FALSE(util::holds_alternative<int>        (v));
+    EXPECT_TRUE (util::holds_alternative<std::string>(v));
+}
+
+TEST(Variant, Sizeof)
+{
+    //variant has to store index of the contained type as well as the type itself
+    EXPECT_EQ(2 * sizeof(size_t), (sizeof(util::variant<int, char>)));
+#if !defined(__GNUG__) || __GNUG__ >= 5
+    // GCC versions prior to 5.0 have limited C++11 support, e.g.
+    // no std::max_align_t defined
+    EXPECT_EQ((sizeof(std::max_align_t) + std::max(sizeof(size_t), alignof(std::max_align_t))), (sizeof(util::variant<std::max_align_t, char>)));
+#endif
+}
+
+TEST(Variant, EXT_IndexOf)
+{
+    struct MyType{};
+    class MyClass{};
+
+    using V = util::variant<util::monostate, int, double, char, float, MyType, MyClass>;
+    static_assert(0u == V::index_of<util::monostate>(), "Index is incorrect");
+    static_assert(1u == V::index_of<int    >(), "Index is incorrect");
+    static_assert(2u == V::index_of<double >(), "Index is incorrect");
+    static_assert(3u == V::index_of<char   >(), "Index is incorrect");
+    static_assert(4u == V::index_of<float  >(), "Index is incorrect");
+    static_assert(5u == V::index_of<MyType >(), "Index is incorrect");
+    static_assert(6u == V::index_of<MyClass>(), "Index is incorrect");
+}
+
+} // namespace opencv_test
diff --git a/inference-engine/thirdparty/fluid/revision.txt b/inference-engine/thirdparty/fluid/revision.txt
new file mode 100644 (file)
index 0000000..e088afd
--- /dev/null
@@ -0,0 +1 @@
+a3df05d93b188d4e86e23ffd1e988dbec0fc9211
diff --git a/inference-engine/thirdparty/fluid/update.sh b/inference-engine/thirdparty/fluid/update.sh
new file mode 100644 (file)
index 0000000..5f4c053
--- /dev/null
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+REVISION=""
+TARGET_DIR=$(pwd)
+
+case "$#" in
+    "0") echo "Using latest master..."
+         REVISION="master"
+         ;;
+    "1") REVISION=$1
+         echo "Using revision ${REVISION}..."
+         ;;
+    *) echo "Usage: ${0} [REVISION]
+
+    Update Fluid to OpenCV source tree at the given REVISION.
+    If no revision specified, the most recent 'master' commit is used.
+"
+       exit 1 ;;
+esac
+
+# Before doing anything, check if this snapshot was not modified
+./check.sh
+if [ $? -ne 0 ]; then
+    echo "Consistency check failed, please reset this subtree to its initial state first!"
+    exit 1
+fi
+
+# Download the stuff...
+URL="https://github.com/opencv/opencv/archive/${REVISION}.zip"
+wget -c ${URL}
+if [ $? -ne 0 ]; then
+    echo "Failed to download ${URL}!"
+    exit 1
+fi
+
+unzip -qq ${REVISION}.zip
+
+# Remove current files
+if [ -f modules ]; then
+    find modules -type f | xargs git rm
+    find modules -type f | xargs rm
+    rm -vd modules
+fi
+
+# Put a new copy. Extend this section if needed
+# BOM thing might help here, probably
+pushd "opencv-${REVISION}"
+cp -rv --parent modules/gapi ${TARGET_DIR}
+popd
+# Note "-f" is used to add files like doc/ which are omitted
+# now by IE's current .gitignore - it breaks checksum otherwise.
+git add -f modules/gapi
+
+# Clean-up files
+rm -rf "opencv-${REVISION}"
+rm "${REVISION}.zip"
+
+# Calculate and store checksum
+./checksum.sh > checksum.txt
+git add checksum.txt
+
+# Store revision
+if [ ${REVISION} == "master" ]; then
+    REVISION="${REVISION} / $(date +%F)"
+fi
+echo ${REVISION} > revision.txt
+git add revision.txt
+
+# Display status
+git status
+
+# Fin
+echo "Done!"
index d89a672..939c81f 100644 (file)
 
 cmake_minimum_required(VERSION 2.8)
 
-# Use old-style version handling
-if(POLICY CMP0048)
-    cmake_policy(SET CMP0048 OLD)
-    cmake_policy(SET CMP0054 OLD)
+if(POLICY CMP0054)
+    cmake_policy(SET CMP0054 NEW)
 endif()
 
 # Enable RPATH on MacOS/OSX
@@ -42,11 +40,20 @@ endif()
 
 set(PROJECT_NAME "Intel(R) MKL-DNN")
 set(PROJECT_FULL_NAME "Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)")
-set(PROJECT_VERSION "0.16")
+set(PROJECT_VERSION "0.17")
 
 set(LIB_NAME mkldnn)
 
-project(${PROJECT_NAME} C CXX)
+if (CMAKE_VERSION VERSION_LESS 3.0)
+    project(${PROJECT_NAME} C CXX)
+else()
+    cmake_policy(SET CMP0048 NEW)
+    project(${PROJECT_NAME} VERSION "${PROJECT_VERSION}" LANGUAGES C CXX)
+endif()
+
+if (NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    message("FATAL_ERROR" "Intel(R) MKL-DNN supports 64 bit platforms only")
+endif()
 
 if("${CMAKE_BUILD_TYPE}" STREQUAL "")
     message(STATUS "CMAKE_BUILD_TYPE is unset, defaulting to Release")
@@ -59,9 +66,9 @@ set(CMAKE_TEST_CCXX_FLAGS)      # TESTS specifics
 
 include("cmake/utils.cmake")
 include("cmake/options.cmake")
-include("cmake/platform.cmake")
 include("cmake/OpenMP.cmake")
 include("cmake/TBB.cmake")
+include("cmake/platform.cmake")
 include("cmake/SDL.cmake")
 include("cmake/MKL.cmake")
 include("cmake/Doxygen.cmake")
index 704589e..2a5b29e 100644 (file)
@@ -1,9 +1,5 @@
-> Intel MKL-DNN repository migrated to [https://github.com/intel/mkl-dnn](https://github.com/intel/mkl-dnn).
-> The old address will continue to be available and will redirect to the new repo.
-> Please update your links.
-
 # Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)
-![v0.16 beta](https://img.shields.io/badge/v0.16-beta-orange.svg)
+![v0.17 beta](https://img.shields.io/badge/v0.17-beta-orange.svg)
 
 Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN) is
 an open source performance library for deep learning applications. The library
@@ -29,6 +25,7 @@ of the following deep learning topologies and variations of these.
 | Speech Recognition (experimental)         | DeepSpeech
 | Adversarial Networks                      | DCGAN, 3DGAN
 | Reinforcement Learning                    | A3C
+| Text-to-Speech                            | WaveNet
 
 Intel MKL-DNN is used in the following software products:
 * [Caffe\* Optimized for Intel Architecture](https://github.com/intel/caffe)
@@ -36,11 +33,13 @@ Intel MKL-DNN is used in the following software products:
 * [DeepBench](https://github.com/baidu-research/DeepBench)
 * [PaddlePaddle\*](http://www.paddlepaddle.org)
 * [Tensorflow\*](https://www.tensorflow.org)
-* [Microsoft\* Cognitive Toolkit (CNTK)](https://www.microsoft.com/en-us/cognitive-toolkit/)
-* [Apache\* MXNet](https://mxnet.apache.org/)
-* [OpenVINO(TM) toolkit](https://software.intel.com/en-us/openvino-toolkit)
+* [Microsoft\* Cognitive Toolkit (CNTK)](https://docs.microsoft.com/en-us/cognitive-toolkit)
+* [Apache\* MXNet](https://mxnet.apache.org)
+* [OpenVINO(TM) toolkit](https://01.org/openvinotoolkit)
 * [Intel(R) Nervana(TM) Graph](https://github.com/NervanaSystems/ngraph)
 * [Menoh\*](https://github.com/pfnet-research/menoh)
+* [DeepLearning4J\*](https://deeplearning4j.org)
+* [BigDL](https://github.com/intel-analytics/BigDL)
 
 ## License
 Intel MKL-DNN is licensed under
@@ -68,7 +67,7 @@ without prior notification in future releases:
 * Convolutions with `s16` data type in source, weights or destination
 * Convolutions and auxiliary primitives for 3D spatial data
 * RNN, LSTM and GRU primitives
-* Intel Threading Building (Intel TBB\*) support
+* Intel Threading Building Blocks (Intel TBB\*) support
 
 ## How to Contribute
 We welcome community contributions to Intel MKL-DNN. If you have an idea how to improve the library:
@@ -87,7 +86,7 @@ request will be merged the repository.
 Intel MKL-DNN supports Intel(R) 64 architecture and compatible architectures.
 The library is optimized for the systems based on
 * Intel Atom(R) processor with Intel(R) SSE4.1 support
-* 4th, 5th, 6th and 7th generation Intel(R) Core processor
+* 4th, 5th, 6th, 7th and 8th generation Intel(R) Core processor
 * Intel(R) Xeon(R) processor E5 v3 family (formerly Haswell)
 * Intel Xeon processor E5 v4 family (formerly Broadwell)
 * Intel Xeon Platinum processor family (formerly Skylake)
@@ -100,22 +99,31 @@ The software dependencies are:
 * [Cmake](https://cmake.org/download/) 2.8.0 or later
 * [Doxygen](http://www.stack.nl/~dimitri/doxygen/download.html#srcbin) 1.8.5 or later
 * C++ compiler with C++11 standard support
+* Optional dependencies:
+  * GNU OpenMP\*, LLVM OpenMP\*, or Intel OpenMP
+  * Threading Building Blocks (TBB)
+  * Intel MKL or Intel MKL small libraries
+
+> **Note**
+> Building Intel MKL-DNN with optinal dependencies may introduce additional
+> runtime dependencies for the library. Please refer to corresponding 
+> software system requirements for details.
 
 The software was validated on RedHat\* Enterprise Linux 7 with
-* GNU\* Compiler Collection 4.8, 5.2, 6.1 and 7.2
+* GNU\* Compiler Collection 4.8, 5.4, 6.1, 7.2 and 8.1
 * Clang\* 3.8.0
 * [Intel(R) C/C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe)
-  17.0 and 18.0
+  17.0, 18.0 and 19.0
 
 on Windows Server\* 2012 R2 with
 * Microsoft\* Visual C++ 14.0 (Visual Studio 2015)
 * [Intel(R) C/C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe)
-  17.0 and 18.0
+  17.0 and 19.0
 
 on macOS\* 10.13 (High Sierra) with
-* Apple LLVM version 9.0.0 (XCode 9.0.0)
+* Apple LLVM version 9.2 (XCode 9.2)
 * [Intel C/C++ Compiler](https://software.intel.com/en-us/intel-parallel-studio-xe)
-  18.0 (XCode 8.3.2)
+  18.0 and 19.0
 
 The implementation uses OpenMP\* 4.0 SIMD extensions. We recommend using
 Intel(R) Compiler for the best performance results.
@@ -155,14 +163,20 @@ You might need to set `MKLROOT` environment variable to the path where full
 Intel MKL is installed to help cmake locate the library.
 
 You can choose to build Intel MKL-DNN without binary dependency. The resulting
-version will be fully functional, however performance of certain convolution
-shapes and sizes and inner product relying on SGEMM function may be suboptimal.
+version will be fully functional, however performance of convolutions relying
+on GEMM-based algorithm, inner product, and mkldnn_?gemm functionality may be
+suboptimal.
 
 > **Note**
 >
 > Using Intel MKL small libraries currently work for Intel MKL-DNN built with
 > OpenMP\* only. Building with Intel TBB requires either full Intel MKL library
 > or standalone build.
+>
+> Using Intel MKL or Intel MKL small libraries will introduce additional
+> runtime dependencies. Please refer to Intel MKL 
+> [system requirements](https://software.intel.com/en-us/articles/intel-math-kernel-library-intel-mkl-2019-system-requirements)
+> for additional information.
 
 Intel MKL-DNN uses a CMake-based build system
 
@@ -174,17 +188,29 @@ Here `$CMAKE_OPTIONS` are options to control the build. Along with the standard
 cmake options such as `CMAKE_INSTALL_PREFIX` or `CMAKE_BUILD_TYPE`,
 user can also pass Intel MKL-DNN specific ones:
 
-|Option                 | Possible Values (defaults in bold) | Description
-|:---                   |:---                | :---
-|MKLDNN_LIBRARY_TYPE    | **SHARED**, STATIC | Defines resulting library type
-|MKLDNN_THREADING       | **OMP**, TBB       | Defines threading type
-|WITH_EXAMPLE           | **ON**, OFF        | Controls building examples
-|WITH_TEST              | **ON**, OFF        | Controls building tests
-|VTUNEROOT              | *path*             | Enables integration with Intel(R) Vtune(tm) Amplifier
+|Option                 | Possible Values (defaults in bold)   | Description
+|:---                   |:---                                  | :---
+|MKLDNN_LIBRARY_TYPE    | **SHARED**, STATIC                   | Defines resulting library type
+|MKLDNN_THREADING       | **OMP**, OMP:INTEL, OMP:COMP, TBB    | Defines threading type
+|MKLDNN_USE_MKL         | **DEF**, NONE, ML, FULL, FULL:STATIC | Defines binary dependency on Intel MKL
+|WITH_EXAMPLE           | **ON**, OFF                          | Controls building examples
+|WITH_TEST              | **ON**, OFF                          | Controls building tests
+|ARCH_OPT_FLAGS (\*)    | *compiler flags*                     | Specifies compiler optimization flags
+|VTUNEROOT              | *path*                               | Enables integration with Intel(R) Vtune(tm) Amplifier
 
 Please check [cmake/options.cmake](cmake/options.cmake) for more options
 and details.
 
+> (\*) **WARNING**
+>
+> By default Intel MKL-DNN is built specifically for the processor type of the
+> compiling machine (e.g. `-march=native` in case of GCC). While this option
+> gives better performance, the resulting library can only be run on systems
+> that are instruction-set compatible with the compiling machine.
+>
+> Hence if Intel MKL-DNN is to be shipped to other platforms (e.g. built by
+> Linux distribution maintainers) consider setting ARCH_OPT_FLAGS to "".
+
 Intel MKL-DNN includes unit tests implemented using the googletest framework. To validate your build, run:
 
 ```
@@ -270,7 +296,8 @@ Intel MKL-DNN built with Intel TBB doesn't require special handling:
        g++ -std=c++11 -I${MKLDNNROOT}/include -L${MKLDNNROOT}/lib simple_net.cpp -lmkldnn -ltbb
 ```
 
-Please note that Intel MKL-DNN has limited optimizations done for Intel TBB
+Please note that Intel MKL-DNN requires Intel TBB 2017 or above.
+Also, Intel MKL-DNN has limited optimizations done for Intel TBB
 and has some functional limitations if built with Intel TBB.
 
 Functional limitations:
index 9111810..bb02059 100644 (file)
@@ -23,15 +23,81 @@ if(MKL_cmake_included)
 endif()
 set(MKL_cmake_included true)
 
+# set SKIP_THIS_MKL to true if given configuration is not supported
+function(maybe_skip_this_mkl LIBNAME)
+    # Optimism...
+    set(SKIP_THIS_MKL False PARENT_SCOPE)
+
+    # Both mklml_intel and mklml_gnu are OpenMP based.
+    # So in case of TBB link with Intel MKL (RT library) and either set:
+    #   MKL_THREADING_LAYER=tbb
+    # to make Intel MKL use TBB threading as well, or
+    #   MKL_THREADING_LAYER=sequential
+    # to make Intel MKL be sequential.
+    if (MKLDNN_THREADING STREQUAL "TBB" AND LIBNAME MATCHES "mklml")
+        set(SKIP_THIS_MKL True PARENT_SCOPE)
+    endif()
+
+    # user doesn't want Intel MKL at all
+    if (MKLDNN_USE_MKL STREQUAL "NONE")
+        set(SKIP_THIS_MKL True PARENT_SCOPE)
+    endif()
+
+    # user specifies Intel MKL-ML should be used
+    if (MKLDNN_USE_MKL STREQUAL "ML")
+        if (LIBNAME STREQUAL "mkl_rt")
+            set(SKIP_THIS_MKL True PARENT_SCOPE)
+        endif()
+    endif()
+
+    # user specifies full Intel MKL should be used
+    if (MKLDNN_USE_MKL MATCHES "FULL")
+        if (LIBNAME MATCHES "mklml")
+            set(SKIP_THIS_MKL True PARENT_SCOPE)
+        endif()
+    endif()
+
+    # avoid using Intel MKL-ML that is not compatible with compiler's OpenMP RT
+    if (MKLDNN_THREADING STREQUAL "OMP:COMP")
+        if ((LIBNAME STREQUAL "mklml_intel" OR LIBNAME STREQUAL "mklml")
+                AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel"))
+            set(SKIP_THIS_MKL True PARENT_SCOPE)
+        elseif (LIBNAME STREQUAL "mklml_gnu"
+                AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+            set(SKIP_THIS_MKL True PARENT_SCOPE)
+        endif()
+    elseif (MKLDNN_THREADING STREQUAL "OMP:INTEL")
+       if (LIBNAME STREQUAL "mklml_gnu")
+           set(SKIP_THIS_MKL True PARENT_SCOPE)
+       endif()
+    endif()
+endfunction()
+
 function(detect_mkl LIBNAME)
     if(HAVE_MKL)
         return()
     endif()
 
-    message(STATUS "Detecting Intel(R) MKL: trying ${LIBNAME}")
+    maybe_skip_this_mkl(${LIBNAME})
+    set_if(SKIP_THIS_MKL MAYBE_SKIP_MSG "... skipped")
+    message(STATUS "Detecting Intel(R) MKL: trying ${LIBNAME}${MAYBE_SKIP_MSG}")
+
+    if (SKIP_THIS_MKL)
+        return()
+    endif()
 
     find_path(MKLINC mkl_cblas.h
         HINTS ${MKLROOT}/include $ENV{MKLROOT}/include)
+
+    # skip full Intel MKL while looking for Intel MKL-ML
+    if (MKLINC AND LIBNAME MATCHES "mklml")
+        get_filename_component(__mklinc_root "${MKLINC}" PATH)
+        find_library(tmp_MKLLIB NAMES "mkl_rt"
+            HINTS ${__mklinc_root}/lib/intel64)
+        set_if(tmp_MKLLIB MKLINC "")
+        unset(tmp_MKLLIB CACHE)
+    endif()
+
     if(NOT MKLINC)
         file(GLOB_RECURSE MKLINC
                 ${CMAKE_CURRENT_SOURCE_DIR}/external/*/mkl_cblas.h)
@@ -42,14 +108,8 @@ function(detect_mkl LIBNAME)
             if(MKLINCLEN GREATER 1)
                 list(SORT MKLINC)
                 list(REVERSE MKLINC)
-                # message(STATUS "MKLINC found ${MKLINCLEN} files:")
-                # foreach(LOCN IN LISTS MKLINC)
-                #     message(STATUS "       ${LOCN}")
-                # endforeach()
                 list(GET MKLINC 0 MKLINCLST)
                 set(MKLINC "${MKLINCLST}")
-                # message(WARNING "MKLINC guessing... ${MKLINC}.  "
-                #     "Please check that above dir has the desired mkl_cblas.h")
             endif()
             get_filename_component(MKLINC ${MKLINC} PATH)
         endif()
@@ -79,14 +139,6 @@ function(detect_mkl LIBNAME)
         endif()
     endif()
 
-    if(UNIX AND LIBNAME MATCHES "mklml.*")
-        # Although MKL-ML depends on shared object functions such as dlopen and
-        # dladdr it is not linked against libdl. This causes link failures when
-        # MKL-DNN is build with the gold linker (e.g. -fuse-ld=gold).
-        list(APPEND EXTRA_LIBS dl)
-        set(EXTRA_LIBS "${EXTRA_LIBS}" PARENT_SCOPE)
-    endif()
-
     if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
         get_filename_component(MKLLIBPATH ${MKLLIB} PATH)
         find_library(MKLIOMP5LIB
@@ -150,29 +202,45 @@ function(detect_mkl LIBNAME)
     set(HAVE_MKL TRUE PARENT_SCOPE)
     set(MKLINC ${MKLINC} PARENT_SCOPE)
     set(MKLLIB "${MKLLIB}" PARENT_SCOPE)
+    set(MKLDLL "${MKLDLL}" PARENT_SCOPE)
 
-    if(WIN32)
-        set(MKLDLL "${MKLDLL}" PARENT_SCOPE)
-    endif()
-    if(MKLIOMP5LIB)
-        set(MKLIOMP5LIB "${MKLIOMP5LIB}" PARENT_SCOPE)
+    set(MKLIOMP5LIB "${MKLIOMP5LIB}" PARENT_SCOPE)
+    set(MKLIOMP5DLL "${MKLIOMP5DLL}" PARENT_SCOPE)
+endfunction()
+
+function(set_static_mkl_libs libpath)
+    set_ternary(lib WIN32 "" "lib")
+    set_ternary(a WIN32 ".lib" ".a")
+
+    if (MKLDNN_THREADING STREQUAL "TBB")
+        set(thr_name "tbb_thread")
+    elseif (MKLDNN_THREADING STREQUAL "OMP:COMP" AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(thr_name "gnu_thread")
+    else()
+        set(thr_name "intel_thread")
     endif()
-    if(WIN32 AND MKLIOMP5DLL)
-        set(MKLIOMP5DLL "${MKLIOMP5DLL}" PARENT_SCOPE)
+
+    find_library(mkl_iface NAMES "${lib}mkl_intel_lp64${a}" HINTS ${libpath})
+    find_library(mkl_thr   NAMES "${lib}mkl_${thr_name}${a}" HINTS ${libpath})
+    find_library(mkl_core  NAMES "${lib}mkl_core${a}" HINTS ${libpath})
+
+    set(MKLLIB "${mkl_iface};${mkl_thr};${mkl_core}")
+    if (UNIX AND NOT APPLE)
+        list(APPEND MKLLIB "${mkl_iface};${mkl_thr};${mkl_core}")
     endif()
+    set_if(UNIX MKLLIB "${MKLLIB};m;dl")
+    set(MKLLIB "${MKLLIB}" PARENT_SCOPE)
 endfunction()
 
-# Both mklml_intel and mklml_gnu are OpenMP based.
-# So in case of TBB link with Intel MKL (RT library) and either set:
-#   MKL_THREADING_LAYER=tbb
-# to make Intel MKL use TBB threading as well, or
-#   MKL_THREADING_LAYER=sequential
-# to make Intel MKL be sequential.
-if(NOT MKLDNN_THREADING STREQUAL "TBB")
-    detect_mkl("mklml_intel")
-    detect_mkl("mklml")
-endif()
+detect_mkl("mklml_intel")
+detect_mkl("mklml_gnu")
+detect_mkl("mklml")
 detect_mkl("mkl_rt")
+if (MKLDNN_USE_MKL STREQUAL "FULL:STATIC" AND HAVE_MKL)
+    set(MKLDLL "")
+    get_filename_component(MKLLIBPATH "${MKLLIB}" PATH)
+    set_static_mkl_libs(${MKLLIBPATH})
+endif ()
 
 if(HAVE_MKL)
     add_definitions(-DUSE_MKL -DUSE_CBLAS)
@@ -182,20 +250,18 @@ if(HAVE_MKL)
     set(MSG "Intel(R) MKL:")
     message(STATUS "${MSG} include ${MKLINC}")
     message(STATUS "${MSG} lib ${MKLLIB}")
-    if(MKLIOMP5LIB)
-        message(STATUS "${MSG} OpenMP lib ${MKLIOMP5LIB}")
-    else()
-        message(STATUS "${MSG} OpenMP lib provided by compiler")
-    endif()
-    if(WIN32)
+    if(WIN32 AND MKLDLL)
         message(STATUS "${MSG} dll ${MKLDLL}")
-        if(MKLIOMP5DLL)
-            message(STATUS "${MSG} OpenMP dll ${MKLIOMP5DLL}")
-        else()
-            message(STATUS "${MSG} OpenMP dll provided by compiler")
-        endif()
     endif()
 else()
+    if (MKLDNN_USE_MKL STREQUAL "NONE")
+        return()
+    endif()
+
+    if (NOT MKLDNN_USE_MKL STREQUAL "DEF")
+        set(FAIL_WITHOUT_MKL True)
+    endif()
+
     if(DEFINED ENV{FAIL_WITHOUT_MKL} OR DEFINED FAIL_WITHOUT_MKL)
         set(SEVERITY "FATAL_ERROR")
     else()
index d325d18..f9c3620 100644 (file)
@@ -22,16 +22,68 @@ if(OpenMP_cmake_included)
 endif()
 set(OpenMP_cmake_included true)
 
-if(NOT MKLDNN_THREADING STREQUAL "OMP")
-    return()
+include("cmake/Threading.cmake")
+include("cmake/MKL.cmake")
+
+if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    # OSX Clang doesn't have OpenMP by default.
+    # But we still want to build the library.
+    set(_omp_severity "WARNING")
+else()
+    set(_omp_severity "FATAL_ERROR")
 endif()
 
-include("cmake/MKL.cmake")
+
+macro(forbid_link_compiler_omp_rt)
+    if (NOT WIN32)
+        set_if(OpenMP_C_FOUND CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OpenMP_C_FLAGS})
+        set_if(OpenMP_CXX_FOUND CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OpenMP_CXX_FLAGS})
+        if (NOT APPLE)
+            set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--as-needed")
+        endif()
+    endif()
+endmacro()
+
+macro(use_intel_omp_rt)
+    # fast return
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+        return()
+    endif()
+
+    # Do not link with compiler-native OpenMP library if Intel MKL is present.
+    # Rationale: Intel MKL comes with Intel OpenMP library which is compatible
+    # with all libraries shipped with compilers that Intel MKL-DNN supports.
+    if(HAVE_MKL)
+        forbid_link_compiler_omp_rt()
+        if (UNIX AND NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+            # For some reasons Clang ignores `-fopenmp=libiomp5` switch and
+            # links against libomp.so anyways.
+            # The workaround is to set the full path to libiomp5.so
+            add_library(libiomp5 SHARED IMPORTED)
+            set_property(TARGET libiomp5 PROPERTY IMPORTED_LOCATION "${MKLIOMP5LIB}")
+            list(APPEND EXTRA_LIBS libiomp5)
+        else()
+            list(APPEND EXTRA_LIBS ${MKLIOMP5LIB})
+        endif()
+    else()
+        if (MKLDNN_THREADING STREQUAL "OMP:INTEL")
+            message(${_omp_severity} "Intel OpenMP runtime could not be found. "
+                "Please either use OpenMP runtime that comes with the compiler "
+                "(via -DMKLDNN_THREADING={OMP,OMP:COMP}), or "
+                "install Intel MKL / Intel MKL-ML (e.g. scripts/prepare_mkl.sh)")
+        endif()
+    endif()
+endmacro()
 
 if(WIN32 AND ${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
     add_definitions(/Qpar)
     add_definitions(/openmp)
-    add_definitions(-DMKLDNN_THR=MKLDNN_THR_OMP)
+    set(OpenMP_CXX_FOUND true)
+elseif(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    append(CMAKE_C_FLAGS "-Xclang -fopenmp")
+    append(CMAKE_CXX_FLAGS "-Xclang -fopenmp")
+    set(OpenMP_CXX_FOUND true)
+    list(APPEND EXTRA_LIBS ${MKLIOMP5LIB})
 else()
     find_package(OpenMP)
     #newer version for findOpenMP (>= v. 3.9)
@@ -44,30 +96,34 @@ else()
         set(OpenMP_C_FOUND true)
         set(OpenMP_CXX_FOUND true)
     endif()
-    if(OpenMP_C_FOUND)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    endif()
-    if(OpenMP_CXX_FOUND)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-        add_definitions(-DMKLDNN_THR=MKLDNN_THR_OMP)
-    endif()
+    append_if(OpenMP_C_FOUND CMAKE_C_FLAGS "${OpenMP_C_FLAGS}")
+    append_if(OpenMP_CXX_FOUND CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS}")
 endif()
 
-# Do not link with compiler-native OpenMP library if MKL is present.
-# Rationale: MKL comes with Intel OpenMP library which is compatible with all
-# libraries shipped with compilers that MKL-DNN supports.
-if(HAVE_MKL AND NOT WIN32 AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-    if(OpenMP_C_FOUND)
-        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OpenMP_C_FLAGS})
+if (MKLDNN_THREADING MATCHES "OMP")
+    if (OpenMP_CXX_FOUND)
+        set_threading("OMP")
+    else()
+        message(${_omp_severity} "OpenMP library could not be found. "
+            "Proceeding might lead to highly sub-optimal performance.")
     endif()
-    if(OpenMP_CXX_FOUND)
-        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OpenMP_CXX_FLAGS})
+
+    if (MKLDNN_THREADING STREQUAL "OMP:COMP")
+        set(MKLIOMP5LIB "")
+        set(MKLIOMP5DLL "")
+    else()
+        use_intel_omp_rt()
     endif()
-    list(APPEND EXTRA_LIBS ${MKLIOMP5LIB})
+else()
+    # Compilation happens with OpenMP to enable `#pragma omp simd`
+    # but during linkage OpenMP dependency should be avoided
+    forbid_link_compiler_omp_rt()
+    return()
 endif()
 
-if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Xclang -fopenmp")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -fopenmp")
-    list(APPEND EXTRA_LIBS ${MKLIOMP5LIB})
+set_ternary(_omp_lib_msg MKLIOMP5LIB "${MKLIOMP5LIB}" "provided by compiler")
+message(STATUS "OpenMP lib: ${_omp_lib_msg}")
+if(WIN32)
+    set_ternary(_omp_dll_msg MKLIOMP5DLL "${MKLIOMP5LIB}" "provided by compiler")
+    message(STATUS "OpenMP dll: ${_omp_dll_msg}")
 endif()
index 52b9e33..fb0cdc1 100644 (file)
@@ -42,7 +42,7 @@ elseif(UNIX)
     find_package(TBB REQUIRED tbb HINTS cmake/lnx)
 endif()
 
-add_definitions(-DMKLDNN_THR=MKLDNN_THR_TBB)
+set_threading("TBB")
 list(APPEND mkldnn_LINKER_LIBS ${TBB_IMPORTED_TARGETS})
 
 message(STATUS "Intel(R) TBB: ${TBBROOT}")
diff --git a/inference-engine/thirdparty/mkl-dnn/cmake/Threading.cmake b/inference-engine/thirdparty/mkl-dnn/cmake/Threading.cmake
new file mode 100644 (file)
index 0000000..f509c79
--- /dev/null
@@ -0,0 +1,39 @@
+#===============================================================================
+# Copyright 2018 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+# Utils for managing threading-related configuration
+#===============================================================================
+
+if(Threading_cmake_included)
+    return()
+endif()
+set(Threading_cmake_included true)
+
+# Replace existing define for threading (if any) with a new one
+macro(set_threading threading)
+    if(MKLDNN_THR_CURRENT)
+        remove_definitions(-DMKLDNN_THR=${MKLDNN_THR_CURRENT})
+    endif()
+    set(MKLDNN_THR_CURRENT MKLDNN_THR_${threading})
+    add_definitions(-DMKLDNN_THR=${MKLDNN_THR_CURRENT})
+endmacro()
+
+# While MKL-DNN defaults to OpenMP (if _OPENMP is defined) without CMake, here
+# we default to sequential threading and let OpenMP.cmake and TBB.cmake to
+# figure things out. This is especially important because OpenMP is used both
+# for threading and vectorization via #pragma omp simd
+set_threading("SEQ")
+
index 1e727e1..320a77e 100644 (file)
@@ -36,8 +36,6 @@ if (NOT _tbbmalloc_proxy_ix EQUAL -1)
     endif()
 endif()
 
-set(TBB_INTERFACE_VERSION 10005)
-
 # Intel MKL-DNN changes: use TBBROOT to locate Intel TBB
 # get_filename_component(_tbb_root "${CMAKE_CURRENT_LIST_FILE}" PATH)
 # get_filename_component(_tbb_root "${_tbb_root}" PATH)
@@ -70,16 +68,29 @@ if (NOT _tbb_compiler_id STREQUAL "GNU")
     unset(_tbb_gcc_ver_output)
 endif()
 
-set(_tbb_compiler_subdir gcc4.1)
-foreach (_tbb_gcc_version 4.1 4.4 4.7)
-    if (NOT _tbb_compiler_ver VERSION_LESS ${_tbb_gcc_version})
-        set(_tbb_compiler_subdir gcc${_tbb_gcc_version})
+set(_tbb_lib ${_tbb_root}/lib/${_tbb_arch_subdir} )
+file(GLOB _tbb_gcc_versions_available RELATIVE ${_tbb_lib} ${_tbb_lib}/*)
+# shall we check _tbb_gcc_versions_available is not empty?
+foreach (_tbb_gcc_version ${_tbb_gcc_versions_available})
+    string(SUBSTRING ${_tbb_gcc_version} 3 -1 _tbb_gcc_version_number)
+    if (NOT _tbb_compiler_ver VERSION_LESS _tbb_gcc_version_number)
+        set(_tbb_compiler_subdir ${_tbb_gcc_version})
     endif()
 endforeach()
 
+unset(_tbb_gcc_version_number)
 unset(_tbb_compiler_id)
 unset(_tbb_compiler_ver)
 
+
+# we need to check the version of tbb
+file(READ "${_tbb_root}/include/tbb/tbb_stddef.h" _tbb_stddef)
+string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_tbb_stddef}")
+if (${TBB_INTERFACE_VERSION} VERSION_LESS 9100)
+    message(FATAL_ERROR "MKL-DNN requires TBB version 2017 or above")
+endif()
+
+# Now we check that all the needed component are present
 get_filename_component(_tbb_lib_path "${_tbb_root}/lib/${_tbb_arch_subdir}/${_tbb_compiler_subdir}" ABSOLUTE)
 
 foreach (_tbb_component ${TBB_FIND_COMPONENTS})
index 41c8637..568791f 100644 (file)
@@ -36,8 +36,6 @@ if (NOT _tbbmalloc_proxy_ix EQUAL -1)
     endif()
 endif()
 
-set(TBB_INTERFACE_VERSION 10005)
-
 # Intel MKL-DNN changes: use TBBROOT to locate Intel TBB
 # get_filename_component(_tbb_root "${CMAKE_CURRENT_LIST_FILE}" PATH)
 # get_filename_component(_tbb_root "${_tbb_root}" PATH)
@@ -54,6 +52,13 @@ endif()
 
 set(_tbb_compiler_subdir .)
 
+# we need to check the version of tbb
+file(READ "${_tbb_root}/include/tbb/tbb_stddef.h" _tbb_stddef)
+string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_tbb_stddef}")
+if (${TBB_INTERFACE_VERSION} VERSION_LESS 9100)
+    message(FATAL_ERROR "MKL-DNN requires TBB version 2017 or above")
+endif()
+
 get_filename_component(_tbb_lib_path "${_tbb_root}/lib/${_tbb_arch_subdir}/${_tbb_compiler_subdir}" ABSOLUTE)
 
 foreach (_tbb_component ${TBB_FIND_COMPONENTS})
index a83517d..e6ff249 100644 (file)
@@ -47,11 +47,49 @@ option(WITH_EXAMPLE "builds examples"  ON)
 option(WITH_TEST "builds tests" ON)
 
 set(MKLDNN_THREADING "OMP" CACHE STRING
-    "specifies threading type; supports OMP (default), or TBB.
-    If Intel(R) Threading Building Blocks (Intel(R) TBB) one should also
-    set TBBROOT (either environement variable or CMake option) to the library
+    "specifies threading type; supports OMP (default), OMP:COMP, OMP:INTEL, or TBB.
+
+    When OpenMP is used a user can choose what runtime to use:
+    - native OpenMP runtime that comes with the compiler (OMP:COMP), or
+    - Intel OpenMP runtime that is compatible with all the compilers that
+      Intel MKL-DNN supports (OMP:INTEL). This option requires Intel MKL
+      be installed or Intel MKL-ML library be downloaded. This option doesn't
+      work with MSVC (w/o Intel Compiler).
+    The default option is OMP, which gives a preference to OMP:INTEL, but if
+    neither Intel MKL is installed nor Intel MKL-ML is available then fallback
+    to OMP:COMP.
+
+    To use Intel(R) Threading Building Blocks (Intel(R) TBB) one should also
+    set TBBROOT (either environment variable or CMake option) to the library
     location")
 
+set(MKLDNN_USE_MKL "DEF" CACHE STRING
+    "specifies what Intel MKL library to use.
+    Supports DEF (default), NONE, ML, FULL, FULL:STATIC.
+
+    By default (DEF) cmakes tries to find Intel MKL-ML library, then full
+    Intel MKL library, or just builds Intel MKL-DNN w/o any binary dependency.
+
+    To build Intel MKL-DNN w/o any dependencies on Intel MKL / Intel MKL-ML
+    use NONE. Note that building system would not be able to use Intel OpenMP
+    runtime that comes with Intel MKL or Intel MKL-ML, and would be available
+    only if Intel Compiler is used.
+
+    To force Intel MKL-DNN to use Intel MKL-ML use ML. Depending on the
+    threading the build system would choose between libmklml_intel or
+    libmklml_gnu.
+
+    To force Intel MKL-DNN to use the full Intel MKL pass FULL or FULL:STATIC
+    to cmake. The former option would make Intel MKL-DNN link against
+    Intel MKL RT (libmkl_rt). The latter one would link against static
+    Intel MKL. Use static linking to reduce the size of the resulting library
+    (including its dependencies).
+    Caution: Intel MKL RT allows setting the threading layer using environment
+             variable MKL_THREADING_LAYER. By default Intel MKL would use
+             OpenMP. If Intel MKL-DNN is built with TBB it is recommended to
+             set MKL_THREADING_LAYER to `tbb` or `sequential`, to avoid
+             conflict between OpenMP and TBB thread pools.")
+
 # =============
 # Optimizations
 # =============
@@ -67,7 +105,7 @@ set(ARCH_OPT_FLAGS "HostOpts" CACHE STRING
       architectures.
 
     - For GNU* Compiler Collection version 5 and newer the default options are
-      `-march=native -mtune=native` which behaves similarly to the descriprion
+      `-march=native -mtune=native` which behaves similarly to the description
       above.
 
     - For all other cases there are no special optimizations flags.
@@ -90,6 +128,15 @@ set(VTUNEROOT "" CACHE STRING
 # Miscellaneous
 # =============
 
+set(MKLDNN_USE_CLANG_SANITIZER "" CACHE STRING
+    "instructs build system to use a Clang sanitizer. Possible values:
+    Address: enables MemorySanitizer
+    Memory: enables MemorySanitizer
+    MemoryWithOrigin: enables MemorySanitizer with origin tracking
+    Undefined: enables UndefinedBehaviourSanitizer
+    This feature is experimental and is only available on Linux.")
+
+
 option(BENCHDNN_USE_RDPMC
     "enables rdpms counter to report precise cpu frequency in benchdnn.
      CAUTION: may not work on all cpus (hence disabled by default)"
index 55f197f..3597970 100644 (file)
@@ -35,78 +35,107 @@ set(DEF_ARCH_OPT_FLAGS)
 if(MSVC)
     set(USERCONFIG_PLATFORM "x64")
     if(${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
-        set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} /MP")
+        append(CMAKE_CCXX_FLAGS "/MP")
         # int -> bool
-        set(CMAKE_CCXX_NOWARN_FLAGS "${CMAKE_CCXX_NOWARN_FLAGS} /wd4800")
+        append(CMAKE_CCXX_NOWARN_FLAGS "/wd4800")
         # unknown pragma
-        set(CMAKE_CCXX_NOWARN_FLAGS "${CMAKE_CCXX_NOWARN_FLAGS} /wd4068")
+        append(CMAKE_CCXX_NOWARN_FLAGS "/wd4068")
         # double -> float
-        set(CMAKE_CCXX_NOWARN_FLAGS "${CMAKE_CCXX_NOWARN_FLAGS} /wd4305")
+        append(CMAKE_CCXX_NOWARN_FLAGS "/wd4305")
         # UNUSED(func)
-        set(CMAKE_CCXX_NOWARN_FLAGS "${CMAKE_CCXX_NOWARN_FLAGS} /wd4551")
+        append(CMAKE_CCXX_NOWARN_FLAGS "/wd4551")
     endif()
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-        set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} /MP")
+        append(CMAKE_CCXX_FLAGS "/MP")
         set(DEF_ARCH_OPT_FLAGS "-QxHOST")
         # disable: loop was not vectorized with "simd"
-        set(CMAKE_CCXX_NOWARN_FLAGS
-            "${CMAKE_CCXX_NOWARN_FLAGS} -Qdiag-disable:15552")
+        append(CMAKE_CCXX_NOWARN_FLAGS "-Qdiag-disable:15552")
     endif()
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
         # Clang cannot vectorize some loops with #pragma omp simd and gets
         # very upset. Tell it that it's okay and that we love it
-        # unconditionnaly.
-        set(CMAKE_CCXX_NOWARN_FLAGS
-            "${CMAKE_CCXX_NOWARN_FLAGS} -Wno-pass-failed")
+        # unconditionally.
+        append(CMAKE_CCXX_FLAGS "-Wno-pass-failed")
     endif()
 elseif(UNIX OR MINGW)
-    set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} -Wall -Werror -Wno-unknown-pragmas")
-    set(CMAKE_CCXX_FLAGS "${CMAKE_CCXX_FLAGS} -fvisibility=internal")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fvisibility-inlines-hidden")
+    append(CMAKE_CCXX_FLAGS "-Wall -Werror -Wno-unknown-pragmas")
+    append(CMAKE_CCXX_FLAGS "-fvisibility=internal")
+    append(CMAKE_C_FLAGS "-std=c99")
+    append(CMAKE_CXX_FLAGS "-std=c++11 -fvisibility-inlines-hidden")
     # compiler specific settings
     if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
         # Clang cannot vectorize some loops with #pragma omp simd and gets
         # very upset. Tell it that it's okay and that we love it
-        # unconditionnaly.
-        set(CMAKE_CCXX_NOWARN_FLAGS
-            "${CMAKE_CCXX_NOWARN_FLAGS} -Wno-pass-failed")
+        # unconditionally.
+        append(CMAKE_CCXX_NOWARN_FLAGS "-Wno-pass-failed")
+        if(MKLDNN_USE_CLANG_SANITIZER MATCHES "Memory(WithOrigin)?")
+            if(NOT MKLDNN_THREADING STREQUAL "SEQ")
+                message(WARNING "Clang OpenMP is not compatible with MSan! "
+                    "Expect a lot of false positives!")
+            endif()
+            append(CMAKE_CCXX_SANITIZER_FLAGS "-fsanitize=memory")
+            if(MKLDNN_USE_CLANG_SANITIZER STREQUAL "MemoryWithOrigin")
+                append(CMAKE_CCXX_SANITIZER_FLAGS
+                    "-fsanitize-memory-track-origins=2")
+                append(CMAKE_CCXX_SANITIZER_FLAGS
+                    "-fno-omit-frame-pointer")
+            endif()
+            set(MKLDNN_ENABLED_CLANG_SANITIZER "${MKLDNN_USE_CLANG_SANITIZER}")
+        elseif(MKLDNN_USE_CLANG_SANITIZER STREQUAL "Undefined")
+            append(CMAKE_CCXX_SANITIZER_FLAGS "-fsanitize=undefined")
+            append(CMAKE_CCXX_SANITIZER_FLAGS
+                "-fno-sanitize=function,vptr")  # work around linking problems
+            append(CMAKE_CCXX_SANITIZER_FLAGS "-fno-omit-frame-pointer")
+            set(MKLDNN_ENABLED_CLANG_SANITIZER "${MKLDNN_USE_CLANG_SANITIZER}")
+        elseif(MKLDNN_USE_CLANG_SANITIZER STREQUAL "Address")
+            append(CMAKE_CCXX_SANITIZER_FLAGS "-fsanitize=address")
+            set(MKLDNN_ENABLED_CLANG_SANITIZER "${MKLDNN_USE_CLANG_SANITIZER}")
+        elseif(MKLDNN_USE_CLANG_SANITIZER STREQUAL "Thread")
+            append(CMAKE_CCXX_SANITIZER_FLAGS "-fsanitize=thread")
+            set(MKLDNN_ENABLED_CLANG_SANITIZER "${MKLDNN_USE_CLANG_SANITIZER}")
+        elseif(MKLDNN_USE_CLANG_SANITIZER STREQUAL "Leak")
+            append(CMAKE_CCXX_SANITIZER_FLAGS "-fsanitize=leak")
+            set(MKLDNN_ENABLED_CLANG_SANITIZER "${MKLDNN_USE_CLANG_SANITIZER}")
+        elseif(NOT MKLDNN_USE_CLANG_SANITIZER STREQUAL "")
+            message(FATAL_ERROR
+                "Unsupported Clang sanitizer '${MKLDNN_USE_CLANG_SANITIZER}'")
+        endif()
+        if(MKLDNN_ENABLED_CLANG_SANITIZER)
+            message(STATUS
+                "Using Clang ${MKLDNN_ENABLED_CLANG_SANITIZER} "
+                "sanitizer (experimental!)")
+            append(CMAKE_CCXX_SANITIZER_FLAGS "-g -fno-omit-frame-pointer")
+        endif()
     elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
         if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
             set(DEF_ARCH_OPT_FLAGS "-march=native -mtune=native")
         endif()
-        if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0)
-            # suppress warning on assumptions made regarding overflow (#146)
-            set(CMAKE_CCXX_NOWARN_FLAGS
-                "${CMAKE_CCXX_NOWARN_FLAGS} -Wno-strict-overflow")
-        endif()
+        # suppress warning on assumptions made regarding overflow (#146)
+        append(CMAKE_CCXX_NOWARN_FLAGS "-Wno-strict-overflow")
     elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
         set(DEF_ARCH_OPT_FLAGS "-xHOST")
         # workaround for Intel Compiler 16.0 that produces error caused
         # by pragma omp simd collapse(..)
         if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "17.0")
-            set(CMAKE_CCXX_NOWARN_FLAGS
-                "${CMAKE_CCXX_NOWARN_FLAGS} -diag-disable:13379")
+            append(CMAKE_CCXX_NOWARN_FLAGS "-diag-disable:13379")
         endif()
-        set(CMAKE_CCXX_NOWARN_FLAGS
-            "${CMAKE_CCXX_NOWARN_FLAGS} -diag-disable:15552")
+        append(CMAKE_CCXX_NOWARN_FLAGS "-diag-disable:15552")
         # disable `was not vectorized: vectorization seems inefficient` remark
-        set(CMAKE_CCXX_NOWARN_FLAGS
-            "${CMAKE_CCXX_NOWARN_FLAGS} -diag-disable:15335")
+        append(CMAKE_CCXX_NOWARN_FLAGS "-diag-disable:15335")
     endif()
 endif()
 
 if(WIN32)
-    set(CTESTCONFIG_PATH "$ENV{PATH}")
-    string(REPLACE ";" "\;" CTESTCONFIG_PATH "${CTESTCONFIG_PATH}")
+    string(REPLACE ";" "\;" ENV_PATH "$ENV{PATH}")
+    set(CTESTCONFIG_PATH "${CTESTCONFIG_PATH}\;${MKLDLLPATH}\;${ENV_PATH}")
 endif()
 
 if(UNIX OR MINGW)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
         # Link Intel libraries statically (except for iomp5)
-        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -liomp5 -static-intel")
+        append(CMAKE_SHARED_LINKER_FLAGS "-liomp5 -static-intel")
         # Tell linker to not complain about missing static libraries
-        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -diag-disable:10237")
+        append(CMAKE_SHARED_LINKER_FLAGS "-diag-disable:10237")
     endif()
 endif()
 
@@ -114,8 +143,8 @@ if(ARCH_OPT_FLAGS STREQUAL "HostOpts")
     set(ARCH_OPT_FLAGS "${DEF_ARCH_OPT_FLAGS}")
 endif()
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_CCXX_FLAGS} ${ARCH_OPT_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CCXX_FLAGS} ${ARCH_OPT_FLAGS}")
+append(CMAKE_C_FLAGS "${CMAKE_CCXX_FLAGS} ${ARCH_OPT_FLAGS}")
+append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_FLAGS} ${ARCH_OPT_FLAGS}")
 
 if(APPLE)
     set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
@@ -123,7 +152,7 @@ if(APPLE)
     # paths to rpath (like Intel compiler...)
     foreach(_ ${CMAKE_C_IMPLICIT_LINK_DIRECTORIES})
         set(_rpath "-Wl,-rpath,${_}")
-        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${_rpath}")
+        append(CMAKE_SHARED_LINKER_FLAGS "${_rpath}")
         set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${_rpath}")
     endforeach()
 endif()
index f159d78..c531d84 100644 (file)
@@ -22,11 +22,7 @@ set(profiling_cmake_included true)
 if("${VTUNEROOT}" STREQUAL "")
     message(STATUS "VTune profiling environment is unset")
 else()
-    if (MSVC)
-        set(JITPROFLIB "jitprofiling.lib")
-    else()
-        set(JITPROFLIB "libjitprofiling.a")
-    endif()
+    set_ternary(JITPROFLIB MSVC "jitprofiling.lib" "libjitprofiling.a")
     list(APPEND EXTRA_LIBS "${VTUNEROOT}/lib64/${JITPROFLIB}")
     message(STATUS "VTune profiling environment is set")
 endif()
index 9268bf1..d8680b7 100644 (file)
@@ -32,9 +32,41 @@ function(register_exe name srcs test)
     target_link_libraries(${name} ${LIB_NAME} ${EXTRA_LIBS} ${ARGV3})
     if("${test}" STREQUAL "test")
         add_test(${name} ${name})
-    endif()
-    if(WIN32 OR MINGW)
-        set_property(TEST ${name} PROPERTY ENVIRONMENT "PATH=${CTESTCONFIG_PATH};$ENV{PATH}")
-        configure_file(${CMAKE_SOURCE_DIR}/config_template.vcxproj.user ${name}.vcxproj.user @ONLY)
+        if(WIN32 OR MINGW)
+            set_property(TEST ${name} PROPERTY ENVIRONMENT "PATH=${CTESTCONFIG_PATH};$ENV{PATH}")
+            configure_file(${CMAKE_SOURCE_DIR}/config_template.vcxproj.user ${name}.vcxproj.user @ONLY)
+        endif()
     endif()
 endfunction()
+
+# Append to a variable
+#   var = var + value
+macro(append var value)
+    set(${var} "${${var}} ${value}")
+endmacro()
+
+# Set variable depending on condition:
+#   var = cond ? val_if_true : val_if_false
+macro(set_ternary var condition val_if_true val_if_false)
+    if (${condition})
+        set(${var} "${val_if_true}")
+    else()
+        set(${var} "${val_if_false}")
+    endif()
+endmacro()
+
+# Conditionally set a variable
+#   if (cond) var = value
+macro(set_if condition var value)
+    if (${condition})
+        set(${var} "${value}")
+    endif()
+endmacro()
+
+# Conditionally append
+#   if (cond) var = var + value
+macro(append_if condition var value)
+    if (${condition})
+        append(${var} "${value}")
+    endif()
+endmacro()
index bb4fdb9..5159fc6 100644 (file)
@@ -36,8 +36,6 @@ if (NOT _tbbmalloc_proxy_ix EQUAL -1)
     endif()
 endif()
 
-set(TBB_INTERFACE_VERSION 10005)
-
 # Intel MKL-DNN changes: use TBBROOT to locate Intel TBB
 # get_filename_component(_tbb_root "${CMAKE_CURRENT_LIST_FILE}" PATH)
 # get_filename_component(_tbb_root "${_tbb_root}" PATH)
@@ -75,11 +73,21 @@ if (WINDOWS_STORE)
     set(_tbb_compiler_subdir ${_tbb_compiler_subdir}_ui)
 endif()
 
-get_filename_component(_tbb_lib_path "${_tbb_root}/bin/${_tbb_arch_subdir}/${_tbb_compiler_subdir}" ABSOLUTE)
+#set conveniance variable to locate TBB files (these are used for a PSXE install)
+get_filename_component(_tbb_lib_path "${_tbb_root}/lib/${_tbb_arch_subdir}/${_tbb_compiler_subdir}" ABSOLUTE)
+get_filename_component(_tbb_inc_path "${_tbb_root}/include/" ABSOLUTE)
+
+
+# we need to check the version of tbb
+file(READ "${_tbb_inc_path}/tbb/tbb_stddef.h" _tbb_stddef)
+string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_tbb_stddef}")
+if (${TBB_INTERFACE_VERSION} VERSION_LESS 9100)
+    message(FATAL_ERROR "MKL-DNN requires TBB version 2017 or above")
+endif()
 
 foreach (_tbb_component ${TBB_FIND_COMPONENTS})
-    set(_tbb_release_lib "${_tbb_lib_path}/${_tbb_component}.dll")
-    set(_tbb_debug_lib "${_tbb_lib_path}/${_tbb_component}_debug.dll")
+    set(_tbb_release_lib "${_tbb_lib_path}/${_tbb_component}.lib")
+    set(_tbb_debug_lib "${_tbb_lib_path}/${_tbb_component}_debug.lib")
 
     if (EXISTS "${_tbb_release_lib}" AND EXISTS "${_tbb_debug_lib}")
         add_library(TBB::${_tbb_component} SHARED IMPORTED)
@@ -87,9 +95,9 @@ foreach (_tbb_component ${TBB_FIND_COMPONENTS})
                               IMPORTED_CONFIGURATIONS "RELEASE;DEBUG"
                               IMPORTED_LOCATION_RELEASE     "${_tbb_release_lib}"
                               IMPORTED_LOCATION_DEBUG       "${_tbb_debug_lib}"
-                              INTERFACE_INCLUDE_DIRECTORIES "${_tbb_root}/include"
-                              IMPORTED_IMPLIB_RELEASE       "${_tbb_root}/lib/${_tbb_arch_subdir}/${_tbb_compiler_subdir}/${_tbb_component}.lib"
-                              IMPORTED_IMPLIB_DEBUG         "${_tbb_root}/lib/${_tbb_arch_subdir}/${_tbb_compiler_subdir}/${_tbb_component}_debug.lib"
+                              INTERFACE_INCLUDE_DIRECTORIES "${_tbb_inc_path}"
+                              IMPORTED_IMPLIB_RELEASE       "${_tbb_release_lib}"
+                              IMPORTED_IMPLIB_DEBUG         "${_tbb_debug_lib}"
                               INTERFACE_COMPILE_DEFINITIONS "__TBB_NO_IMPLICIT_LINKAGE=1")
 
         # Add internal dependencies for imported targets: TBB::tbbmalloc_proxy -> TBB::tbbmalloc
index f833331..799f5c4 100644 (file)
@@ -44,7 +44,7 @@ linear (1D) memory address space and why this is important for
 Let's first focus on data formats for activations (images).
 
 Activations consist of channels (aka feature maps) and a spatial domain,
-either 2D or 3D. Spatial domain together with channels form an image.
+1D, 2D or 3D. Spatial domain together with channels form an image.
 During the training phase images are typically grouped together in batches.
 Even if there is only one image, we would still assume there is a batch
 with batch size equal to 1.
index 8cf6ae0..2a0c7a8 100644 (file)
@@ -18,12 +18,13 @@ The table below summarizes the list of supported functions and their variants.
 
 | Primitive class   | Primitive                | fp32 training | fp32 inference | int8 inference |
 | :---------------- | :----------------------- | :-----------: | :------------: | :------------: |
-| Convolution       | 2D direct convolution    | x             | x              | x              |
+| Convolution       | 1D direct convolution    | x             | x              |                |
+|                   | 2D direct convolution    | x             | x              | x              |
 |                   | 2D direct deconvolution  | x             | x              | x              |
 |                   | 2D winograd convolution  | x             | x              | x              |
 |                   | 3D direct convolution    | x             | x              |                |
 |                   | 3D direct deconvolution  | x             | x              |                |
-| Inner Product     | 2D inner product         | x             | x              |                |
+| Inner Product     | 2D inner product         | x             | x              | x              |
 |                   | 3D inner product         | x             | x              |                |
 | RNN (experimental)| Vanilla RNN cell         | x             | x              |                |
 |                   | LSTM cell                | x             | x              |                |
@@ -47,6 +48,7 @@ The table below summarizes the list of supported functions and their variants.
 |                   | Sum                      | x             | x              | x              |
 |                   | Concat                   | x             | x              | x              |
 |                   | Elementwise operations   |               | x              |                |
+|                   | Channel Shuffle          | x             | x              | x              |
 
 ## Programming Model
 
index c917955..3d05855 100644 (file)
@@ -19,16 +19,19 @@ if (NOT WITH_EXAMPLE)
 endif()
 
 # propagate EXAMPLE specific flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_EXAMPLE_CCXX_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_EXAMPLE_CCXX_FLAGS}")
+append(CMAKE_C_FLAGS "${CMAKE_EXAMPLE_CCXX_FLAGS}")
+append(CMAKE_CXX_FLAGS "${CMAKE_EXAMPLE_CCXX_FLAGS}")
+
+# propagate sanitizer flags
+append(CMAKE_C_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}")
+append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}")
 
 include_directories(${CMAKE_SOURCE_DIR}/include)
 
+set_if(UNIX LIBM m)
+
 register_exe(simple-net-c simple_net.c "test")
 register_exe(simple-net-cpp simple_net.cpp "test")
-if(UNIX)
-    set(LIBM m)
-endif()
 register_exe(simple-training-net-c simple_training_net.c "test" ${LIBM})
 register_exe(simple-training-net-cpp simple_training_net.cpp "test" ${LIBM})
 register_exe(simple-net-int8-cpp simple_net_int8.cpp "test")
index 42180da..6a4e78a 100644 (file)
 #endif
 
 #define BATCH 8
+#define IC 3
+#define OC 96
+#define CONV_IH 227
+#define CONV_IW 227
+#define CONV_OH 55
+#define CONV_OW 55
+#define CONV_STRIDE 4
+#define CONV_PAD 0
+#define POOL_OH 27
+#define POOL_OW 27
+#define POOL_STRIDE 2
+#define POOL_PAD 0
 
 #define CHECK(f) do { \
     mkldnn_status_t s = f; \
@@ -132,52 +144,55 @@ mkldnn_status_t prepare_reorder(
     return mkldnn_success;
 }
 
-mkldnn_status_t simple_net(){
+mkldnn_status_t simple_net() {
 
     mkldnn_engine_t engine;
     CHECK(mkldnn_engine_create(&engine, mkldnn_cpu, 0 /* idx */));
 
-    float *net_src = (float*)aligned_malloc(BATCH*3*227*227*sizeof(float), 64);
-    float *net_dst = (float*)aligned_malloc(BATCH*96*27*27*sizeof(float), 64);
+    float *net_src = (float *)aligned_malloc(
+            BATCH * IC * CONV_IH * CONV_IW * sizeof(float), 64);
+    float *net_dst = (float *)aligned_malloc(
+            BATCH * OC * POOL_OH * POOL_OW * sizeof(float), 64);
 
     /* AlexNet: conv
-     * {BATCH, 3, 227, 227} (x) {96, 3, 11, 11} -> {BATCH, 96, 55, 55}
-     * strides: {4, 4}
+     * {BATCH, IC, CONV_IH, CONV_IW} (x) {OC, IC, CONV_KH, CONV_KW} ->
+     * {BATCH, OC, CONV_OH, CONV_OW}
+     * strides: {CONV_STRIDE, CONV_STRIDE}
      */
-    int conv_src_sizes[4] = {BATCH, 3, 227, 227};
-    int conv_weights_sizes[4] = {96, 3, 11, 11};
-    int conv_bias_sizes[4] = {96};
-    int conv_dst_sizes[4] = {BATCH, 96, 55, 55};
-    int conv_strides[2] = {4, 4};
-    int conv_padding[2] = {0, 0};
+    int conv_user_src_sizes[4] = { BATCH, IC, CONV_IH, CONV_IW };
+    int conv_user_weights_sizes[4] = { OC, IC, 11, 11 };
+    int conv_bias_sizes[4] = { OC };
+    int conv_user_dst_sizes[4] = { BATCH, OC, CONV_OH, CONV_OW };
+    int conv_strides[2] = { CONV_STRIDE, CONV_STRIDE };
+    int conv_padding[2] = { CONV_PAD, CONV_PAD };
 
     float *conv_src = net_src;
-    float *conv_weights =
-       (float*)aligned_malloc(product(conv_weights_sizes, 4)*sizeof(float), 64);
-    float *conv_bias =
-       (float*)aligned_malloc(product(conv_bias_sizes, 1)*sizeof(float), 64);
+    float *conv_weights = (float *)aligned_malloc(
+            product(conv_user_weights_sizes, 4) * sizeof(float), 64);
+    float *conv_bias = (float *)aligned_malloc(
+            product(conv_bias_sizes, 1) * sizeof(float), 64);
 
     /* create memory for user data */
     mkldnn_primitive_t conv_user_src_memory, conv_user_weights_memory,
         conv_user_bias_memory;
-    init_data_memory(4, conv_src_sizes, mkldnn_nchw, mkldnn_f32, engine,
-        conv_src, &conv_user_src_memory);
-    init_data_memory(4, conv_weights_sizes, mkldnn_oihw, mkldnn_f32, engine,
-        conv_weights, &conv_user_weights_memory);
+    init_data_memory(4, conv_user_src_sizes, mkldnn_nchw, mkldnn_f32, engine,
+            conv_src, &conv_user_src_memory);
+    init_data_memory(4, conv_user_weights_sizes, mkldnn_oihw, mkldnn_f32,
+            engine, conv_weights, &conv_user_weights_memory);
     init_data_memory(1, conv_bias_sizes, mkldnn_x, mkldnn_f32, engine,
-        conv_bias, &conv_user_bias_memory);
+            conv_bias, &conv_user_bias_memory);
 
     /* create data descriptors for convolution w/ no specified format */
 
     mkldnn_memory_desc_t conv_src_md, conv_weights_md, conv_bias_md,
         conv_dst_md;
-    CHECK(mkldnn_memory_desc_init(&conv_src_md, 4, conv_src_sizes,
+    CHECK(mkldnn_memory_desc_init(&conv_src_md, 4, conv_user_src_sizes,
         mkldnn_f32, mkldnn_any));
-    CHECK(mkldnn_memory_desc_init(&conv_weights_md, 4, conv_weights_sizes,
+    CHECK(mkldnn_memory_desc_init(&conv_weights_md, 4, conv_user_weights_sizes,
         mkldnn_f32, mkldnn_any));
     CHECK(mkldnn_memory_desc_init(&conv_bias_md, 1, conv_bias_sizes,
         mkldnn_f32, mkldnn_x));
-    CHECK(mkldnn_memory_desc_init(&conv_dst_md, 4, conv_dst_sizes,
+    CHECK(mkldnn_memory_desc_init(&conv_dst_md, 4, conv_user_dst_sizes,
         mkldnn_f32, mkldnn_any));
 
     /* create a convolution */
@@ -194,20 +209,13 @@ mkldnn_status_t simple_net(){
     mkldnn_primitive_t conv_internal_src_memory, conv_internal_weights_memory,
         conv_internal_dst_memory;
 
-    float *conv_src_buffer =
-        (float*)aligned_malloc(product(conv_src_sizes, 4)*sizeof(float), 64);
-    float *conv_weights_buffer =
-        (float*)aligned_malloc(product(conv_weights_sizes, 4)*sizeof(float), 64);
-    float *conv_dst_buffer =
-        (float*)aligned_malloc(product(conv_dst_sizes, 4)*sizeof(float), 64);
-    memset(conv_src_buffer, 0, product(conv_src_sizes, 4)*sizeof(float));
-    memset(conv_weights_buffer, 0, product(conv_weights_sizes, 4)*sizeof(float));
-    memset(conv_dst_buffer, 0, product(conv_dst_sizes, 4)*sizeof(float));
-
     /* create memory for dst data, we don't need reorder it to user data */
-    CHECK(mkldnn_primitive_create(&conv_internal_dst_memory,
-            mkldnn_primitive_desc_query_pd(conv_pd, mkldnn_query_dst_pd, 0),
-            NULL, NULL));
+    const_mkldnn_primitive_desc_t dst_pd
+            = mkldnn_primitive_desc_query_pd(conv_pd, mkldnn_query_dst_pd, 0);
+    CHECK(mkldnn_primitive_create(
+            &conv_internal_dst_memory, dst_pd, NULL, NULL));
+    size_t conv_dst_size = mkldnn_memory_primitive_desc_get_size(dst_pd);
+    float *conv_dst_buffer = (float *)aligned_malloc(conv_dst_size, 64);
     CHECK(mkldnn_memory_set_data_handle(
             conv_internal_dst_memory, conv_dst_buffer));
 
@@ -217,12 +225,16 @@ mkldnn_status_t simple_net(){
 
     const_mkldnn_primitive_desc_t src_pd = mkldnn_primitive_desc_query_pd(
             conv_pd, mkldnn_query_src_pd, 0);
+    size_t conv_src_size = mkldnn_memory_primitive_desc_get_size(src_pd);
+    float *conv_src_buffer = (float *)aligned_malloc(conv_src_size, 64);
     CHECK(prepare_reorder(&conv_user_src_memory, &src_pd, 1,
-            &conv_internal_src_memory, &conv_reorder_src,
-            conv_src_buffer));
+            &conv_internal_src_memory, &conv_reorder_src, conv_src_buffer));
 
     const_mkldnn_primitive_desc_t weights_pd = mkldnn_primitive_desc_query_pd(
             conv_pd, mkldnn_query_weights_pd, 0);
+    size_t conv_weights_size
+            = mkldnn_memory_primitive_desc_get_size(weights_pd);
+    float *conv_weights_buffer = (float *)aligned_malloc(conv_weights_size, 64);
     CHECK(prepare_reorder(&conv_user_weights_memory, &weights_pd, 1,
             &conv_internal_weights_memory, &conv_reorder_weights,
             conv_weights_buffer));
@@ -245,17 +257,12 @@ mkldnn_status_t simple_net(){
     CHECK(mkldnn_primitive_create(&conv, conv_pd, conv_srcs, conv_dsts));
 
     /* AlexNet: relu
-     * {BATCH, 96, 55, 55} -> {BATCH, 96, 55, 55}
+     * {BATCH, OC, CONV_OH, CONV_OW} -> {BATCH, OC, CONV_OH, CONV_OW}
      */
     float negative_slope = 1.0f;
 
-    int *relu_dst_sizes = conv_dst_sizes;
-    float *relu_dst_buffer =
-        (float*)aligned_malloc(product(relu_dst_sizes, 4)*sizeof(float), 64);
-    memset(relu_dst_buffer, 0, product(relu_dst_sizes, 4)*sizeof(float));
-
     /* create relu memory descriptor on dst memory descriptor
-     * from previos primitive */
+     * from previous primitive */
     const_mkldnn_primitive_desc_t conv_dst_pd = mkldnn_primitive_desc_query_pd(
             conv_pd, mkldnn_query_dst_pd, 0);
     const mkldnn_memory_desc_t *relu_src_md =
@@ -273,6 +280,8 @@ mkldnn_status_t simple_net(){
     const_mkldnn_primitive_desc_t relu_dst_pd = mkldnn_primitive_desc_query_pd(
             relu_pd, mkldnn_query_dst_pd, 0);
     CHECK(mkldnn_primitive_create(&relu_dst_memory, relu_dst_pd, NULL, NULL));
+    size_t relu_dst_size = mkldnn_memory_primitive_desc_get_size(relu_dst_pd);
+    float *relu_dst_buffer = (float *)aligned_malloc(relu_dst_size, 64);
     CHECK(mkldnn_memory_set_data_handle(relu_dst_memory, relu_dst_buffer));
 
     /* finally create a relu primitive */
@@ -283,7 +292,7 @@ mkldnn_status_t simple_net(){
     CHECK(mkldnn_primitive_create(&relu, relu_pd, &relu_srcs, relu_dsts));
 
     /* AlexNet: lrn
-     * {BATCH, 96, 55, 55} -> {BATCH, 96, 55, 55}
+     * {BATCH, OC, CONV_OH, CONV_OW} -> {BATCH, OC, CONV_OH, CONV_OW}
      * local size: 5
      * alpha: 0.0001
      * beta: 0.75
@@ -293,14 +302,8 @@ mkldnn_status_t simple_net(){
     float beta = 0.75f;
     float k = 1.0f;
 
-    int32_t *lrn_dst_sizes = relu_dst_sizes;
-
-    float *lrn_dst_buffer =
-        (float*)aligned_malloc(product(lrn_dst_sizes, 4)*sizeof(float), 64);
-    memset(lrn_dst_buffer, 0, product(lrn_dst_sizes, 4)*sizeof(float));
-
     /* create lrn memory descriptor on dst memory descriptor
-     *  from previos primitive */
+     *  from previous primitive */
     const mkldnn_memory_desc_t *lrn_src_md =
         mkldnn_primitive_desc_query_memory_d(relu_dst_pd);
 
@@ -317,6 +320,8 @@ mkldnn_status_t simple_net(){
     const_mkldnn_primitive_desc_t lrn_dst_pd = mkldnn_primitive_desc_query_pd(
             lrn_pd, mkldnn_query_dst_pd, 0);
     CHECK(mkldnn_primitive_create(&lrn_dst_memory, lrn_dst_pd, NULL, NULL));
+    size_t lrn_dst_size = mkldnn_memory_primitive_desc_get_size(lrn_dst_pd);
+    float *lrn_dst_buffer = (float *)aligned_malloc(lrn_dst_size, 64);
     CHECK(mkldnn_memory_set_data_handle(lrn_dst_memory, lrn_dst_buffer));
 
     mkldnn_primitive_t lrn_scratch_memory;
@@ -327,7 +332,6 @@ mkldnn_status_t simple_net(){
     size_t lrn_scratch_size =
         mkldnn_memory_primitive_desc_get_size(lrn_scratch_pd);
     float *lrn_scratch_buffer = (float*)aligned_malloc(lrn_scratch_size, 64);
-    memset(lrn_scratch_buffer, 0, lrn_scratch_size);
     CHECK(mkldnn_memory_set_data_handle(lrn_scratch_memory,
             lrn_scratch_buffer));
 
@@ -341,28 +345,25 @@ mkldnn_status_t simple_net(){
     CHECK(mkldnn_primitive_create(&lrn, lrn_pd, &lrn_srcs, lrn_dsts));
 
     /* AlexNet: pool
-     * {BATCH, 96, 55, 55} -> {BATCH, 96, 27, 27}
+     * {BATCH, OC, CONV_OH, CONV_OW} -> {BATCH, OC, POOL_OH, POOL_OW}
      * kernel: {3, 3}
-     * strides: {2, 2}
+     * strides: {POOL_STRIDE, POOL_STRIDE}
      */
-    int32_t pool_dst_sizes[4] = {BATCH, 96, 27, 27};
-    int32_t pool_kernel[2] = {3, 3};
-    int32_t pool_strides[2] = {2, 2};
-    int32_t pool_padding[2] = {0, 0};
 
-    float *pool_dst_buffer =
-        (float*)aligned_malloc(product(pool_dst_sizes, 4)*sizeof(float), 64);
-    memset(pool_dst_buffer, 0, product(pool_dst_sizes, 4)*sizeof(float));
+    int32_t pool_dst_sizes[4] = { BATCH, OC, POOL_OH, POOL_OW };
+    int32_t pool_kernel[2] = { 3, 3 };
+    int32_t pool_strides[2] = { POOL_STRIDE, POOL_STRIDE };
+    int32_t pool_padding[2] = { POOL_PAD, POOL_PAD };
 
     /* create pooling memory descriptor on dst descriptor
-     *  from previos primitive */
+     *  from previous primitive */
     const mkldnn_memory_desc_t *pool_src_md =
         mkldnn_primitive_desc_query_memory_d(lrn_dst_pd);
 
     /* create descriptors for dst pooling data */
     mkldnn_memory_desc_t pool_dst_md;
-    CHECK(mkldnn_memory_desc_init(&pool_dst_md, 4, pool_dst_sizes, mkldnn_f32,
-            mkldnn_any));
+    CHECK(mkldnn_memory_desc_init(
+            &pool_dst_md, 4, pool_dst_sizes, mkldnn_f32, mkldnn_any));
 
     /* create memory for user data */
     mkldnn_primitive_t pool_user_dst_memory;
@@ -387,7 +388,6 @@ mkldnn_status_t simple_net(){
     size_t pool_indices_size =
         mkldnn_memory_primitive_desc_get_size(pool_indices_pd);
     float *pool_indices_buffer = (float*)aligned_malloc(pool_indices_size, 64);
-    memset(pool_indices_buffer, 0, pool_indices_size);
     CHECK(mkldnn_memory_set_data_handle(pool_indices_memory,
             pool_indices_buffer));
 
@@ -398,6 +398,8 @@ mkldnn_status_t simple_net(){
     mkldnn_primitive_t pool_reorder_dst, pool_internal_dst_memory;
     const_mkldnn_primitive_desc_t pool_dst_pd =
         mkldnn_primitive_desc_query_pd(pool_pd, mkldnn_query_dst_pd, 0);
+    size_t pool_dst_size = mkldnn_memory_primitive_desc_get_size(pool_dst_pd);
+    float *pool_dst_buffer = (float *)aligned_malloc(pool_dst_size, 64);
     CHECK(prepare_reorder(&pool_user_dst_memory, &pool_dst_pd, 0,
             &pool_internal_dst_memory, &pool_reorder_dst, pool_dst_buffer));
 
index e008100..d63e675 100644 (file)
@@ -545,7 +545,7 @@ void simple_net() {
     );
     auto leftmost_bwd_prim_desc
         = mkldnn::rnn_backward::primitive_desc(
-            leftmost_layer_bwd_desc, cpu_engine);
+            leftmost_layer_bwd_desc, cpu_engine, leftmost_prim_desc);
 
     // As the batch dimensions are different between leftmost and rightmost
     // we need to do the views. rightmost needs less memory, so it will view
@@ -585,7 +585,7 @@ void simple_net() {
     );
     auto rightmost_bwd_prim_desc
         = mkldnn::rnn_backward::primitive_desc(
-            rightmost_layer_bwd_desc, cpu_engine);
+            rightmost_layer_bwd_desc, cpu_engine, rightmost_prim_desc);
 
     //
     // Memory primitives for backward pass
index 95f052d..dbe1ac0 100644 (file)
 #endif
 
 #define BATCH 32
+#define IC 3
+#define OC 96
+#define CONV_IH 227
+#define CONV_IW 227
+#define CONV_OH 55
+#define CONV_OW 55
+#define CONV_STRIDE 4
+#define CONV_PAD 0
+#define POOL_OH 27
+#define POOL_OW 27
+#define POOL_STRIDE 2
+#define POOL_PAD 0
 
 #define CHECK(f)                                                               \
     do {                                                                       \
@@ -165,8 +177,8 @@ mkldnn_status_t simple_net()
     mkldnn_engine_t engine;
     CHECK(mkldnn_engine_create(&engine, mkldnn_cpu, 0 /* idx */));
 
-    int net_src_sizes[4] = { BATCH, 3, 227, 227 };
-    int net_dst_sizes[4] = { BATCH, 96, 27, 27 };
+    int net_src_sizes[4] = { BATCH, IC, CONV_IH, CONV_IW };
+    int net_dst_sizes[4] = { BATCH, OC, POOL_OH, POOL_OW };
 
     float *net_src =
         (float *)aligned_malloc(product(net_src_sizes,4)*sizeof(float), 64);
@@ -179,47 +191,47 @@ mkldnn_status_t simple_net()
     /*----------------------------------------------------------------------*/
     /*----------------- Forward Stream -------------------------------------*/
     /* AlexNet: conv
-     * {BATCH, 3, 227, 227} (x) {96, 3, 11, 11} -> {BATCH, 96, 55, 55}
-     * strides: {4, 4}
+     * {BATCH, IC, CONV_IH, CONV_IW} (x) {OC, IC, 11, 11} ->
+     * {BATCH, OC, CONV_OH, CONV_OW}
+     * strides: {CONV_STRIDE, CONV_STRIDE}
      */
-    int *conv_src_sizes = net_src_sizes;
-    int conv_weights_sizes[4] = { 96, 3, 11, 11 };
-    int conv_bias_sizes[4] = { 96 };
-    int conv_dst_sizes[4] = { BATCH, 96, 55, 55 };
-    int conv_strides[2] = { 4, 4 };
-    int conv_padding[2] = { 0, 0 };
+    int *conv_user_src_sizes = net_src_sizes;
+    int conv_user_weights_sizes[4] = { OC, IC, 11, 11 };
+    int conv_bias_sizes[4] = { OC };
+    int conv_user_dst_sizes[4] = { BATCH, OC, CONV_OH, CONV_OW };
+    int conv_strides[2] = { CONV_STRIDE, CONV_STRIDE };
+    int conv_padding[2] = { CONV_PAD, CONV_PAD };
 
     float *conv_src = net_src;
-    float *conv_weights =
-        (float *)aligned_malloc(product(conv_weights_sizes, 4)*sizeof(float),
-                                64);
-    float *conv_bias =
-         (float *)aligned_malloc(product(conv_bias_sizes, 1)*sizeof(float), 64);
+    float *conv_weights = (float *)aligned_malloc(
+            product(conv_user_weights_sizes, 4) * sizeof(float), 64);
+    float *conv_bias = (float *)aligned_malloc(
+            product(conv_bias_sizes, 1) * sizeof(float), 64);
 
-    init_net_data(conv_weights, 4, conv_weights_sizes);
+    init_net_data(conv_weights, 4, conv_user_weights_sizes);
     init_net_data(conv_bias, 1, conv_bias_sizes);
 
     /* create memory for user data */
     mkldnn_primitive_t conv_user_src_memory, conv_user_weights_memory,
             conv_user_bias_memory;
-    init_data_memory(4, conv_src_sizes, mkldnn_nchw, mkldnn_f32, engine,
+    init_data_memory(4, conv_user_src_sizes, mkldnn_nchw, mkldnn_f32, engine,
                      conv_src, &conv_user_src_memory);
-    init_data_memory(4, conv_weights_sizes, mkldnn_oihw, mkldnn_f32, engine,
-                     conv_weights, &conv_user_weights_memory);
+    init_data_memory(4, conv_user_weights_sizes, mkldnn_oihw, mkldnn_f32,
+            engine, conv_weights, &conv_user_weights_memory);
     init_data_memory(1, conv_bias_sizes, mkldnn_x, mkldnn_f32, engine,
-                     conv_bias, &conv_user_bias_memory);
+            conv_bias, &conv_user_bias_memory);
 
     /* create data descriptors for convolution w/ no specified format */
     mkldnn_memory_desc_t conv_src_md, conv_weights_md, conv_bias_md,
             conv_dst_md;
-    CHECK(mkldnn_memory_desc_init(&conv_src_md, 4, conv_src_sizes, mkldnn_f32,
-                                  mkldnn_any));
-    CHECK(mkldnn_memory_desc_init(&conv_weights_md, 4, conv_weights_sizes,
-                                  mkldnn_f32, mkldnn_any));
-    CHECK(mkldnn_memory_desc_init(&conv_bias_md, 1, conv_bias_sizes, mkldnn_f32,
-                                  mkldnn_x));
-    CHECK(mkldnn_memory_desc_init(&conv_dst_md, 4, conv_dst_sizes, mkldnn_f32,
-                                  mkldnn_any));
+    CHECK(mkldnn_memory_desc_init(
+            &conv_src_md, 4, conv_user_src_sizes, mkldnn_f32, mkldnn_any));
+    CHECK(mkldnn_memory_desc_init(&conv_weights_md, 4, conv_user_weights_sizes,
+            mkldnn_f32, mkldnn_any));
+    CHECK(mkldnn_memory_desc_init(
+            &conv_bias_md, 1, conv_bias_sizes, mkldnn_f32, mkldnn_x));
+    CHECK(mkldnn_memory_desc_init(
+            &conv_dst_md, 4, conv_user_dst_sizes, mkldnn_f32, mkldnn_any));
 
     /* create a convolution */
     mkldnn_convolution_desc_t conv_any_desc;
@@ -234,48 +246,43 @@ mkldnn_status_t simple_net()
     mkldnn_primitive_t conv_internal_src_memory, conv_internal_weights_memory,
             conv_internal_dst_memory;
 
-    float *conv_src_buffer =
-        (float *)aligned_malloc(product(conv_src_sizes, 4)*sizeof(float), 64);
-    float *conv_weights_buffer =
-        (float *)aligned_malloc(product(conv_weights_sizes, 4)*sizeof(float),
-                                64);
-    float *conv_dst_buffer =
-        (float *)aligned_malloc(product(conv_dst_sizes, 4)*sizeof(float), 64);
-    memset(conv_src_buffer, 0, product(conv_src_sizes, 4)*sizeof(float));
-    memset(conv_weights_buffer, 0,
-        product(conv_weights_sizes, 4)*sizeof(float));
-    memset(conv_dst_buffer, 0, product(conv_dst_sizes, 4)*sizeof(float));
-
     /* create memory for dst data, we don't need to reorder it to user data */
+    const_mkldnn_primitive_desc_t conv_dst_pd
+            = mkldnn_primitive_desc_query_pd(conv_pd, mkldnn_query_dst_pd, 0);
     CHECK(mkldnn_primitive_create(
-            &conv_internal_dst_memory,
-            mkldnn_primitive_desc_query_pd(conv_pd, mkldnn_query_dst_pd, 0),
-            NULL, NULL));
-    CHECK(mkldnn_memory_set_data_handle(conv_internal_dst_memory,
-                                        conv_dst_buffer));
+            &conv_internal_dst_memory, conv_dst_pd, NULL, NULL));
+    size_t conv_dst_size = mkldnn_memory_primitive_desc_get_size(conv_dst_pd);
+    float *conv_dst_buffer = (float *)aligned_malloc(conv_dst_size, 64);
+    CHECK(mkldnn_memory_set_data_handle(
+            conv_internal_dst_memory, conv_dst_buffer));
 
     /* create reorder primitives between user data and convolution srcs
      * if required */
     mkldnn_primitive_t conv_reorder_src, conv_reorder_weights;
 
-    const_mkldnn_primitive_desc_t src_pd
+    const_mkldnn_primitive_desc_t conv_src_pd
             = mkldnn_primitive_desc_query_pd(conv_pd, mkldnn_query_src_pd, 0);
-    CHECK(prepare_reorder(&conv_user_src_memory, &src_pd, 1,
-                          &conv_internal_src_memory, &conv_reorder_src,
-                          conv_src_buffer));
-
-    const_mkldnn_primitive_desc_t weights_pd = mkldnn_primitive_desc_query_pd(
-            conv_pd, mkldnn_query_weights_pd, 0);
-    CHECK(prepare_reorder(&conv_user_weights_memory, &weights_pd, 1,
-                          &conv_internal_weights_memory, &conv_reorder_weights,
-                          conv_weights_buffer));
+    size_t conv_src_size = mkldnn_memory_primitive_desc_get_size(conv_src_pd);
+    float *conv_src_buffer = (float *)aligned_malloc(conv_src_size, 64);
+    CHECK(prepare_reorder(&conv_user_src_memory, &conv_src_pd, 1,
+        &conv_internal_src_memory, &conv_reorder_src, conv_src_buffer));
+
+    const_mkldnn_primitive_desc_t conv_weights_pd
+            = mkldnn_primitive_desc_query_pd(
+                    conv_pd, mkldnn_query_weights_pd, 0);
+    size_t conv_weights_size
+            = mkldnn_memory_primitive_desc_get_size(conv_weights_pd);
+    float *conv_weights_buffer = (float *)aligned_malloc(conv_weights_size, 64);
+    CHECK(prepare_reorder(&conv_user_weights_memory, &conv_weights_pd, 1,
+            &conv_internal_weights_memory, &conv_reorder_weights,
+            conv_weights_buffer));
 
     mkldnn_primitive_t conv_src_memory = conv_internal_src_memory
-                                                 ? conv_internal_src_memory
-                                                 : conv_user_src_memory;
-    mkldnn_primitive_t conv_weights_memory
-            = conv_internal_weights_memory ? conv_internal_weights_memory
-                                           : conv_user_weights_memory;
+                                                ? conv_internal_src_memory
+                                                : conv_user_src_memory;
+    mkldnn_primitive_t conv_weights_memory = conv_internal_weights_memory
+                                                ? conv_internal_weights_memory
+                                                : conv_user_weights_memory;
 
     mkldnn_primitive_at_t conv_srcs[]
             = { mkldnn_primitive_at(conv_src_memory, 0),
@@ -289,19 +296,10 @@ mkldnn_status_t simple_net()
     CHECK(mkldnn_primitive_create(&conv, conv_pd, conv_srcs, conv_dsts));
 
     /* AlexNet: relu
-     * {BATCH, 96, 55, 55} -> {BATCH, 96, 55, 55}
+     * {BATCH, OC, CONV_OH, CONV_OW} -> {BATCH, OC, CONV_OH, CONV_OW}
      */
     float negative_slope = 1.0f;
 
-    int *relu_dst_sizes = conv_dst_sizes;
-    float *relu_dst_buffer =
-        (float *)aligned_malloc(product(relu_dst_sizes, 4)*sizeof(float), 64);
-    memset(relu_dst_buffer, 0, product(relu_dst_sizes, 4)*sizeof(float));
-
-    /* create relu src memory descriptor using dst memory descriptor
-     * from previos primitive */
-    const_mkldnn_primitive_desc_t conv_dst_pd
-            = mkldnn_primitive_desc_query_pd(conv_pd, mkldnn_query_dst_pd, 0);
     /* keep memory format of source same as the format of convolution
        * output in order to avoid reorder */
     const mkldnn_memory_desc_t *relu_src_md
@@ -320,6 +318,8 @@ mkldnn_status_t simple_net()
     const_mkldnn_primitive_desc_t relu_dst_pd
             = mkldnn_primitive_desc_query_pd(relu_pd, mkldnn_query_dst_pd, 0);
     CHECK(mkldnn_primitive_create(&relu_dst_memory, relu_dst_pd, NULL, NULL));
+    size_t relu_dst_size = mkldnn_memory_primitive_desc_get_size(relu_dst_pd);
+    float *relu_dst_buffer = (float *)aligned_malloc(relu_dst_size, 64);
     CHECK(mkldnn_memory_set_data_handle(relu_dst_memory, relu_dst_buffer));
 
     /* finally create a relu primitive */
@@ -330,7 +330,7 @@ mkldnn_status_t simple_net()
     CHECK(mkldnn_primitive_create(&relu, relu_pd, &relu_srcs, relu_dsts));
 
     /* AlexNet: lrn
-     * {BATCH, 96, 55, 55} -> {BATCH, 96, 55, 55}
+     * {BATCH, OC, CONV_OH, CONV_OW} -> {BATCH, OC, CONV_OH, CONV_OW}
      * local size: 5
      * alpha: 0.0001
      * beta: 0.75
@@ -341,14 +341,8 @@ mkldnn_status_t simple_net()
     float beta = 0.75f;
     float k = 1.0f;
 
-    int32_t *lrn_dst_sizes = relu_dst_sizes;
-
-    float *lrn_dst_buffer =
-        (float *)aligned_malloc(product(lrn_dst_sizes, 4)*sizeof(float), 64);
-    memset(lrn_dst_buffer, 0, product(lrn_dst_sizes, 4)*sizeof(float));
-
     /* create lrn src memory descriptor using dst memory descriptor
-     *  from previos primitive */
+     *  from previous primitive */
     const mkldnn_memory_desc_t *lrn_src_md
             = mkldnn_primitive_desc_query_memory_d(relu_dst_pd);
 
@@ -367,6 +361,8 @@ mkldnn_status_t simple_net()
     const_mkldnn_primitive_desc_t lrn_dst_pd
             = mkldnn_primitive_desc_query_pd(lrn_pd, mkldnn_query_dst_pd, 0);
     CHECK(mkldnn_primitive_create(&lrn_dst_memory, lrn_dst_pd, NULL, NULL));
+    size_t lrn_dst_size = mkldnn_memory_primitive_desc_get_size(lrn_dst_pd);
+    float *lrn_dst_buffer = (float *)aligned_malloc(lrn_dst_size, 64);
     CHECK(mkldnn_memory_set_data_handle(lrn_dst_memory, lrn_dst_buffer));
 
     /* create workspace only in training and only for forward primitive*/
@@ -394,22 +390,17 @@ mkldnn_status_t simple_net()
     CHECK(mkldnn_primitive_create(&lrn, lrn_pd, &lrn_srcs, lrn_dsts));
 
     /* AlexNet: pool
-     * {BATCH, 96, 55, 55} -> {BATCH, 96, 27, 27}
+     * {BATCH, OC, CONV_OH, CONV_OW} -> {BATCH, OC, POOL_OH, POOL_OW}
      * kernel: {3, 3}
-     * strides: {2, 2}
+     * strides: {POOL_STRIDE, POOL_STRIDE}
      */
-    int32_t pool_src_sizes[4] = { BATCH, 96, 55, 55 };
     int32_t *pool_dst_sizes = net_dst_sizes;
     int32_t pool_kernel[2] = { 3, 3 };
-    int32_t pool_strides[2] = { 2, 2 };
-    int32_t pool_padding[2] = { 0, 0 };
-
-    float *pool_dst_buffer =
-        (float *)aligned_malloc(product(pool_dst_sizes, 4)*sizeof(float), 64);
-    memset(pool_dst_buffer, 0, product(pool_dst_sizes, 4)*sizeof(float));
+    int32_t pool_strides[2] = { POOL_STRIDE, POOL_STRIDE };
+    int32_t pool_padding[2] = { POOL_PAD, POOL_PAD };
 
     /* create pooling src memory descriptor using dst descriptor
-     *  from previos primitive */
+     *  from previous primitive */
     const mkldnn_memory_desc_t *pool_src_md
             = mkldnn_primitive_desc_query_memory_d(lrn_dst_pd);
 
@@ -455,6 +446,8 @@ mkldnn_status_t simple_net()
     mkldnn_primitive_t pool_reorder_dst, pool_internal_dst_memory;
     const_mkldnn_primitive_desc_t pool_dst_pd
             = mkldnn_primitive_desc_query_pd(pool_pd, mkldnn_query_dst_pd, 0);
+    size_t pool_dst_size = mkldnn_memory_primitive_desc_get_size(pool_dst_pd);
+    float *pool_dst_buffer = (float *)aligned_malloc(pool_dst_size, 64);
     CHECK(prepare_reorder(&pool_user_dst_memory, &pool_dst_pd, 0,
                           &pool_internal_dst_memory, &pool_reorder_dst,
                           pool_dst_buffer));
@@ -486,16 +479,13 @@ mkldnn_status_t simple_net()
     if (pool_reorder_dst)
         net_fwd[n_fwd++] = pool_reorder_dst;
 
-    mkldnn_stream_t stream_fwd;
-    CHECK(mkldnn_stream_create(&stream_fwd, mkldnn_eager));
-
     void *net_output = NULL; // output from forward stream:
 
     /*----------------------------------------------------------------------*/
     /*----------------- Backward Stream -------------------------------------*/
     /* ... user diff_data ...*/
-    float *net_diff_dst =
-        (float *)aligned_malloc(product(pool_dst_sizes, 4)*sizeof(float), 64);
+    float *net_diff_dst = (float *)aligned_malloc(
+        product(pool_dst_sizes, 4) * sizeof(float), 64);
 
     init_net_data(net_diff_dst, 4, pool_dst_sizes);
 
@@ -505,11 +495,6 @@ mkldnn_status_t simple_net()
                      net_diff_dst, &pool_user_diff_dst_memory);
 
     /* Pooling Backward */
-    float *pool_diff_src_buffer =
-        (float *)aligned_malloc(product(pool_src_sizes, 4)*sizeof(float), 64);
-    memset(pool_diff_src_buffer, 0,
-        product(pool_src_sizes, 4)*sizeof(float));
-
     /* pooling diff src memory descriptor */
     const mkldnn_memory_desc_t *pool_diff_src_md
             = mkldnn_primitive_desc_query_memory_d(lrn_dst_pd);
@@ -533,14 +518,14 @@ mkldnn_status_t simple_net()
     /* create reorder primitive between user diff dst and pool diff dst
      * if required*/
     mkldnn_primitive_t pool_diff_dst_memory;
-    float *pool_diff_dst_buffer =
-        (float *)aligned_malloc(product(pool_dst_sizes, 4)*sizeof(float), 64);
-    memset(pool_diff_dst_buffer, 0,
-        product(pool_dst_sizes, 4)*sizeof(float));
     mkldnn_primitive_t pool_reorder_diff_dst, pool_internal_diff_dst_memory;
     const_mkldnn_primitive_desc_t pool_diff_dst_pd
             = mkldnn_primitive_desc_query_pd(pool_bwd_pd,
                                              mkldnn_query_diff_dst_pd, 0);
+    size_t pool_diff_dst_size
+        = mkldnn_memory_primitive_desc_get_size(pool_diff_dst_pd);
+    float *pool_diff_dst_buffer
+        = (float *)aligned_malloc(pool_diff_dst_size, 64);
     CHECK(prepare_reorder(&pool_user_diff_dst_memory, &pool_diff_dst_pd, 1,
                           &pool_internal_diff_dst_memory,
                           &pool_reorder_diff_dst, pool_diff_dst_buffer));
@@ -554,8 +539,12 @@ mkldnn_status_t simple_net()
     const_mkldnn_primitive_desc_t pool_diff_src_pd
             = mkldnn_primitive_desc_query_pd(pool_bwd_pd,
                                              mkldnn_query_diff_src_pd, 0);
-    CHECK(mkldnn_primitive_create(&pool_diff_src_memory, pool_diff_src_pd, NULL,
-                                  NULL));
+    size_t pool_diff_src_size
+            = mkldnn_memory_primitive_desc_get_size(pool_diff_src_pd);
+    float *pool_diff_src_buffer
+            = (float *)aligned_malloc(pool_diff_src_size, 64);
+    CHECK(mkldnn_primitive_create(
+            &pool_diff_src_memory, pool_diff_src_pd, NULL, NULL));
     CHECK(mkldnn_memory_set_data_handle(pool_diff_src_memory,
                                         pool_diff_src_buffer));
 
@@ -585,17 +574,13 @@ mkldnn_status_t simple_net()
                                        lrn_pd));
 
     /* create memory primitives for lrn diff src */
-    int32_t *lrn_diff_src_sizes = relu_dst_sizes;
-    float *lrn_diff_src_buffer =
-        (float *)aligned_malloc(product(lrn_diff_src_sizes, 4)*sizeof(float),
-                                64);
-    memset(lrn_diff_src_buffer, 0,
-        product(lrn_diff_src_sizes, 4)*sizeof(float));
-
     mkldnn_primitive_t lrn_diff_src_memory;
     const_mkldnn_primitive_desc_t lrn_diff_src_pd
             = mkldnn_primitive_desc_query_pd(lrn_bwd_pd,
                                              mkldnn_query_diff_src_pd, 0);
+    size_t lrn_diff_src_size
+            = mkldnn_memory_primitive_desc_get_size(lrn_diff_src_pd);
+    float *lrn_diff_src_buffer = (float *)aligned_malloc(lrn_diff_src_size, 64);
     CHECK(mkldnn_primitive_create(&lrn_diff_src_memory, lrn_diff_src_pd, NULL,
                                   NULL));
     CHECK(mkldnn_memory_set_data_handle(lrn_diff_src_memory,
@@ -628,18 +613,16 @@ mkldnn_status_t simple_net()
     CHECK(mkldnn_primitive_desc_create(&relu_bwd_pd, &relu_bwd_desc, engine,
                                        relu_pd));
 
-    /* create memory primities for relu diff src */
-    int32_t *relu_diff_src_sizes = conv_dst_sizes;
-    float *relu_diff_src_buffer =
-        (float *)aligned_malloc(product(relu_diff_src_sizes, 4)*sizeof(float),
-                                64);
-    memset(relu_diff_src_buffer, 0,
-        product(relu_diff_src_sizes, 4)*sizeof(float));
-
+    /* create memory primitives for relu diff src */
     mkldnn_primitive_t relu_diff_src_memory;
     const_mkldnn_primitive_desc_t relu_diff_src_pd
             = mkldnn_primitive_desc_query_pd(relu_bwd_pd,
                                              mkldnn_query_diff_src_pd, 0);
+    size_t relu_diff_src_size
+            = mkldnn_memory_primitive_desc_get_size(relu_diff_src_pd);
+    float *relu_diff_src_buffer
+            = (float *)aligned_malloc(relu_diff_src_size, 64);
+
     CHECK(mkldnn_primitive_create(&relu_diff_src_memory, relu_diff_src_pd, NULL,
                                   NULL));
     CHECK(mkldnn_memory_set_data_handle(relu_diff_src_memory,
@@ -657,55 +640,37 @@ mkldnn_status_t simple_net()
                                   relu_diff_srcs));
 
     /* Backward convolution with respect to weights */
-    float *conv_diff_weights_buffer =
-        (float *)aligned_malloc(product(conv_weights_sizes, 4)*sizeof(float),
-                                64);
-    float *conv_diff_bias_buffer =
-        (float *)aligned_malloc(product(conv_bias_sizes, 1)*sizeof(float), 64);
-    float *conv_user_diff_weights_buffer =
-        (float *)aligned_malloc(product(conv_weights_sizes, 4)*sizeof(float),
-                                64);
-    float *conv_bwd_src_buffer =
-        (float *)aligned_malloc(product(conv_src_sizes, 4)*sizeof(float), 64);
-    float *conv_diff_dst_buffer =
-        (float *)aligned_malloc(product(conv_dst_sizes, 4)*sizeof(float), 64);
-    memset(conv_diff_weights_buffer, 0,
-        product(conv_weights_sizes, 4)*sizeof(float));
-    memset(conv_diff_bias_buffer, 0,
-        product(conv_bias_sizes, 1)*sizeof(float));
-    memset(conv_user_diff_weights_buffer, 0,
-        product(conv_weights_sizes, 4)*sizeof(float));
-    memset(conv_bwd_src_buffer, 0,
-        product(conv_src_sizes, 4)*sizeof(float));
-    memset(conv_diff_dst_buffer, 0,
-       product(conv_dst_sizes, 4)*sizeof(float));
+    float *conv_diff_bias_buffer = (float *)aligned_malloc(
+            product(conv_bias_sizes, 1) * sizeof(float), 64);
+    float *conv_user_diff_weights_buffer = (float *)aligned_malloc(
+            product(conv_user_weights_sizes, 4) * sizeof(float), 64);
 
     /* initialize memory for diff weights in user format */
     mkldnn_primitive_t conv_user_diff_weights_memory;
-    init_data_memory(4, conv_weights_sizes, mkldnn_nchw, mkldnn_f32, engine,
-                     conv_user_diff_weights_buffer,
-                     &conv_user_diff_weights_memory);
+    init_data_memory(4, conv_user_weights_sizes, mkldnn_nchw, mkldnn_f32,
+            engine, conv_user_diff_weights_buffer,
+            &conv_user_diff_weights_memory);
 
     /* memory descriptors should be in format `any` to allow backward
      * convolution for
      * weights to chose the format it prefers for best performance */
-    mkldnn_memory_desc_t conv_bwd_src_md, conv_diff_weights_md,
+    mkldnn_memory_desc_t conv_diff_src_md, conv_diff_weights_md,
             conv_diff_bias_md, conv_diff_dst_md;
-    CHECK(mkldnn_memory_desc_init(&conv_bwd_src_md, 4, conv_src_sizes,
-                                  mkldnn_f32, mkldnn_any));
-    CHECK(mkldnn_memory_desc_init(&conv_diff_weights_md, 4, conv_weights_sizes,
-                                  mkldnn_f32, mkldnn_any));
-    CHECK(mkldnn_memory_desc_init(&conv_diff_bias_md, 1, conv_bias_sizes,
-                                  mkldnn_f32, mkldnn_x));
-    CHECK(mkldnn_memory_desc_init(&conv_diff_dst_md, 4, conv_dst_sizes,
-                                  mkldnn_f32, mkldnn_any));
+    CHECK(mkldnn_memory_desc_init(
+            &conv_diff_src_md, 4, conv_user_src_sizes, mkldnn_f32, mkldnn_any));
+    CHECK(mkldnn_memory_desc_init(&conv_diff_weights_md, 4,
+            conv_user_weights_sizes, mkldnn_f32, mkldnn_any));
+    CHECK(mkldnn_memory_desc_init(
+            &conv_diff_bias_md, 1, conv_bias_sizes, mkldnn_f32, mkldnn_x));
+    CHECK(mkldnn_memory_desc_init(
+            &conv_diff_dst_md, 4, conv_user_dst_sizes, mkldnn_f32, mkldnn_any));
 
     /* create backward convolution descriptor */
     mkldnn_convolution_desc_t conv_bwd_weights_desc;
-    CHECK(mkldnn_convolution_backward_weights_desc_init(
-            &conv_bwd_weights_desc, mkldnn_convolution_direct, &conv_bwd_src_md,
-            &conv_diff_weights_md, &conv_diff_bias_md, &conv_diff_dst_md,
-            conv_strides, conv_padding, conv_padding, mkldnn_padding_zero));
+    CHECK(mkldnn_convolution_backward_weights_desc_init(&conv_bwd_weights_desc,
+            mkldnn_convolution_direct, &conv_diff_src_md, &conv_diff_weights_md,
+            &conv_diff_bias_md, &conv_diff_dst_md, conv_strides, conv_padding,
+            conv_padding, mkldnn_padding_zero));
 
     mkldnn_primitive_desc_t conv_bwd_weights_pd;
     CHECK(mkldnn_primitive_desc_create(
@@ -718,14 +683,18 @@ mkldnn_status_t simple_net()
     /* create reorder primitives for src from forward convolution to the
      * format chosen by backward convolution */
     mkldnn_primitive_t conv_bwd_reorder_src, conv_bwd_internal_src_memory;
-    const_mkldnn_primitive_desc_t conv_bwd_src_pd
+    const_mkldnn_primitive_desc_t conv_diff_src_pd
             = mkldnn_primitive_desc_query_pd(conv_bwd_weights_pd,
                                              mkldnn_query_src_pd, 0);
-    CHECK(prepare_reorder(&conv_src_memory, &conv_bwd_src_pd, 1,
-                          &conv_bwd_internal_src_memory, &conv_bwd_reorder_src,
-                          conv_bwd_src_buffer));
-
-    mkldnn_primitive_t conv_bwd_src_memory
+    size_t conv_diff_src_size
+            = mkldnn_memory_primitive_desc_get_size(conv_diff_src_pd);
+    float *conv_diff_src_buffer
+            = (float *)aligned_malloc(conv_diff_src_size, 64);
+    CHECK(prepare_reorder(&conv_src_memory, &conv_diff_src_pd, 1,
+            &conv_bwd_internal_src_memory, &conv_bwd_reorder_src,
+            conv_diff_src_buffer));
+
+    mkldnn_primitive_t conv_diff_src_memory
             = conv_bwd_internal_src_memory ? conv_bwd_internal_src_memory
                                            : conv_src_memory;
 
@@ -735,6 +704,11 @@ mkldnn_status_t simple_net()
     const_mkldnn_primitive_desc_t conv_diff_dst_pd
             = mkldnn_primitive_desc_query_pd(conv_bwd_weights_pd,
                                              mkldnn_query_diff_dst_pd, 0);
+    size_t conv_diff_dst_size
+            = mkldnn_memory_primitive_desc_get_size(conv_diff_dst_pd);
+    float *conv_diff_dst_buffer
+            = (float *)aligned_malloc(conv_diff_dst_size, 64);
+
     CHECK(prepare_reorder(&relu_diff_src_memory, &conv_diff_dst_pd, 1,
                           &conv_internal_diff_dst_memory,
                           &conv_reorder_diff_dst, conv_diff_dst_buffer));
@@ -749,6 +723,10 @@ mkldnn_status_t simple_net()
     const_mkldnn_primitive_desc_t conv_diff_weights_pd
             = mkldnn_primitive_desc_query_pd(conv_bwd_weights_pd,
                                              mkldnn_query_diff_weights_pd, 0);
+    size_t conv_diff_weights_size
+            = mkldnn_memory_primitive_desc_get_size(conv_diff_weights_pd);
+    float *conv_diff_weights_buffer
+            = (float *)aligned_malloc(conv_diff_weights_size, 64);
     CHECK(prepare_reorder(&conv_user_diff_weights_memory, &conv_diff_weights_pd,
                           0, &conv_internal_diff_weights_memory,
                           &conv_reorder_diff_weights,
@@ -770,7 +748,7 @@ mkldnn_status_t simple_net()
                                         conv_diff_bias_buffer));
 
     mkldnn_primitive_at_t conv_diff_dsts[]
-            = { mkldnn_primitive_at(conv_bwd_src_memory, 0),
+            = { mkldnn_primitive_at(conv_diff_src_memory, 0),
                 mkldnn_primitive_at(conv_diff_dst_memory, 0) };
 
     const_mkldnn_primitive_t conv_diff_weights[]
@@ -798,19 +776,18 @@ mkldnn_status_t simple_net()
     if (conv_reorder_diff_weights)
         net_bwd[n_bwd++] = conv_reorder_diff_weights;
 
-    mkldnn_stream_t stream_bwd;
-    CHECK(mkldnn_stream_create(&stream_bwd, mkldnn_eager));
-
     // output from backward stream
     void *net_diff_weights = NULL;
     void *net_diff_bias = NULL;
 
-    int n_iter = 1; //number of iterations for training.
+    int n_iter = 10; //number of iterations for training.
     /* Execute the net */
-    while (n_iter) {
-        /* Forward pass */
+    for (int i = 0; i < n_iter; i++) {
+        mkldnn_stream_t stream_fwd;
+        CHECK(mkldnn_stream_create(&stream_fwd, mkldnn_eager));
         CHECK(mkldnn_stream_submit(stream_fwd, n_fwd, net_fwd, NULL));
         CHECK(mkldnn_stream_wait(stream_fwd, n_fwd, NULL));
+        CHECK(mkldnn_stream_destroy(stream_fwd));
 
         /* Update net_diff_dst */
         CHECK(mkldnn_memory_get_data_handle(pool_user_dst_memory, &net_output));
@@ -818,8 +795,11 @@ mkldnn_status_t simple_net()
         // some user defined func update_diff_dst(net_diff_dst, net_output)
 
         /* Backward pass */
+        mkldnn_stream_t stream_bwd;
+        CHECK(mkldnn_stream_create(&stream_bwd, mkldnn_eager));
         CHECK(mkldnn_stream_submit(stream_bwd, n_bwd, net_bwd, NULL));
         CHECK(mkldnn_stream_wait(stream_bwd, n_bwd, NULL));
+        CHECK(mkldnn_stream_destroy(stream_bwd));
 
         /*... update weights ... */
         CHECK(mkldnn_memory_get_data_handle(conv_user_diff_weights_memory,
@@ -830,8 +810,6 @@ mkldnn_status_t simple_net()
         // some user defined func update_weights(conv_user_weights_memory,
         // conv_bias_memory,
         //      net_diff_weights, net_diff_bias);
-
-        --n_iter;
     }
 
     /* Cleanup forward */
@@ -840,8 +818,6 @@ mkldnn_status_t simple_net()
     CHECK(mkldnn_primitive_desc_destroy(relu_pd));
     CHECK(mkldnn_primitive_desc_destroy(conv_pd));
 
-    mkldnn_stream_destroy(stream_fwd);
-
     _free(net_src);
     _free(net_dst);
 
@@ -890,8 +866,6 @@ mkldnn_status_t simple_net()
     CHECK(mkldnn_primitive_desc_destroy(conv_diff_bias_pd));
     CHECK(mkldnn_primitive_desc_destroy(conv_bwd_weights_pd));
 
-    mkldnn_stream_destroy(stream_bwd);
-
     mkldnn_primitive_destroy(pool_user_diff_dst_memory);
     mkldnn_primitive_destroy(pool_diff_src_memory);
     mkldnn_primitive_destroy(pool_internal_diff_dst_memory);
@@ -925,7 +899,7 @@ mkldnn_status_t simple_net()
     _free(conv_diff_weights_buffer);
     _free(conv_diff_bias_buffer);
     _free(conv_user_diff_weights_buffer);
-    _free(conv_bwd_src_buffer);
+    _free(conv_diff_src_buffer);
     _free(conv_diff_dst_buffer);
 
     mkldnn_engine_destroy(engine);
index 8d186bd..73853ad 100644 (file)
@@ -1073,6 +1073,42 @@ mkldnn_status_t MKLDNN_API mkldnn_dilated_deconvolution_backward_weights_desc_in
 
 /** @} */
 
+/** @addtogroup c_api_shuffle Shuffle
+ * A primitive to shuffle data along the axis.
+ * @{ */
+
+/** Initializes a @p shuffle_desc for forward propagation using @p prop_kind,
+ * @p memory descriptor @p data_desc, @p axis and @p group
+ * number.
+ *
+ * Order of inputs:
+ *  - src (#mkldnn_query_src_pd, 0)
+ *
+ * Order of outputs:
+ *  - dst (#mkldnn_query_dst_pd, 0)
+ *
+ */
+mkldnn_status_t MKLDNN_API mkldnn_shuffle_forward_desc_init(
+        mkldnn_shuffle_desc_t *shuffle_desc, mkldnn_prop_kind_t prop_kind,
+        const mkldnn_memory_desc_t *data_desc, int axis, int group_size);
+
+/** Initializes a @p shuffle_desc for backward propagation using @p memory
+ * descriptor @p diff_data_desc, @p axis and @p group number.
+ *
+ *
+ * Order of inputs:
+ *  - diff_dst (#mkldnn_query_diff_dst_pd, 0)
+ *
+ * Order of outputs:
+ *  - diff_src (#mkldnn_query_diff_src_pd, 0)
+ *
+ */
+mkldnn_status_t MKLDNN_API mkldnn_shuffle_backward_desc_init(
+        mkldnn_shuffle_desc_t *shuffle_desc,
+        const mkldnn_memory_desc_t *diff_data_desc, int axis, int group_size);
+
+/** @} */
+
 /** @addtogroup c_api_eltwise Eltwise
  * A primitive to compute element wise operations like parametric rectifier
  * linear unit (ReLU).
@@ -1803,6 +1839,36 @@ mkldnn_status_t MKLDNN_API mkldnn_sgemm(const char *transa, const char *transb,
         const float *B, const int *ldb,
         const float *beta, float *C, const int *ldc);
 
+/** gemm_s8u8s32 and gemm_s8s8s32 perform matrix-matrix multiplication operation
+ * and add the result to a scalar-matrix product. To get the final result,
+ * a vector is added to each row or column of the output matrix.
+ * The operation is defined as:
+ * C := alpha*(op(A) + A_offset) * (op(B) + B_offset) + beta*C + C_offset
+ * where op( X ) = X or op( X ) = X**T,
+ * A_offset is an m-by-k matrix with every element equal to the value oa,
+ * B_offset is an k-by-n matrix with every element equal to the value ob,
+ * C_offset is an m-by-n matrix defined by the oc array, size len:
+ * if offsetc = F: len must be at least 1
+ * if offsetc = C: len must be at least max(1, m)
+ * if offsetc = R: len must be at least max(1, n)
+ * alpha and beta are scalars, and A, B and C are matrices, with op( A )
+ * an m-by-k matrix, op( B ) a k-by-n matrix and C an m-by-n matrix.
+ * @note
+ *      API is different compared to standard BLAS routine
+ *      as it returns mkldnn_status_t for error handling.
+ *      XERBLA is not supported: no error message will be printed
+ *      in case of incorrect parameters */
+mkldnn_status_t MKLDNN_API mkldnn_gemm_s8u8s32(const char *transa,
+        const char *transb, const char *offsetc, const int *M, const int *N,
+        const int *K, const float *alpha, const int8_t *A, const int *lda,
+        const int8_t *ao, const uint8_t *B, const int *ldb, const int8_t *bo,
+        const float *beta, int32_t *c, const int *ldc, const int32_t *co);
+
+mkldnn_status_t MKLDNN_API mkldnn_gemm_s8s8s32(const char *transa,
+        const char *transb, const char *offsetc, const int *M, const int *N,
+        const int *K, const float *alpha, const int8_t *A, const int *lda,
+        const int8_t *ao, const int8_t *B, const int *ldb, const int8_t *bo,
+        const float *beta, int32_t *c, const int *ldc, const int32_t *co);
 /** @} */
 
 /** @} */
index c2aa7c9..b0869e7 100644 (file)
@@ -58,6 +58,9 @@ private:
     handle(const handle &&) = delete;
     handle &operator=(const handle &&other) = delete;
 protected:
+    bool operator==(const T other) const { return other == _data.get(); }
+    bool operator!=(const T other) const { return !(*this == other); }
+public:
     /// Constructs a C handle wrapper.
     /// @param t The C handle to wrap.
     /// @param weak A flag to specify whether to construct a weak wrapper.
@@ -65,9 +68,6 @@ protected:
         reset(t, weak);
     }
 
-    bool operator==(const T other) const { return other == _data.get(); }
-    bool operator!=(const T other) const { return !(*this == other); }
-public:
     handle(const handle &other): _data(other._data) {}
     handle &operator=(const handle &other) {
         _data = other._data;
@@ -96,6 +96,10 @@ template <> struct handle_traits<mkldnn_primitive_desc_t> {
 template <> struct handle_traits<mkldnn_primitive_t> {
     static constexpr auto destructor = &mkldnn_primitive_destroy;
 };
+
+template <> struct handle_traits<mkldnn_primitive_desc_iterator_t> {
+    static constexpr auto destructor = &mkldnn_primitive_desc_iterator_destroy;
+};
 #endif
 
 /// Base class for all computational primitives.
@@ -116,6 +120,7 @@ public:
         sum = mkldnn_sum,
         convolution = mkldnn_convolution,
         deconvolution = mkldnn_deconvolution,
+        shuffle = mkldnn_shuffle,
         eltwise = mkldnn_eltwise,
         depthwise = mkldnn_depthwise,
         relu = mkldnn_relu,
@@ -184,7 +189,7 @@ struct error: public std::exception {
     ///                        caused the error.
 
     static void wrap_c_api(mkldnn_status_t status,
-            std::string message,
+            const std::string &message,
             mkldnn_primitive_t *error_primitive = 0)
     {
         if (status != mkldnn_success) {
@@ -325,9 +330,11 @@ enum query {
 
     impl_info_str = mkldnn_query_impl_info_str,
 
+    op_d = mkldnn_query_op_d,
     memory_d = mkldnn_query_memory_d,
     convolution_d = mkldnn_query_convolution_d,
     deconvolution_d = mkldnn_query_deconvolution_d,
+    shuffle_d = mkldnn_query_shuffle_d,
     eltwise_d = mkldnn_query_eltwise_d,
     depthwise_d = mkldnn_query_depthwise_d,
     relu_d = mkldnn_query_relu_d,
@@ -587,7 +594,7 @@ private:
 
 /// @}
 
-/// @addtogroup cpp_api_primitives Primitives
+/// @addtogroup cpp_api_memory_related Memory and memory related operations
 /// @{
 
 /// @addtogroup cpp_api_memory Memory
@@ -629,9 +636,13 @@ struct memory: public primitive  {
         blocked = mkldnn_blocked,
         x = mkldnn_x,
         nc = mkldnn_nc,
+        ncw = mkldnn_ncw,
+        nwc = mkldnn_nwc,
+        nCw16c = mkldnn_nCw16c,
         nchw = mkldnn_nchw,
         nhwc = mkldnn_nhwc,
         chwn = mkldnn_chwn,
+        nCw8c = mkldnn_nCw8c,
         nChw8c = mkldnn_nChw8c,
         nChw16c = mkldnn_nChw16c,
         ncdhw = mkldnn_ncdhw,
@@ -640,9 +651,22 @@ struct memory: public primitive  {
         nCdhw16c = mkldnn_nCdhw16c,
         oi = mkldnn_oi,
         io = mkldnn_io,
+        oiw = mkldnn_oiw,
+        wio = mkldnn_wio,
+        Owi8o = mkldnn_Owi8o,
+        OIw8o8i = mkldnn_OIw8o8i,
+        OIw8i8o = mkldnn_OIw8i8o,
+        OIw16i16o = mkldnn_OIw16i16o,
+        OIw16o16i = mkldnn_OIw16o16i,
+        Oiw16o = mkldnn_Oiw16o,
+        Owi16o = mkldnn_Owi16o,
+        OIw8i16o2i = mkldnn_OIw8i16o2i,
+        OIw8o16i2o = mkldnn_OIw8o16i2o,
+        IOw16o16i = mkldnn_IOw16o16i,
         oihw = mkldnn_oihw,
         ihwo = mkldnn_ihwo,
         hwio = mkldnn_hwio,
+        hwio_s8s8 = mkldnn_hwio_s8s8,
         dhwio = mkldnn_dhwio,
         oidhw = mkldnn_oidhw,
         OIdhw8i8o = mkldnn_OIdhw8i8o,
@@ -665,13 +689,28 @@ struct memory: public primitive  {
         OIdhw8i16o2i = mkldnn_OIdhw8i16o2i,
         OIhw8o16i2o = mkldnn_OIhw8o16i2o,
         OIhw4i16o4i = mkldnn_OIhw4i16o4i,
+        OIhw4i16o4i_s8s8 = mkldnn_OIhw4i16o4i_s8s8,
         Oihw8o = mkldnn_Oihw8o,
         Oihw16o = mkldnn_Oihw16o,
         Ohwi8o = mkldnn_Ohwi8o,
         Ohwi16o = mkldnn_Ohwi16o,
         OhIw16o4i = mkldnn_OhIw16o4i,
+        OhIw8o4i = mkldnn_OhIw8o4i,
+        OhIw8o4i_s8s8 = mkldnn_OhIw8o4i_s8s8,
+        goiw = mkldnn_goiw,
+        gOwi8o = mkldnn_gOwi8o,
+        gOIw8o8i = mkldnn_gOIw8o8i,
+        gOIw8i8o = mkldnn_gOIw8i8o,
+        gOIw16i16o = mkldnn_gOIw16i16o,
+        gOIw16o16i = mkldnn_gOIw16o16i,
+        gOiw16o = mkldnn_gOiw16o,
+        gOwi16o = mkldnn_gOwi16o,
+        gOIw8i16o2i = mkldnn_gOIw8i16o2i,
+        gIOw16o16i = mkldnn_gIOw16o16i,
+        gOIw8o16i2o = mkldnn_gOIw8o16i2o,
         goihw = mkldnn_goihw,
         hwigo = mkldnn_hwigo,
+        hwigo_s8s8 = mkldnn_hwigo_s8s8,
         gOIdhw8i8o = mkldnn_gOIdhw8i8o,
         gOIdhw8o8i = mkldnn_gOIdhw8o8i,
         gOdhwi8o = mkldnn_gOdhwi8o,
@@ -681,6 +720,7 @@ struct memory: public primitive  {
         gOIdhw8i16o2i = mkldnn_gOIdhw8i16o2i,
         gOIhw8o16i2o = mkldnn_gOIhw8o16i2o,
         gOIhw4i16o4i = mkldnn_gOIhw4i16o4i,
+        gOIhw4i16o4i_s8s8 = mkldnn_gOIhw4i16o4i_s8s8,
         gOihw8o = mkldnn_gOihw8o,
         gOihw16o = mkldnn_gOihw16o,
         gOhwi8o = mkldnn_gOhwi8o,
@@ -691,6 +731,8 @@ struct memory: public primitive  {
         gOIhw16o16i = mkldnn_gOIhw16o16i,
         gIOhw16o16i = mkldnn_gIOhw16o16i,
         gOhIw16o4i = mkldnn_gOhIw16o4i,
+        gOhIw8o4i = mkldnn_gOhIw8o4i,
+        gOhIw8o4i_s8s8 = mkldnn_gOhIw8o4i_s8s8,
         goidhw = mkldnn_goidhw,
         gOIdhw16i16o = mkldnn_gOIdhw16i16o,
         gOIdhw16o16i = mkldnn_gOIdhw16o16i,
@@ -1139,6 +1181,11 @@ struct sum : public primitive {
 
             auto c_api_inputs = cpp_to_c(inputs);
 
+            error::wrap_c_api(
+                scales.size() == inputs.size() ? mkldnn_success
+                                               : mkldnn_invalid_arguments,
+                "number of scales not equal to number of inputs");
+
             error::wrap_c_api(mkldnn_sum_primitive_desc_create(
                     &result, &output.data, (int)c_api_inputs.size(),
                     &scales[0], &c_api_inputs[0]),
@@ -1152,6 +1199,11 @@ struct sum : public primitive {
 
             auto c_api_inputs = cpp_to_c(inputs);
 
+            error::wrap_c_api(
+                scales.size() == inputs.size() ? mkldnn_success
+                                               : mkldnn_invalid_arguments,
+                "number of scales not equal to number of inputs");
+
             error::wrap_c_api(mkldnn_sum_primitive_desc_create(
                     &result, nullptr, (int)c_api_inputs.size(), &scales[0],
                     &c_api_inputs[0]),
@@ -1233,6 +1285,111 @@ private:
 
 /// @}
 
+/// @}
+
+/// @addtogroup cpp_api_primitives Primitives
+/// @{
+
+/// @addtogroup cpp_api_primitive_descriptors Primitive descriptors
+/// @{
+
+/// A base class for all primitive descriptors
+struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
+    primitive_desc(const_mkldnn_op_desc_t desc, const primitive_attr *attr,
+            const engine &e, const_mkldnn_primitive_desc_t hint_fwd_pd) {
+        mkldnn_primitive_desc_iterator_t iterator = nullptr;
+        mkldnn_status_t status = mkldnn_primitive_desc_iterator_create_v2(
+                &iterator, desc, attr ? attr->get() : nullptr, e.get(),
+                hint_fwd_pd);
+        error::wrap_c_api(status,
+                "could not create a primitive descriptor iterator");
+        pd_iterator.reset(iterator);
+        fetch_impl();
+    }
+
+    engine get_engine() { return engine::query(*this); }
+
+    primitive_attr get_primitive_attr() const {
+        const_mkldnn_primitive_attr_t const_cattr;
+        error::wrap_c_api(mkldnn_primitive_desc_get_attr(get(), &const_cattr),
+                "could not get attributes");
+        mkldnn_primitive_attr_t cattr;
+        error::wrap_c_api(mkldnn_primitive_attr_clone(&cattr, const_cattr),
+                "could not clone attributes");
+
+        primitive_attr attr;
+        attr.reset(cattr);
+        return attr;
+    }
+
+    /// Returns implementation name
+    const char *impl_info_str() const {
+        const char *res;
+        error::wrap_c_api(mkldnn_primitive_desc_query(get(),
+                    mkldnn_query_impl_info_str, 0, &res),
+                "could not query implementation info string");
+        return res;
+    }
+
+    /// Advances the next implementation for the given op descriptor
+    ///
+    /// Returns:
+    /// - @c true on success
+    /// - @c false if the last implementation reached, and
+    ///   the primitive descriptor itself is kept unchanged
+    bool next_impl() {
+        mkldnn_status_t status = mkldnn_primitive_desc_iterator_next(
+                pd_iterator.get());
+        if (status == mkldnn_iterator_ends) return false;
+        error::wrap_c_api(status, "primitive descriptor iterator next failed");
+
+        fetch_impl();
+        return true;
+    }
+
+    /// Queries and returns requested memory primitive descriptor
+    memory::primitive_desc query_mpd(query what, int idx = 0) const {
+        std::vector<query> valid_w{input_pd, output_pd, src_pd, diff_src_pd,
+            weights_pd, diff_weights_pd, dst_pd, diff_dst_pd, workspace_pd};
+        if (!std::any_of(valid_w.cbegin(), valid_w.cend(),
+                    [=](query q) { return what == q; }))
+            throw error(mkldnn_invalid_arguments, "invalid memory query");
+
+        const_mkldnn_primitive_desc_t const_cdesc
+            = mkldnn_primitive_desc_query_pd(get(),
+                    mkldnn::convert_to_c(what), idx);
+
+        // TODO: is there a better way to inform about this?
+        if (const_cdesc == nullptr)
+            throw error(mkldnn_not_required, "queried memory is not required");
+
+        mkldnn_primitive_desc_t cdesc;
+        error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
+                "could not clone a memory primitive descriptor");
+
+        memory::primitive_desc ret;
+        ret.reset(cdesc);
+        return ret;
+    }
+
+    // register specialized queries, e.g. src_primitive_desc()
+#   define REG_QUERY_MPD(name, what, idx) \
+    memory::primitive_desc name ## _primitive_desc() const \
+    { return query_mpd(what ## _pd, idx); }
+
+  private:
+    handle<mkldnn_primitive_desc_iterator_t> pd_iterator;
+    void fetch_impl() {
+        mkldnn_primitive_desc_t pd = mkldnn_primitive_desc_iterator_fetch(
+                pd_iterator.get());
+        error::wrap_c_api(pd != nullptr ? mkldnn_success : mkldnn_runtime_error,
+                "could not fetch a primitive descriptor from the iterator");
+        reset(pd);
+    }
+};
+
+/// @}
+
 /// @addtogroup cpp_api_convolution Convolution
 /// A primitive to compute convolution using different algorithms.
 ///
@@ -1325,74 +1482,18 @@ struct convolution_forward: public primitive {
                     "could not create a dilated convolution forward descriptor");
         }
     };
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(), nullptr),
-                    "could not create a convolution forward primitive descriptor");
-            reset(result);
-        }
-
-        primitive_desc(const desc &adesc, const primitive_attr &aattr,
-                const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create_v2(
-                        &result, &adesc.data, aattr.get(),
-                        aengine.get(), nullptr),
-                    "could not create a convolution forward primitive descriptor");
-            reset(result);
-        }
-
-        memory::primitive_desc src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a src primititve descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
 
-        memory::primitive_desc bias_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a bias primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
 
-        memory::primitive_desc dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(weights, weights, 0);
+        REG_QUERY_MPD(bias, weights, 1);
+        REG_QUERY_MPD(dst, dst, 0);
     };
 
     convolution_forward(const primitive_desc &aprimitive_desc,
@@ -1466,54 +1567,19 @@ struct convolution_backward_data : public primitive {
                     "could not create a convolution backward data descriptor");
         }
     };
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-                const convolution_forward::primitive_desc
-                    &hint_fwd_primitive_desc) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(),
-                        hint_fwd_primitive_desc.get()),
-                    "could not create a convolution backward data primitive descriptor");
-            reset(result);
-        }
-        memory::primitive_desc diff_src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_src primititve descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
 
-        memory::primitive_desc weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const convolution_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc diff_dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const convolution_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(diff_src, diff_src, 0);
+        REG_QUERY_MPD(weights, weights, 0);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
     };
 
     convolution_backward_data(const primitive_desc &aprimitive_desc,
@@ -1617,66 +1683,19 @@ struct convolution_backward_weights : public primitive {
 
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-                const convolution_forward::primitive_desc
-                    &hint_fwd_primitive_desc) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(),
-                        hint_fwd_primitive_desc.get()),
-                    "could not create a convolution backward weights primitive descriptor");
-            reset(result);
-        }
-        memory::primitive_desc src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a src primititve descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_bias_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_weights_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_bias primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const convolution_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc diff_dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const convolution_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(diff_weights, diff_weights, 0);
+        REG_QUERY_MPD(diff_bias, diff_weights, 1);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
     };
 
     convolution_backward_weights(const primitive_desc &aprimitive_desc,
@@ -1724,16 +1743,14 @@ struct convolution_relu_forward : public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                    &result, &adesc.data, aengine.get(), nullptr),
-                "could not create a convolution relu forward descriptor");
-            reset(result);
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(weights, weights, 0);
+        REG_QUERY_MPD(bias, weights, 1);
+        REG_QUERY_MPD(dst, dst, 0);
     };
 
     /// @deprecated consider using convolution_forward + post_ops
@@ -1860,74 +1877,18 @@ struct deconvolution_forward: public primitive {
                     "could not create a dilated deconvolution forward descriptor");
         }
     };
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(), nullptr),
-                    "could not create a deconvolution forward primitive descriptor");
-            reset(result);
-        }
 
-        primitive_desc(const desc &adesc, const primitive_attr &aattr,
-                const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create_v2(
-                        &result, &adesc.data, aattr.get(),
-                        aengine.get(), nullptr),
-                    "could not create a deconvolution forward primitive descriptor");
-            reset(result);
-        }
-
-        memory::primitive_desc src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a src primititve descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc bias_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a bias primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
 
-        memory::primitive_desc dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(weights, weights, 0);
+        REG_QUERY_MPD(bias, weights, 1);
+        REG_QUERY_MPD(dst, dst, 0);
     };
 
     deconvolution_forward(const primitive_desc &aprimitive_desc,
@@ -2002,54 +1963,19 @@ struct deconvolution_backward_data : public primitive {
                     "could not create a dilated deconvolution backward data descriptor");
         }
     };
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-                const deconvolution_forward::primitive_desc
-                    &hint_fwd_primitive_desc) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(),
-                        hint_fwd_primitive_desc.get()),
-                    "could not create a deconvolution backward data primitive descriptor");
-            reset(result);
-        }
-        memory::primitive_desc diff_src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_src primititve descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
 
-        memory::primitive_desc weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc diff_dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(diff_src, diff_src, 0);
+        REG_QUERY_MPD(weights, weights, 0);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
     };
 
     deconvolution_backward_data(const primitive_desc &aprimitive_desc,
@@ -2152,66 +2078,19 @@ struct deconvolution_backward_weights : public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-                const deconvolution_forward::primitive_desc
-                    &hint_fwd_primitive_desc) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(),
-                        hint_fwd_primitive_desc.get()),
-                    "could not create a deconvolution backward weights primitive descriptor");
-            reset(result);
-        }
-        memory::primitive_desc src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a src primititve descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc diff_weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const deconvolution_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc diff_bias_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_weights_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_bias primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(diff_weights, diff_weights, 0);
+        REG_QUERY_MPD(diff_bias, diff_weights, 1);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
     };
 
     deconvolution_backward_weights(const primitive_desc &aprimitive_desc,
@@ -2327,52 +2206,16 @@ struct lrn_forward : public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                    &result, &adesc.data, aengine.get(), nullptr),
-                "could not create a lrn forward primitive descriptor");
-            reset(result);
-        }
-
-        memory::primitive_desc src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a src primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc workspace_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t ldesc;
-            const_mkldnn_primitive_desc_t const_ldesc =
-                    mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(workspace_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&ldesc, const_ldesc),
-                    "could not clone a workspace primitive descriptor");
-            adesc.reset(ldesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
 
-        memory::primitive_desc dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(dst, dst, 0);
+        REG_QUERY_MPD(workspace, workspace, 0);
     };
 
     lrn_forward(const primitive_desc &aprimitive_desc,
@@ -2427,54 +2270,18 @@ struct lrn_backward : public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-        const lrn_forward::primitive_desc &hint_fwd_primitive_desc) {
-        mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(),
-                        hint_fwd_primitive_desc.get()),
-                    "could not create a backward lrn primitive descriptor");
-            reset(result);
-        }
-
-        memory::primitive_desc diff_src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_src primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc workspace_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t ldesc;
-            const_mkldnn_primitive_desc_t const_ldesc =
-                    mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(workspace_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&ldesc, const_ldesc),
-                    "could not clone a workspace primitive descriptor");
-            adesc.reset(ldesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const lrn_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc diff_dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const lrn_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(diff_src, diff_src, 0);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
+        REG_QUERY_MPD(workspace, workspace, 0);
     };
 
     lrn_backward(const primitive_desc &aprimitive_desc,
@@ -2539,52 +2346,16 @@ struct pooling_forward : public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-        mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(), nullptr),
-                    "could not create a forward pooling primitive descriptor");
-            reset(result);
-        }
-
-        memory::primitive_desc workspace_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(workspace_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a workspace primititve descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
 
-        memory::primitive_desc src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a src primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(dst, dst, 0);
+        REG_QUERY_MPD(workspace, workspace, 0);
     };
 
     pooling_forward(const primitive_desc &aprimitive_desc, const primitive::at &src,
@@ -2637,42 +2408,18 @@ struct pooling_backward : public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-        const pooling_forward::primitive_desc &hint_fwd_primitive_desc) {
-        mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(),
-                        hint_fwd_primitive_desc.get()),
-                    "could not create a backward pooling primitive descriptor");
-            reset(result);
-        }
-
-        memory::primitive_desc diff_src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff src primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const pooling_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc diff_dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const pooling_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(diff_src, diff_src, 0);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
+        REG_QUERY_MPD(workspace, workspace, 0);
     };
 
     pooling_backward(const primitive_desc &aprimitive_desc, const primitive::at &diff_dst,
@@ -2730,29 +2477,15 @@ struct eltwise_forward : public primitive {
         : desc(aprop_kind, eltwise_relu, src_desc, negative_slope) {}
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(), nullptr),
-                    "could not create a eltwise forward primitive descriptor");
-            reset(result);
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
 
-        memory::primitive_desc dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                        mkldnn::convert_to_c(dst_pd), 0);
-            error::wrap_c_api(
-                    mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(dst, dst, 0);
     };
 
     eltwise_forward(const primitive_desc &aprimitive_desc,
@@ -2792,30 +2525,18 @@ struct eltwise_backward : public primitive {
                 negative_slope) {}
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-        const eltwise_forward::primitive_desc &hint_fwd_primitive_desc) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(),
-                        hint_fwd_primitive_desc.get()),
-                    "could not create a eltwise backward primitive descriptor");
-            reset(result);
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const eltwise_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc diff_src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff src primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const eltwise_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(diff_src, diff_src, 0);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
     };
 
     eltwise_backward(const primitive_desc &aprimitive_desc,
@@ -2922,16 +2643,15 @@ struct softmax_forward : public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                    &result, &adesc.data, aengine.get(), nullptr),
-                "could not create a softmax forward primitive descriptor");
-            reset(result);
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
 
-        engine get_engine() { return engine::query(*this); }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {}
+
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(dst, dst, 0);
     };
 
     softmax_forward(const primitive_desc &aprimitive_desc,
@@ -2958,31 +2678,19 @@ struct softmax_backward : public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-                const softmax_forward::primitive_desc &hint_fwd_primitive_desc)
-        {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                        &result, &adesc.data, aengine.get(),
-                        hint_fwd_primitive_desc.get()),
-                    "could not create a backward softmax primitive descriptor");
-            reset(result);
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const softmax_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc diff_src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                        mkldnn::convert_to_c(diff_src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff src primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const softmax_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(dst, dst, 0);
+        REG_QUERY_MPD(diff_src, diff_src, 0);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
+        REG_QUERY_MPD(workspace, workspace, 0);
     };
 
     softmax_backward(const primitive_desc &aprimitive_desc,
@@ -3020,105 +2728,32 @@ struct batch_normalization_forward : public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                &result, &adesc.data, aengine.get(), nullptr),
-        "could not create a batch normalization forward primitive descriptor");
-            reset(result);
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
 
-        primitive_desc(const desc &adesc, const primitive_attr &aattr,
-                const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create_v2(
-                        &result, &adesc.data, aattr.get(), aengine.get(),
-                        nullptr),
-                    "could not create a batch normalization forward "
-                    "primitive descriptor");
-            reset(result);
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {}
 
-        memory::primitive_desc weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t bndesc;
-            const_mkldnn_primitive_desc_t const_bndesc =
-                    mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
-                        const_bndesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(bndesc);
-            return adesc;
-        }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(weights, weights, 0);
+        REG_QUERY_MPD(dst, dst, 0);
+        REG_QUERY_MPD(workspace, workspace, 0);
 
-        memory::primitive_desc mean_primitive_desc() const {
-            memory::primitive_desc aprimitive_desc;
-            mkldnn_primitive_desc_t bndesc;
-            mkldnn_batch_normalization_desc_t *p;
-            error::wrap_c_api(mkldnn_primitive_desc_query(
-                    get(), mkldnn::convert_to_c(batch_normalization_d), 0, &p),
-                    "could not get a batch-normalization descriptor");
-            const_mkldnn_primitive_desc_t const_bndesc =
-                (p->flags & use_global_stats) ?
-                    mkldnn_primitive_desc_query_pd(get(),
-                        mkldnn::convert_to_c(src_pd), 1) :
-                    mkldnn_primitive_desc_query_pd(get(),
-                        mkldnn::convert_to_c(dst_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
-                        const_bndesc),
-                    "could not clone a mean primitive descriptor");
-            aprimitive_desc.reset(bndesc);
-            return aprimitive_desc;
-        }
+        memory::primitive_desc mean_primitive_desc() const
+        { return stat_primitive_desc(mean); }
+        memory::primitive_desc variance_primitive_desc() const
+        { return stat_primitive_desc(var); }
 
-        memory::primitive_desc variance_primitive_desc() const {
-            memory::primitive_desc aprimitive_desc;
-            mkldnn_primitive_desc_t bndesc;
+    private:
+        enum { mean = 1, var = 2, };
+        memory::primitive_desc stat_primitive_desc(int kind) const {
             mkldnn_batch_normalization_desc_t *p;
             error::wrap_c_api(mkldnn_primitive_desc_query(
                     get(), mkldnn::convert_to_c(batch_normalization_d), 0, &p),
                     "could not get a batch-normalization descriptor");
-            const_mkldnn_primitive_desc_t const_bndesc =
-                (p->flags & use_global_stats) ?
-                    mkldnn_primitive_desc_query_pd(get(),
-                        mkldnn::convert_to_c(src_pd), 2) :
-                    mkldnn_primitive_desc_query_pd(get(),
-                        mkldnn::convert_to_c(dst_pd), 2);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
-                        const_bndesc),
-                    "could not clone a variance primitive descriptor");
-            aprimitive_desc.reset(bndesc);
-            return aprimitive_desc;
-        }
-
-        memory::primitive_desc workspace_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(workspace_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a workspace primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc,
-                        const_cdesc),
-                    "could not clone a dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
+            return query_mpd(p->flags & use_global_stats ? src_pd : dst_pd, kind);
         }
-
-        engine get_engine() { return engine::query(*this); }
     };
 
     batch_normalization_forward(const primitive_desc &aprimitive_desc,
@@ -3290,97 +2925,26 @@ struct batch_normalization_backward : public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-                const batch_normalization_forward::primitive_desc
-                    &hint_fwd_primitive_desc) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                &result, &adesc.data, aengine.get(),
-                hint_fwd_primitive_desc.get()),
-        "could not create a batch normalization backward primitive descriptor");
-            reset(result);
-        }
-
-        memory::primitive_desc weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t bndesc;
-            const_mkldnn_primitive_desc_t const_bndesc =
-                    mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
-                        const_bndesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(bndesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t bndesc;
-            const_mkldnn_primitive_desc_t const_bndesc =
-                    mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
-                        const_bndesc),
-                    "could not clone a diff_weights primitive descriptor");
-            adesc.reset(bndesc);
-            return adesc;
-        }
-
-        memory::primitive_desc mean_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t bndesc;
-            const_mkldnn_primitive_desc_t const_bndesc =
-                    mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
-                        const_bndesc),
-                    "could not clone a mean primitive descriptor");
-            adesc.reset(bndesc);
-            return adesc;
-        }
-
-        memory::primitive_desc variance_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t bndesc;
-            const_mkldnn_primitive_desc_t const_bndesc =
-                    mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 2);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
-                        const_bndesc),
-                    "could not clone a variance primitive descriptor");
-            adesc.reset(bndesc);
-            return adesc;
-        }
-
-        memory::primitive_desc workspace_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(workspace_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a workspace primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc,
-                        const_cdesc),
-                    "could not clone a dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        engine get_engine() { return engine::query(*this); }
-    };
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const batch_normalization_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
+
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const batch_normalization_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
+
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(mean, src, 1);
+        REG_QUERY_MPD(variance, src, 2);
+        REG_QUERY_MPD(weights, weights, 0);
+        REG_QUERY_MPD(dst, dst, 0);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
+        REG_QUERY_MPD(workspace, workspace, 0);
+
+        REG_QUERY_MPD(diff_src, diff_src, 0);
+        REG_QUERY_MPD(diff_weights, diff_weights, 0);
+    };
 
     // Prop_kind == backward
     batch_normalization_backward(const primitive_desc &aprimitive_desc,
@@ -3491,74 +3055,17 @@ struct inner_product_forward: public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                &result, &adesc.data, aengine.get(), nullptr),
-        "could not create a inner product forward primitive descriptor");
-            reset(result);
-        }
-
-        primitive_desc(const desc &adesc, const primitive_attr &aattr,
-                const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create_v2(
-                &result, &adesc.data, aattr.get(), aengine.get(), nullptr),
-                    "could not create a inner product "
-                    "forward primitive descriptor");
-            reset(result);
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
 
-        memory::primitive_desc src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a src primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {}
 
-        memory::primitive_desc bias_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a bias primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(weights, weights, 0);
+        REG_QUERY_MPD(bias, weights, 1);
+        REG_QUERY_MPD(dst, dst, 0);
     };
 
     inner_product_forward(const primitive_desc &aprimitive_desc,
@@ -3605,54 +3112,18 @@ struct inner_product_backward_data: public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-                const inner_product_forward::primitive_desc
-                    &hint_fwd_primitive_desc) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(&result,
-                    &adesc.data, aengine.get(), hint_fwd_primitive_desc.get()),
-        "could not create a inner product backward data primitive descriptor");
-            reset(result);
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const inner_product_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc diff_dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff dst primititve descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff src primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const inner_product_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(diff_src, diff_src, 0);
+        REG_QUERY_MPD(weights, weights, 0);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
     };
 
     inner_product_backward_data(const primitive_desc &aprimitive_desc,
@@ -3694,66 +3165,19 @@ struct inner_product_backward_weights: public primitive {
         }
     };
 
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine,
-                const inner_product_forward::primitive_desc
-                    &hint_fwd_primitive_desc) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(&result,
-                    &adesc.data, aengine.get(), hint_fwd_primitive_desc.get()),
-        "could not create a inner product backward weights primitive descriptor");
-            reset(result);
-        }
-
-        memory::primitive_desc diff_dst_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff dst primititve descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_weights_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_bias_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_weights_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff bias primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const inner_product_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
 
-        memory::primitive_desc src_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a src primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const inner_product_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
 
-        engine get_engine() { return engine::query(*this); }
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(diff_weights, diff_weights, 0);
+        REG_QUERY_MPD(diff_bias, diff_weights, 1);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
     };
 
     inner_product_backward_weights(const primitive_desc &aprimitive_desc,
@@ -3858,112 +3282,22 @@ struct rnn_forward : public primitive {
         }
 
     };
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                    &result, &adesc.data, aengine.get(), nullptr),
-                "could not create an RNN forward primitive descriptor");
-            reset(result);
-        }
-
-        memory::primitive_desc src_layer_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone an src layer primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc src_iter_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a src iter primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc weights_layer_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc weights_iter_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc bias_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 2);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a bias primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc workspace_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t ldesc;
-            const_mkldnn_primitive_desc_t const_ldesc =
-                    mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(workspace_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&ldesc, const_ldesc),
-                    "could not clone a workspace primitive descriptor");
-            adesc.reset(ldesc);
-            return adesc;
-        }
-
-        memory::primitive_desc dst_layer_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst last layer primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
 
-        memory::primitive_desc dst_iter_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst last iteration primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        engine get_engine() { return engine::query(*this); }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
+
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, nullptr) {}
+
+        REG_QUERY_MPD(src_layer, src, 0);
+        REG_QUERY_MPD(src_iter, src, 1);
+        REG_QUERY_MPD(weights_layer, weights, 0);
+        REG_QUERY_MPD(weights_iter, weights, 1);
+        REG_QUERY_MPD(bias, weights, 2);
+        REG_QUERY_MPD(dst_layer, dst, 0);
+        REG_QUERY_MPD(dst_iter, dst, 1);
+        REG_QUERY_MPD(workspace, workspace, 0);
     };
 
     rnn_forward(const primitive_desc &aprimitive_desc,
@@ -4029,197 +3363,38 @@ struct rnn_backward : public primitive {
         }
 
     };
-    struct primitive_desc : public handle<mkldnn_primitive_desc_t> {
-        primitive_desc(const desc &adesc, const engine &aengine) {
-            mkldnn_primitive_desc_t result;
-            error::wrap_c_api(mkldnn_primitive_desc_create(
-                    &result, &adesc.data, aengine.get(), nullptr),
-                "could not create an RNN backward primitive descriptor");
-            reset(result);
-        }
-
-        memory::primitive_desc src_layer_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone an src layer primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc src_iter_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(src_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a src iter primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc weights_layer_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc weights_iter_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc bias_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(weights_pd), 2);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a bias primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc dst_layer_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst last layer primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc dst_iter_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(dst_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst last iteration primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_src_layer_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_src_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone an src_layer primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
 
-        memory::primitive_desc diff_src_iter_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_src_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a diff_src iter primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_weights_layer_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_weights_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_weights_iter_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_weights_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a weights primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_bias_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_weights_pd), 2);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a bias primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_dst_layer_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_dst_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst last layer primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc diff_dst_iter_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t cdesc;
-            const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(diff_dst_pd), 1);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                    "could not clone a dst last iteration primitive descriptor");
-            adesc.reset(cdesc);
-            return adesc;
-        }
-
-        memory::primitive_desc workspace_primitive_desc() const {
-            memory::primitive_desc adesc;
-            mkldnn_primitive_desc_t ldesc;
-            const_mkldnn_primitive_desc_t const_ldesc =
-                    mkldnn_primitive_desc_query_pd(get(),
-                               mkldnn::convert_to_c(workspace_pd), 0);
-            error::wrap_c_api(mkldnn_primitive_desc_clone(&ldesc, const_ldesc),
-                    "could not clone a workspace primitive descriptor");
-            adesc.reset(ldesc);
-            return adesc;
-        }
-
-        engine get_engine() { return engine::query(*this); }
+    struct primitive_desc : public mkldnn::primitive_desc {
+        MKLDNN_DEPRECATED
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
+
+        primitive_desc(const desc &desc, const engine &e,
+                const rnn_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
+
+        primitive_desc(const desc &desc, const primitive_attr &attr, const engine &e,
+                const rnn_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, &attr, e, hint_fwd_pd.get()) {}
+
+        REG_QUERY_MPD(src_layer, src, 0);
+        REG_QUERY_MPD(src_iter, src, 1);
+        REG_QUERY_MPD(weights_layer, weights, 0);
+        REG_QUERY_MPD(weights_iter, weights, 1);
+        REG_QUERY_MPD(bias, weights, 2);
+        REG_QUERY_MPD(dst_layer, dst, 0);
+        REG_QUERY_MPD(dst_iter, dst, 1);
+        REG_QUERY_MPD(workspace, workspace, 0);
+
+        REG_QUERY_MPD(diff_src_layer, diff_src, 0);
+        REG_QUERY_MPD(diff_src_iter, diff_src, 1);
+        REG_QUERY_MPD(diff_weights_layer, diff_weights, 0);
+        REG_QUERY_MPD(diff_weights_iter, diff_weights, 1);
+        REG_QUERY_MPD(diff_bias, diff_weights, 2);
+        REG_QUERY_MPD(diff_dst_layer, diff_dst, 0);
+        REG_QUERY_MPD(diff_dst_iter, diff_dst, 1);
     };
+
     // With last iteration (with and without input src_iter)
     rnn_backward(const primitive_desc &aprimitive_desc,
                  const primitive::at &src_layer,
@@ -4271,6 +3446,80 @@ struct rnn_backward : public primitive {
 };
 
 /// @}
+
+/// @addtogroup cpp_api_shuffle Shuffle
+/// A primitive to shuffle data along the axis.
+///
+/// @sa @ref c_api_shuffle in @ref c_api
+/// @{
+
+struct shuffle_forward : public primitive {
+    struct desc {
+        mkldnn_shuffle_desc_t data;
+        desc(prop_kind aprop_kind, const memory::desc &data_desc,
+                int axis, int group_size) {
+            error::wrap_c_api(mkldnn_shuffle_forward_desc_init(&data,
+                        mkldnn::convert_to_c(aprop_kind), &data_desc.data,
+                        axis, group_size),
+                    "could not create a shuffle forward descriptor");
+        }
+    };
+
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, nullptr) {}
+
+        REG_QUERY_MPD(src, src, 0);
+        REG_QUERY_MPD(dst, dst, 0);
+    };
+
+    shuffle_forward(const primitive_desc &aprimitive_desc,
+            const primitive::at &src, const memory &dst) {
+        mkldnn_primitive_t result;
+        mkldnn_primitive_at_t inputs[] = { src.data };
+        const_mkldnn_primitive_t outputs[] = { dst.get() };
+        check_num_parameters(aprimitive_desc.get(), 1, 1, "shuffle forward");
+        error::wrap_c_api(mkldnn_primitive_create(&result,
+            aprimitive_desc.get(), inputs, outputs),
+            "could not create a shuffle forward primitive");
+        reset(result);
+    }
+};
+
+struct shuffle_backward : public primitive {
+    struct desc {
+        mkldnn_shuffle_desc_t data;
+        desc(const memory::desc &diff_data_desc, int axis, int group_size) {
+            error::wrap_c_api(mkldnn_shuffle_backward_desc_init(&data,
+                        &diff_data_desc.data, axis, group_size),
+                    "could not create a shuffle backward descriptor");
+        }
+    };
+
+    struct primitive_desc : public mkldnn::primitive_desc {
+        primitive_desc(const desc &desc, const engine &e,
+                const shuffle_forward::primitive_desc &hint_fwd_pd)
+            : mkldnn::primitive_desc(&desc.data, nullptr, e, hint_fwd_pd.get()) {}
+
+        REG_QUERY_MPD(diff_src, diff_src, 0);
+        REG_QUERY_MPD(diff_dst, diff_dst, 0);
+    };
+
+    shuffle_backward(const primitive_desc &aprimitive_desc,
+            const primitive::at &diff_dst, const memory &diff_src) {
+        mkldnn_primitive_t result;
+        mkldnn_primitive_at_t inputs[] = { diff_dst.data};
+        const_mkldnn_primitive_t outputs[] = { diff_src.get() };
+        check_num_parameters(aprimitive_desc.get(), 1, 1, "shuffle backward");
+        error::wrap_c_api(mkldnn_primitive_create(&result,
+            aprimitive_desc.get(), inputs, outputs),
+            "could not create a shuffle backward primitive");
+        reset(result);
+    }
+};
+
+/// @}
+
 /// @} Primitives
 
 /// @addtogroup cpp_api_stream Stream
@@ -4355,6 +3604,8 @@ struct stream: public handle<mkldnn_stream_t> {
     }
 };
 
+#undef REG_QUERY_MPD
+
 /// @}
 
 /// @} C++ API
index 2a58622..b0ea527 100644 (file)
@@ -84,13 +84,39 @@ typedef enum {
 
 /** Memory format specification.
  *
- * Intel(R) MKL-DNN uses the following notation for memory format names:
+ * Intel MKL-DNN formats describe physical data layout. The physical layout
+ * is described as a sequence of the dimensions as they are laid out in the
+ * memory (from the outer-most to the inner-most). Note that this order
+ * doesn't affect the logical order of the dimensions that is kept in the
+ * `dims` field of mkldnn_memory_desc_t structure. The logical order of the
+ * dimensions is specified by the type of tensor.
+ *
+ * For example, CNN 5D tensor always has its logical dimensions in order
+ * `(batch, channels, depth, height, width)`, while physical layout might
+ * be #mkldnn_ncdhw or #mkldnn_ndhwc:
+ *
+ * ~~~cpp
+ * int batch = 2, channels = 16, depth = 13, height = 13, width = 13;
+ *
+ * int ndims = 5; // 5D tensor
+ * mkldnn_dims_t dims = {batch, channels, depth, height, width};
+ *
+ * mkldnn_memory_desc_t data_in_ncdhw;
+ * mkldnn_memory_desc_init(&data_in_ncdhw, 5, dims, mlkdnn_ncdhw);
+ *
+ * // note that in both cases dims passed are the same
+ * mkldnn_memory_desc_t data_in_ndhwc;
+ * mkldnn_memory_desc_init(&data_in_ndhwc, 5, dims, mlkdnn_ndhwc);
+ * ~~~
+ *
+ * The following notation for memory format names:
  *  - @c 'n' denotes the mini-batch dimension
  *  - @c 'c' denotes a channels dimension
  *  - When there are multiple channel dimensions (for example, in convolution
  *    weights tensor), @c 'i' and @c 'o' denote dimensions of input and output
  *    channels
- *  - @c 'h' and @c 'w' denote spatial width and height
+ *  - @c 'd', @c 'h', and @c 'w' denote spatial depth, height, and width
+ *    respectively
  *  - Upper-case letters indicate that the data is laid out in blocks
  *    for a particular dimension. In such cases, the format name contains both
  *    upper- and lower-case letters for that dimension with lower-case letter
@@ -103,6 +129,8 @@ typedef enum {
  *    Channel designations can be different. For example: both the @c
  *    'mkldnn_nc' and @c 'mkldnn_io' formats can be used to describe a 2D
  *    tensor.
+ *
+ * @sa @ref understanding_memory_formats
  */
 typedef enum {
     /** Undefined memory format, used for empty memory descriptors. */
@@ -118,254 +146,229 @@ typedef enum {
     mkldnn_x,
     /** 2D data tensor. */
     mkldnn_nc,
-    /** 4D data tensor in the @c nchw format typically used in Caffe. */
+    /** 3D data tensor with the physical layout @c ncw.
+     * Logical dimensions come in the order: (n, c, w) */
+    mkldnn_ncw,
+    /** 3D data tensor with the physical layout @c nwc.
+     * Logical dimensions come in the order: (n, c, w) */
+    mkldnn_nwc,
+    /** 4D data tensor with the physical layout @c nchw, used in Caffe.
+     * Logical dimensions come in the order: (n, c, h, w) */
     mkldnn_nchw,
-    /** 4D data tensor in the @c nhwc format typically used in TensorFlow. */
+    /** 4D data tensor with the physical layout @c nhwc, used in TensorFlow.
+     * Logical dimensions come in the order: (n, c, h, w) */
     mkldnn_nhwc,
-    /** 4D data tensor in the @c chwn format typically used in Neon. */
+    /** 4D data tensor with the physical layout @c chwn, used in Neon.
+     * Logical dimensions come in the order: (n, c, h, w) */
     mkldnn_chwn,
-    /** 4D data tensor in the @c nchw format with channels data laid out in
-     * memory in 8-element blocks. */
-    mkldnn_nChw8c,
-    /** 4D data tensor in the @c nchw format with channels data laid out in
-     * memory in 16-element blocks. */
-    mkldnn_nChw16c,
-    /** 5D data tensor in the @c ncdhw format. */
+    /** 5D data tensor with the physical layout @c ncdhw.
+     * Logical dimensions come in the order: (n, c, d, h, w) */
     mkldnn_ncdhw,
-    /** 5D data tensor in the @c ndhwc format typically used in TensorFlow. */
+    /** 5D data tensor with the physical layout @c ndhwc, used in TensorFlow.
+     * Logical dimensions come in the order: (n, c, d, h, w) */
     mkldnn_ndhwc,
-    /** 5D data tensor in the @c ncdhw format with channels data laid out in
-     * memory in 8-element blocks. */
-    mkldnn_nCdhw8c,
-    /** 5D data tensor in the @c ncdhw format with channels data laid out in
-     * memory in 16-element blocks. */
-    mkldnn_nCdhw16c,
-    /** 2D weights tensor in the format (input channels, output channels). */
+    /** 2D weights tensor with physical layout @c oi.
+     * Logical dimensions come in the order: (o, i) */
     mkldnn_oi,
-    /** 2D weights tensor in the format (input channels, output channels). */
+    /** 2D weights tensor with physical layout @c io.
+     * Logical dimensions come in the order: (o, i) */
     mkldnn_io,
-    /** 4D weights tensor in the format (input channels, output channels,
-     * width, height). */
+    /** 3D weights tensor with physical layout @c oiw.
+     * Logical dimensions come in the order: (o, i, w) */
+    mkldnn_oiw,
+    /** 3D weights tensor with physical layout @c wio.
+     * Logical dimensions come in the order: (o, i, w) */
+    mkldnn_wio,
+    /** 4D weights tensor with physical layout @c oihw, used in Caffe.
+     * Logical dimensions come in the order: (o, i, h, w) */
     mkldnn_oihw,
-    /** 4D weights tensor in the format (input channels, height, width,
-     * output channels). */
-    mkldnn_ihwo,
-    /** 4D weights tensor in the format (height, width, input channels,
-     * output channels). */
+    /** 4D weights tensor with physical layout @c hwio, used in TensorFlow.
+     * Logical dimensions come in the order: (o, i, h, w) */
     mkldnn_hwio,
-    /** 5D weights tensor in the format (depth, height, width, input channels,
-     * output channels). */
-    mkldnn_dhwio,
-    /** 5D weight tensor in the @c oidhw format. */
+    /** 4D weights tensor with physical layout @c ihwo.
+     * Logical dimensions come in the order: (o, i, h, w) */
+    mkldnn_ihwo,
+    /** 5D weights tensor with physical layout @c iodhw, used in Caffe.
+     * Logical dimensions come in the order: (o, i, d, h, w) */
     mkldnn_oidhw,
-   /** 6D weights tensor in the @c oidhw format with output channels data
-    * laid out in memory in 8-element blocks and input channels data
-     * laid out in memory in 8-element blocks blocked by quadruple. */
-    mkldnn_OIdhw8i8o,
-    /** 6D weights tensor in the @c oihw format with both input and output
-     * channels data laid out in memory in 8-element blocks. */
-    mkldnn_OIdhw8o8i,
-    /** 5D weights tensor in the blocked version of @c oidhw format with output
-     * channels data laid out in memory in 8-element blocks. */
-    mkldnn_Odhwi8o,
-    /** 4D weights tensor in the @c oihw format with both input and output
-     * channels data laid out in memory in 8-element blocks. */
-   /** 6D weights tensor in the @c oidhw format with output channels data
-    * laid out in memory in 16-element blocks and input channels data
-     * laid out in memory in 4-element blocks blocked by quadruple. */
-    mkldnn_OIdhw16i16o,
-    /** 6D weights tensor in the @c oihw format with both input and output
-     * channels data laid out in memory in 16-element blocks. */
-    mkldnn_OIdhw16o16i,
-    /** 5D weights tensor in the blocked version of @c oidhw format with output
-     * channels data laid out in memory in 16-element blocks. */
-    mkldnn_Oidhw16o,
-    /** 5D weights tensor in the blocked version of @c oidhw format with output
-     * channels data laid out in memory in 16-element blocks. */
-    mkldnn_Odhwi16o,
-    /** 4D weights tensor in the @c oihw format with both input and output
-     * channels data laid out in memory in 8-element blocks. */
-    mkldnn_OIhw8i8o,
-    /** 4D weights tensor in the @c oihw format with both input and output
-     * channels data laid out in memory in 16-element blocks. */
-    mkldnn_OIhw16i16o,
-    /** 4D weights tensor in the @c oihw format with output channels data
-     * laid out in memory in 16-element blocks and input channels data
-     * laid out in memory in 4-element blocks blocked by quadruple. */
-    mkldnn_OIhw4i16o4i,
-    /** 4D weights tensor in the @c oihw format with output channels data
-     * laid out in memory in 16-element blocks and input channels data
-     * laid out in memory in 8-element blocks blocked by pairs. */
-    mkldnn_OIhw8i16o2i,
-    /** 5D weights tensor in the @c oidhw format with output channels data
-     * laid out in memory in 16-element blocks and input channels data
-     * laid out in memory in 8-element blocks blocked by pairs. */
-    mkldnn_OIdhw8i16o2i,
-    /** 4D weights tensor in the @c oihw format with input channels data
-     * laid out in memory in 16-element blocks and output channels data
-     * laid out in memory in 8-element blocks blocked by pairs. */
-    mkldnn_OIhw8o16i2o,
-    /** 4D weights tensor in the @c oihw format with both input and output
-     * channels data laid out in memory in 8-element blocks. */
-    mkldnn_OIhw8o8i,
-    /** 4D weights tensor in the @c oihw format with both input and output
-     * channels data laid out in memory in 16-element blocks. */
-    mkldnn_OIhw16o16i,
-    /** 4D weights tensor in the @c oihw format with both input and output
-     * channels data laid out in memory in 16-element blocks. */
-    mkldnn_IOhw16o16i,
-    /** 4D weights tensor in the format (output channels, input channels,
-     * height, width) with output channels data laid out in memory in 8-element
-     * blocks. */
-    mkldnn_Oihw8o,
-    /** 4D weights tensor in the format (output channels, input channels,
-     * height, width) with output channels data laid out in memory in
-     * 16-element blocks. */
-    mkldnn_Oihw16o,
-    /** 4D weights tensor in the format (output channels, width, height, input
-     * channels) with output channels data laid out in memory in 8-element
-     * blocks. */
-    mkldnn_Ohwi8o,
-    /** 4D weights tensor in the format (output channels, width, height, input
-     * channels) with output channels data laid out in memory in 16-element
-     * blocks. */
-    mkldnn_Ohwi16o,
-    /** 4D weights tensor in the @c oihw format with both input and output
-     * channels data laid out in memory in 16-element and 4-element blocks. */
-    mkldnn_OhIw16o4i,
-    /** 5D weights tensor in the @c oihw format with extra outer dimension for
-     * groups. */
+    /** 5D weights tensor with physical layout @c dhwio, used in TensorFlow.
+     * Logical dimensions come in the order: (o, i, d, h, w) */
+    mkldnn_dhwio,
+    /** 4D grouped weights tensor with the physical layout @c goiw.
+     * Logical dimensions come in the order: (g, o, i, w) */
+    mkldnn_goiw,
+    /** 5D grouped weights tensor with the physical layout @c goihw,
+     * used in Caffe.
+     * Logical dimensions come in the order: (g, o, i, h, w) */
     mkldnn_goihw,
-    /** 5D weights tensor in the @c hwio format with extra dimension for
-     * groups that comes after the output channels. */
+    /** 5D grouped weights tensor with the physical layout @c hwigo,
+     * used in TensorFlow.
+     * Logical dimensions come in the order: (g, o, i, h, w) */
     mkldnn_hwigo,
-    /** 6D weights tensor in the @c oidhw format with output channels data
-     * laid out in memory in 8-element blocks and input channels data
-     * laid out in memory in 8-element blocks blocked by quadruple. */
-    mkldnn_gOIdhw8i8o,
-    /** 6D weights tensor in the @c oihw format with both input and output
-     * channels data laid out in memory in 8-element blocks. */
-    mkldnn_gOIdhw8o8i,
-    /** 5D weights tensor in the blocked version of @c oidhw format with output
-     * channels data laid out in memory in 8-element blocks. */
-    mkldnn_gOdhwi8o,
-    /** 5D weights tensor in the blocked version of @c goihw format with both
-     * input and output channels data laid out in memory in 8-element blocks.
-     */
-    mkldnn_gOIhw8i8o,
-    /** 5D weights tensor in the blocked version of @c goihw format with both
-     * input and output channels data laid out in memory in 16-element blocks.
-     */
-    mkldnn_gOIhw16i16o,
-    /** 5D weights tensor in the @c oihw format with output channels data
-     * laid out in memory in 16-element blocks and input channels data
-     * laid out in memory in 4-element blocks blocked by quadruple. */
-    mkldnn_gOIhw4i16o4i,
-    /** 5D weights tensor in the @c oihw format with output channels data
-     * laid out in memory in 16-element blocks and input channels data
-     * laid out in memory in 8-element blocks blocked by pairs. */
-    mkldnn_gOIhw8i16o2i,
-    /** 6D weights tensor in the @c oidhw format with output channels data
-     * laid out in memory in 16-element blocks and input channels data
-     * laid out in memory in 8-element blocks blocked by pairs. */
-    mkldnn_gOIdhw8i16o2i,
-    /** 5D weights tensor in the @c oihw format with input channels data
-     * laid out in memory in 16-element blocks and output channels data
-     * laid out in memory in 8-element blocks blocked by pairs. */
-    mkldnn_gOIhw8o16i2o,
-    /** 5D weights tensor in the blocked version of @c goihw format with both
-     * input and output channels data laid out in memory in 8-element blocks.
-     */
-    mkldnn_gOIhw8o8i,
-    /** 5D weights tensor in the blocked version of @c goihw format with both
-     * input and output channels data laid out in memory in 16-element blocks.
-     */
-    mkldnn_gOIhw16o16i,
-    /** 5D weights tensor in the blocked version of @c goihw format with both
-     * input and output channels data laid out in memory in 16-element blocks.
-     */
-    mkldnn_gIOhw16o16i,
-    /** 5D weights tensor in the blocked version of @c goihw format with output
-     * channels data laid out in memory in 8-element blocks. */
-    mkldnn_gOihw8o,
-    /** 5D weights tensor in the blocked version of @c goihw format with output
-     * channels data laid out in memory in 16-element blocks. */
-    mkldnn_gOihw16o,
-    /** 5D weights tensor in the blocked version of @c goihw format with output
-     * channels data laid out in memory in 8-element blocks. */
-    mkldnn_gOhwi8o,
-    /** 5D weights tensor in the blocked version of @c goihw format with output
-     * channels data laid out in memory in 16-element blocks. */
-    mkldnn_gOhwi16o,
-    /** 5D weights tensor in the blocked version of @c goihw format with group
-     * data laid out in memory in 8-element blocks. */
-    mkldnn_Goihw8g,
-    /** 5D weights tensor in the blocked version of @c goihw format with group
-     * data laid out in memory in 16-element blocks. */
-    mkldnn_Goihw16g,
-    /** 5D weights tensor in the @c goihw format with both input and output
-     * channels data laid out in memory in 16-element and 4-element blocks. */
-    mkldnn_gOhIw16o4i,
-    /** 6D weight tensor in the @c goidhw format with extra dimension for
-     * groups */
+    /** 6D grouped weights tensor with the physical layout @c goidhw,
+     * used in Caffe.
+     * Logical dimensions come in the order: (g, o, i, d, h, w) */
     mkldnn_goidhw,
-   /** 6D weights tensor in the @c oidhw format with output channels data
-    * laid out in memory in 16-element blocks and input channels data
-     * laid out in memory in 4-element blocks blocked by quadruple. */
-    mkldnn_gOIdhw16i16o,
-    /** 6D weights tensor in the blocked version of @c goihw format with both
-     * input and output channels data laid out in memory in 16-element blocks.
-     */
-    mkldnn_gOIdhw16o16i,
-    /** 6D weights tensor in the blocked version of @c goidhw format with output
-     * channels data laid out in memory in 16-element blocks. */
-    mkldnn_gOidhw16o,
-    /** 6D weights tensor in the blocked version of @c goidhw format with output
-     * channels data laid out in memory in 16-element blocks. */
-    mkldnn_gOdhwi16o,
-    /** 3D data tensor in the format (batch, seq_length, input channels). */
+    /** 3D RNN data tensor in the format (batch, seq_length, input channels). */
     mkldnn_ntc,
-    /** 3D data tensor in the format (seq_length, batch, input channels). */
+    /** 3D RNN data tensor in the format (seq_length, batch, input channels). */
     mkldnn_tnc,
-    /** 5D states tensor in the format (num_layers, num_directions, num_states,
-     * batch, state channels). */
+    /** 5D RNN states tensor in the format (num_layers, num_directions,
+     * num_states, batch, state channels). */
     mkldnn_ldsnc,
-    /** 5D weights tensor in the format (num_layers, num_directions,
-     *  input_chanels, num_gates, output_channels).
-     *  For LSTM cells, the gates order is forget, input, output and candidate gate.
-     *  For GRU cells, the gates order is update, reset and output gate. */
+    /** 5D RNN weights tensor in the format (num_layers, num_directions,
+     *  input_channels, num_gates, output_channels).
+     *
+     *  - For LSTM cells, the gates order is input, forget, candidate
+     *    and output gate.
+     *  - For GRU cells, the gates order is update, reset and output gate. */
     mkldnn_ldigo,
-    /** 5D weights tensor in the blocked format. */
-    mkldnn_ldigo_p,
-    /** 5D weights tensor in the format (num_layers, num_directions, num_gates,
-     *  output_channels, input_chanels).
-     *  For LSTM cells, the gates order is forget, input, output and candidate gate.
-     *  For GRU cells, the gates order is update, reset and output gate. */
+    /** 5D RNN weights tensor in the format (num_layers, num_directions,
+     * num_gates, output_channels, input_channels).
+     *
+     *  - For LSTM cells, the gates order is input, forget, candidate
+     *    and output gate.
+     *  For GRU cells, the gates order is update, reset and output gate. */
     mkldnn_ldgoi,
-    /** 5D weights tensor in the blocked format. */
-    mkldnn_ldgoi_p,
-    /** 4D bias tensor in the format (num_layers, num_directions, num_gates,
-     * output_channels).
-     * For LSTM cells, the gates order is forget, input, output and candidate gate.
-     * For GRU cells, the gates order is update, reset and output gate. */
+    /** 4D RNN bias tensor in the format (num_layers, num_directions,
+     * num_gates, output_channels).
+     *
+     *  - For LSTM cells, the gates order is input, forget, candidate
+     *    and output gate.
+     * For GRU cells, the gates order is update, reset and output gate. */
     mkldnn_ldgo,
-    /** General tensor format for integer 8bit winograd convolution. */
-    mkldnn_wino_fmt,
+
+    /* Opaque data types, are not to be used explicitly */
+
+    /* data */
+    mkldnn_nCw8c /** blocked data format */,
+    mkldnn_nCw16c /** blocked data format */,
+    mkldnn_nChw8c /** blocked data format */,
+    mkldnn_nChw16c /** blocked data format */,
+    mkldnn_nCdhw8c /** blocked data format */,
+    mkldnn_nCdhw16c /** blocked data format */,
+
+    /* weights, 3D */
+    mkldnn_Owi8o /** blocked weights format */,
+    mkldnn_OIw8i8o /** blocked weights format */,
+    mkldnn_OIw8o8i /** blocked weights format */,
+    mkldnn_OIw16i16o /** blocked weights format */,
+    mkldnn_OIw16o16i /** blocked weights format */,
+    mkldnn_Oiw16o /** blocked weights format */,
+    mkldnn_Owi16o /** blocked weights format */,
+    mkldnn_OIw8i16o2i /** blocked weights format */,
+    mkldnn_OIw8o16i2o /** blocked weights format */,
+    mkldnn_IOw16o16i /** blocked weights format */,
+
+    /* weights, 4D */
+    /** weights format with additional buffer
+     * size equal to the number of output channels
+     * and containing the values:
+     * O[i:0,OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/
+    mkldnn_hwio_s8s8,
+    mkldnn_oIhw8i /** blocked weights format */,
+    mkldnn_oIhw16i /** blocked weights format */,
+    mkldnn_OIhw8i8o /** blocked weights format */,
+    mkldnn_OIhw16i16o /** blocked weights format */,
+    mkldnn_OIhw4i16o4i /** blocked weights format */,
+    /** blocked weights format with additional buffer
+     * with size equal to the number of output channels
+     * and containing the values:
+     * O[i:0,OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/
+    mkldnn_OIhw4i16o4i_s8s8,
+    mkldnn_OIhw8i16o2i /** blocked weights format */,
+    mkldnn_OIhw8o16i2o /** blocked weights format */,
+    mkldnn_OIhw8o8i /** blocked weights format */,
+    mkldnn_OIhw16o16i /** blocked weights format */,
+    mkldnn_IOhw16o16i /** blocked weights format */,
+    mkldnn_Oihw8o /** blocked weights format */,
+    mkldnn_Oihw16o /** blocked weights format */,
+    mkldnn_Ohwi8o /** blocked weights format */,
+    mkldnn_Ohwi16o /** blocked weights format */,
+    mkldnn_OhIw16o4i /** blocked weights format */,
+    mkldnn_OhIw8o4i /** blocked weights format */,
+    /** blocked weights format with additional buffer
+     * with size equal to the number of output channels
+     * and containing the values:
+     * O[i:0,OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/
+    mkldnn_OhIw8o4i_s8s8,
+
+    /* weights, 5D */
+    mkldnn_oIdhw8i /** blocked weights format */,
+    mkldnn_oIdhw16i /** blocked weights format */,
+    mkldnn_OIdhw8i8o /** blocked weights format */,
+    mkldnn_OIdhw8o8i /** blocked weights format */,
+    mkldnn_Odhwi8o /** blocked weights format */,
+    mkldnn_OIdhw16i16o /** blocked weights format */,
+    mkldnn_OIdhw16o16i /** blocked weights format */,
+    mkldnn_Oidhw16o /** blocked weights format */,
+    mkldnn_Odhwi16o /** blocked weights format */,
+    mkldnn_OIdhw8i16o2i /** blocked weights format */,
+
+    /* weights w/ groups, 4D */
+    mkldnn_gOwi8o /** blocked weights format */,
+    mkldnn_gOIw8o8i /** blocked weights format */,
+    mkldnn_gOIw8i8o /** blocked weights format */,
+    mkldnn_gOIw16i16o /** blocked weights format */,
+    mkldnn_gOIw16o16i /** blocked weights format */,
+    mkldnn_gOiw16o /** blocked weights format */,
+    mkldnn_gOwi16o /** blocked weights format */,
+    mkldnn_gOIw8i16o2i /** blocked weights format */,
+    mkldnn_gOIw8o16i2o /** blocked weights format */,
+    mkldnn_gIOw16o16i /** blocked weights format */,
+
+    /* weights w/ groups, 5D */
+    /** weights format with additional buffer
+     * size equal to the number of output channels
+     * multiplied by number of groups and containing the values:
+     * O[i:0,G*OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/
+    mkldnn_hwigo_s8s8,
+    mkldnn_gOIhw8i8o /** blocked weights format */,
+    mkldnn_gOIhw16i16o /** blocked weights format */,
+    mkldnn_gOIhw4i16o4i /** blocked weights format */,
+    /** blocked weights format with additional buffer
+     * with size equal to the number of output channels
+     * multiplied by number of groups and containing the values:
+     * O[i:0,G*OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/
+    mkldnn_gOIhw4i16o4i_s8s8,
+    mkldnn_gOIhw8i16o2i /** blocked weights format */,
+    mkldnn_gOIhw8o16i2o /** blocked weights format */,
+    mkldnn_gOIhw8o8i /** blocked weights format */,
+    mkldnn_gOIhw16o16i /** blocked weights format */,
+    mkldnn_gIOhw16o16i /** blocked weights format */,
+    mkldnn_gOihw8o /** blocked weights format */,
+    mkldnn_gOihw16o /** blocked weights format */,
+    mkldnn_gOhwi8o /** blocked weights format */,
+    mkldnn_gOhwi16o /** blocked weights format */,
+    mkldnn_Goihw8g /** blocked weights format */,
+    mkldnn_Goihw16g /** blocked weights format */,
+    mkldnn_gOhIw16o4i /** blocked weights format */,
+    mkldnn_gOhIw8o4i /** blocked weights format */,
+    /** blocked weights format with additional buffer
+     * with size equal to the number of output channels
+     * multiplied by number of groups and containing the values:
+     * O[i:0,G*OC] = -128 * SUM(j:0,IC;h:0,H;w:0,W)(weights(i,j,h,w))*/
+    mkldnn_gOhIw8o4i_s8s8,
+
+    /* weights w/ groups, 6D */
+    mkldnn_gOIdhw8i8o /** blocked weights format */,
+    mkldnn_gOIdhw8o8i /** blocked weights format */,
+    mkldnn_gOdhwi8o /** blocked weights format */,
+    mkldnn_gOIdhw8i16o2i /** blocked weights format */,
+    mkldnn_gOIdhw16i16o /** blocked weights format */,
+    mkldnn_gOIdhw16o16i /** blocked weights format */,
+    mkldnn_gOidhw16o /** blocked weights format */,
+    mkldnn_gOdhwi16o /** blocked weights format */,
+
+    mkldnn_wino_fmt /** Weights format used in 8bit Winograd convolution */,
+
+    /* RNN packed weights */
+    mkldnn_ldigo_p /** RNN packed weights (unused) */,
+    mkldnn_ldgoi_p /** RNN packed weights (unused) */,
+
     /** Just a sentinel, not real memory format. Must be changed after new
      * format is added. */
     mkldnn_format_last,
-    /** 4D weights tensor in the oihw format with input channels data laid out
-     * in memory in 8-element blocks. */
-    mkldnn_oIhw8i = mkldnn_nChw8c,
-    /** 4D weights tensor in the oihw format with input channels data laid out
-     * in memory in 16-element blocks. */
-    mkldnn_oIhw16i = mkldnn_nChw16c,
-    /** 5D weights tensor in the oihw format with input channels data laid out
-     * in memory in 8-element blocks. */
-    mkldnn_oIdhw8i = mkldnn_nCdhw8c,
-    /** 5D weights tensor in the oihw format with input channels data laid out
-     * in memory in 16-element blocks. */
-    mkldnn_oIdhw16i = mkldnn_nCdhw16c,
 } mkldnn_memory_format_t;
 
 /** Kinds of padding. Define how to interpret the data in padding regions. */
@@ -411,6 +414,8 @@ typedef enum {
     mkldnn_view,
     /** A reorder primitive.*/
     mkldnn_reorder,
+    /** A shuffle primitive.*/
+    mkldnn_shuffle,
     /** A (out-of-place) concat primitive. */
     mkldnn_concat,
     /** A (in-place) concat primitive. */
@@ -577,7 +582,9 @@ typedef int mkldnn_dims_t[TENSOR_MAX_DIMS];
 /** A type to describe strides within a tensor. */
 typedef ptrdiff_t mkldnn_strides_t[TENSOR_MAX_DIMS];
 
-/** Generic description of blocked data layout for most memory formats. */
+/** Generic description of blocked data layout for most memory formats.
+ *
+ * @sa @ref understanding_memory_formats */
 typedef struct {
     /** Block size for each of the dimensions. */
     mkldnn_dims_t block_dims;
@@ -605,7 +612,7 @@ typedef enum {
     mkldnn_wino_wei_OBaaIBOIio
 } mkldnn_wino_memory_format_t;
 
-/** Description of tensor of weights for integer 8bit winograd convolution. */
+/** Description of tensor of weights for winograd 2x3 convolution. */
 typedef struct {
     mkldnn_wino_memory_format_t wino_format;
     int r;
@@ -616,6 +623,7 @@ typedef struct {
     int oc_block;
     int ic2_block;
     int oc2_block;
+    float adj_scale;
     size_t size;
 } mkldnn_wino_desc_t;
 
@@ -638,14 +646,20 @@ typedef struct {
     /** Number of dimensions */
     int ndims;
     /** Dimensions in the following order:
-     * - CNN data tensors:  mini-batch, channel, spatial
-     *   (<code>{N, C, [D,] H, W}</code>)
+     * - CNN data tensors: mini-batch, channel, spatial
+     *   (<code>{N, C, [[D,] H,] W}</code>)
      * - CNN weight tensors: group (optional), output channel, input channel,
-     *   spatial (<code>{[G,] O, I, [D,] H, W}<code>)
+     *   spatial (<code>{[G,] O, I, [[D,] H,] W}</code>)
      * - RNN data tensors: time, mini-batch, channels (<code>{T, N, C}</code>)
      *   or layers, directions, states, mini-batch, channels (<code>{L, D, S, N, C}</code>)
      * - RNN weight tensor: layers, directions, input channel, gates, output channels
-     *   (<code>{L, D, I, G, O}</code>). */
+     *   (<code>{L, D, I, G, O}</code>).
+     *
+     * @note
+     *    The order of dimensions does not depend on the memory format, so
+     *    no matter whether the data is laid in #mkldnn_nchw or #mkldnn_nhwc
+     *    the dims for 4D CN data tensor would be <code>{N, C, H, W}</code>
+     */
     mkldnn_dims_t dims;
     /** Data type of the tensor elements. */
     mkldnn_data_type_t data_type;
@@ -708,6 +722,23 @@ typedef struct {
 /** A descriptor of a deconvolution operation. */
 typedef mkldnn_convolution_desc_t mkldnn_deconvolution_desc_t;
 
+/** A descriptor of a shuffle operation. */
+typedef struct {
+    /** The kind of primitive. Used for self identifying the primitive
+     * descriptor. Must be #mkldnn_convolution. */
+    mkldnn_primitive_kind_t primitive_kind;
+    /** The kind of propagation. Possible values: #mkldnn_forward_training,
+     * #mkldnn_forward_inference, #mkldnn_backward_data*/
+    mkldnn_prop_kind_t prop_kind;
+    /** Source and destination memory descriptor.
+     *  and source and destination gradient memory descriptor. */
+    mkldnn_memory_desc_t data_desc;
+    /** axis for shuffling. */
+    int axis;
+    /** number of groups in group convolution */
+    int group_size;
+} mkldnn_shuffle_desc_t;
+
 /** A descriptor of a element-wise operation. */
 typedef struct {
     /** The kind of primitive. Used for self identifying the primitive
@@ -1164,6 +1195,7 @@ typedef struct {
  *      *_s64                        | ptrdiff_t *
  *      *_f64                        | double *
  *      *_str                        | const char **
+ *      #mkldnn_query_op_d           | const_mkldnn_op_desc_t *
  *      *_md                         | const mkldnn_memory_desc_t **
  *      *_${op}_d                    | const mkldnn_${op}_desc_t **
  *      *_pd                         | const_mkldnn_primitive_desc_t *
@@ -1196,9 +1228,11 @@ typedef enum {
 
     /* memory and op descriptor section */
     mkldnn_query_some_d = 64, /**< stub */
+    mkldnn_query_op_d, /**< op descriptor */
     mkldnn_query_memory_d, /**< memory descriptor for memory and view */
     mkldnn_query_convolution_d, /**< convolution descriptor */
     mkldnn_query_deconvolution_d, /**< deconvolution descriptor */
+    mkldnn_query_shuffle_d, /**< shuffle descriptor */
     mkldnn_query_eltwise_d, /**< eltwise descriptor */
     mkldnn_query_relu_d = mkldnn_query_eltwise_d, /**< @deprecated */
     mkldnn_query_softmax_d, /**< softmax descriptor */
index 7ca7b81..48979c3 100644 (file)
@@ -18,12 +18,12 @@ rem ============================================================================
 rem req: PowerShell 3.0+
 powershell.exe -command "if ($PSVersionTable.PSVersion.Major -ge 3) {exit 1} else {Write-Host \"The script requires PowerShell 3.0 or above (current version: $($PSVersionTable.PSVersion.Major).$($PSVersionTable.PSVersion.Minor))\"}" && goto Error_load
 
-set MKLURLROOT=https://github.com/intel/mkl-dnn/releases/download/v0.16/
-set MKLVERSION=2019.0.20180710
+set MKLURLROOT=https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/
+set MKLVERSION=2019.0.1.20180928
 
-set MKLPACKAGE=mklml_win_%MKLVERSION%.zip
+set MKLPACKAGE=mklml_win_%MKLVERSION%
 
-set MKLURL=%MKLURLROOT%%MKLPACKAGE%
+set MKLURL=%MKLURLROOT%%MKLPACKAGE%.zip
 if /i "%1"=="" (
        set DST=%~dp0..\external
 ) else (
@@ -32,19 +32,24 @@ if /i "%1"=="" (
 
 if not exist %DST% mkdir %DST%
 
-powershell.exe -command "if (Get-Command Invoke-WebRequest -errorAction SilentlyContinue){[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; Invoke-WebRequest %MKLURL% -OutFile %DST%\%MKLPACKAGE%} else {exit 1}" && goto Unpack || goto Error_load
+if not exist "%DST%\%MKLPACKAGE%\license.txt" (
+       powershell.exe -command "if (Get-Command Invoke-WebRequest -errorAction SilentlyContinue){[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; Invoke-WebRequest %MKLURL% -OutFile %DST%\%MKLPACKAGE%.zip} else {exit 1}" && goto Unpack || goto Error_load
 
 :Unpack
-powershell.exe -command "if (Get-Command Add-Type -errorAction SilentlyContinue) {Add-Type -Assembly \"System.IO.Compression.FileSystem\"; try { [IO.Compression.zipfile]::ExtractToDirectory(\"%DST%\%MKLPACKAGE%\", \"%DST%\")}catch{$_.exception ; exit 1}} else {exit 1}" && goto Exit || goto Error_unpack
+       powershell.exe -command "if (Get-Command Add-Type -errorAction SilentlyContinue) {Add-Type -Assembly \"System.IO.Compression.FileSystem\"; try { [IO.Compression.zipfile]::ExtractToDirectory(\"%DST%\%MKLPACKAGE%.zip\", \"%DST%\")}catch{$_.exception ; exit 1}} else {exit 1}" && goto Exit || goto Error_unpack
 
 :Error_load
-echo prepare_mkl.bat : Error: Failed to load %MKLURL% to %DST%, try to load it manually
-exit /B 1
+       echo prepare_mkl.bat : Error: Failed to load %MKLURL% to %DST%, try to load it manually
+       exit /B 1
 
 :Error_unpack
-echo prepare_mkl.bat : Error: Failed to unpack %DST%\%MKLPACKAGE% to %DST%, try unpack the archive manually
-exit /B 1
+       echo prepare_mkl.bat : Error: Failed to unpack %DST%\%MKLPACKAGE%.zip to %DST%, try unpack the archive manually
+       exit /B 1
 
 :Exit
-echo Downloaded and unpacked Intel(R) MKL small libraries to %DST%
-exit /B 0
\ No newline at end of file
+       echo Downloaded and unpacked Intel^(R^) MKL small libraries to %DST%
+       exit /B 0
+) else (
+       echo Intel^(R^) MKL small libraries are already installed in %DST%
+       exit /B 0
+)
index a841633..27115ef 100644 (file)
 # limitations under the License.
 #===============================================================================
 
-MKLURLROOT="https://github.com/intel/mkl-dnn/releases/download/v0.16/"
-MKLVERSION="2019.0.20180710"
+MKLURLROOT="https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/"
+MKLVERSION="2019.0.1.20180928"
 
 os=`uname`
 if [ "$os" = "Linux" ]; then
-  MKLPACKAGE="mklml_lnx_${MKLVERSION}.tgz"
+  MKLPACKAGE="mklml_lnx_${MKLVERSION}"
 elif [ "$os" = "Darwin" ]; then
-  MKLPACKAGE="mklml_mac_${MKLVERSION}.tgz"
+  MKLPACKAGE="mklml_mac_${MKLVERSION}"
 else
   echo "Cannot identify operating system. Try downloading package manually."
   exit 1
 fi
 
-MKLURL=${MKLURLROOT}${MKLPACKAGE}
+MKLURL=${MKLURLROOT}${MKLPACKAGE}.tgz
 DST=`dirname $0`/../external
 mkdir -p $DST
 DST=`cd $DST;pwd`
 
-if [ -x "$(command -v curl)" ]; then
-  curl -L -o "${DST}/${MKLPACKAGE}" "$MKLURL"
-elif [ -x "$(command -v wget)" ]; then
-  wget -O "${DST}/${MKLPACKAGE}" "$MKLURL"
-else
-  echo "curl or wget not available"
-  exit 1
-fi
+if [ ! -e "${DST}/${MKLPACKAGE}/license.txt" ]; then
+  if [ -x "$(command -v curl)" ]; then
+    curl -L -o "${DST}/${MKLPACKAGE}.tgz" "$MKLURL"
+  elif [ -x "$(command -v wget)" ]; then
+    wget -O "${DST}/${MKLPACKAGE}.tgz" "$MKLURL"
+  else
+    echo "curl or wget not available"
+    exit 1
+  fi
 
-if [ \! $? ]; then
-  echo "Download from $MKLURL to $DST failed"
-  exit 1
-fi
+  if [ \! $? ]; then
+    echo "Download from $MKLURL to $DST failed"
+    exit 1
+  fi
 
-tar -xzf "$DST/${MKLPACKAGE}" -C $DST
-echo "Downloaded and unpacked Intel(R) MKL small libraries to $DST"
+  tar -xzf "$DST/${MKLPACKAGE}.tgz" -C $DST
+  echo "Downloaded and unpacked Intel(R) MKL small libraries to $DST"
+else
+  echo "Intel(R) MKL small libraries are already installed in $DST"
+fi
index b970734..83ed499 100644 (file)
@@ -33,12 +33,16 @@ include_directories(
     )
 
 # propagate SRC specific flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_SRC_CCXX_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_SRC_CCXX_FLAGS}")
+append(CMAKE_C_FLAGS "${CMAKE_SRC_CCXX_FLAGS}")
+append(CMAKE_CXX_FLAGS "${CMAKE_SRC_CCXX_FLAGS}")
 
 # propagate no warning flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_CCXX_NOWARN_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CCXX_NOWARN_FLAGS}")
+append(CMAKE_C_FLAGS "${CMAKE_CCXX_NOWARN_FLAGS}")
+append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_NOWARN_FLAGS}")
+
+# propagate sanitizer flags
+append(CMAKE_C_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}")
+append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}")
 
 if(NOT MKLDNN_VERBOSE)
     add_definitions(-DDISABLE_VERBOSE)
@@ -58,7 +62,7 @@ if(WIN32)
     add_definitions(-DNOMINMAX)
     # Correct 'jnl' macro/jit issue
     if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qlong-double")
+        append(CMAKE_CXX_FLAGS "/Qlong-double")
     endif()
 endif()
 
@@ -66,14 +70,21 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
     # to make computations more stable and to align the jitted code
     # with the reference one use precise division and square root
     # by default
-    file(GLOB FILES_REQUIRED_PREC_DIV_SQRT
+    file(GLOB FILES_REQUIRED_PREC_SQRT
         ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*normalization*.cpp)
+    file(GLOB FILES_REQUIRED_PREC_DIV
+        ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*normalization*.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/cpu/ref_rnn.cpp)
     if(WIN32)
-        set_source_files_properties(${FILES_REQUIRED_PREC_DIV_SQRT}
-            PROPERTIES COMPILE_FLAGS "/Qprec-div /Qprec-sqrt")
+        set_source_files_properties(${FILES_REQUIRED_PREC_SQRT}
+            PROPERTIES COMPILE_FLAGS "/Qprec-sqrt")
+        set_source_files_properties(${FILES_REQUIRED_PREC_DIV}
+            PROPERTIES COMPILE_FLAGS "/Qprec-div")
     else()
-        set_source_files_properties(${FILES_REQUIRED_PREC_DIV_SQRT}
-            PROPERTIES COMPILE_FLAGS "-prec-div -prec-sqrt")
+        set_source_files_properties(${FILES_REQUIRED_PREC_SQRT}
+            PROPERTIES COMPILE_FLAGS "-prec-sqrt")
+        set_source_files_properties(${FILES_REQUIRED_PREC_DIV}
+            PROPERTIES COMPILE_FLAGS "-prec-div")
     endif()
 endif()
 
index 1d4c620..31d5df2 100644 (file)
@@ -36,7 +36,7 @@ status_t bnrm_desc_init(batch_normalization_desc_t *bnrm_desc,
         && !any_null(bnrm_desc, data_desc)
         && one_of(prop_kind, forward_training, forward_inference,
                 backward_data, backward)
-        && implication(prop_kind & backward, diff_data_desc != nullptr);
+        && IMPLICATION(prop_kind & backward, diff_data_desc != nullptr);
     if (!args_ok) return invalid_arguments;
 
     auto bd = batch_normalization_desc_t();
index c0027d2..5bc02ae 100644 (file)
@@ -114,6 +114,10 @@ namespace memory_format {
     const memory_format_t blocked = mkldnn_blocked;
     const memory_format_t x = mkldnn_x;
     const memory_format_t nc = mkldnn_nc;
+    const memory_format_t ncw = mkldnn_ncw;
+    const memory_format_t nwc = mkldnn_nwc;
+    const memory_format_t nCw8c = mkldnn_nCw8c;
+    const memory_format_t nCw16c = mkldnn_nCw16c;
     const memory_format_t nchw = mkldnn_nchw;
     const memory_format_t nhwc = mkldnn_nhwc;
     const memory_format_t chwn = mkldnn_chwn;
@@ -125,9 +129,22 @@ namespace memory_format {
     const memory_format_t nCdhw16c = mkldnn_nCdhw16c;
     const memory_format_t oi = mkldnn_oi;
     const memory_format_t io = mkldnn_io;
+    const memory_format_t oiw = mkldnn_oiw;
+    const memory_format_t wio = mkldnn_wio;
+    const memory_format_t Owi8o = mkldnn_Owi8o;
+    const memory_format_t OIw8i8o = mkldnn_OIw8i8o;
+    const memory_format_t OIw8o8i = mkldnn_OIw8o8i;
+    const memory_format_t OIw16i16o = mkldnn_OIw16i16o;
+    const memory_format_t OIw16o16i = mkldnn_OIw16o16i;
+    const memory_format_t Oiw16o = mkldnn_Oiw16o;
+    const memory_format_t Owi16o = mkldnn_Owi16o;
+    const memory_format_t OIw8i16o2i = mkldnn_OIw8i16o2i;
+    const memory_format_t IOw16o16i = mkldnn_IOw16o16i;
+    const memory_format_t OIw8o16i2o = mkldnn_OIw8o16i2o;
     const memory_format_t oihw = mkldnn_oihw;
     const memory_format_t ihwo = mkldnn_ihwo;
     const memory_format_t hwio = mkldnn_hwio;
+    const memory_format_t hwio_s8s8 = mkldnn_hwio_s8s8;
     const memory_format_t dhwio = mkldnn_dhwio;
     const memory_format_t oidhw = mkldnn_oidhw;
     const memory_format_t OIdhw8i8o = mkldnn_OIdhw8i8o;
@@ -144,6 +161,7 @@ namespace memory_format {
     const memory_format_t OIhw8i8o = mkldnn_OIhw8i8o;
     const memory_format_t OIhw16i16o = mkldnn_OIhw16i16o;
     const memory_format_t OIhw4i16o4i = mkldnn_OIhw4i16o4i;
+    const memory_format_t OIhw4i16o4i_s8s8 = mkldnn_OIhw4i16o4i_s8s8;
     const memory_format_t OIhw8i16o2i = mkldnn_OIhw8i16o2i;
     const memory_format_t OIdhw8i16o2i = mkldnn_OIdhw8i16o2i;
     const memory_format_t OIhw8o16i2o = mkldnn_OIhw8o16i2o;
@@ -153,11 +171,26 @@ namespace memory_format {
     const memory_format_t Oihw16o = mkldnn_Oihw16o;
     const memory_format_t Ohwi8o = mkldnn_Ohwi8o;
     const memory_format_t Ohwi16o = mkldnn_Ohwi16o;
+    const memory_format_t OhIw8o4i = mkldnn_OhIw8o4i;
+    const memory_format_t OhIw8o4i_s8s8 = mkldnn_OhIw8o4i_s8s8;
+    const memory_format_t goiw = mkldnn_goiw;
+    const memory_format_t gOwi8o = mkldnn_gOwi8o;
+    const memory_format_t gOIw8i8o = mkldnn_gOIw8i8o;
+    const memory_format_t gOIw8o8i = mkldnn_gOIw8o8i;
+    const memory_format_t gOIw16i16o = mkldnn_gOIw16i16o;
+    const memory_format_t gOIw16o16i = mkldnn_gOIw16o16i;
+    const memory_format_t gOiw16o = mkldnn_gOiw16o;
+    const memory_format_t gOwi16o = mkldnn_gOwi16o;
+    const memory_format_t gOIw8i16o2i = mkldnn_gOIw8i16o2i;
+    const memory_format_t gIOw16o16i = mkldnn_gIOw16o16i;
+    const memory_format_t gOIw8o16i2o = mkldnn_gOIw8o16i2o;
     const memory_format_t goihw = mkldnn_goihw;
     const memory_format_t hwigo = mkldnn_hwigo;
+    const memory_format_t hwigo_s8s8 = mkldnn_hwigo_s8s8;
     const memory_format_t gOIhw8i8o = mkldnn_gOIhw8i8o;
     const memory_format_t gOIhw16i16o = mkldnn_gOIhw16i16o;
     const memory_format_t gOIhw4i16o4i = mkldnn_gOIhw4i16o4i;
+    const memory_format_t gOIhw4i16o4i_s8s8 = mkldnn_gOIhw4i16o4i_s8s8;
     const memory_format_t gOIhw8i16o2i = mkldnn_gOIhw8i16o2i;
     const memory_format_t gOIdhw8i16o2i = mkldnn_gOIdhw8i16o2i;
     const memory_format_t gOIhw8o16i2o = mkldnn_gOIhw8o16i2o;
@@ -177,6 +210,8 @@ namespace memory_format {
     const memory_format_t gOIdhw16o16i = mkldnn_gOIdhw16o16i;
     const memory_format_t gOidhw16o = mkldnn_gOidhw16o;
     const memory_format_t gOdhwi16o = mkldnn_gOdhwi16o;
+    const memory_format_t gOhIw8o4i = mkldnn_gOhIw8o4i;
+    const memory_format_t gOhIw8o4i_s8s8 = mkldnn_gOhIw8o4i_s8s8;
     const memory_format_t ntc = mkldnn_ntc;
     const memory_format_t tnc = mkldnn_tnc;
     const memory_format_t ldsnc = mkldnn_ldsnc;
@@ -210,6 +245,7 @@ namespace primitive_kind {
     const primitive_kind_t sum = mkldnn_sum;
     const primitive_kind_t convolution = mkldnn_convolution;
     const primitive_kind_t deconvolution = mkldnn_deconvolution;
+    const primitive_kind_t shuffle = mkldnn_shuffle;
     const primitive_kind_t eltwise = mkldnn_eltwise;
     const primitive_kind_t depthwise = mkldnn_depthwise;
     const primitive_kind_t softmax = mkldnn_softmax;
@@ -238,9 +274,11 @@ namespace query {
     const query_t impl_info_str = mkldnn_query_impl_info_str;
 
     const query_t some_d = mkldnn_query_some_d;
+    const query_t op_d = mkldnn_query_op_d;
     const query_t memory_d = mkldnn_query_memory_d;
     const query_t convolution_d = mkldnn_query_convolution_d;
     const query_t deconvolution_d = mkldnn_query_deconvolution_d;
+    const query_t shuffle_d = mkldnn_query_shuffle_d;
     const query_t eltwise_d = mkldnn_query_eltwise_d;
     const query_t depthwise_d = mkldnn_query_depthwise_d;
     const query_t softmax_d = mkldnn_query_softmax_d;
@@ -270,6 +308,7 @@ using wino_data_t = mkldnn_wino_desc_t;
 using memory_desc_t = mkldnn_memory_desc_t;
 using convolution_desc_t = mkldnn_convolution_desc_t;
 using deconvolution_desc_t = mkldnn_deconvolution_desc_t;
+using shuffle_desc_t = mkldnn_shuffle_desc_t;
 using pooling_desc_t = mkldnn_pooling_desc_t;
 using eltwise_desc_t = mkldnn_eltwise_desc_t;
 using softmax_desc_t = mkldnn_softmax_desc_t;
@@ -294,6 +333,7 @@ struct op_desc_t {
         memory_desc_t memory;
         convolution_desc_t convolution;
         deconvolution_desc_t deconvolution;
+        shuffle_desc_t shuffle;
         pooling_desc_t pooling;
         eltwise_desc_t eltwise;
         softmax_desc_t softmax;
@@ -301,6 +341,7 @@ struct op_desc_t {
         batch_normalization_desc_t batch_normalization;
         inner_product_desc_t inner_product;
         convolution_relu_desc_t convolution_relu;
+        rnn_desc_t rnn;
         roi_pooling_desc_t roi_pooling;
         depthwise_desc_t depthwise;
     };
@@ -316,6 +357,7 @@ struct op_desc_t {
 
     DECL_CTOR_AND_CONVERTERS(memory_desc_t, memory);
     DECL_CTOR_AND_CONVERTERS(convolution_desc_t, convolution);
+    DECL_CTOR_AND_CONVERTERS(shuffle_desc_t, shuffle);
     DECL_CTOR_AND_CONVERTERS(pooling_desc_t, pooling);
     DECL_CTOR_AND_CONVERTERS(eltwise_desc_t, eltwise);
     DECL_CTOR_AND_CONVERTERS(depthwise_desc_t, depthwise);
@@ -324,6 +366,7 @@ struct op_desc_t {
     DECL_CTOR_AND_CONVERTERS(batch_normalization_desc_t, batch_normalization);
     DECL_CTOR_AND_CONVERTERS(inner_product_desc_t, inner_product);
     DECL_CTOR_AND_CONVERTERS(convolution_relu_desc_t, convolution_relu);
+    DECL_CTOR_AND_CONVERTERS(rnn_desc_t, rnn);
     DECL_CTOR_AND_CONVERTERS(roi_pooling_desc_t, roi_pooling);
 
 #   undef DECL_CTOR_AND_CONVERTERS
@@ -351,6 +394,7 @@ struct view_pd_t;
 struct concat_pd_t;
 struct sum_pd_t;
 struct reorder_pd_t;
+struct shuffle_pd_t;
 
 }
 }
index 78dec2f..8340220 100644 (file)
@@ -89,7 +89,7 @@ status_t conv_desc_init(convolution_desc_t *conv_desc,
     bool consistency = true
         && memory_desc_wrapper(weights_desc).nelems()
         && src_desc->ndims == dst_desc->ndims
-        && utils::one_of(src_desc->ndims, 4, 5)
+        && utils::one_of(src_desc->ndims, 3, 4, 5)
         && utils::one_of(weights_desc->ndims, src_desc->ndims,
                 src_desc->ndims + 1)
         && (with_bias ? bias_desc->ndims == 1 : true)
index c6e45b9..90b6629 100644 (file)
@@ -90,35 +90,38 @@ struct _convolution_fwd_pd_t: public primitive_desc_t {
     inline int G() const
     { return with_groups() ? cdesc_().weights_desc.dims[0] : 1; }
 
-    inline int ID() const { return (ndims() == 5)
-        ? input_pd()->desc()->dims[2] : 1; }
-    inline int IH() const { return input_pd()->desc()->dims[ndims()-2]; }
+    inline int ID() const { return (ndims() == 5) ? input_pd()->desc()->dims[2] : 1; }
+    inline int IH() const { return (ndims() == 3) ? 1 : input_pd()->desc()->dims[ndims()-2]; }
     inline int IW() const { return input_pd()->desc()->dims[ndims()-1]; }
-    inline int OD() const { return (ndims() == 5)
-        ? output_pd()->desc()->dims[2] : 1; }
-    inline int OH() const { return output_pd()->desc()->dims[ndims()-2]; }
+    inline int OD() const { return (ndims() == 5) ? output_pd()->desc()->dims[2] : 1; }
+    inline int OH() const { return (ndims() == 3) ? 1 : output_pd()->desc()->dims[ndims()-2]; }
     inline int OW() const { return output_pd()->desc()->dims[ndims()-1]; }
     inline int KD() const { return (ndims() == 5)
         ? cdesc_().weights_desc.dims[2 + with_groups()] : 1; }
     inline int KH() const
-    { return cdesc_().weights_desc.dims[ndims() - (2 - with_groups())]; }
+    { return (ndims() == 3)
+        ? 1 : cdesc_().weights_desc.dims[ndims() - (2 - with_groups())]; }
     inline int KW() const
     { return cdesc_().weights_desc.dims[ndims() - (1 - with_groups())]; }
 
     inline int KSD() const { return (ndims() == 5) ? cdesc_().strides[0] : 1; }
-    inline int KSH() const { return cdesc_().strides[ndims()-4]; }
+    inline int KSH() const { return (ndims() == 3)
+        ? 1 : cdesc_().strides[ndims()-4]; }
     inline int KSW() const { return cdesc_().strides[ndims()-3]; }
 
     inline int KDD() const { return (ndims() == 5) ? cdesc_().dilates[0] : 0; }
-    inline int KDH() const { return cdesc_().dilates[ndims()-4]; }
+    inline int KDH() const { return (ndims() == 3)
+        ? 0 : cdesc_().dilates[ndims()-4]; }
     inline int KDW() const { return cdesc_().dilates[ndims()-3]; }
 
     inline int padFront() const
         { return (ndims() == 5) ? cdesc_().padding[0][0] : 0; }
     inline int padBack() const
         { return (ndims() == 5) ? cdesc_().padding[1][0] : 0; }
-    inline int padT() const { return cdesc_().padding[0][ndims()-4]; }
-    inline int padB() const { return cdesc_().padding[1][ndims()-4]; }
+    inline int padT() const { return (ndims() == 3)
+        ? 0 : cdesc_().padding[0][ndims()-4]; }
+    inline int padB() const { return (ndims() == 3)
+        ? 0 : cdesc_().padding[1][ndims()-4]; }
     inline int padL() const { return cdesc_().padding[0][ndims()-3]; }
     inline int padR() const { return cdesc_().padding[1][ndims()-3]; }
 
@@ -211,35 +214,38 @@ struct convolution_bwd_data_pd_t: public primitive_desc_t {
     inline int G() const
     { return with_groups() ? desc_.weights_desc.dims[0] : 1; }
 
-    inline int ID() const { return (ndims() == 5)
-        ? output_pd()->desc()->dims[2] : 1; }
-    inline int IH() const { return output_pd()->desc()->dims[ndims()-2]; }
+    inline int ID() const { return (ndims() == 5) ? output_pd()->desc()->dims[2] : 1; }
+    inline int IH() const { return (ndims() == 3) ? 1 : output_pd()->desc()->dims[ndims()-2]; }
     inline int IW() const { return output_pd()->desc()->dims[ndims()-1]; }
-    inline int OD() const { return (ndims() == 5)
-        ? input_pd()->desc()->dims[2] : 1; }
-    inline int OH() const { return input_pd()->desc()->dims[ndims()-2]; }
+    inline int OD() const { return (ndims() == 5) ? input_pd()->desc()->dims[2] : 1; }
+    inline int OH() const { return (ndims() == 3) ? 1 : input_pd()->desc()->dims[ndims()-2]; }
     inline int OW() const { return input_pd()->desc()->dims[ndims()-1]; }
     inline int KD() const { return (ndims() == 5)
         ? desc_.weights_desc.dims[2 + with_groups()] : 1; }
     inline int KH() const
-    { return desc_.weights_desc.dims[ndims() - (2 - with_groups())]; }
+    { return (ndims() == 3)
+        ? 1 : desc_.weights_desc.dims[ndims() - (2 - with_groups())]; }
     inline int KW() const
     { return desc_.weights_desc.dims[ndims() - (1 - with_groups())]; }
 
     inline int KSD() const { return (ndims() == 5) ? desc_.strides[0] : 1; }
-    inline int KSH() const { return desc_.strides[ndims()-4]; }
+    inline int KSH() const { return (ndims() == 3)
+        ? 1 : desc_.strides[ndims()-4]; }
     inline int KSW() const { return desc_.strides[ndims()-3]; }
 
     inline int KDD() const { return (ndims() == 5) ? desc_.dilates[0] : 0; }
-    inline int KDH() const { return desc_.dilates[ndims()-4]; }
+    inline int KDH() const { return (ndims() == 3)
+        ? 0 : desc_.dilates[ndims()-4]; }
     inline int KDW() const { return desc_.dilates[ndims()-3]; }
 
     inline int padFront() const
         { return (ndims() == 5) ? desc_.padding[0][0] : 0; }
     inline int padBack() const
         { return (ndims() == 5) ? desc_.padding[1][0] : 0; }
-    inline int padT() const { return desc_.padding[0][ndims()-4]; }
-    inline int padB() const { return desc_.padding[1][ndims()-4]; }
+    inline int padT() const { return (ndims() == 3)
+        ? 0 : desc_.padding[0][ndims()-4]; }
+    inline int padB() const { return (ndims() == 3)
+        ? 0 : desc_.padding[1][ndims()-4]; }
     inline int padL() const { return desc_.padding[0][ndims()-3]; }
     inline int padR() const { return desc_.padding[1][ndims()-3]; }
 
@@ -322,33 +328,40 @@ struct convolution_bwd_weights_pd_t: public primitive_desc_t {
 
     inline int ID() const { return (ndims() == 5)
         ? desc_.src_desc.dims[2] : 1; }
-    inline int IH() const { return desc_.src_desc.dims[ndims()-2]; }
+    inline int IH() const { return (ndims() == 3)
+        ? 1 : desc_.src_desc.dims[ndims()-2]; }
     inline int IW() const { return desc_.src_desc.dims[ndims()-1]; }
     inline int OD() const { return (ndims() == 5)
         ? desc_.diff_dst_desc.dims[2] : 1; }
-    inline int OH() const { return desc_.diff_dst_desc.dims[ndims()-2]; }
+    inline int OH() const { return (ndims() == 3)
+        ? 1 : desc_.diff_dst_desc.dims[ndims()-2]; }
     inline int OW() const { return desc_.diff_dst_desc.dims[ndims()-1]; }
     inline int KD() const { return (ndims() == 5)
         ? desc_.diff_weights_desc.dims[2 + with_groups()] : 1; }
     inline int KH() const
-    { return desc_.diff_weights_desc.dims[ndims() - (2 - with_groups())]; }
+    { return (ndims() == 3)
+        ? 1 : desc_.diff_weights_desc.dims[ndims() - (2 - with_groups())]; }
     inline int KW() const
     { return desc_.diff_weights_desc.dims[ndims() - (1 - with_groups())]; }
 
     inline int KSD() const { return (ndims() == 5) ? desc_.strides[0] : 1; }
-    inline int KSH() const { return desc_.strides[ndims()-4]; }
+    inline int KSH() const { return (ndims() == 3)
+        ? 1 : desc_.strides[ndims()-4]; }
     inline int KSW() const { return desc_.strides[ndims()-3]; }
 
     inline int KDD() const { return (ndims() == 5) ? desc_.dilates[0] : 0; }
-    inline int KDH() const { return desc_.dilates[ndims()-4]; }
+    inline int KDH() const { return (ndims() == 3)
+        ? 0 : desc_.dilates[ndims()-4]; }
     inline int KDW() const { return desc_.dilates[ndims()-3]; }
 
     inline int padFront() const
         { return (ndims() == 5) ? desc_.padding[0][0] : 0; }
     inline int padBack() const
         { return (ndims() == 5) ? desc_.padding[1][0] : 0; }
-    inline int padT() const { return desc_.padding[0][ndims()-4]; }
-    inline int padB() const { return desc_.padding[1][ndims()-4]; }
+    inline int padT() const { return (ndims() == 3)
+        ? 0 : desc_.padding[0][ndims()-4]; }
+    inline int padB() const { return (ndims() == 3)
+        ? 0 : desc_.padding[1][ndims()-4]; }
     inline int padL() const { return desc_.padding[0][ndims()-3]; }
     inline int padR() const { return desc_.padding[1][ndims()-3]; }
 
index e7d6676..69acb03 100644 (file)
@@ -74,8 +74,11 @@ struct depthwise_fwd_pd_t: public primitive_desc_t {
 
     inline int MB() const { return input_pd()->desc()->ndims > 0 ? input_pd()->desc()->dims[0] : 1; }
     inline int C()  const { return input_pd()->desc()->ndims > 1 ? input_pd()->desc()->dims[1] : 1; }
-    inline int H()  const { return input_pd()->desc()->ndims > 2 ? input_pd()->desc()->dims[2] : 1; }
-    inline int W()  const { return input_pd()->desc()->ndims > 3 ? input_pd()->desc()->dims[3] : 1; }
+    inline int D()  const { return input_pd()->desc()->ndims > 4 ? input_pd()->desc()->dims[2] : 1; }
+    inline int H()  const { return input_pd()->desc()->ndims > 4 ? input_pd()->desc()->dims[3] :
+            input_pd()->desc()->ndims > 2 ? input_pd()->desc()->dims[2] : 1; }
+    inline int W()  const { return input_pd()->desc()->ndims > 4 ? input_pd()->desc()->dims[4] :
+            input_pd()->desc()->ndims > 3 ? input_pd()->desc()->dims[3] : 1; }
 
 protected:
     depthwise_desc_t desc_;
index 12636d5..815d2d7 100644 (file)
@@ -40,7 +40,7 @@ status_t eltwise_desc_init(eltwise_desc_t *eltwise_desc, prop_kind_t prop_kind,
                   eltwise_square, eltwise_abs, eltwise_sqrt, eltwise_linear,
                   eltwise_bounded_relu, eltwise_soft_relu, eltwise_logistic,
                   eltwise_clamp)
-        && implication(prop_kind == backward_data, diff_data_desc != nullptr);
+        && IMPLICATION(prop_kind == backward_data, diff_data_desc != nullptr);
     if (!args_ok) return invalid_arguments;
 
     auto ed = eltwise_desc_t();
@@ -57,7 +57,7 @@ status_t eltwise_desc_init(eltwise_desc_t *eltwise_desc, prop_kind_t prop_kind,
     ed.negative_slope = ed.alpha;
 
     bool consistency = true
-        && implication(ed.prop_kind == backward_data,
+        && IMPLICATION(ed.prop_kind == backward_data,
                 array_cmp(ed.diff_data_desc.dims, ed.data_desc.dims,
                     ed.diff_data_desc.ndims));
     if (!consistency) return invalid_arguments;
diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/format_traits.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/format_traits.hpp
new file mode 100644 (file)
index 0000000..0a13a33
--- /dev/null
@@ -0,0 +1,260 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef FORMAT_TRAITS_HPP
+#define FORMAT_TRAITS_HPP
+
+#include <assert.h>
+
+#include "c_types_map.hpp"
+#include "utils.hpp"
+
+namespace mkldnn {
+namespace impl {
+
+enum class data_kind_t {
+    data,
+    wei,
+    gwei,
+    rnn,
+    other,
+};
+
+enum class block_format_t {
+    _,
+    _8c, _8g, _8i, _8o,
+    _8i8o, _8o8i, _8o4i, _8o4i_s8s8,
+    _16c, _16g, _16i, _16o,
+    _16i16o, _16o16i,
+    _8i16o2i, _8o16i2o,
+    _4i16o4i, _4i16o4i_s8s8,
+};
+
+template <block_format_t f> struct block_format_traits {
+    using bf = block_format_t;
+    static constexpr int levels = f == bf::_
+        ? 0
+        : utils::one_of(f, bf::_8i16o2i, bf::_8o16i2o,
+                           bf::_4i16o4i, bf::_4i16o4i_s8s8) ? 2 : 1;
+    static constexpr int blk_ndims = f == bf::_
+        ? 0
+        : utils::one_of(f, bf::_8c, bf::_8g, bf::_8i, bf::_8o, bf::_16c,
+                bf::_16g, bf::_16i, bf::_16o) ? 1 : 2;
+    static constexpr int blk_size = f == bf::_
+        ? 1
+        : utils::one_of(f, bf::_8c, bf::_8g, bf::_8i, bf::_8o, bf::_8i8o,
+                bf::_8o8i, bf::_8o4i, bf::_8o4i_s8s8) ? 8 : 16;
+};
+
+template <memory_format_t> struct format_traits {
+    // data_kind_t data_kind;   -- the kind of data (e.g. weights or rnn)
+    // block_format_t blk_fmt;  -- the format of blocks (e.g. 8c or 4i16o4i)
+    // int ndims;               -- # of dimensions
+    // int ndims_sp;            -- # of spatial dimensions
+    // int blk_size;            -- block size (1, 8, or 16)
+};
+
+#define DECL_TRAITS(_fmt, _data_kind, _blk_fmt, _ndims, _ndims_sp) \
+template <> struct format_traits<memory_format::_fmt> { \
+    static constexpr data_kind_t data_kind = data_kind_t::_data_kind; \
+    static constexpr block_format_t blk_fmt = block_format_t::_blk_fmt; \
+    static constexpr int ndims = _ndims; \
+    static constexpr int ndims_sp = _ndims_sp; \
+    static constexpr int blk_size = \
+        block_format_traits<block_format_t::_blk_fmt>::blk_size; \
+}
+
+DECL_TRAITS(any, other, _, 0, 0);
+DECL_TRAITS(blocked, other, _, 0, 0);
+DECL_TRAITS(x, other, _, 1, 1);
+
+/* data: 2D */
+DECL_TRAITS(nc, data, _, 2, 0);
+
+/* data: 3D */
+DECL_TRAITS(ncw, data, _, 3, 1);
+DECL_TRAITS(nwc, data, _, 3, 1);
+DECL_TRAITS(nCw8c, data, _8c, 3, 1);
+DECL_TRAITS(nCw16c, data, _16c, 3, 1);
+
+/* data: 4D */
+DECL_TRAITS(nchw, data, _, 4, 2);
+DECL_TRAITS(nhwc, data, _, 4, 2);
+DECL_TRAITS(chwn, data, _, 4, 2);
+DECL_TRAITS(nChw8c, data, _8c, 4, 2);
+DECL_TRAITS(nChw16c, data, _16c, 4, 2);
+
+/* data: 5D */
+DECL_TRAITS(ncdhw, data, _, 5, 3);
+DECL_TRAITS(ndhwc, data, _, 5, 3);
+DECL_TRAITS(nCdhw8c, data, _8c, 5, 3);
+DECL_TRAITS(nCdhw16c, data, _16c, 5, 3);
+
+/* wei: 2D */
+DECL_TRAITS(oi, wei, _, 2, 0);
+DECL_TRAITS(io, wei, _, 2, 0);
+
+/* wei: 3D */
+DECL_TRAITS(oiw, wei, _, 3, 1);
+DECL_TRAITS(wio, wei, _, 3, 1);
+DECL_TRAITS(Owi8o, wei, _8o, 3, 1);
+DECL_TRAITS(OIw8i8o, wei, _8i8o, 3, 1);
+DECL_TRAITS(OIw8o8i, wei, _8o8i, 3, 1);
+DECL_TRAITS(OIw16i16o, wei, _16i16o, 3, 1);
+DECL_TRAITS(OIw16o16i, wei, _16o16i, 3, 1);
+DECL_TRAITS(Oiw16o, wei, _16o, 3, 1);
+DECL_TRAITS(Owi16o, wei, _16o, 3, 1);
+DECL_TRAITS(OIw8i16o2i, wei, _8i16o2i, 3, 1);
+DECL_TRAITS(IOw16o16i, wei, _16o16i, 3, 1);
+DECL_TRAITS(OIw8o16i2o, wei, _8o16i2o, 3, 1);
+
+/* wei: 4D */
+DECL_TRAITS(oihw, wei, _, 4, 2);
+DECL_TRAITS(ihwo, wei, _, 4, 2);
+DECL_TRAITS(hwio, wei, _, 4, 2);
+DECL_TRAITS(hwio_s8s8, wei, _, 4, 2);
+DECL_TRAITS(oIhw8i, wei, _8i, 4, 2);
+DECL_TRAITS(oIhw16i, wei, _16i, 4, 2);
+DECL_TRAITS(OIhw8i8o, wei, _8i8o, 4, 2);
+DECL_TRAITS(OhIw8o4i, wei, _8o4i, 4, 2);
+DECL_TRAITS(OhIw8o4i_s8s8, wei, _8o4i_s8s8, 4, 2);
+DECL_TRAITS(OIhw16i16o, wei, _16i16o, 4, 2);
+DECL_TRAITS(OIhw4i16o4i, wei, _4i16o4i, 4, 2);
+DECL_TRAITS(OIhw4i16o4i_s8s8, wei, _4i16o4i_s8s8, 4, 2);
+DECL_TRAITS(OIhw8i16o2i, wei, _8i16o2i, 4, 2);
+DECL_TRAITS(OIhw8o16i2o, wei, _8o16i2o, 4, 2);
+DECL_TRAITS(OIhw8o8i, wei, _8o8i, 4, 2);
+DECL_TRAITS(OIhw16o16i, wei, _16o16i, 4, 2);
+DECL_TRAITS(IOhw16o16i, wei, _16o16i, 4, 2);
+DECL_TRAITS(Oihw16o, wei, _16o, 4, 2);
+DECL_TRAITS(Ohwi8o, wei, _8o, 4, 2);
+DECL_TRAITS(Ohwi16o, wei, _16o, 4, 2);
+
+/* wei: 5D */
+DECL_TRAITS(dhwio, wei, _, 5, 3);
+DECL_TRAITS(oidhw, wei, _, 5, 3);
+DECL_TRAITS(OIdhw8i8o, wei, _8i8o, 5, 3);
+DECL_TRAITS(OIdhw8o8i, wei, _8o8i, 5, 3);
+DECL_TRAITS(Odhwi8o, wei, _8o, 5, 3);
+DECL_TRAITS(OIdhw16i16o, wei, _16i16o, 5, 3);
+DECL_TRAITS(OIdhw16o16i, wei, _16o16i, 5, 3);
+DECL_TRAITS(Oidhw16o, wei, _16o, 5, 3);
+DECL_TRAITS(Odhwi16o, wei, _16o, 5, 3);
+DECL_TRAITS(oIdhw8i, wei, _8i, 5, 3);
+DECL_TRAITS(oIdhw16i, wei, _16i, 5, 3);
+DECL_TRAITS(OIdhw8i16o2i, wei, _8i16o2i, 5, 3);
+
+/* gwei: 4D */
+DECL_TRAITS(goiw, gwei, _, 4, 1);
+DECL_TRAITS(gOwi8o, gwei, _8o, 4, 1);
+DECL_TRAITS(gOIw8i8o, gwei, _8i8o, 4, 1);
+DECL_TRAITS(gOIw8o8i, gwei, _8o8i, 4, 1);
+DECL_TRAITS(gOIw16i16o, gwei, _16i16o, 4, 1);
+DECL_TRAITS(gOIw16o16i, gwei, _16o16i, 4, 1);
+DECL_TRAITS(gOiw16o, gwei, _16o, 4, 1);
+DECL_TRAITS(gOwi16o, gwei, _16o, 4, 1);
+DECL_TRAITS(gOIw8i16o2i, gwei, _8i16o2i, 4, 1);
+DECL_TRAITS(gIOw16o16i, gwei, _16o16i, 4, 1);
+DECL_TRAITS(gOIw8o16i2o, gwei, _8o16i2o, 4, 1);
+
+/* gwei: 5D */
+DECL_TRAITS(goihw, gwei, _, 5, 2);
+DECL_TRAITS(hwigo, gwei, _, 5, 2);
+DECL_TRAITS(hwigo_s8s8, gwei, _, 5, 2);
+DECL_TRAITS(gOIhw8i8o, gwei, _8i8o, 5, 2);
+DECL_TRAITS(gOhIw8o4i, gwei, _8o4i, 5, 2);
+DECL_TRAITS(gOhIw8o4i_s8s8, gwei, _8o4i_s8s8, 5, 2);
+DECL_TRAITS(gOIhw16i16o, gwei, _16i16o, 5, 2);
+DECL_TRAITS(gOIhw4i16o4i, gwei, _4i16o4i, 5, 2);
+DECL_TRAITS(gOIhw4i16o4i_s8s8, gwei, _4i16o4i_s8s8, 5, 2);
+DECL_TRAITS(gOIhw8i16o2i, gwei, _8i16o2i, 5, 2);
+DECL_TRAITS(gOIdhw8i16o2i, gwei, _8i16o2i, 5, 2);
+DECL_TRAITS(gOIhw8o16i2o, gwei, _8o16i2o, 5, 2);
+DECL_TRAITS(gOIhw8o8i, gwei, _8o8i, 5, 2);
+DECL_TRAITS(gOIhw16o16i, gwei, _16o16i, 5, 2);
+DECL_TRAITS(gIOhw16o16i, gwei, _16o16i, 5, 2);
+DECL_TRAITS(gOihw16o, gwei, _16o, 5, 2);
+DECL_TRAITS(gOhwi8o, gwei, _8o, 5, 2);
+DECL_TRAITS(gOhwi16o, gwei, _16o, 5, 2);
+DECL_TRAITS(Goihw8g, gwei, _8g, 5, 2);
+DECL_TRAITS(Goihw16g, gwei, _16g, 5, 2);
+
+/* gwei: 6D */
+DECL_TRAITS(goidhw, gwei, _, 6, 3);
+DECL_TRAITS(gOIdhw8i8o, gwei, _8i8o, 6, 3);
+DECL_TRAITS(gOIdhw8o8i, gwei, _8o8i, 6, 3);
+DECL_TRAITS(gOdhwi8o, gwei, _8o, 6, 3);
+DECL_TRAITS(gOIdhw16i16o, gwei, _16i16o, 6, 3);
+DECL_TRAITS(gOIdhw16o16i, gwei, _16o16i, 6, 3);
+DECL_TRAITS(gOidhw16o, gwei, _16o, 6, 3);
+DECL_TRAITS(gOdhwi16o, gwei, _16o, 6, 3);
+
+/* rnn */
+DECL_TRAITS(ntc, rnn, _, 3, 0);
+DECL_TRAITS(tnc, rnn, _, 3, 0);
+DECL_TRAITS(ldsnc, rnn, _, 5, 0);
+DECL_TRAITS(ldigo, rnn, _, 5, 0);
+DECL_TRAITS(ldgoi, rnn, _, 5, 0);
+DECL_TRAITS(ldgo, rnn, _, 4, 0);
+
+#undef DECL_TRAITS
+
+/** returns the offset within the block for weights blocked over oc and ic */
+template <block_format_t f>
+constexpr int OI_blk_off(int oc, int ic) {
+    using bf = block_format_t;
+    static_assert(utils::one_of(f, bf::_8i8o, bf::_8o8i, bf::_8o4i, bf::_8o4i_s8s8,
+                bf::_16i16o, bf::_16o16i, bf::_8i16o2i, bf::_8o16i2o,
+                bf::_4i16o4i, bf::_4i16o4i_s8s8),
+            "unexpected blocked format");
+#   define blksize block_format_traits<f>::blk_size
+    return f == bf::_8i16o2i
+        ? (ic / 2) * blksize * 2 + 2 * oc + ic % 2
+        : (f == bf::_4i16o4i || f == bf::_4i16o4i_s8s8)
+        ? (ic / 4) * blksize * 4 + oc * 4 + ic % 4
+        : f == bf::_8o16i2o
+        ? (oc / 2) * blksize * 2 + 2 * ic + oc % 2
+        : utils::one_of(f, bf::_8i8o, bf::_16i16o)
+        ? ic * blksize + oc
+        : (f == bf::_8o4i || f == bf::_8o4i_s8s8)
+        ? (ic / 4) * blksize * 4 + 4 * oc + ic % 4
+        : oc * blksize + ic;
+#   undef blksize // if only we program in C++14...
+}
+
+/** computes offset for 1D, 2D, or 3D weights (w/ or w/o groups)
+ * in the same fashion: off(g, oc, ic, d, h, w) */
+template <memory_format_t fmt>
+constexpr size_t wei_blk_off_like_gwei3D(const memory_desc_wrapper &md,
+        const int g, const int o, const int i, const int d, const int h,
+        const int w) {
+    static_assert(utils::one_of(format_traits<fmt>::data_kind,
+                data_kind_t::wei, data_kind_t::gwei), "weights are expected");
+    static_assert(utils::one_of(format_traits<fmt>::ndims_sp, 1, 2, 3),
+            "incorrect number of dims");
+#   define w_grp (format_traits<fmt>::data_kind == data_kind_t::gwei)
+    return format_traits<fmt>::ndims_sp == 1
+        ? md.blk_off<!w_grp>(g, o, i, w)
+        : format_traits<fmt>::ndims_sp == 2
+            ? md.blk_off<!w_grp>(g, o, i, h, w)
+            : md.blk_off<!w_grp>(g, o, i, d, h, w);
+#   undef w_grp // if only we program in C++14...
+}
+
+} // namespace impl
+} // namespace mkldnn
+
+#endif
index f7e91ed..93e791f 100644 (file)
@@ -37,7 +37,7 @@ status_t lrn_desc_init(lrn_desc_t *lrn_desc,
         && !any_null(lrn_desc, data_desc)
         && one_of(alg_kind, lrn_within_channel, lrn_across_channels)
         && one_of(prop_kind, forward_training, forward_inference, backward_data)
-        && implication(prop_kind == backward_data, diff_data_desc != nullptr);
+        && IMPLICATION(prop_kind == backward_data, diff_data_desc != nullptr);
     if (!args_ok) return invalid_arguments;
 
     auto ld = lrn_desc_t();
index dfc5be2..0ae7093 100644 (file)
@@ -46,6 +46,16 @@ saturate(const acc_t &x) {
     return (typename utils::remove_reference<data_t>::type)v;
 }
 
+template <typename data_t>
+double saturate(const double &x) {
+    double v = x;
+    if (v < (double)nstl::numeric_limits<data_t>::lowest())
+        v = (double)nstl::numeric_limits<data_t>::lowest();
+    if (v > (double)nstl::numeric_limits<data_t>::max())
+        v = (double)nstl::numeric_limits<data_t>::max();
+    return v;
+}
+
 template <> inline int8_t saturate<int8_t, uint8_t>(const uint8_t &x) {
     return x <= 127u ? x : 127;
 }
@@ -60,6 +70,11 @@ out_round(float v, round_mode_t rmode = round_mode::nearest)
 { return (out_t)(rmode == round_mode::down ? floorf(v) : nearbyintf(v)); }
 
 template <typename out_t>
+inline typename utils::enable_if<nstl::is_integral<out_t>::value, out_t>::type
+out_round(double v, round_mode_t rmode = round_mode::nearest)
+{ return (out_t)(rmode == round_mode::down ? floor(v) : nearbyint(v)); }
+
+template <typename out_t>
 inline typename utils::enable_if<!nstl::is_integral<out_t>::value, out_t>::type
 out_round(float v, round_mode_t rmode = round_mode::nearest)
 { UNUSED(rmode); return v; }
index 53bcaa4..3df9295 100644 (file)
@@ -112,6 +112,40 @@ status_t fill_nc(memory_desc_t &md) {
     return fill_nonblocked(md, perm);
 }
 
+status_t fill_ncw(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const int perm[3] = {0, 1, 2};
+    return fill_nonblocked(md, perm);
+}
+
+status_t fill_nwc(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const int perm[3] = {0, 2, 1};
+    return fill_nonblocked(md, perm);
+}
+
+status_t fill_nCw8c(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {1, 8, 1, 1};
+    const int perm[] = {
+        0, 1, 2,
+        3, 4, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_nCw16c(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {1, 16, 1};
+    const int perm[] = {
+        0, 1, 2,
+        3, 4, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
 status_t fill_nchw(memory_desc_t &md) {
     if (md.ndims != 4) return invalid_arguments;
 
@@ -215,6 +249,120 @@ status_t fill_io(memory_desc_t &md) {
     return fill_nonblocked(md, perm);
 }
 
+status_t fill_oiw(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const int perm[3] = {0, 1, 2};
+    return fill_nonblocked(md, perm);
+}
+
+status_t fill_wio(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const int perm[3] = {2, 1, 0};
+    return fill_nonblocked(md, perm);
+}
+
+status_t fill_Owi8o(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {8, 1, 1};
+    const int perm[] = {
+        0, 2, 1,
+        3, 4, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_OIw8o8i(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {8, 8, 1};
+    const int perm[] = {
+        0, 1, 2,
+        3, 4, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_OIw8i8o(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {8, 8, 1};
+    const int perm[] = {
+        0, 1, 2,
+        4, 3, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_OIw16i16o(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {16, 16, 1};
+    const int perm[] = {
+        0, 1, 2,
+        4, 3, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_OIw16o16i(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {16, 16, 1};
+    const int perm[] = {
+        0, 1, 2,
+        3, 4, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_Oiw16o(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {16, 1, 1};
+    const int perm[] = {
+        0, 1, 2,
+        3, 4, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_Owi16o(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {16, 1, 1};
+    const int perm[] = {
+        0, 2, 1,
+        3, 4, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_OIw8i16o2i(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {16, 16, 1};
+    const int perm[] = {
+        0, 1, 2,
+        4, 3, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_IOw16o16i(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {16, 16, 1};
+    const int perm[] = {
+        1, 0, 2,
+        3, 4, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_OIw8o16i2o(memory_desc_t &md) {
+    if (md.ndims != 3) return invalid_arguments;
+
+    const dims_t block_dims = {16, 16, 1};
+    const int perm[] = {
+        0, 1, 2,
+        3, 4, 5};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
 status_t fill_oihw(memory_desc_t &md) {
     if (md.ndims != 4) return invalid_arguments;
 
@@ -293,6 +441,16 @@ status_t fill_OIhw4i16o4i(memory_desc_t &md) {
     return fill_contiguous_blocked(md, block_dims, perm);
 }
 
+status_t fill_OhIw8o4i(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {8, 4, 1, 1};
+    const int perm[] = {
+        0, 2, 1, 3,
+        4, 5, 6, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
 status_t fill_OIhw8i16o2i(memory_desc_t &md) {
     if (md.ndims != 4) return invalid_arguments;
 
@@ -433,6 +591,113 @@ status_t fill_Odhwi8o(memory_desc_t &md) {
     return fill_contiguous_blocked(md, block_dims, perm);
 }
 
+status_t fill_goiw(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const int perm[4] = {0, 1, 2, 3};
+    return fill_nonblocked(md, perm);
+}
+
+status_t fill_gOwi8o(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {1, 8, 1, 1};
+    const int perm[] = {
+        0, 1, 3, 2,
+        4, 5, 6, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_gOIw8o8i(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {1, 8, 8, 1};
+    const int perm[] = {
+        0, 1, 2, 3,
+        4, 5, 6, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_gOIw8i8o(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {1, 8, 8, 1};
+    const int perm[] = {
+        0, 1, 2, 3,
+        4, 6, 5, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_gOIw16i16o(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {1, 16, 16, 1};
+    const int perm[] = {
+        0, 1, 2, 3,
+        4, 6, 5, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_gOIw16o16i(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {1, 16, 16, 1};
+    const int perm[] = {
+        0, 1, 2, 3,
+        4, 5, 6, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_gOiw16o(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {1, 16, 1, 1};
+    const int perm[] = {
+        0, 1, 2, 3,
+        4, 5, 6, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_gOwi16o(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {1, 16, 1, 1};
+    const int perm[] = {
+        0, 1, 3, 2,
+        4, 5, 6, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_gOIw8i16o2i(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {1, 16, 16, 1};
+    const int perm[] = {
+        0, 1, 2, 3,
+        4, 6, 5, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_gOIw8o16i2o(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {1, 16, 16, 1};
+    const int perm[] = {
+        0, 1, 2, 3,
+        4, 5, 6, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
+status_t fill_gIOw16o16i(memory_desc_t &md) {
+    if (md.ndims != 4) return invalid_arguments;
+
+    const dims_t block_dims = {1, 16, 16, 1};
+    const int perm[] = {
+        0, 2, 1, 3,
+        4, 5, 6, 7};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
 status_t fill_goihw(memory_desc_t &md) {
     if (md.ndims != 5) return invalid_arguments;
 
@@ -557,6 +822,16 @@ status_t fill_gOIhw4i16o4i(memory_desc_t &md) {
     return fill_contiguous_blocked(md, block_dims, perm);
 }
 
+status_t fill_gOhIw8o4i(memory_desc_t &md) {
+    if (md.ndims != 5) return invalid_arguments;
+
+    const dims_t block_dims = {1, 8, 4, 1, 1};
+    const int perm[] = {
+        0, 1, 3, 2, 4,
+        5, 6, 7, 8, 9};
+    return fill_contiguous_blocked(md, block_dims, perm);
+}
+
 status_t fill_Goihw8g(memory_desc_t &md) {
     if (md.ndims != 5) return invalid_arguments;
 
@@ -706,20 +981,40 @@ status_t memory_desc_wrapper::compute_blocking(memory_desc_t &memory_desc)
     switch (memory_desc.format) {
     case x: return fill_x(memory_desc);
     case nc: return fill_nc(memory_desc);
+    case ncw: return fill_ncw(memory_desc);
+    case nwc: return fill_nwc(memory_desc);
+    case nCw8c: return fill_nCw8c(memory_desc);
+    case nCw16c: return fill_nCw16c(memory_desc);
     case nchw: return fill_nchw(memory_desc);
     case nhwc: return fill_nhwc(memory_desc);
     case chwn: return fill_chwn(memory_desc);
-    case nChw8c: return fill_nChw8c(memory_desc);
-    case nChw16c: return fill_nChw16c(memory_desc);
+    case nChw8c: case oIhw8i: return fill_nChw8c(memory_desc);
+    case nChw16c: case oIhw16i: return fill_nChw16c(memory_desc);
     case oi: return fill_oi(memory_desc);
     case io: return fill_io(memory_desc);
+    case oiw: return fill_oiw(memory_desc);
+    case wio: return fill_wio(memory_desc);
+    case Owi8o: return fill_Owi8o(memory_desc);
+    case OIw8o8i: return fill_OIw8o8i(memory_desc);
+    case OIw8i8o: return fill_OIw8i8o(memory_desc);
+    case OIw16i16o: return fill_OIw16i16o(memory_desc);
+    case OIw16o16i: return fill_OIw16o16i(memory_desc);
+    case Oiw16o: return fill_Oiw16o(memory_desc);
+    case Owi16o: return fill_Owi16o(memory_desc);
+    case OIw8i16o2i: return fill_OIw8i16o2i(memory_desc);
+    case OIw8o16i2o: return fill_OIw8o16i2o(memory_desc);
+    case IOw16o16i: return fill_IOw16o16i(memory_desc);
     case oihw: return fill_oihw(memory_desc);
     case ihwo: return fill_ihwo(memory_desc);
     case hwio: return fill_hwio(memory_desc);
+    case hwio_s8s8: return fill_hwio(memory_desc);
     case dhwio: return fill_dhwio(memory_desc);
     case OIhw8i8o: return fill_OIhw8i8o(memory_desc);
     case OIhw16i16o: return fill_OIhw16i16o(memory_desc);
     case OIhw4i16o4i: return fill_OIhw4i16o4i(memory_desc);
+    case OhIw8o4i: return fill_OhIw8o4i(memory_desc);
+    case OhIw8o4i_s8s8: return fill_OhIw8o4i(memory_desc);
+    case OIhw4i16o4i_s8s8: return fill_OIhw4i16o4i(memory_desc);
     case OIhw8i16o2i: return fill_OIhw8i16o2i(memory_desc);
     case OIdhw8i16o2i: return fill_OIdhw8i16o2i(memory_desc);
     case OIhw8o16i2o: return fill_OIhw8o16i2o(memory_desc);
@@ -729,11 +1024,26 @@ status_t memory_desc_wrapper::compute_blocking(memory_desc_t &memory_desc)
     case Oihw16o: return fill_Oihw16o(memory_desc);
     case Ohwi8o: return fill_Ohwi8o(memory_desc);
     case Ohwi16o: return fill_Ohwi16o(memory_desc);
+    case goiw: return fill_goiw(memory_desc);
+    case gOwi8o: return fill_gOwi8o(memory_desc);
+    case gOIw8o8i: return fill_gOIw8o8i(memory_desc);
+    case gOIw8i8o: return fill_gOIw8i8o(memory_desc);
+    case gOIw16i16o: return fill_gOIw16i16o(memory_desc);
+    case gOIw16o16i: return fill_gOIw16o16i(memory_desc);
+    case gOiw16o: return fill_gOiw16o(memory_desc);
+    case gOwi16o: return fill_gOwi16o(memory_desc);
+    case gOIw8i16o2i: return fill_gOIw8i16o2i(memory_desc);
+    case gOIw8o16i2o: return fill_gOIw8o16i2o(memory_desc);
+    case gIOw16o16i: return fill_gIOw16o16i(memory_desc);
     case goihw: return fill_goihw(memory_desc);
     case hwigo: return fill_hwigo(memory_desc);
+    case hwigo_s8s8: return fill_hwigo(memory_desc);
     case gOIhw8i8o: return fill_gOIhw8i8o(memory_desc);
     case gOIhw16i16o: return fill_gOIhw16i16o(memory_desc);
     case gOIhw4i16o4i: return fill_gOIhw4i16o4i(memory_desc);
+    case gOhIw8o4i: return fill_gOhIw8o4i(memory_desc);
+    case gOhIw8o4i_s8s8: return fill_gOhIw8o4i(memory_desc);
+    case gOIhw4i16o4i_s8s8: return fill_gOIhw4i16o4i(memory_desc);
     case gOIhw8i16o2i: return fill_gOIhw8i16o2i(memory_desc);
     case gOIdhw8i16o2i: return fill_gOIdhw8i16o2i(memory_desc);
     case gOIhw8o16i2o: return fill_gOIhw8o16i2o(memory_desc);
@@ -749,8 +1059,8 @@ status_t memory_desc_wrapper::compute_blocking(memory_desc_t &memory_desc)
     case ndhwc: return fill_ndhwc(memory_desc);
     case oidhw: return fill_oidhw(memory_desc);
     case goidhw: return fill_goidhw(memory_desc);
-    case nCdhw8c: return fill_nCdhw8c(memory_desc);
-    case nCdhw16c: return fill_nCdhw16c(memory_desc);
+    case nCdhw8c: case oIdhw8i: return fill_nCdhw8c(memory_desc);
+    case nCdhw16c: case oIdhw16i: return fill_nCdhw16c(memory_desc);
     case OIdhw16i16o: return fill_OIdhw16i16o(memory_desc);
     case gOIdhw16i16o: return fill_gOIdhw16i16o(memory_desc);
     case OIdhw8i8o: return fill_OIdhw8i8o(memory_desc);
index 8b8d17b..91e18cf 100644 (file)
@@ -67,8 +67,8 @@ struct memory_desc_wrapper: public c_compatible {
      * is true, and the number of data elements otherwise */
     size_t nelems(bool with_padding = false) const {
         if (is_zero()) return 0;
-    return utils::array_product<int, size_t>(with_padding
-                ? blocking_desc().padding_dims : dims(), ndims());
+        return (utils::array_product<int, size_t>(with_padding
+                ? blocking_desc().padding_dims : dims(), ndims()));
     }
 
     /** returns true if memory descriptor is zero */
@@ -81,6 +81,41 @@ struct memory_desc_wrapper: public c_compatible {
     size_t data_type_size() const
     { return types::data_type_size(data_type()); }
 
+    /** return the size of data type of additional buffer */
+    size_t additional_buffer_data_size() const {
+        using namespace mkldnn::impl::memory_format;
+        return (utils::one_of(format(), hwio_s8s8, hwigo_s8s8,
+                    gOIhw4i16o4i_s8s8, OIhw4i16o4i_s8s8, OhIw8o4i_s8s8, gOhIw8o4i_s8s8))
+            ? sizeof(int32_t) : 0;
+    }
+
+    /** return true if memory format has additional buffer */
+    bool is_additional_buffer() const {
+        using namespace mkldnn::impl::memory_format;
+        return (utils::one_of(format(), hwio_s8s8, hwigo_s8s8,
+                    gOIhw4i16o4i_s8s8, OIhw4i16o4i_s8s8, OhIw8o4i_s8s8, gOhIw8o4i_s8s8))
+            ? true : false;
+    }
+
+    /** returns the size of additional buffer */
+    size_t additional_buffer_size() const {
+        using namespace mkldnn::impl::memory_format;
+        const auto &padding_dims = blocking_desc().padding_dims;
+        switch(format()) {
+            case hwigo_s8s8:
+            case gOIhw4i16o4i_s8s8:
+            case gOhIw8o4i_s8s8:
+                return size_t(padding_dims[0]) * size_t(padding_dims[1])
+                    * additional_buffer_data_size();
+            case hwio_s8s8:
+            case OIhw4i16o4i_s8s8:
+            case OhIw8o4i_s8s8:
+                return size_t(padding_dims[0]) * additional_buffer_data_size();
+            default:
+                return 0;
+        }
+    }
+
     /** returns the size required to store described memory
      * note: if offset_padding != 0 returns 0 (need to specify the behavior) */
     size_t size() const {
@@ -112,7 +147,7 @@ struct memory_desc_wrapper: public c_compatible {
                     max_size = nstl::max(max_size,
                             size_t(block * strides[1][d]));
             }
-            return max_size * data_type_size();
+            return max_size * data_type_size() + additional_buffer_size();;
         }
     }
 
@@ -132,6 +167,13 @@ struct memory_desc_wrapper: public c_compatible {
         return true;
     }
 
+    /** returns true if memory desc has blocked layout and block dims are 1s */
+    bool is_plain() const {
+        if (!is_blocking_desc()) return false;
+        return
+            utils::array_product(blocking_desc().block_dims, ndims()) == 1;
+    }
+
     /* comparison section */
 
     inline bool operator==(const memory_desc_wrapper &rhs) const;
@@ -180,13 +222,22 @@ struct memory_desc_wrapper: public c_compatible {
             phys_offset += pos_block * blk.strides[0][d];
             phys_offset += pos_within_block * blk.strides[1][d];
         }
-        if (format() == gOIhw4i16o4i || format() == OIhw4i16o4i) {
+        if (utils::one_of(format(), gOIhw4i16o4i, OIhw4i16o4i, gOIhw4i16o4i_s8s8,
+                            OIhw4i16o4i_s8s8)) {
             // TODO: Fix temporary workaround for formats with double blocking
-            const bool with_groups = format() == gOIhw4i16o4i;
+            const bool with_groups = (format() == gOIhw4i16o4i
+                                      || format() == gOIhw4i16o4i_s8s8);
             const int oc_16 = pos[with_groups + 0] % 16;
             const int ic_4  = pos[with_groups + 1] % 4;
             phys_offset += 4 * oc_16 + ic_4 - (oc_16 + 16 * ic_4);
         }
+        if (format() == gOIw8i16o2i || format() == OIw8i16o2i) {
+            // TODO: Fix temporary workaround for formats with double blocking
+            const bool with_groups = format() == gOIw8i16o2i;
+            const int oc_16 = pos[with_groups + 0] % 16;
+            const int ic_2  = pos[with_groups + 1] % 2;
+            phys_offset += -16 * ic_2 + oc_16 + ic_2;
+        }
         if (format() == gOIhw8i16o2i || format() == OIhw8i16o2i) {
             // TODO: Fix temporary workaround for formats with double blocking
             const bool with_groups = format() == gOIhw8i16o2i;
@@ -208,6 +259,13 @@ struct memory_desc_wrapper: public c_compatible {
             const int oc_2  = pos[with_groups + 0] % 2;
             phys_offset += -16 * oc_2 + ic_16 + oc_2;
         }
+        if (format() == gOIw8o16i2o || format() == OIw8o16i2o) {
+            // TODO: Fix temporary workaround for formats with double blocking
+            const bool with_groups = format() == gOIw8o16i2o;
+            const int ic_16 = pos[with_groups + 1] % 16;
+            const int oc_2  = pos[with_groups + 0] % 2;
+            phys_offset += -16 * oc_2 + ic_16 + oc_2;
+        }
         return phys_offset;
     }
 
@@ -330,11 +388,11 @@ inline bool memory_desc_wrapper::similar_to(const memory_desc_wrapper &rhs,
         && dim_start <= ndims() /* guard */
         && array_cmp(dims() + ds, rhs.dims() + ds, ndims() - ds)
         && format_normalize(format()) == format_normalize(rhs.format())
-        && implication(with_data_type, data_type() == rhs.data_type())
+        && IMPLICATION(with_data_type, data_type() == rhs.data_type())
         && array_cmp(blk.block_dims + ds, r_blk.block_dims + ds, ndims() - ds)
         && array_cmp(blk.strides[0] + ds, r_blk.strides[0] + ds, ndims() - ds)
         && array_cmp(blk.strides[1] + ds, r_blk.strides[1] + ds, ndims() - ds)
-        && implication(with_padding,
+        && IMPLICATION(with_padding,
                 array_cmp(blk.padding_dims + ds, r_blk.padding_dims + ds,
                     ndims() - ds)
                 && array_cmp(blk.offset_padding_to_data + ds,
index dce7aeb..b54848f 100644 (file)
@@ -59,35 +59,57 @@ const char *mkldnn_fmt2str(mkldnn_memory_format_t v) {
     if (v == mkldnn_blocked) return "blocked";
     if (v == mkldnn_x) return "x";
     if (v == mkldnn_nc) return "nc";
+    if (v == mkldnn_ncw) return "ncw";
+    if (v == mkldnn_nwc) return "nwc";
     if (v == mkldnn_nchw) return "nchw";
     if (v == mkldnn_nhwc) return "nhwc";
     if (v == mkldnn_chwn) return "chwn";
-    if (v == mkldnn_nChw8c) return "nChw8c";
-    if (v == mkldnn_nChw16c) return "nChw16c";
     if (v == mkldnn_ncdhw) return "ncdhw";
     if (v == mkldnn_ndhwc) return "ndhwc";
-    if (v == mkldnn_nCdhw8c) return "nCdhw8c";
-    if (v == mkldnn_nCdhw16c) return "nCdhw16c";
     if (v == mkldnn_oi) return "oi";
     if (v == mkldnn_io) return "io";
+    if (v == mkldnn_oiw) return "oiw";
+    if (v == mkldnn_wio) return "wio";
     if (v == mkldnn_oihw) return "oihw";
-    if (v == mkldnn_ihwo) return "ihwo";
     if (v == mkldnn_hwio) return "hwio";
-    if (v == mkldnn_dhwio) return "dhwio";
+    if (v == mkldnn_hwio_s8s8) return "hwio_s8s8";
+    if (v == mkldnn_ihwo) return "ihwo";
     if (v == mkldnn_oidhw) return "oidhw";
-    if (v == mkldnn_oIdhw8i) return "oIdhw8i";
-    if (v == mkldnn_oIdhw16i) return "oIdhw16i";
-    if (v == mkldnn_OIdhw8i8o) return "OIdhw8i8o";
-    if (v == mkldnn_OIdhw8o8i) return "OIdhw8o8i";
-    if (v == mkldnn_OIdhw16i16o) return "OIdhw16i16o";
-    if (v == mkldnn_OIdhw16o16i) return "OIdhw16o16i";
-    if (v == mkldnn_Oidhw16o) return "Oidhw16o";
-    if (v == mkldnn_Odhwi16o) return "Odhwi16o";
+    if (v == mkldnn_dhwio) return "dhwio";
+    if (v == mkldnn_goiw) return "goiw";
+    if (v == mkldnn_goihw) return "goihw";
+    if (v == mkldnn_hwigo) return "hwigo";
+    if (v == mkldnn_hwigo_s8s8) return "hwigo_s8s8";
+    if (v == mkldnn_goidhw) return "goidhw";
+    if (v == mkldnn_ntc) return "ntc";
+    if (v == mkldnn_tnc) return "tnc";
+    if (v == mkldnn_ldsnc) return "ldsnc";
+    if (v == mkldnn_ldigo) return "ldigo";
+    if (v == mkldnn_ldgoi) return "ldgoi";
+    if (v == mkldnn_ldgo) return "ldgo";
+    if (v == mkldnn_nCw8c) return "nCw8c";
+    if (v == mkldnn_nCw16c) return "nCw16c";
+    if (v == mkldnn_nChw8c) return "nChw8c";
+    if (v == mkldnn_nChw16c) return "nChw16c";
+    if (v == mkldnn_nCdhw8c) return "nCdhw8c";
+    if (v == mkldnn_nCdhw16c) return "nCdhw16c";
+    if (v == mkldnn_Owi8o) return "Owi8o";
+    if (v == mkldnn_OIw8i8o) return "OIw8i8o";
+    if (v == mkldnn_OIw8o8i) return "OIw8o8i";
+    if (v == mkldnn_OIw16i16o) return "OIw16i16o";
+    if (v == mkldnn_OIw16o16i) return "OIw16o16i";
+    if (v == mkldnn_Oiw16o) return "Oiw16o";
+    if (v == mkldnn_Owi16o) return "Owi16o";
+    if (v == mkldnn_OIw8i16o2i) return "OIw8i16o2i";
+    if (v == mkldnn_OIw8o16i2o) return "OIw8o16i2o";
+    if (v == mkldnn_IOw16o16i) return "IOw16o16i";
+    if (v == mkldnn_oIhw8i) return "oIhw8i";
+    if (v == mkldnn_oIhw16i) return "oIhw16i";
     if (v == mkldnn_OIhw8i8o) return "OIhw8i8o";
     if (v == mkldnn_OIhw16i16o) return "OIhw16i16o";
     if (v == mkldnn_OIhw4i16o4i) return "OIhw4i16o4i";
+    if (v == mkldnn_OIhw4i16o4i_s8s8) return "OIhw4i16o4i_s8s8";
     if (v == mkldnn_OIhw8i16o2i) return "OIhw8i16o2i";
-    if (v == mkldnn_OIdhw8i16o2i) return "OIdhw8i16o2i";
     if (v == mkldnn_OIhw8o16i2o) return "OIhw8o16i2o";
     if (v == mkldnn_OIhw8o8i) return "OIhw8o8i";
     if (v == mkldnn_OIhw16o16i) return "OIhw16o16i";
@@ -97,13 +119,33 @@ const char *mkldnn_fmt2str(mkldnn_memory_format_t v) {
     if (v == mkldnn_Ohwi8o) return "Ohwi8o";
     if (v == mkldnn_Ohwi16o) return "Ohwi16o";
     if (v == mkldnn_OhIw16o4i) return "OhIw16o4i";
-    if (v == mkldnn_goihw) return "goihw";
-    if (v == mkldnn_hwigo) return "hwigo";
+    if (v == mkldnn_OhIw8o4i) return "OhIw8o4i";
+    if (v == mkldnn_OhIw8o4i_s8s8) return "OhIw8o4i_s8s8";
+    if (v == mkldnn_oIdhw8i) return "oIdhw8i";
+    if (v == mkldnn_oIdhw16i) return "oIdhw16i";
+    if (v == mkldnn_OIdhw8i8o) return "OIdhw8i8o";
+    if (v == mkldnn_OIdhw8o8i) return "OIdhw8o8i";
+    if (v == mkldnn_Odhwi8o) return "Odhwi8o";
+    if (v == mkldnn_OIdhw16i16o) return "OIdhw16i16o";
+    if (v == mkldnn_OIdhw16o16i) return "OIdhw16o16i";
+    if (v == mkldnn_Oidhw16o) return "Oidhw16o";
+    if (v == mkldnn_Odhwi16o) return "Odhwi16o";
+    if (v == mkldnn_OIdhw8i16o2i) return "OIdhw8i16o2i";
+    if (v == mkldnn_gOwi8o) return "gOwi8o";
+    if (v == mkldnn_gOIw8o8i) return "gOIw8o8i";
+    if (v == mkldnn_gOIw8i8o) return "gOIw8i8o";
+    if (v == mkldnn_gOIw16i16o) return "gOIw16i16o";
+    if (v == mkldnn_gOIw16o16i) return "gOIw16o16i";
+    if (v == mkldnn_gOiw16o) return "gOiw16o";
+    if (v == mkldnn_gOwi16o) return "gOwi16o";
+    if (v == mkldnn_gOIw8i16o2i) return "gOIw8i16o2i";
+    if (v == mkldnn_gOIw8o16i2o) return "gOIw8o16i2o";
+    if (v == mkldnn_gIOw16o16i) return "gIOw16o16i";
     if (v == mkldnn_gOIhw8i8o) return "gOIhw8i8o";
     if (v == mkldnn_gOIhw16i16o) return "gOIhw16i16o";
     if (v == mkldnn_gOIhw4i16o4i) return "gOIhw4i16o4i";
+    if (v == mkldnn_gOIhw4i16o4i_s8s8) return "gOIhw4i16o4i_s8s8";
     if (v == mkldnn_gOIhw8i16o2i) return "gOIhw8i16o2i";
-    if (v == mkldnn_gOIdhw8i16o2i) return "gOIdhw8i16o2i";
     if (v == mkldnn_gOIhw8o16i2o) return "gOIhw8o16i2o";
     if (v == mkldnn_gOIhw8o8i) return "gOIhw8o8i";
     if (v == mkldnn_gOIhw16o16i) return "gOIhw16o16i";
@@ -115,27 +157,20 @@ const char *mkldnn_fmt2str(mkldnn_memory_format_t v) {
     if (v == mkldnn_Goihw8g) return "Goihw8g";
     if (v == mkldnn_Goihw16g) return "Goihw16g";
     if (v == mkldnn_gOhIw16o4i) return "gOhIw16o4i";
-    if (v == mkldnn_goidhw) return "goidhw";
+    if (v == mkldnn_gOhIw8o4i) return "gOhIw8o4i";
+    if (v == mkldnn_gOhIw8o4i_s8s8) return "gOhIw8o4i_s8s8";
     if (v == mkldnn_gOIdhw8i8o) return "gOIdhw8i8o";
     if (v == mkldnn_gOIdhw8o8i) return "gOIdhw8o8i";
+    if (v == mkldnn_gOdhwi8o) return "gOdhwi8o";
+    if (v == mkldnn_gOIdhw8i16o2i) return "gOIdhw8i16o2i";
     if (v == mkldnn_gOIdhw16i16o) return "gOIdhw16i16o";
     if (v == mkldnn_gOIdhw16o16i) return "gOIdhw16o16i";
     if (v == mkldnn_gOidhw16o) return "gOidhw16o";
     if (v == mkldnn_gOdhwi16o) return "gOdhwi16o";
-    if (v == mkldnn_Odhwi8o) return "Odhwi8o";
-    if (v == mkldnn_gOdhwi8o) return "gOdhwi8o";
-    if (v == mkldnn_ntc) return "ntc";
-    if (v == mkldnn_tnc) return "tnc";
-    if (v == mkldnn_ldsnc) return "ldsnc";
-    if (v == mkldnn_ldigo) return "ldigo";
+    if (v == mkldnn_wino_fmt) return "wino_fmt";
     if (v == mkldnn_ldigo_p) return "ldigo_p";
-    if (v == mkldnn_ldgoi) return "ldgoi";
     if (v == mkldnn_ldgoi_p) return "ldgoi_p";
-    if (v == mkldnn_ldgo) return "ldgo";
-    if (v == mkldnn_wino_fmt) return "wino_fmt";
     if (v == mkldnn_format_last) return "format_last";
-    if (v == mkldnn_oIhw8i) return "oIhw8i";
-    if (v == mkldnn_oIhw16i) return "oIhw16i";
     assert(!"unknown fmt");
     return "unknown fmt";
 }
@@ -159,6 +194,7 @@ const char *mkldnn_prim_kind2str(mkldnn_primitive_kind_t v) {
     if (v == mkldnn_memory) return "memory";
     if (v == mkldnn_view) return "view";
     if (v == mkldnn_reorder) return "reorder";
+    if (v == mkldnn_shuffle) return "shuffle";
     if (v == mkldnn_concat) return "concat";
     if (v == mkldnn_concat_inplace) return "concat_inplace";
     if (v == mkldnn_sum) return "sum";
index 6759294..9741c21 100644 (file)
 #define MKLDNN_THR_OMP 1
 #define MKLDNN_THR_TBB 2
 
+/* Ideally this condition below should never happen (if the library is built
+ * using regular cmake). For the 3rd-party projects that build the library
+ * from the sources on their own try to guess the right threading... */
 #if !defined(MKLDNN_THR)
-#define MKLDNN_THR MKLDNN_THR_SEQ
+#   if defined(_OPENMP)
+#       define MKLDNN_THR MKLDNN_THR_OMP
+#   else
+#       define MKLDNN_THR MKLDNN_THR_SEQ
+#   endif
 #endif
 
 #if MKLDNN_THR == MKLDNN_THR_SEQ
@@ -49,8 +56,8 @@ inline void mkldnn_thr_barrier() {
 }
 
 #elif MKLDNN_THR == MKLDNN_THR_TBB
-#include "tbb/parallel_for.h"
 #include "tbb/task_arena.h"
+#include "tbb/parallel_for.h"
 #define MKLDNN_THR_SYNC 0
 
 inline int mkldnn_get_max_threads()
index 6dfbc8a..f5512b8 100644 (file)
@@ -31,6 +31,7 @@ namespace impl {
 
 template <data_type_t> struct prec_traits {}; /* ::type -> float */
 template <typename> struct data_traits {}; /* ::data_type -> f32 */
+template <int> struct typesize_traits {}; /* ::data_type_size -> f32 */
 template <primitive_kind_t> struct pkind_traits {}; /* ::desc_type, ::query_d */
 
 template <> struct prec_traits<data_type::f32> { typedef float type; };
@@ -50,6 +51,10 @@ template <> struct data_traits<int8_t>
 template <> struct data_traits<uint8_t>
 { static constexpr data_type_t data_type = data_type::u8; };
 
+template <> struct typesize_traits<4> { typedef float type; };
+template <> struct typesize_traits<2> { typedef int16_t type; };
+template <> struct typesize_traits<1> { typedef uint8_t type; };
+
 #define PKIND_TRAITS_INST(op) \
 template <> struct pkind_traits<primitive_kind::op> { \
     typedef CONCAT2(op, _desc_t) desc_type; \
@@ -58,6 +63,7 @@ template <> struct pkind_traits<primitive_kind::op> { \
 PKIND_TRAITS_INST(memory);
 PKIND_TRAITS_INST(convolution);
 PKIND_TRAITS_INST(deconvolution);
+PKIND_TRAITS_INST(shuffle);
 PKIND_TRAITS_INST(eltwise);
 PKIND_TRAITS_INST(depthwise);
 PKIND_TRAITS_INST(softmax);
index a08e459..e44baef 100644 (file)
@@ -42,8 +42,8 @@ status_t mkldnn_primitive_create(primitive_t **primitive,
         const auto i_oi = (int)inputs[i].output_index;
         const bool ok = true
             && i_p != nullptr
-            && utils::implication(i_p->kind() == memory, i_oi == 0)
-            && utils::implication(i_p->kind() != memory,
+            && IMPLICATION(i_p->kind() == memory, i_oi == 0)
+            && IMPLICATION(i_p->kind() != memory,
                     i_oi < i_p->pd()->n_outputs());
         if (!ok)
             return invalid_arguments;
index 4cc40ae..866c934 100644 (file)
@@ -49,6 +49,13 @@ status_t scales_t::set(int count, int mask, const float *scales) {
     return status::success;
 }
 
+mkldnn::impl::status_t scales_t::scale(float factor) {
+    int cnt = (count_ == 1) ? scales_buf_size : count_;
+    for (int c = 0; c < cnt; ++c)
+        scales_[c] *= factor;
+    return status::success;
+}
+
 }
 }
 
index e9e5bdd..3f56d99 100644 (file)
@@ -54,6 +54,7 @@ struct scales_t: public c_compatible {
 
     status_t set(int count, int mask, const float *scales);
     status_t set(float single_scale) { return this->set(1, 0, &single_scale); }
+    status_t scale(float factor);
 
     int count_;
     int mask_;
@@ -106,19 +107,19 @@ struct mkldnn_post_ops: public mkldnn::impl::c_compatible {
                 bool require_nslope_zero = true) const {
             using namespace mkldnn::impl;
             return kind == primitive_kind::eltwise
-                && utils::implication(require_scale_one, eltwise.scale == 1.f)
+                && IMPLICATION(require_scale_one, eltwise.scale == 1.f)
                 && eltwise.alg == alg_kind::eltwise_relu
-                && utils::implication(require_nslope_zero, eltwise.alpha == 0.f);
+                && IMPLICATION(require_nslope_zero, eltwise.alpha == 0.f);
         }
         bool is_sum(bool require_scale_one = true) const {
             using namespace mkldnn::impl;
             return kind == primitive_kind::sum
-                && utils::implication(require_scale_one, sum.scale == 1.f);
+                && IMPLICATION(require_scale_one, sum.scale == 1.f);
         }
         bool is_eltwise(bool require_scale_one = true) const {
             using namespace mkldnn::impl;
             return kind == primitive_kind::eltwise
-                   && utils::implication(require_scale_one, eltwise.scale == 1.f);
+                   && IMPLICATION(require_scale_one, eltwise.scale == 1.f);
         }
         bool is_depthwise() const {
             using namespace mkldnn::impl;
index 82720c7..c88aaeb 100644 (file)
@@ -35,6 +35,11 @@ status_t primitive_desc_t::query(query_t what, int idx, void *result) const {
         case query::engine: *(engine_t**)result = engine(); break;
         case query::primitive_kind: *(primitive_kind_t*)result = kind(); break;
 
+        case query::op_d:
+            if (idx != 0 || op_desc() == nullptr) return invalid_arguments;
+            *(const_c_op_desc_t *)result
+                = static_cast<const_c_op_desc_t>(op_desc()); break;
+
         case query::input_pd: return safe_ret_pd(input_pd(idx));
         case query::output_pd: return safe_ret_pd(output_pd(idx));
         case query::src_pd: return safe_ret_pd(src_pd(idx));
index 691bd40..75da085 100644 (file)
@@ -40,7 +40,7 @@ status_t mkldnn_reorder_primitive_desc_create_v2(
 
     auto i_ek = input->engine()->kind();
     auto o_ek = output->engine()->kind();
-    if (!implication(i_ek != o_ek, one_of(engine_kind::cpu, i_ek, o_ek)))
+    if (!IMPLICATION(i_ek != o_ek, one_of(engine_kind::cpu, i_ek, o_ek)))
         return invalid_arguments;
 
     auto r_pd = reinterpret_cast<reorder_pd_t **>(
index 87daf96..432763b 100644 (file)
@@ -60,8 +60,8 @@ status_t mkldnn_rnn_cell_desc_init(rnn_cell_desc_t *rnn_cell_desc,
     bool args_ok = true
             && one_of(cell_kind, vanilla_rnn, vanilla_lstm, vanilla_gru,
                     gru_linear_before_reset)
-            && implication(cell_kind == vanilla_rnn,
-                       one_of(act_f, eltwise_relu, eltwise_tanh));
+            && IMPLICATION(cell_kind == vanilla_rnn,
+                    one_of(act_f, eltwise_relu, eltwise_tanh, eltwise_logistic));
     if (!args_ok)
         return status::invalid_arguments;
 
@@ -137,13 +137,13 @@ status_t MKLDNN_API mkldnn_rnn_forward_desc_init(mkldnn_rnn_desc_t *rnn_desc,
             && DIC == weights_layer_desc->dims[4]
             && DIC == weights_iter_desc->dims[4]
             && DLC == dst_layer_desc->dims[2] && L == weights_iter_desc->dims[0]
-            && implication(!is_zero_md(dst_iter_desc), true
+            && IMPLICATION(!is_zero_md(dst_iter_desc), true
                                && DIC == dst_iter_desc->dims[4]
                                && L == dst_iter_desc->dims[0])
-            && implication(!is_zero_md(bias_desc), L == bias_desc->dims[0])
-            && implication(
+            && IMPLICATION(!is_zero_md(bias_desc), L == bias_desc->dims[0])
+            && IMPLICATION(
                        !is_zero_md(src_iter_desc), L == src_iter_desc->dims[0])
-            && implication(rnn_cell_desc->cell_kind == alg_kind::vanilla_gru,
+            && IMPLICATION(rnn_cell_desc->cell_kind == alg_kind::vanilla_gru,
                        DIC == weights_iter_desc->dims[2]);
     if (!args_ok)
         return invalid_arguments;
@@ -222,13 +222,13 @@ status_t MKLDNN_API mkldnn_rnn_backward_desc_init(mkldnn_rnn_desc_t *rnn_desc,
             && DIC == weights_layer_desc->dims[4]
             && DIC == weights_iter_desc->dims[4]
             && DLC == dst_layer_desc->dims[2] && L == weights_iter_desc->dims[0]
-            && implication(!is_zero_md(dst_iter_desc), true
+            && IMPLICATION(!is_zero_md(dst_iter_desc), true
                                && DIC == dst_iter_desc->dims[4]
                                && L == dst_iter_desc->dims[0])
-            && implication(!is_zero_md(bias_desc), L == bias_desc->dims[0])
-            && implication(
+            && IMPLICATION(!is_zero_md(bias_desc), L == bias_desc->dims[0])
+            && IMPLICATION(
                        !is_zero_md(src_iter_desc), L == src_iter_desc->dims[0])
-            && implication(rnn_cell_desc->cell_kind == alg_kind::vanilla_gru,
+            && IMPLICATION(rnn_cell_desc->cell_kind == alg_kind::vanilla_gru,
                        DIC == weights_iter_desc->dims[2]);
     if (!args_ok)
         return invalid_arguments;
index 93a677d..5b11d5a 100644 (file)
@@ -57,86 +57,156 @@ struct rnn_pd_t : public primitive_desc_t {
                 prop_kind::backward);
     }
 
+    inline bool is_fwd() const {
+        return utils::one_of(desc_.prop_kind, prop_kind::forward_training,
+                prop_kind::forward_inference);
+    }
+
     inline size_t ws_states_size() {
-        int wic = nstl::max(SLC(), nstl::max(SIC(), DIC()));
-        return (size_t)(L() + 1) * D() * (T() + 1) * S() * MB() * wic;
+        return (size_t)(L() + 1) * D() * (T() + 1) * S() * MB() * S_GLD();
     }
 
     inline size_t ws_diff_states_size() {
-        int wic = nstl::max(SLC(), nstl::max(SIC(), DIC()));
-        return (size_t)(L() + 1) * D() * (T() + 1) * (S() + 1) * MB() * wic;
+        return (size_t)(L() + 1) * D() * (T() + 1) * (S() + 1) * MB() * S_GLD();
     }
 
-    inline size_t ws_gates_size() {
-        int n_layer = L();
-        int n_direction = D();
-        int n_iter = T();
-        int n_gates = G();
-        int batch = MB();
-        int s_size = DIC();
+    inline size_t ws_weights_layer_size() {
+        size_t ld = is_fwd() ? G_GLD() : S_GLD();
+        size_t not_ld =  is_fwd() ? SLC() : G() * DIC();
+        return (size_t)(L() * D() * ld * not_ld);
+    }
+
+    inline size_t ws_weights_iter_size() {
+        size_t ld = is_fwd() ? G_GLD() : S_GLD();
+        size_t not_ld =  is_fwd() ? SIC() : G() * DIC();
+        return (size_t)(L() * D() * ld * not_ld);
+    }
+
+    inline size_t ws_diff_weights_layer_size() {
+        return (size_t)(L() * D() * SLC() * GC());
+    }
 
-        return (size_t)n_layer * n_direction * n_iter * batch * n_gates
-                * s_size;
+    inline size_t ws_diff_weights_iter_size() {
+        return (size_t)(L() * D() * SIC() * GC());
+    }
+
+    inline size_t ws_gates_size() {
+        return (size_t) L() * D() * T() * MB() * GC();
     }
 
     inline size_t ws_cell_comp_size() {
-        int n_gates = G();
-        int batch = MB();
-        int s_size = DIC();
-        return (size_t)is_lbr() * n_gates * batch * s_size;
+        return (size_t)is_lbr() * MB() * GC();
     }
 
     inline size_t ws_grid_comp_size() {
-        int n_layer = L();
-        int n_direction = D();
-        int n_iter = T();
-        int batch = MB();
-        int s_size = DIC();
-        return (size_t)is_lbr() * is_training() * n_layer * n_direction * n_iter
-                * batch * s_size;
+        return (size_t)is_lbr() * is_training() * L() * D() * T() * MB() * DIC();
     }
 
     inline int ws_per_cell() {
-        int batch = MB();
-        int s_size = DIC();
-        return is_lbr() * is_training() * batch * s_size;
+        return is_lbr() *  MB() * DIC();
     }
 
-    inline void set_offsets(size_t &ws_gates_offset, size_t &ws_states_offset,
-            size_t &ws_diff_states_offset, size_t &ws_grid_comp_offset,
-            size_t &ws_cell_comp_offset) {
+    // returns the scratchpad size if use_workspace is true
+    // returns the workspace size if use_workspace is false,
+    // and all scratchpad boolean are false
+    inline size_t set_offsets( bool use_workspace,
+        size_t &ws_gates_offset, size_t &ws_states_offset,
+        size_t &ws_diff_states_offset, size_t &ws_grid_comp_offset,
+        bool use_ws_cell_comp, size_t &ws_cell_comp_offset,
+        bool copy_weights_layer_, size_t &ws_weights_layer_offset,
+        bool copy_weights_iter_, size_t &ws_weights_iter_offset,
+        bool copy_diff_weights_layer, size_t &ws_diff_weights_layer_offset,
+        bool copy_diff_weights_iter, size_t &ws_diff_weights_iter_offset) {
         const size_t page_size = 4096; // 2097152;
-        ws_gates_offset
-                = 0; // assumes the workspace base pointer is page aligned
-        ws_states_offset = utils::rnd_up(ws_gates_size(), page_size);
-        ws_diff_states_offset
-                = utils::rnd_up(ws_states_offset + ws_states_size(), page_size);
-        ws_grid_comp_offset = utils::rnd_up(ws_diff_states_offset
-                + ws_diff_states_size(), page_size);
+        size_t current_offset;
+
+        /* Mandatory workspaces: go to workspace if use_workspace, scratchpad otherwise */
+        current_offset = 0;  // assumes the workspace base pointer is page aligned
+        ws_gates_offset = current_offset;
+        current_offset += ws_gates_size();
+
+        current_offset = utils::rnd_up(current_offset, page_size);
+        ws_states_offset = current_offset;
+        current_offset += ws_states_size();
+
+        current_offset = utils::rnd_up(current_offset, page_size);
+        ws_diff_states_offset = current_offset;
+        current_offset += ws_diff_states_size();
+
+        current_offset = utils::rnd_up(current_offset, page_size);
+        ws_grid_comp_offset = current_offset;
+        current_offset += ws_grid_comp_size();
+
+        // ws_cell_comp is optional
+        if (use_ws_cell_comp) {
+            current_offset = utils::rnd_up(current_offset, page_size);
+            ws_cell_comp_offset = current_offset;
+            current_offset += ws_cell_comp_size();
+        }
+
+        /* Optional scratchpads */
+        // Assumes the scratchpad base pointer is page aligned.
+        // If use_workspace, the following goes to scratchpad alone,
+        // otherwise, all goes to scratchpad and continue incrementing offset
+        current_offset = use_workspace ? 0 : current_offset;
+
+        if (copy_weights_layer_) {
+            current_offset = utils::rnd_up(current_offset, page_size);
+            ws_weights_layer_offset = current_offset;
+            current_offset += ws_weights_layer_size();
+        }
+
+        if (copy_weights_iter_) {
+            current_offset = utils::rnd_up(current_offset, page_size);
+            ws_weights_iter_offset = current_offset;
+            current_offset += ws_weights_iter_size();
+        }
+
+        if (copy_diff_weights_layer) {
+            current_offset = utils::rnd_up(current_offset, page_size);
+            ws_diff_weights_layer_offset = current_offset;
+            current_offset += ws_diff_weights_layer_size();
+        }
+
+        if (copy_diff_weights_iter) {
+            current_offset = utils::rnd_up(current_offset, page_size);
+            ws_diff_weights_iter_offset = current_offset;
+            current_offset += ws_diff_weights_iter_size();
+        }
 
-        ws_cell_comp_offset = utils::rnd_up(ws_grid_comp_offset
-                + ws_grid_comp_size(), page_size);
+        return current_offset;
     }
 
     inline size_t get_ws_size() {
-        size_t ws_gates_offset, ws_states_offset, ws_diff_states_offset,
-            ws_grid_comp_offset, ws_cell_comp_offset;
-        set_offsets(
-                ws_gates_offset, ws_states_offset, ws_diff_states_offset,
-                ws_grid_comp_offset, ws_cell_comp_offset);
-        return ws_grid_comp_offset + ws_grid_comp_size();
-    }
-
-    inline size_t get_scratchpad_size() {
-        size_t ws_gates_offset, ws_states_offset, ws_diff_states_offset,
-            ws_grid_comp_offset, ws_cell_comp_offset;
-        set_offsets(
-                ws_gates_offset, ws_states_offset, ws_diff_states_offset,
-                ws_grid_comp_offset, ws_cell_comp_offset);
-        if (desc_.prop_kind == prop_kind::forward_inference)
-            return ws_cell_comp_offset + ws_cell_comp_size();
-        else
-            return ws_cell_comp_size();
+        size_t ws_gates_offset, ws_states_offset,
+            ws_diff_states_offset,ws_grid_comp_offset,
+            ws_cell_comp_offset, ws_weights_layer_offset,
+            ws_weights_iter_offset, ws_diff_weights_layer_offset,
+            ws_diff_weights_iter_offset;
+        return set_offsets( false,
+                     ws_gates_offset, ws_states_offset,
+                     ws_diff_states_offset, ws_grid_comp_offset,
+                     is_lbr(), ws_cell_comp_offset,
+                     false, ws_weights_layer_offset,
+                     false, ws_weights_iter_offset,
+                     false, ws_diff_weights_layer_offset,
+                     false, ws_diff_weights_iter_offset);
+    }
+
+    inline size_t get_scratchpad_size(bool use_workspace) {
+        size_t ws_gates_offset, ws_states_offset,
+            ws_diff_states_offset,ws_grid_comp_offset,
+            ws_cell_comp_offset, ws_weights_layer_offset,
+            ws_weights_iter_offset, ws_diff_weights_layer_offset,
+            ws_diff_weights_iter_offset;
+        return set_offsets(use_workspace,
+                     ws_gates_offset, ws_states_offset,
+                     ws_diff_states_offset, ws_grid_comp_offset,
+                     false, ws_cell_comp_offset,
+                     false, ws_weights_layer_offset,
+                     false, ws_weights_iter_offset,
+                     false, ws_diff_weights_layer_offset,
+                     false, ws_diff_weights_iter_offset);
     }
 
     int T() const { return desc_.src_layer_desc.dims[0]; }
@@ -153,6 +223,108 @@ struct rnn_pd_t : public primitive_desc_t {
 
     int DLC() const { return desc_.dst_layer_desc.dims[2]; }
 
+    int get_good_ld(int dim){
+        // we want matrices leading dimentions to be 64-byte aligned,
+        // and not divisible by 256 to avoid 4K aliasing effects
+        int ld = utils::rnd_up(dim, (int)(64/sizeof(float)));
+        return (ld % 256 == 0) ? ld + 64/sizeof(float) : ld;
+    }
+
+    int WIC() {
+        // wic will be the leading dimension of our B matrices
+        return get_good_ld(nstl::max(SLC(), nstl::max(SIC(), DIC())));
+    }
+
+    int GC() {
+        // gc will be the leading dimension of our C matrices
+        return get_good_ld(G() * DIC());
+    }
+
+    /* replacement functions for meaningless WIC and GC:
+       - LD stands for leading dimension
+       - GLD stands for good leading dimension
+       - NLD stands for not leading dimension (so the other dim)
+    */
+    int G_GLD() {
+        // good leading dimension for the gates
+        // C matrices for fwd, B matrices for bwd
+        return get_good_ld(G() * DIC());
+    }
+
+    int S_GLD() {
+        // good leading dimension for the states
+        // B matrices for fwd, B matrices for bwd_w, C matrices for bwd_d
+        return get_good_ld(nstl::max(SLC(), nstl::max(SIC(), DIC())));
+    }
+
+    int W_GLD() {
+        // good leading dimension for the weights
+        return is_fwd() ? G_GLD() : S_GLD();
+    }
+
+    int DW_GLD() {
+        // good leading dimension for the diff weights
+        return weights_copy_enabled() ? G_GLD() : G() * DIC();
+    }
+
+    int weights_copy_enabled() { return (T() > 1); }
+
+    int get_weights_ld(int feature_dim) {
+        return is_fwd() ? G() * DIC() : feature_dim;
+    }
+
+    int get_weights_nld(int feature_dim) {
+        return !(is_fwd()) ? G() * DIC() : feature_dim;
+    }
+
+    int WL_LD() {
+        return get_weights_ld(SLC());
+    }
+
+    int WL_GLD() {
+        return weights_copy_enabled() ? get_good_ld(WL_LD()) : WL_LD();
+    }
+
+    int WI_LD() {
+        return get_weights_ld(SIC());
+    }
+
+    int WI_GLD() {
+        return weights_copy_enabled() ? get_good_ld(WI_LD()) : WI_LD();
+    }
+
+    int DWL_LD() {
+        return G() * DIC();
+    }
+
+    int DWL_GLD() {
+        return weights_copy_enabled() ? get_good_ld(DWL_LD()) : DWL_LD();
+    }
+
+    int DWI_LD() {
+        return G() * DIC();
+    }
+
+    int DWI_GLD() {
+        return weights_copy_enabled() ? get_good_ld(DWI_LD()) : DWI_LD();
+    }
+
+    int WL_NLD() {
+        return get_weights_nld(SLC());
+    }
+
+    int WI_NLD() {
+        return get_weights_nld(SIC());
+    }
+
+    int DWL_NLD() {
+        return SLC();
+    }
+
+    int DWI_NLD() {
+        return SIC();
+    }
+
     int S() const { return mkldnn_rnn_cell_get_states_count(&desc_.cell_desc); }
 
     bool with_bias() const {
diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/shuffle.cpp b/inference-engine/thirdparty/mkl-dnn/src/common/shuffle.cpp
new file mode 100644 (file)
index 0000000..59d8b24
--- /dev/null
@@ -0,0 +1,72 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <assert.h>
+#include "mkldnn.h"
+
+#include "c_types_map.hpp"
+#include "type_helpers.hpp"
+#include "utils.hpp"
+
+using namespace mkldnn::impl;
+using namespace mkldnn::impl::utils;
+using namespace mkldnn::impl::status;
+using namespace mkldnn::impl::prop_kind;
+using namespace mkldnn::impl::types;
+
+namespace {
+status_t shuffle_desc_init(shuffle_desc_t *shuffle_desc, prop_kind_t prop_kind,
+        const memory_desc_t *data_desc, int axis, int group_size) {
+    bool args_ok = true
+        && !any_null(shuffle_desc, data_desc)
+        && one_of(prop_kind, forward_training, forward_inference,
+                  backward, backward_data)
+        && axis >= 0 && axis < data_desc->ndims
+        && group_size > 0 && group_size <= data_desc->dims[axis];
+    if (!args_ok) return invalid_arguments;
+
+    auto sd = shuffle_desc_t();
+    sd.primitive_kind = primitive_kind::shuffle;
+    sd.prop_kind = prop_kind;
+    sd.data_desc = *data_desc;
+    sd.axis = axis;
+    sd.group_size = group_size;
+
+    bool consistency = true
+        && sd.data_desc.dims[axis] % sd.group_size == 0;
+    if (!consistency) return invalid_arguments;
+
+    *shuffle_desc = sd;
+    return success;
+}
+}
+
+status_t mkldnn_shuffle_forward_desc_init(shuffle_desc_t *shuffle_desc,
+        prop_kind_t prop_kind, const memory_desc_t *data_desc, int axis,
+        int group_size) {
+    if (!one_of(prop_kind, forward_training, forward_inference))
+        return invalid_arguments;
+    return shuffle_desc_init(shuffle_desc, prop_kind, data_desc, axis,
+        group_size);
+}
+
+status_t mkldnn_shuffle_backward_desc_init(shuffle_desc_t *shuffle_desc,
+        const memory_desc_t *diff_data_desc, int axis, int group_size) {
+    return shuffle_desc_init(shuffle_desc, backward_data, diff_data_desc, axis,
+        group_size);
+}
+
+// vim: et ts=5 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/inference-engine/thirdparty/mkl-dnn/src/common/shuffle_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/common/shuffle_pd.hpp
new file mode 100644 (file)
index 0000000..ab7ddc1
--- /dev/null
@@ -0,0 +1,94 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef SHUFFLE_PD_HPP
+#define SHUFFLE_PD_HPP
+
+#include "mkldnn.h"
+
+#include "c_types_map.hpp"
+#include "primitive_desc.hpp"
+#include "memory_pd.hpp"
+
+namespace mkldnn {
+namespace impl {
+
+struct shuffle_pd_t: public primitive_desc_t {
+    static constexpr auto base_pkind = primitive_kind::shuffle;
+
+    typedef shuffle_pd_t base_class;
+    typedef shuffle_pd_t hint_class;
+
+    shuffle_pd_t(mkldnn::impl::engine_t *engine,
+            const shuffle_desc_t *adesc,
+            const primitive_attr_t *attr,
+            const shuffle_pd_t *hint_fwd_pd)
+        : primitive_desc_t(engine, attr, primitive_kind::shuffle)
+        , desc_(*adesc)
+        , hint_fwd_pd_(hint_fwd_pd) {}
+    virtual ~shuffle_pd_t() {}
+
+    const shuffle_desc_t *desc() const { return &desc_; }
+    virtual const op_desc_t *op_desc() const override
+    { return reinterpret_cast<const op_desc_t *>(this->desc()); }
+    virtual void init_info() override { init_info_shuffle(this, this->info_); }
+
+    virtual const memory_pd_t *input_pd(int index = 0) const override
+    { return index == 0 ? (is_fwd() ? src_pd() : diff_dst_pd()) : nullptr; }
+    virtual const memory_pd_t *output_pd(int index = 0) const override
+    { return index == 0 ? (is_fwd() ? dst_pd() : diff_src_pd()) : nullptr; }
+
+    virtual int n_inputs() const override { return 1; }
+    virtual int n_outputs() const override { return 1; }
+
+    virtual status_t query(query_t what, int idx, void *result) const override
+    {
+        switch (what) {
+        case query::shuffle_d:
+            *(const shuffle_desc_t**)result = desc(); break;
+        default: return primitive_desc_t::query(what, idx, result);
+        }
+        return status::success;
+    }
+
+    /* shuffle aux functions */
+    inline bool is_fwd() const {
+        return utils::one_of(desc_.prop_kind, prop_kind::forward_training,
+                prop_kind::forward_inference);
+    }
+    inline int ndims() const { return desc_.data_desc.ndims; }
+    inline int MB() const { return desc_.data_desc.dims[0]; }
+    inline int C() const { return ndims() >= 2 ? desc_.data_desc.dims[1] : 1; }
+    inline int D() const { return ndims() == 5 ? desc_.data_desc.dims[2] : 1; }
+    inline int H() const { return ndims() >= 4 ?
+                                  desc_.data_desc.dims[ndims() - 2] : 1; }
+    inline int W() const { return ndims() >= 3 ?
+                                  desc_.data_desc.dims[ndims() - 1] : 1; }
+    inline int axis() const { return desc_.axis; }
+    inline int axis_size() const { return desc_.data_desc.dims[axis()]; }
+    inline int group_size() const { return desc_.group_size; }
+
+protected:
+    shuffle_desc_t desc_;
+    const shuffle_pd_t *hint_fwd_pd_;
+};
+
+}
+}
+
+#endif
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
index 8103695..a7cf1a1 100644 (file)
@@ -74,6 +74,7 @@ inline memory_format_t flat_memory_format(int ndims) {
     switch (ndims) {
     case 1: return memory_format::x;
     case 2: return memory_format::nc;
+    case 3: return memory_format::ncw;
     case 4: return memory_format::nchw;
     case 5: return memory_format::ncdhw;
     default: return memory_format::undef;
@@ -91,6 +92,10 @@ inline memory_format_t format_normalize(const memory_format_t fmt) {
     const bool is_blocked = utils::one_of(fmt, blocked,
             x,
             nc,
+            ncw,
+            nwc,
+            nCw8c,
+            nCw16c,
             nchw,
             nhwc,
             chwn,
@@ -102,9 +107,22 @@ inline memory_format_t format_normalize(const memory_format_t fmt) {
             nCdhw16c,
             oi,
             io,
+            oiw,
+            wio,
+            Owi8o,
+            OIw8i8o,
+            OIw8o8i,
+            OIw16i16o,
+            OIw16o16i,
+            Oiw16o,
+            Owi16o,
+            OIw8i16o2i,
+            OIw8o16i2o,
+            IOw16o16i,
             oihw,
             ihwo,
             hwio,
+            hwio_s8s8,
             dhwio,
             oidhw,
             OIdhw8i8o,
@@ -121,24 +139,42 @@ inline memory_format_t format_normalize(const memory_format_t fmt) {
             OIhw8i8o,
             OIhw16i16o,
             OIhw4i16o4i,
+            OIhw4i16o4i_s8s8,
             OIhw8i16o2i,
             OIdhw8i16o2i,
             OIhw8o16i2o,
             OIhw8o8i,
+            OhIw8o4i,
+            OhIw8o4i_s8s8,
             OIhw16o16i,
             IOhw16o16i,
             Oihw16o,
             Ohwi8o,
             Ohwi16o,
+            goiw,
+            gOwi8o,
+            gOIw8i8o,
+            gOIw8o8i,
+            gOIw16i16o,
+            gOIw16o16i,
+            gOiw16o,
+            gOwi16o,
+            gOIw8i16o2i,
+            gOIw8o16i2o,
+            gIOw16o16i,
             goihw,
             hwigo,
+            hwigo_s8s8,
             gOIhw8i8o,
             gOIhw16i16o,
             gOIhw4i16o4i,
+            gOIhw4i16o4i_s8s8,
             gOIhw8i16o2i,
             gOIdhw8i16o2i,
             gOIhw8o16i2o,
             gOIhw8o8i,
+            gOhIw8o4i,
+            gOhIw8o4i_s8s8,
             gOIhw16o16i,
             gIOhw16o16i,
             gOihw16o,
@@ -165,8 +201,10 @@ inline memory_format_t format_normalize(const memory_format_t fmt) {
 
 inline bool is_format_double_blocked(memory_format_t fmt) {
     using namespace memory_format;
-    return utils::one_of(OIhw8i16o2i, OIdhw8i16o2i, OIhw8o16i2o, OIhw4i16o4i,
-            gOIhw8i16o2i, gOIdhw8i16o2i, gOIhw8o16i2o,gOIhw4i16o4i);
+    return utils::one_of(OIw8o16i2o, OIw8i16o2i, OIhw8i16o2i, OIdhw8i16o2i,
+            OIhw8o16i2o, OIhw4i16o4i, OIhw4i16o4i_s8s8, gOIw8o16i2o, gOIw8i16o2i,
+            gOIhw8i16o2i, gOIdhw8i16o2i, gOIhw8o16i2o, gOIhw4i16o4i,
+            gOIhw4i16o4i_s8s8);
 }
 
 inline bool blocking_desc_is_equal(const blocking_desc_t &lhs,
@@ -257,7 +295,8 @@ inline data_type_t default_accum_data_type(data_type_t src_dt,
     if (one_of(prop_kind, forward_training, forward_inference)) {
         if (src_dt == s16 && wei_dt == s16 && dst_dt == s32)
             return s32;
-        if (src_dt == u8 && wei_dt == s8 && one_of(dst_dt, f32, s32, s8, u8))
+        if ((src_dt == u8 || src_dt == s8)
+            && wei_dt == s8 && one_of(dst_dt, f32, s32, s8, u8))
             return s32;
     } else if (prop_kind == backward_data) {
         if (src_dt == s32 && wei_dt == s16 && dst_dt == s16)
index 0b0f787..01fa467 100644 (file)
 #include <stdlib.h>
 #include <assert.h>
 
+#define MSAN_ENABLED 0
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#undef MSAN_ENABLED
+#define MSAN_ENABLED 1
+#include <sanitizer/msan_interface.h>
+#endif
+#endif
+
 #include "c_types_map.hpp"
 
 namespace mkldnn {
 namespace impl {
 
+// Sanity check for 64 bits
+static_assert(sizeof(void*) == 8, "Intel(R) MKL-DNN supports 64 bit only");
+
 #define UNUSED(x) ((void)x)
 #define MAYBE_UNUSED(x) UNUSED(x)
 
@@ -36,6 +48,8 @@ namespace impl {
     return status; \
 } while (0)
 
+#define IMPLICATION(cause, effect) (!(cause) || !!(effect))
+
 #ifdef _WIN32
 #define __PRETTY_FUNCTION__ __FUNCSIG__
 #endif
@@ -105,17 +119,15 @@ inline bool everyone_is(T val, P item, Args... item_others) {
 }
 
 template <typename T, typename P>
-inline bool one_of(T val, P item) { return val == item; }
+constexpr bool one_of(T val, P item) { return val == item; }
 template <typename T, typename P, typename... Args>
-inline bool one_of(T val, P item, Args... item_others) {
+constexpr bool one_of(T val, P item, Args... item_others) {
     return val == item || one_of(val, item_others...);
 }
 
 template <typename... Args>
 inline bool any_null(Args... ptrs) { return one_of(nullptr, ptrs...); }
 
-inline bool implication(bool cause, bool effect) { return !cause || effect; }
-
 template<typename T>
 inline void array_copy(T *dst, const T *src, size_t size) {
     for (size_t i = 0; i < size; ++i) dst[i] = src[i];
@@ -226,6 +238,13 @@ inline bool nd_iterator_jump(U &cur, const U end, W &x, const Y &X,
     return false;
 }
 
+template <typename T>
+inline T pick(size_t i, const T &x0) { return x0; }
+template <typename T, typename ...Args>
+inline T pick(size_t i, const T &x0, Args &&... args) {
+    return i == 0 ? x0 : pick(i - 1, utils::forward<Args>(args)...);
+}
+
 template <typename Telem, size_t Tdims>
 struct array_offset_calculator {
     template <typename... Targs>
@@ -291,6 +310,13 @@ FILE *mkldnn_fopen(const char *filename, const char *mode);
 void set_rnd_mode(round_mode_t rnd_mode);
 void restore_rnd_mode();
 
+constexpr int msan_enabled = MSAN_ENABLED;
+inline void msan_unpoison(void *ptr, size_t size) {
+#if MSAN_ENABLED
+    __msan_unpoison(ptr, size);
+#endif
+}
+
 unsigned int get_cache_size(int level, bool per_core);
 
 }
index e784371..e48e94a 100644 (file)
@@ -128,6 +128,29 @@ template <typename pd_t> static void init_info_conv(pd_t *s, char *buffer) {
             aux_str, prb_str);
 }
 
+template <typename pd_t> static void init_info_shuffle(pd_t *s, char *buffer) {
+    DECL_DAT_AUX_PRB_STRS();
+
+    const auto md = (s->desc()->prop_kind == prop_kind::backward_data
+            ? s->diff_dst_pd() : s->src_pd())->desc();
+
+    snprintf(dat_str, MKLDNN_VERBOSE_DAT_LEN, "dt:%s fmt:%s",
+            mkldnn_dt2str(md->data_type), mkldnn_fmt2str(md->format));
+
+    snprintf(aux_str, MKLDNN_VERBOSE_AUX_LEN, "axis:%d group_size:%d",
+            s->axis(), s->group_size());
+
+    int l = 0;
+    for (int d = 0; d < md->ndims - 1; ++d)
+        l += snprintf(prb_str + l, MKLDNN_VERBOSE_PRB_LEN - l,
+                "%dx", md->dims[d]);
+    snprintf(prb_str + l, MKLDNN_VERBOSE_PRB_LEN - l,
+                "%d", md->dims[md->ndims - 1]);
+
+    verbose_templ(buffer, s->kind(), s->name(), s->desc()->prop_kind, dat_str,
+            aux_str, prb_str);
+}
+
 template <typename pd_t> static void init_info_eltwise(pd_t *s, char *buffer) {
     DECL_DAT_AUX_PRB_STRS();
 
index 8fd4fcf..477566b 100644 (file)
@@ -99,7 +99,7 @@ protected:
 
         for (int i = 0; i < n_; ++i) {
             const memory_desc_wrapper i_d(&src_pds_[i]);
-            if (i_d.is_wino_desc())
+            if (i_d.is_wino_desc() || i_d.is_additional_buffer())
                 return unimplemented;
         }
 
index 176f7fb..1db3f4a 100644 (file)
@@ -70,14 +70,14 @@ protected:
     inline memory_format_t src_format()
     {
         using namespace memory_format;
-        return (this->cdesc_().src_desc.ndims == 4) ? nchw : ncdhw;
+        return utils::pick(this->cdesc_().src_desc.ndims - 3, ncw, nchw, ncdhw);
     }
     inline memory_format_t wei_format()
     {
         using namespace memory_format;
-        return (this->cdesc_().src_desc.ndims == 4)
-            ? this->with_groups() ? goihw : oihw
-            : this->with_groups() ? goidhw : oidhw;
+        return this->with_groups()
+            ? utils::pick(this->cdesc_().src_desc.ndims - 3, goiw, goihw, goidhw)
+            : utils::pick(this->cdesc_().src_desc.ndims - 3, oiw, oihw, oidhw);
     }
 
     virtual status_t set_default_params() {
@@ -128,14 +128,14 @@ protected:
     inline memory_format_t src_format()
     {
         using namespace memory_format;
-        return (this->desc_.diff_src_desc.ndims == 4) ? nchw : ncdhw;
+        return utils::pick(this->desc_.diff_src_desc.ndims - 3, ncw, nchw, ncdhw);
     }
     inline memory_format_t wei_format()
     {
         using namespace memory_format;
-        return (this->desc_.diff_src_desc.ndims == 4)
-            ? this->with_groups() ? goihw : oihw
-            : this->with_groups() ? goidhw : oidhw;
+        return this->with_groups()
+            ? utils::pick(this->desc_.diff_src_desc.ndims - 3, goiw, goihw, goidhw)
+            : utils::pick(this->desc_.diff_src_desc.ndims - 3, oiw, oihw, oidhw);
     }
 
     virtual status_t set_default_params() {
@@ -192,14 +192,14 @@ protected:
     inline memory_format_t src_format()
     {
         using namespace memory_format;
-        return (this->desc_.src_desc.ndims == 4) ? nchw : ncdhw;
+        return utils::pick(this->desc_.src_desc.ndims - 3, ncw, nchw, ncdhw);
     }
     inline memory_format_t wei_format()
     {
         using namespace memory_format;
-        return (this->desc_.src_desc.ndims == 4)
-            ? this->with_groups() ? goihw : oihw
-            : this->with_groups() ? goidhw : oidhw;
+        return this->with_groups()
+            ? utils::pick(this->desc_.src_desc.ndims - 3, goiw, goihw, goidhw)
+            : utils::pick(this->desc_.src_desc.ndims - 3, oiw, oihw, oidhw);
     }
 
     virtual status_t set_default_params() {
index 63615bc..104ce88 100644 (file)
 
 #include "cpu/ref_rnn.hpp"
 
-#include "cpu/jit_avx512_core_u8s8s32x_1x1_convolution.hpp"
+#include "cpu/jit_avx512_core_x8s8s32x_1x1_convolution.hpp"
 #include "cpu/jit_avx512_common_1x1_convolution.hpp"
 #include "cpu/jit_avx512_core_fp32_wino_conv_4x3.hpp"
 #include "cpu/jit_avx512_common_convolution_winograd.hpp"
-#include "cpu/jit_avx512_core_u8s8s32x_convolution.hpp"
+#include "cpu/jit_avx512_core_x8s8s32x_convolution.hpp"
 #include "cpu/jit_avx512_common_convolution.hpp"
 #include "cpu/jit_avx2_1x1_convolution.hpp"
 #include "cpu/jit_sse42_1x1_convolution.hpp"
 #include "cpu/jit_avx2_convolution.hpp"
 #include "cpu/jit_sse42_convolution.hpp"
 #include "cpu/gemm_convolution.hpp"
-#include "cpu/gemm_u8s8s32x_convolution.hpp"
+#include "cpu/gemm_x8s8s32x_convolution.hpp"
 #include "cpu/ref_convolution.hpp"
+#include "cpu/jit_avx512_core_u8s8s32x_deconvolution.hpp"
 #include "cpu/ref_deconvolution.hpp"
+#include "cpu/ref_shuffle.hpp"
 #include "cpu/jit_uni_eltwise.hpp"
 #include "cpu/ref_eltwise.hpp"
 #include "cpu/ref_softmax.hpp"
 #include "cpu/ref_roi_pooling.hpp"
 #include "cpu/jit_uni_depthwise.hpp"
 #include "cpu/ref_depthwise.hpp"
+#include "cpu/jit_uni_x8s8s32x_convolution.hpp"
+#include "cpu/jit_uni_x8s8s32x_1x1_convolution.hpp"
+#include "cpu/jit_uni_x8s8s32x_dw_convolution.hpp"
+#include "cpu/jit_uni_i8i8_pooling.hpp"
 
 namespace mkldnn {
 namespace impl {
@@ -104,6 +110,7 @@ static const pd_create_f cpu_impl_list[] = {
     /* conv */
     INSTANCE(jit_avx512_common_dw_convolution_fwd_t),
     INSTANCE(jit_avx512_common_dw_convolution_bwd_data_t),
+    INSTANCE(jit_avx512_common_dw_convolution_bwd_weights_t),
     INSTANCE(jit_avx512_common_1x1_convolution_fwd_f32_t),
     INSTANCE(jit_avx512_common_1x1_convolution_bwd_data_f32_t),
     INSTANCE(jit_avx512_common_1x1_convolution_bwd_weights_t),
@@ -121,11 +128,13 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(jit_avx512_common_convolution_bwd_weights_t<f32>),
     INSTANCE(jit_avx2_dw_convolution_fwd_t),
     INSTANCE(jit_avx2_dw_convolution_bwd_data_t),
+    INSTANCE(jit_avx2_dw_convolution_bwd_weights_t),
     INSTANCE(jit_avx2_1x1_convolution_fwd_t),
     INSTANCE(jit_avx2_1x1_convolution_bwd_data_t),
     INSTANCE(jit_avx2_1x1_convolution_bwd_weights_t),
     INSTANCE(jit_sse42_dw_convolution_fwd_t),
     INSTANCE(jit_sse42_dw_convolution_bwd_data_t),
+    INSTANCE(jit_sse42_dw_convolution_bwd_weights_t),
     INSTANCE(jit_sse42_1x1_convolution_fwd_t),
     INSTANCE(jit_avx2_convolution_fwd_t),
     INSTANCE(jit_avx2_convolution_bwd_data_t),
@@ -143,20 +152,56 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<s8>),
     INSTANCE(jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<u8>),
     INSTANCE(jit_avx512_common_convolution_fwd_t<s16, s16, s32>),
-    INSTANCE(jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<f32>),
-    INSTANCE(jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<s32>),
-    INSTANCE(jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<u8>),
-    INSTANCE(jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<s8>),
-    INSTANCE(jit_avx512_core_u8s8s32x_convolution_fwd_t<f32>),
-    INSTANCE(jit_avx512_core_u8s8s32x_convolution_fwd_t<s32>),
-    INSTANCE(jit_avx512_core_u8s8s32x_convolution_fwd_t<u8>),
-    INSTANCE(jit_avx512_core_u8s8s32x_convolution_fwd_t<s8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<u8,f32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<u8,s32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<u8,u8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<u8,s8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<s8,f32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<s8,s32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<s8,u8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<s8,s8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_fwd_t<u8,f32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_fwd_t<u8,s32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_fwd_t<u8,u8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_fwd_t<u8,s8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_fwd_t<s8,f32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_fwd_t<s8,s32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_fwd_t<s8,u8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_fwd_t<s8,s8>),
     INSTANCE(jit_avx512_common_convolution_bwd_data_t<s16, s16, s32>),
     INSTANCE(jit_avx512_common_convolution_bwd_weights_t<s16, s16, s32>),
-    INSTANCE(_gemm_u8s8s32x_convolution_fwd_t<false, s32>),
-    INSTANCE(_gemm_u8s8s32x_convolution_fwd_t<false, u8>),
-    INSTANCE(_gemm_u8s8s32x_convolution_fwd_t<false, s8>),
-    INSTANCE(_gemm_u8s8s32x_convolution_fwd_t<false, f32>),
+    INSTANCE(jit_avx2_x8s8s32x_dw_convolution_fwd_t<u8,f32>),
+    INSTANCE(jit_avx2_x8s8s32x_dw_convolution_fwd_t<u8,s32>),
+    INSTANCE(jit_avx2_x8s8s32x_dw_convolution_fwd_t<u8,u8>),
+    INSTANCE(jit_avx2_x8s8s32x_dw_convolution_fwd_t<u8,s8>),
+    INSTANCE(jit_sse42_x8s8s32x_dw_convolution_fwd_t<u8,f32>),
+    INSTANCE(jit_sse42_x8s8s32x_dw_convolution_fwd_t<u8,s32>),
+    INSTANCE(jit_sse42_x8s8s32x_dw_convolution_fwd_t<u8,u8>),
+    INSTANCE(jit_sse42_x8s8s32x_dw_convolution_fwd_t<u8,s8>),
+    INSTANCE(jit_avx2_x8s8s32x_convolution_fwd_t<u8,f32>),
+    INSTANCE(jit_avx2_x8s8s32x_convolution_fwd_t<u8,s32>),
+    INSTANCE(jit_avx2_x8s8s32x_convolution_fwd_t<u8,u8>),
+    INSTANCE(jit_avx2_x8s8s32x_convolution_fwd_t<u8,s8>),
+    INSTANCE(jit_avx2_x8s8s32x_convolution_fwd_t<s8,f32>),
+    INSTANCE(jit_avx2_x8s8s32x_convolution_fwd_t<s8,s32>),
+    INSTANCE(jit_avx2_x8s8s32x_convolution_fwd_t<s8,u8>),
+    INSTANCE(jit_avx2_x8s8s32x_convolution_fwd_t<s8,s8>),
+    INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t<u8,f32>),
+    INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t<u8,s32>),
+    INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t<u8,u8>),
+    INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t<u8,s8>),
+    INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t<s8,f32>),
+    INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t<s8,s32>),
+    INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t<s8,u8>),
+    INSTANCE(jit_sse42_x8s8s32x_convolution_fwd_t<s8,s8>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<false, u8, s32>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<false, u8, u8>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<false, u8, s8>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<false, u8, f32>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<false, s8, s32>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<false, s8, u8>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<false, s8, s8>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<false, s8, f32>),
     INSTANCE(_gemm_u8s8s32x_convolution_bwd_data_t<s32>),
     INSTANCE(_gemm_u8s8s32x_convolution_bwd_data_t<u8>),
     INSTANCE(_gemm_u8s8s32x_convolution_bwd_data_t<s8>),
@@ -173,9 +218,16 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(ref_convolution_bwd_data_t<u8, s8, u8, s32>),
     INSTANCE(ref_convolution_bwd_weights_t<s16, s32, s16, s32>),
     /* deconv */
+    INSTANCE(_jit_avx512_core_u8s8s32x_deconvolution_fwd_t<s32>),
+    INSTANCE(_jit_avx512_core_u8s8s32x_deconvolution_fwd_t<u8>),
+    INSTANCE(_jit_avx512_core_u8s8s32x_deconvolution_fwd_t<s8>),
+    INSTANCE(_jit_avx512_core_u8s8s32x_deconvolution_fwd_t<f32>),
     INSTANCE(ref_deconvolution_bwd_weights_t),
     INSTANCE(ref_deconvolution_bwd_data_t),
     INSTANCE(ref_deconvolution_fwd_t),
+    /* shuffle */
+    INSTANCE(ref_shuffle_t<4>), /* f32 or s32 */
+    INSTANCE(ref_shuffle_t<1>), /* s8 or u8 */
     /* eltwise */
     INSTANCE(jit_uni_eltwise_fwd_t<avx512_common>),
     INSTANCE(jit_uni_eltwise_bwd_t<avx512_common>),
@@ -218,6 +270,8 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(ref_pooling_bwd_t<f32>),
     /* pool (int) */
     INSTANCE(jit_avx512_core_i8i8_pooling_fwd_t),
+    INSTANCE(jit_uni_i8i8_pooling_fwd_t<avx2>),
+    INSTANCE(jit_uni_i8i8_pooling_fwd_t<sse42>),
     INSTANCE(ref_pooling_fwd_t<s32>),
     INSTANCE(ref_pooling_fwd_t<s16, s32>),
     INSTANCE(ref_pooling_fwd_t<s8, s32>),
@@ -283,18 +337,30 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(jit_avx512_core_u8s8s32x_wino_convolution_relu_t<u8>),
     INSTANCE(jit_avx512_common_1x1_convolution_relu_s16s16s32_t),
     INSTANCE(jit_avx512_common_convolution_relu_t<s16, s16, s32>),
-    INSTANCE(jit_avx512_core_u8s8s32x_1x1_convolution_relu_t<f32>),
-    INSTANCE(jit_avx512_core_u8s8s32x_1x1_convolution_relu_t<s32>),
-    INSTANCE(jit_avx512_core_u8s8s32x_1x1_convolution_relu_t<s8>),
-    INSTANCE(jit_avx512_core_u8s8s32x_1x1_convolution_relu_t<u8>),
-    INSTANCE(jit_avx512_core_u8s8s32x_convolution_relu_t<f32>),
-    INSTANCE(jit_avx512_core_u8s8s32x_convolution_relu_t<s32>),
-    INSTANCE(jit_avx512_core_u8s8s32x_convolution_relu_t<u8>),
-    INSTANCE(jit_avx512_core_u8s8s32x_convolution_relu_t<s8>),
-    INSTANCE(_gemm_u8s8s32x_convolution_fwd_t<true, s32>),
-    INSTANCE(_gemm_u8s8s32x_convolution_fwd_t<true, u8>),
-    INSTANCE(_gemm_u8s8s32x_convolution_fwd_t<true, s8>),
-    INSTANCE(_gemm_u8s8s32x_convolution_fwd_t<true, f32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t<u8,f32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t<u8,s32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t<u8,s8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t<u8,u8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t<s8,f32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t<s8,s32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t<s8,s8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_1x1_convolution_relu_t<s8,u8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t<u8,f32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t<u8,s32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t<u8,u8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t<u8,s8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t<s8,f32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t<s8,s32>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t<s8,u8>),
+    INSTANCE(jit_avx512_core_x8s8s32x_convolution_relu_t<s8,s8>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<true, u8, s32>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<true, u8, u8>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<true, u8, s8>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<true, u8, f32>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<true, s8, s32>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<true, s8, u8>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<true, s8, s8>),
+    INSTANCE(_gemm_x8s8s32x_convolution_fwd_t<true, s8, f32>),
     INSTANCE(ref_convolution_relu_t<s16, s16, s32, s32>),
     INSTANCE(ref_convolution_relu_t<u8, s8, s32, s32>),
     INSTANCE(ref_convolution_relu_t<u8, s8, s8, s32>),
@@ -316,6 +382,23 @@ const pd_create_f* cpu_engine_t::get_implementation_list() const {
 
 cpu_engine_factory_t engine_factory;
 
+namespace {
+// XXX: this is a huge hammer. This disables all and any msan checks on
+// primitives outputs.
+//
+// A proper approach would be an implementation-specific unpoisoning.
+void unpoison_outputs(primitive_t *p)
+{
+    for(auto o: p->outputs()) {
+        assert(o->kind() == primitive_kind::memory);
+        void *p;
+        o->get_data_handle(&p);
+        size_t s = ((memory_pd_t *)o->pd())->get_size();
+        msan_unpoison(p, s);
+    }
+}
+}
+
 status_t cpu_engine_t::submit(primitive_t *p, event_t *e,
         event_vector &prerequisites) {
     /* FIXME: this should live in primitive execute function... */
@@ -328,6 +411,8 @@ status_t cpu_engine_t::submit(primitive_t *p, event_t *e,
     } else {
         p->execute(e);
     }
+    if (msan_enabled)
+        unpoison_outputs(p);
     return success;
 }
 
index f989eba..cf23837 100644 (file)
@@ -37,15 +37,15 @@ inline bool dense_gemm_consitency_check(const memory_desc_wrapper &src_d,
     using namespace memory_format;
     using namespace utils;
     return true
-        && implication(src_d.format() == nChw8c, wei_d.format() == oIhw8i)
-        && implication(src_d.format() == nChw16c, wei_d.format() == oIhw16i)
-        && implication(src_d.format() == nCdhw8c, wei_d.format() == oIdhw8i)
-        && implication(src_d.format() == nCdhw16c, wei_d.format() == oIdhw16i)
-        && implication(src_d.format() == nchw, wei_d.format() == oihw)
-        && implication(src_d.format() == ncdhw, wei_d.format() == oidhw)
-        && implication(src_d.format() == nhwc, wei_d.format() == hwio)
-        && implication(src_d.format() == ndhwc, wei_d.format() == dhwio)
-        && implication(src_d.format() == nc, one_of(wei_d.format(), oi, io))
+        && IMPLICATION(src_d.format() == nChw8c, wei_d.format() == oIhw8i)
+        && IMPLICATION(src_d.format() == nChw16c, wei_d.format() == oIhw16i)
+        && IMPLICATION(src_d.format() == nCdhw8c, wei_d.format() == oIdhw8i)
+        && IMPLICATION(src_d.format() == nCdhw16c, wei_d.format() == oIdhw16i)
+        && IMPLICATION(src_d.format() == nchw, wei_d.format() == oihw)
+        && IMPLICATION(src_d.format() == ncdhw, wei_d.format() == oidhw)
+        && IMPLICATION(src_d.format() == nhwc, wei_d.format() == hwio)
+        && IMPLICATION(src_d.format() == ndhwc, wei_d.format() == dhwio)
+        && IMPLICATION(src_d.format() == nc, one_of(wei_d.format(), oi, io))
         && dst_d.format() == nc
         && src_d.only_padded_dim(1)
         && wei_d.only_padded_dim(1)
index db8794d..4bbff22 100644 (file)
@@ -22,6 +22,8 @@
 #include "type_helpers.hpp"
 #include "utils.hpp"
 
+#include "format_traits.hpp"
+
 #include "cpu_memory.hpp"
 
 namespace mkldnn {
@@ -33,11 +35,14 @@ using namespace mkldnn::impl::data_type;
 using namespace mkldnn::impl::status;
 using namespace mkldnn::impl::memory_format;
 
+using dk = data_kind_t;
+using bf = block_format_t;
+
 template <data_type_t dt, memory_format_t fmt>
-typename utils::enable_if<fmt == nChw8c || fmt == nChw16c || fmt == nCdhw8c
-    || fmt == nCdhw16c>::type typed_zero_pad_data(
+typename utils::enable_if<format_traits<fmt>::data_kind == dk::data>::type
+typed_zero_pad_data(
     const memory_desc_wrapper &m_d, typename prec_traits<dt>::type *data) {
-    constexpr int blksize = (fmt == nChw8c || fmt == nCdhw8c) ? 8 : 16;
+    constexpr int blksize = format_traits<fmt>::blk_size;
 
     const auto &dims = m_d.dims();
     const auto &pdims = m_d.blocking_desc().padding_dims;
@@ -58,21 +63,14 @@ typename utils::enable_if<fmt == nChw8c || fmt == nChw16c || fmt == nCdhw8c
 
 template <data_type_t dt, memory_format_t fmt>
 typename utils::enable_if<false
-|| fmt == Ohwi8o || fmt == Oihw16o || fmt == Ohwi16o || fmt == Oidhw16o
-|| fmt == Odhwi16o|| fmt == Odhwi8o || fmt == gOhwi8o || fmt == gOihw16o
-|| fmt == gOhwi16o || fmt == gOidhw16o || fmt == gOdhwi16o || fmt == gOdhwi8o
+|| format_traits<fmt>::blk_fmt == bf::_8o
+|| format_traits<fmt>::blk_fmt == bf::_16o
 >::type typed_zero_pad_weights(const memory_desc_wrapper &m_d,
         typename prec_traits<dt>::type *data) {
-    static constexpr int w_groups = false
-        || fmt == gOhwi8o || fmt == gOihw16o || fmt == gOhwi16o
-        || fmt == gOidhw16o || fmt == gOdhwi16o || fmt == gOdhwi8o;
-
-    constexpr int is_3d = false
-        || fmt == Oidhw16o || fmt == Odhwi16o || fmt == Odhwi8o
-        || fmt == gOidhw16o || fmt == gOdhwi16o || fmt == gOdhwi8o;
-
-    constexpr int blksize = fmt == Ohwi8o || fmt == gOhwi8o
-        || fmt == Odhwi8o || fmt == gOdhwi8o ? 8 : 16;
+    static constexpr int w_groups = format_traits<fmt>::data_kind == dk::gwei;
+    constexpr int is_1d = format_traits<fmt>::ndims_sp == 1;
+    constexpr int is_3d = format_traits<fmt>::ndims_sp == 3;
+    constexpr int blksize = format_traits<fmt>::blk_size;
 
     const auto &dims = m_d.dims();
     const auto &pdims = m_d.blocking_desc().padding_dims;
@@ -81,30 +79,30 @@ typename utils::enable_if<false
     const int NB_OC = pdims[w_groups + 0] / blksize;
     const int IC = dims[w_groups + 1];
     const int D = is_3d ? dims[w_groups + 2] : 1;
-    const int H = dims[w_groups + 2 + is_3d];
-    const int W = dims[w_groups + 3 + is_3d];
+    const int H = is_1d ? 1 : dims[w_groups + 2 + is_3d];
+    const int W = dims[w_groups + 3 - is_1d + is_3d];
 
     const int oc_tail = pdims[w_groups + 0] - dims[w_groups + 0];
 
     parallel_nd(G, IC, D, H, W,
         [&](int g, int ic, int d, int h, int w) {
-        auto x = &data[is_3d
-            ? m_d.blk_off<!w_groups>(g, NB_OC - 1, ic, d, h, w)
-            : m_d.blk_off<!w_groups>(g, NB_OC - 1, ic, h, w) ];
+        auto x = &data[wei_blk_off_like_gwei3D<fmt>(m_d,
+                g, NB_OC - 1, ic, d, h, w)];
         for (int oc = blksize - oc_tail; oc < blksize; ++oc)
             x[oc] = 0;
     });
 }
 
 template <data_type_t dt, memory_format_t fmt>
-typename utils::enable_if<fmt == oIhw8i || fmt == oIhw16i
-    || fmt == oIdhw8i || fmt == oIdhw16i>::type
-typed_zero_pad_weights(const memory_desc_wrapper &m_d,
+typename utils::enable_if<false
+|| format_traits<fmt>::blk_fmt == bf::_8i
+|| format_traits<fmt>::blk_fmt == bf::_16i
+>::type typed_zero_pad_weights(const memory_desc_wrapper &m_d,
         typename prec_traits<dt>::type *data) {
-    constexpr int blksize = fmt == oIhw8i || fmt == oIdhw8i ? 8 : 16;
-
-    static constexpr int w_groups = 0;
-    constexpr int is_3d = fmt == oIdhw8i || fmt == oIdhw16i;
+    static constexpr int w_groups = format_traits<fmt>::data_kind == dk::gwei;
+    constexpr int is_1d = format_traits<fmt>::ndims_sp == 1;
+    constexpr int is_3d = format_traits<fmt>::ndims_sp == 3;
+    constexpr int blksize = format_traits<fmt>::blk_size;
 
     const auto &dims = m_d.dims();
     const auto &pdims = m_d.blocking_desc().padding_dims;
@@ -113,54 +111,30 @@ typed_zero_pad_weights(const memory_desc_wrapper &m_d,
     const int OC = dims[w_groups + 0];
     const int NB_IC = pdims[w_groups + 1] / blksize;
     const int D = is_3d ? dims[w_groups + 2] : 1;
-    const int H = dims[w_groups + 2 + is_3d];
+    const int H = is_1d ? 1 : dims[w_groups + 2 + is_3d];
     const int W = dims[w_groups + 3 + is_3d];
 
     const int ic_tail = pdims[w_groups + 1] - dims[w_groups + 1];
 
     parallel_nd(G, OC, D, H, W,
         [&](int g, int oc, int d, int h, int w) {
-        auto x = &data[is_3d
-            ? m_d.blk_off<!w_groups>(g, oc, NB_IC - 1, d, h, w)
-            : m_d.blk_off<!w_groups>(g, oc, NB_IC - 1, h, w) ];
+        auto x = &data[wei_blk_off_like_gwei3D<fmt>(m_d,
+                g, oc, NB_IC - 1, d, h, w)];
         for (int ic = blksize - ic_tail; ic < blksize; ++ic)
             x[ic] = 0;
     });
 }
 
 template <data_type_t dt, memory_format_t fmt>
-typename utils::enable_if<false
-|| fmt == IOhw16o16i || fmt == gIOhw16o16i
-|| fmt == OIdhw16i16o || fmt == OIdhw16o16i || fmt == OIhw8i8o
-|| fmt == OIhw16i16o || fmt == OIhw4i16o4i || fmt == OIhw8i16o2i
-|| fmt == OIdhw8i16o2i || fmt == OIhw8o16i2o || fmt == OIhw8o8i
-|| fmt == OIhw16o16i || fmt == OIdhw8i8o || fmt == OIdhw8o8i
-|| fmt == gOIhw8i8o
-|| fmt == gOIhw16i16o || fmt == gOIhw4i16o4i || fmt == gOIhw8i16o2i
-|| fmt == gOIdhw8i16o2i || fmt == gOIhw8o16i2o || fmt == gOIhw8o8i
-|| fmt == gOIhw16o16i || fmt == gOIdhw16i16o || fmt == gOIdhw16o16i
-|| fmt == gOIdhw8i8o || fmt == gOIdhw8o8i
->::type typed_zero_pad_weights(const memory_desc_wrapper &m_d,
+typename utils::enable_if<
+block_format_traits<format_traits<fmt>::blk_fmt>::blk_ndims == 2>::type
+typed_zero_pad_weights(const memory_desc_wrapper &m_d,
         typename prec_traits<dt>::type *data) {
     using data_t = typename prec_traits<dt>::type;
-    static constexpr int w_groups = false
-        || fmt == gOIhw8i8o || fmt == gOIhw16i16o || fmt == gOIhw4i16o4i
-        || fmt == gOIhw8i16o2i || fmt == gOIdhw8i16o2i || fmt == gOIhw8o16i2o
-        || fmt == gOIhw8o8i || fmt == gOIhw16o16i || fmt == gIOhw16o16i
-        || fmt == gOIdhw16i16o || fmt == gOIdhw16o16i || fmt == gOIdhw8i8o
-        || fmt == gOIdhw8o8i;
-
-    constexpr int is_3d = false
-        || fmt == OIdhw16i16o || fmt == OIdhw16o16i || fmt == OIdhw8i16o2i
-        || fmt == gOIdhw8i16o2i || fmt == gOIdhw16i16o || fmt == gOIdhw16o16i
-        || fmt == OIdhw8i8o || fmt == OIdhw8o8i || fmt == gOIdhw8i8o
-        || fmt == gOIdhw8o8i;
-
-    constexpr int blksize = (fmt == OIhw8i8o || fmt == OIhw8o8i
-        || fmt == gOIhw8i8o || fmt == gOIhw8o8i || fmt == OIdhw8i8o
-        || fmt == OIdhw8o8i || fmt == gOIdhw8i8o || fmt == gOIdhw8o8i)
-        ? 8 : 16;
-
+    static constexpr int w_groups = format_traits<fmt>::data_kind == dk::gwei;
+    constexpr int is_1d = format_traits<fmt>::ndims_sp == 1;
+    constexpr int is_3d = format_traits<fmt>::ndims_sp == 3;
+    constexpr int blksize = format_traits<fmt>::blk_size;
     const auto &dims = m_d.dims();
     const auto &pdims = m_d.blocking_desc().padding_dims;
 
@@ -168,35 +142,20 @@ typename utils::enable_if<false
     const int NB_OC = pdims[w_groups + 0] / blksize;
     const int NB_IC = pdims[w_groups + 1] / blksize;
     const int D = is_3d ? dims[w_groups + 2] : 1;
-    const int H = dims[w_groups + 2 + is_3d];
-    const int W = dims[w_groups + 3 + is_3d];
-
-    auto index = [&](const int ic, const int oc) {
-        if (utils::one_of(fmt,
-                    OIhw8i16o2i, gOIhw8i16o2i,
-                    OIdhw8i16o2i, gOIdhw8i16o2i))
-            return ((ic / 2) * blksize * 2 + 2 * oc + ic % 2);
-        else if (utils::one_of(fmt, OIhw4i16o4i, gOIhw4i16o4i))
-            return ((ic / 4) * blksize * 4 + oc * 4 + ic % 4);
-        else if (utils::one_of(fmt, OIhw8o16i2o, gOIhw8o16i2o))
-            return ((oc / 2) * blksize * 2 + 2 * ic + oc % 2);
-        else if (utils::one_of(fmt,
-                    OIhw16i16o, gOIhw16i16o, OIhw8i8o, gOIhw8i8o,
-                    OIdhw16i16o, gOIdhw16i16o, OIdhw8i8o, gOIdhw8i8o))
-            return (ic * blksize + oc);
-        else
-            return (oc * blksize + ic);
-    };
+    const int H = is_1d ? 1 : dims[w_groups + 2 + is_3d];
+    const int W = dims[w_groups + 3 - is_1d + is_3d];
 
     auto ker = [&](data_t *d, const int oc_tail, const int ic_tail) {
+#       define blk_off OI_blk_off<format_traits<fmt>::blk_fmt>
         int oc = 0;
         for (; oc < blksize - oc_tail; ++oc) {
             for (int ic = blksize - ic_tail; ic < blksize; ++ic)
-                d[index(ic, oc)] = 0;
+                d[blk_off(oc, ic)] = 0;
         }
         for (; oc < blksize; ++oc)
             for (int ic = 0; ic < blksize; ++ic)
-                d[index(ic, oc)] = 0;
+                d[blk_off(oc, ic)] = 0;
+#       undef blk_off
     };
 
     const int oc_tail = pdims[w_groups + 0] - dims[w_groups + 0];
@@ -205,9 +164,8 @@ typename utils::enable_if<false
     if (ic_tail) {
         parallel_nd(G, NB_OC, D, H, W,
             [&](int g, int nb_oc, int d, int h, int w) {
-            auto x = &data[is_3d
-                ? m_d.blk_off<!w_groups>(g, nb_oc, NB_IC - 1, d, h, w)
-                : m_d.blk_off<!w_groups>(g, nb_oc, NB_IC - 1, h, w) ];
+            auto x = &data[wei_blk_off_like_gwei3D<fmt>(m_d,
+                    g, nb_oc, NB_IC - 1, d, h, w)];
             ker(x, 0, ic_tail);
         });
     }
@@ -215,19 +173,20 @@ typename utils::enable_if<false
     if (oc_tail) {
         parallel_nd(G, NB_IC, D, H, W,
             [&](int g, int nb_ic, int d, int h, int w) {
-            auto x = &data[is_3d
-                ? m_d.blk_off<!w_groups>(g, NB_OC - 1, nb_ic, d, h, w)
-                : m_d.blk_off<!w_groups>(g, NB_OC - 1, nb_ic, h, w) ];
+            auto x = &data[wei_blk_off_like_gwei3D<fmt>(m_d,
+                    g, NB_OC - 1, nb_ic, d, h, w)];
             ker(x, oc_tail, 0);
         });
     }
 }
 
 template <data_type_t dt, memory_format_t fmt>
-typename utils::enable_if<fmt == Goihw8g || fmt == Goihw16g>::type
-typed_zero_pad_weights(const memory_desc_wrapper &m_d,
+typename utils::enable_if<false
+|| format_traits<fmt>::blk_fmt == bf::_8g
+|| format_traits<fmt>::blk_fmt == bf::_16g
+>::type typed_zero_pad_weights(const memory_desc_wrapper &m_d,
         typename prec_traits<dt>::type *data) {
-    constexpr int blksize = fmt == Goihw8g ? 8 : 16;
+    constexpr int blksize = format_traits<fmt>::blk_size;
 
     const auto &dims = m_d.dims();
     const auto &pdims = m_d.blocking_desc().padding_dims;
@@ -308,6 +267,8 @@ status_t cpu_memory_t::typed_zero_pad() {
     /* data */
 #   define MAYBE_DATA(f) if (fmt == f) \
     { typed_zero_pad_data<dt, f>(mpd, data); return success; }
+    MAYBE_DATA(nCw8c);
+    MAYBE_DATA(nCw16c);
     MAYBE_DATA(nChw8c);
     MAYBE_DATA(nCdhw8c);
     MAYBE_DATA(nChw16c);
@@ -330,6 +291,17 @@ status_t cpu_memory_t::typed_zero_pad() {
     MAYBE_WEIGHTS(OIhw8i8o);
     MAYBE_WEIGHTS(OIhw16i16o);
     MAYBE_WEIGHTS(OIhw4i16o4i);
+    MAYBE_WEIGHTS(OIhw4i16o4i_s8s8);
+    MAYBE_WEIGHTS(Owi8o);
+    MAYBE_WEIGHTS(OIw8i8o);
+    MAYBE_WEIGHTS(OIw8o8i);
+    MAYBE_WEIGHTS(OIw16i16o);
+    MAYBE_WEIGHTS(OIw16o16i);
+    MAYBE_WEIGHTS(Oiw16o);
+    MAYBE_WEIGHTS(Owi16o);
+    MAYBE_WEIGHTS(OIw8i16o2i);
+    MAYBE_WEIGHTS(OIw8o16i2o);
+    MAYBE_WEIGHTS(IOw16o16i);
     MAYBE_WEIGHTS(OIhw8i16o2i);
     MAYBE_WEIGHTS(OIdhw8i16o2i);
     MAYBE_WEIGHTS(OIhw8o16i2o);
@@ -342,6 +314,17 @@ status_t cpu_memory_t::typed_zero_pad() {
     MAYBE_WEIGHTS(gOIhw8i8o);
     MAYBE_WEIGHTS(gOIhw16i16o);
     MAYBE_WEIGHTS(gOIhw4i16o4i);
+    MAYBE_WEIGHTS(gOIhw4i16o4i_s8s8);
+    MAYBE_WEIGHTS(gOwi8o);
+    MAYBE_WEIGHTS(gOIw8i8o);
+    MAYBE_WEIGHTS(gOIw8o8i);
+    MAYBE_WEIGHTS(gOIw16i16o);
+    MAYBE_WEIGHTS(gOIw16o16i);
+    MAYBE_WEIGHTS(gOiw16o);
+    MAYBE_WEIGHTS(gOwi16o);
+    MAYBE_WEIGHTS(gOIw8i16o2i);
+    MAYBE_WEIGHTS(gOIw8o16i2o);
+    MAYBE_WEIGHTS(gIOw16o16i);
     MAYBE_WEIGHTS(gOIhw8i16o2i);
     MAYBE_WEIGHTS(gOIdhw8i16o2i);
     MAYBE_WEIGHTS(gOIhw8o16i2o);
index 3ab7dda..116c4a8 100644 (file)
@@ -77,7 +77,7 @@ void reduce_balancer_t::balance() {
     assert(ngroups * nthr_per_group <= nthr_);
     assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_
             || nthr_per_group == 1); /* no reduction buffer overflow */
-    assert(implication(!syncable_, nthr_per_group == 1));
+    assert(IMPLICATION(!syncable_, nthr_per_group == 1));
 
     ngroups_ = ngroups;
     nthr_per_group_ = nthr_per_group;
index ad791e6..eee668b 100644 (file)
@@ -73,69 +73,157 @@ static const rpd_create_f cpu_reorder_impl_list[] = {
     /* jit */
     jit_uni_reorder_create,
 
-    /* fp32: flat <-> blocked with tail */
+    /* fp32: flat <-> blocked with< tail */
+    REG_SR_BIDIR(f32, any, f32, nCw8c),
+    REG_SR_BIDIR(f32, any, f32, OIw8i8o),
+    REG_SR_BIDIR(f32, any, f32, OIw8o8i),
+    REG_SR_BIDIR(f32, any, f32, gOIw8i8o),
+    REG_SR_BIDIR(f32, any, f32, gOIw8o8i),
+
+    REG_SR_BIDIR(f32, any, f32, nCw16c),
+    REG_SR_BIDIR(f32, any, f32, OIw16o16i),
+    REG_SR_BIDIR(f32, any, f32, OIw16i16o),
+    REG_SR_BIDIR(f32, any, f32, IOw16o16i),
+    REG_SR_BIDIR(f32, any, f32, gOIw16o16i),
+    REG_SR_BIDIR(f32, any, f32, gOIw16i16o),
+    REG_SR_BIDIR(f32, any, f32, gIOw16o16i),
+
     REG_SR_BIDIR(f32, any, f32, nChw8c),
-    REG_SR_BIDIR(f32, any, f32, nChw16c),
-    REG_SR_BIDIR(f32, any, f32, nCdhw16c),
-    REG_SR_BIDIR(f32, nChw8c, f32, nChw16c),
+    REG_SR_BIDIR(f32, any, f32, Ohwi8o),
+    REG_SR_BIDIR(f32, any, f32, OIhw8i8o),
+    REG_SR_BIDIR(f32, any, f32, OIhw8o8i),
+    REG_SR_BIDIR(f32, any, f32, gOhwi8o),
+    REG_SR_BIDIR(f32, any, f32, gOIhw8i8o),
+    REG_SR_BIDIR(f32, any, f32, gOIhw8o8i),
 
+    REG_SR_BIDIR(f32, any, f32, nChw16c),
     REG_SR_BIDIR(f32, any, f32, Oihw16o),
     REG_SR_BIDIR(f32, any, f32, Ohwi16o),
-    REG_SR_BIDIR(f32, any, f32, Oidhw16o),
-    REG_SR_BIDIR(f32, any, f32, Odhwi16o),
     REG_SR_BIDIR(f32, any, f32, OIhw16o16i),
     REG_SR_BIDIR(f32, any, f32, OIhw16i16o),
-    REG_SR_BIDIR(f32, any, f32, OIdhw16o16i),
-    REG_SR_BIDIR(f32, any, f32, OIdhw16i16o),
     REG_SR_BIDIR(f32, any, f32, IOhw16o16i),
     REG_SR_BIDIR(f32, any, f32, gOihw16o),
     REG_SR_BIDIR(f32, any, f32, gOhwi16o),
-    REG_SR_BIDIR(f32, any, f32, gOidhw16o),
-    REG_SR_BIDIR(f32, any, f32, gOdhwi16o),
     REG_SR_BIDIR(f32, any, f32, gOIhw16o16i),
     REG_SR_BIDIR(f32, any, f32, gOIhw16i16o),
+    REG_SR_BIDIR(f32, any, f32, gIOhw16o16i),
+
+    REG_SR_BIDIR(f32, any, f32, nCdhw8c),
+    REG_SR_BIDIR(f32, any, f32, Odhwi8o),
+    REG_SR_BIDIR(f32, any, f32, OIdhw8i8o),
+    REG_SR_BIDIR(f32, any, f32, OIdhw8o8i),
+    REG_SR_BIDIR(f32, any, f32, gOdhwi8o),
+    REG_SR_BIDIR(f32, any, f32, gOIdhw8i8o),
+    REG_SR_BIDIR(f32, any, f32, gOIdhw8o8i),
+
+    REG_SR_BIDIR(f32, any, f32, nCdhw16c),
+    REG_SR_BIDIR(f32, any, f32, Oidhw16o),
+    REG_SR_BIDIR(f32, any, f32, Odhwi16o),
+    REG_SR_BIDIR(f32, any, f32, OIdhw16o16i),
+    REG_SR_BIDIR(f32, any, f32, OIdhw16i16o),
+    REG_SR_BIDIR(f32, any, f32, gOidhw16o),
+    REG_SR_BIDIR(f32, any, f32, gOdhwi16o),
     REG_SR_BIDIR(f32, any, f32, gOIdhw16o16i),
     REG_SR_BIDIR(f32, any, f32, gOIdhw16i16o),
-    REG_SR_BIDIR(f32, any, f32, gIOhw16o16i),
+
+    REG_SR_BIDIR(f32, nChw8c, f32, nChw16c),
+
+    /* WA to prevent fallback on reference implementations */
+    REG_SR_DIRECT_COPY(u8, f32),
+    REG_SR_DIRECT_COPY(u8, s8),
+    REG_SR_DIRECT_COPY(s8, u8),
+    REG_SR_DIRECT_COPY(u8, u8),
+    REG_SR_DIRECT_COPY(s8, s8),
 
     /* int: flat <-> blocked with tail */
-    REG_SR_BIDIR(f32, nhwc, s32, nChw16c),
-    REG_SR_BIDIR(f32, nhwc, s8, nChw16c),
-    REG_SR_BIDIR(f32, nhwc, u8, nChw16c),
-    REG_SR_BIDIR(s32, nhwc, f32, nChw16c),
-    REG_SR_BIDIR(s32, nhwc, s32, nChw16c),
-    REG_SR_BIDIR(s32, nhwc, s8, nChw16c),
-    REG_SR_BIDIR(s32, nhwc, u8, nChw16c),
-    REG_SR_BIDIR(s8, nhwc, f32, nChw16c),
-    REG_SR_BIDIR(s8, nhwc, s32, nChw16c),
-    REG_SR_BIDIR(s8, nhwc, s8, nChw16c),
-    REG_SR_BIDIR(s8, nhwc, u8, nChw16c),
-    REG_SR_BIDIR(u8, nhwc, f32, nChw16c),
-    REG_SR_BIDIR(u8, nhwc, s32, nChw16c),
-    REG_SR_BIDIR(u8, nhwc, s8, nChw16c),
-    REG_SR_BIDIR(u8, nhwc, u8, nChw16c),
-
-    REG_SR_BIDIR(f32, oihw, f32, OIhw4i16o4i),
-    REG_SR_BIDIR(f32, oihw, s8, OIhw4i16o4i),
-    REG_SR_BIDIR(s8, oihw, f32, OIhw4i16o4i),
-    REG_SR_BIDIR(s8, oihw, s8, OIhw4i16o4i),
-    REG_SR_BIDIR(f32, goihw, s8, gOIhw4i16o4i),
-    REG_SR_BIDIR(s8, goihw, f32, gOIhw4i16o4i),
-    REG_SR_BIDIR(f32, goihw, f32, gOIhw4i16o4i),
-    REG_SR_BIDIR(s8, goihw, s8, gOIhw4i16o4i),
+    REG_SR(f32, nChw8c, u8, nhwc, fmt_order::keep),
+    REG_SR(f32, nChw8c, s8, nhwc, fmt_order::keep),
+    REG_SR(u8, nhwc, f32, nChw8c, fmt_order::keep),
+    REG_SR(s8, nhwc, f32, nChw8c, fmt_order::keep),
+    REG_SR(f32, nhwc, u8, nhwc, fmt_order::keep),
+    REG_SR(f32, nhwc, s8, nhwc, fmt_order::keep),
+    REG_SR(u8, nhwc, f32, nhwc, fmt_order::keep),
+    REG_SR(s8, nhwc, f32, nhwc, fmt_order::keep),
+    REG_SR(s8, nhwc, u8, nhwc, fmt_order::keep),
+    REG_SR(u8, nhwc, s8, nhwc, fmt_order::keep),
+    REG_SR(u8, nhwc, s8, nhwc, fmt_order::keep),
+    REG_SR(f32, nchw, u8, nhwc, fmt_order::keep),
+    REG_SR(f32, nchw, s8, nhwc, fmt_order::keep),
+    REG_SR(u8, nchw, u8, nhwc, fmt_order::keep),
+    REG_SR(s8, nchw, s8, nhwc, fmt_order::keep),
+    REG_SR(u8, nhwc, f32, nchw, fmt_order::keep),
+
+    REG_SR_BIDIR(f32, any, s32, nChw8c),
+    REG_SR_BIDIR(f32, any, s8, nChw8c),
+    REG_SR_BIDIR(f32, any, u8, nChw8c),
+    REG_SR_BIDIR(s32, any, f32, nChw8c),
+    REG_SR_BIDIR(s32, any, s32, nChw8c),
+    REG_SR_BIDIR(s32, any, s8, nChw8c),
+    REG_SR_BIDIR(s32, any, u8, nChw8c),
+    REG_SR_BIDIR(s8, any, f32, nChw8c),
+    REG_SR_BIDIR(s8, any, s32, nChw8c),
+    REG_SR_BIDIR(s8, any, s8, nChw8c),
+    REG_SR_BIDIR(s8, any, u8, nChw8c),
+    REG_SR_BIDIR(u8, any, f32, nChw8c),
+    REG_SR_BIDIR(u8, any, s32, nChw8c),
+    REG_SR_BIDIR(u8, any, s8, nChw8c),
+    REG_SR_BIDIR(u8, any, u8, nChw8c),
+
+    REG_SR_BIDIR(f32, any, s32, nChw16c),
+    REG_SR_BIDIR(f32, any, s8, nChw16c),
+    REG_SR_BIDIR(f32, any, u8, nChw16c),
+    REG_SR_BIDIR(s32, any, f32, nChw16c),
+    REG_SR_BIDIR(s32, any, s32, nChw16c),
+    REG_SR_BIDIR(s32, any, s8, nChw16c),
+    REG_SR_BIDIR(s32, any, u8, nChw16c),
+    REG_SR_BIDIR(s8, any, f32, nChw16c),
+    REG_SR_BIDIR(s8, any, s32, nChw16c),
+    REG_SR_BIDIR(s8, any, s8, nChw16c),
+    REG_SR_BIDIR(s8, any, u8, nChw16c),
+    REG_SR_BIDIR(u8, any, f32, nChw16c),
+    REG_SR_BIDIR(u8, any, s32, nChw16c),
+    REG_SR_BIDIR(u8, any, s8, nChw16c),
+    REG_SR_BIDIR(u8, any, u8, nChw16c),
+
+    REG_SR_BIDIR(f32, any, f32, OIhw4i16o4i),
+    REG_SR_BIDIR(f32, any, s8, OIhw4i16o4i),
+    REG_SR_BIDIR(s8, any, f32, OIhw4i16o4i),
+    REG_SR_BIDIR(s8, any, s8, OIhw4i16o4i),
+    REG_SR_BIDIR(f32, any, s8, gOIhw4i16o4i),
+    REG_SR_BIDIR(s8, any, f32, gOIhw4i16o4i),
+    REG_SR_BIDIR(f32, any, f32, gOIhw4i16o4i),
+    REG_SR_BIDIR(s8, any, s8, gOIhw4i16o4i),
+
+    REG_SR(f32, any, f32, OhIw8o4i, fmt_order::keep),
+    REG_SR(f32, any, s8, OhIw8o4i, fmt_order::keep),
+    REG_SR(s8, any, f32, OhIw8o4i, fmt_order::keep),
+    REG_SR(s8, any, s8, OhIw8o4i, fmt_order::keep),
+    REG_SR(f32, any, s8, gOhIw8o4i, fmt_order::keep),
+    REG_SR(s8, any, f32, gOhIw8o4i, fmt_order::keep),
+    REG_SR(f32, any, f32, gOhIw8o4i, fmt_order::keep),
+    REG_SR(s8, any, s8, gOhIw8o4i, fmt_order::keep),
+    REG_SR(f32, oihw, s8, OhIw8o4i_s8s8, fmt_order::keep),
+    REG_SR(s8, oihw, s8, OhIw8o4i_s8s8, fmt_order::keep),
+    REG_SR(f32, goihw, s8, gOhIw8o4i_s8s8, fmt_order::keep),
+    REG_SR(s8, goihw, s8, gOhIw8o4i_s8s8, fmt_order::keep),
+
+    REG_SR(f32, any, s8, hwio_s8s8, fmt_order::keep),
+    REG_SR(s8, any, s8, hwio_s8s8, fmt_order::keep),
+    REG_SR(f32, any, s8, hwigo_s8s8, fmt_order::keep),
+    REG_SR(s8, any, s8, hwigo_s8s8, fmt_order::keep),
+    REG_SR(f32, oihw, s8, OIhw4i16o4i_s8s8, fmt_order::keep),
+    REG_SR(s8, oihw, s8, OIhw4i16o4i_s8s8, fmt_order::keep),
+    REG_SR(f32, goihw, s8, gOIhw4i16o4i_s8s8, fmt_order::keep),
+    REG_SR(s8, goihw, s8, gOIhw4i16o4i_s8s8, fmt_order::keep),
 
     /* s16 <-> s16 */
     REG_SR_DIRECT_COPY(s16, s16),
-    REG_SR_BIDIR(s16, oihw, s16, OIhw8i16o2i),
-    REG_SR_BIDIR(s16, goihw, s16, gOIhw8i16o2i),
+
+    REG_SR_BIDIR(s16, any, s16, OIhw8i16o2i),
+    REG_SR_BIDIR(s16, any, s16, gOIhw8i16o2i),
     REG_SR_BIDIR(s16, OIhw8i16o2i, s16, OIhw8o16i2o),
     REG_SR_BIDIR(s16, gOIhw8i16o2i, s16, gOIhw8o16i2o),
 
-    /* WA to prevent fallback on reference implementations */
-    REG_SR_DIRECT_COPY(u8, f32),
-    REG_SR_BIDIR(u8, nchw, f32, nChw8c),
-    REG_SR_BIDIR(u8, nchw, f32, nChw16c),
-
     /* reference: the last line of defence */
     REG_SR(f32, any, f32, any, fmt_order::any, spec::reference),
     REG_SR(f32, any, s32, any, fmt_order::any, spec::reference),
index ddc3721..f929a9e 100644 (file)
@@ -43,7 +43,7 @@ struct cpu_reorder_pd_t: public reorder_pd_t {
     virtual status_t init() const {
         const auto &post_ops = attr()->post_ops_;
         bool args_ok = true
-            && utils::implication(post_ops.len_ != 0,
+            && IMPLICATION(post_ops.len_ != 0,
                     post_ops.len_ == 1
                     && post_ops.entry_[0].kind == primitive_kind::sum);
         return args_ok ? success : unimplemented;
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_shuffle_pd.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_shuffle_pd.hpp
new file mode 100644 (file)
index 0000000..d6df0ef
--- /dev/null
@@ -0,0 +1,65 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_SHUFFLE_PD_HPP
+#define CPU_SHUFFLE_PD_HPP
+
+#include <assert.h>
+
+#include "c_types_map.hpp"
+#include "shuffle_pd.hpp"
+#include "cpu_engine.hpp"
+#include "cpu_memory.hpp"
+#include "cpu_primitive.hpp"
+#include "type_helpers.hpp"
+#include "utils.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+struct cpu_shuffle_pd_t: public shuffle_pd_t {
+    using cpu_memory_pd_t = cpu_memory_t::pd_t;
+
+    cpu_shuffle_pd_t(engine_t *engine, const shuffle_desc_t *adesc,
+            const primitive_attr_t *attr, const shuffle_pd_t *hint_fwd_pd)
+        : shuffle_pd_t(engine, adesc, attr, hint_fwd_pd)
+        , data_pd_(engine_, &desc_.data_desc) {}
+    virtual ~cpu_shuffle_pd_t() {}
+
+    virtual const cpu_memory_pd_t *src_pd(int index = 0) const override
+    { return index == 0 && is_fwd() ? &data_pd_ : nullptr; }
+    virtual const cpu_memory_pd_t *dst_pd(int index = 0) const override
+    { return index == 0 && is_fwd() ? &data_pd_ : nullptr; }
+    virtual const cpu_memory_pd_t *diff_dst_pd(int index = 0) const override
+    { return index == 0 && !is_fwd() ? &data_pd_ : nullptr; }
+    virtual const cpu_memory_pd_t *diff_src_pd(int index = 0) const override
+    { return index == 0 && !is_fwd() ? &data_pd_ : nullptr; }
+    const cpu_memory_pd_t *data_pd(int index = 0) const
+    { return index == 0 ? &data_pd_ : nullptr; }
+
+protected:
+    cpu_memory_pd_t data_pd_;
+    virtual status_t init() = 0;
+};
+
+}
+}
+}
+
+#endif
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
index 6ce027e..00769ad 100644 (file)
@@ -83,6 +83,11 @@ protected:
     cpu_memory_t::pd_t dst_pd_;
 
     virtual status_t init() {
+        for (int i = 0; i < n_; ++i) {
+            const memory_desc_wrapper src_pd(&src_pds_[i]);
+            if (!src_pd.is_blocking_desc())
+                return unimplemented;
+        }
         bool ok = true
             && set_default_params() == success
             && attr()->has_default_values();
@@ -95,7 +100,11 @@ protected:
             /* the stupidest ever heuristics */
             for (int i = 0; i < n_; ++i)
                 dst_fmt = nstl::max(dst_fmt, src_pds_[i].desc()->format);
-            CHECK(dst_pd_.set_format(dst_fmt));
+
+            if (dst_fmt == memory_format::blocked)
+                dst_pd_ = src_pds_[0];
+            else
+                CHECK(dst_pd_.set_format(dst_fmt));
         }
 
         return success;
index fe84121..146e688 100644 (file)
@@ -25,6 +25,8 @@
 #include "../jit_generator.hpp"
 #include "nstl.hpp"
 #include "os_blas.hpp"
+#include "math_utils.hpp"
+#include "mkldnn_traits.hpp"
 
 /* USE_MKL      USE_CBLAS       effect
  * -------      ---------       ------
@@ -52,6 +54,7 @@ mkldnn_status_t check_gemm_input(const char *transa, const char *transb,
         && *M >= 0
         && *N >= 0
         && *K >= 0;
+
     if (!consistency) return invalid_arguments;
     bool isTransA = utils::one_of(*transa, 'T', 't');
     bool isTransB = utils::one_of(*transb, 'T', 't');
@@ -66,6 +69,19 @@ mkldnn_status_t check_gemm_input(const char *transa, const char *transb,
     return success;
 }
 
+mkldnn_status_t check_gemm_x8x8x32_input(const char *offsetc,
+        const char *transa, const char *transb, const int *M, const int *N,
+        const int *K, const int *lda, const int *ldb, const int *ldc,
+        const float *alpha, const float *beta, const bool with_bias) {
+
+    if (offsetc == nullptr) return invalid_arguments;
+    if (!utils::one_of(*offsetc, 'F', 'f', 'C', 'c', 'R', 'r'))
+        return invalid_arguments;
+
+    return check_gemm_input(transa, transb, M, N, K, lda, ldb, ldc, alpha,
+        beta, with_bias);
+}
+
 struct gemm_impl_t {
     gemm_impl_t(char transa, char transb, bool zero_beta, bool with_bias) {
         //jit kernel has three codepaths: beta is 0, 1 or arbitrary
@@ -132,7 +148,7 @@ mkldnn_status_t extended_sgemm(const char *transa, const char *transb,
         const int *M, const int *N, const int *K, const float *alpha,
         const float *A, const int *lda, const float *B, const int *ldb,
         const float *beta, float *C, const int *ldc,
-        const float *bias) {
+        const float *bias, const bool force_jit_gemm) {
     //Check input
     mkldnn_status_t status = check_gemm_input(transa, transb, M, N, K,
             lda, ldb, ldc, alpha, beta, bias != nullptr);
@@ -143,20 +159,22 @@ mkldnn_status_t extended_sgemm(const char *transa, const char *transb,
     int trA = *transa == 't' || *transa == 'T';
     int trB = *transb == 't' || *transb == 'T';
 #ifdef USE_CBLAS
-    //Call cblas
-    CBLAS_TRANSPOSE Cblas_trA = trA ? CblasTrans : CblasNoTrans;
-    CBLAS_TRANSPOSE Cblas_trB = trB ? CblasTrans : CblasNoTrans;
-    cblas_sgemm(CblasColMajor, Cblas_trA, Cblas_trB,
-            *M, *N, *K, *alpha, A, *lda, B, *ldb, *beta, C, *ldc);
-    //Add bias if necessary (bias is applied to columns of C)
-    if (bias) {
-        cblas_int incx = 1, incy = 1;
-        parallel_nd(*N, [&](int n) {
-            cblas_saxpy(*M, 1.0, bias, incx, C + n*(*ldc), incy);
-        });
+    if (!force_jit_gemm) {
+        //Call cblas
+        CBLAS_TRANSPOSE Cblas_trA = trA ? CblasTrans : CblasNoTrans;
+        CBLAS_TRANSPOSE Cblas_trB = trB ? CblasTrans : CblasNoTrans;
+        cblas_sgemm(CblasColMajor, Cblas_trA, Cblas_trB,
+                *M, *N, *K, *alpha, A, *lda, B, *ldb, *beta, C, *ldc);
+        //Add bias if necessary (bias is applied to columns of C)
+        if (bias) {
+            cblas_int incx = 1, incy = 1;
+            parallel_nd(*N, [&](int n) {
+                cblas_saxpy(*M, 1.0, bias, incx, C + n*(*ldc), incy);
+            });
+        }
+        return mkldnn_success;
     }
-    return mkldnn_success;
-#else
+#endif
     //Generate jit kernel and call sgemm with bias
     volatile static int initialized = 0;
     if (!initialized) {
@@ -176,9 +194,98 @@ mkldnn_status_t extended_sgemm(const char *transa, const char *transb,
                 transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
 
     return mkldnn_success;
-#endif
 }
 
+template <typename b_dt>
+mkldnn_status_t gemm_s8x8s32(const char *transa, const char *transb,
+        const char *offsetc, const int *M, const int *N, const int *K,
+        const float *alpha, const int8_t *A, const int *LDA, const int8_t *ao,
+        const b_dt *B, const int *LDB, const int8_t *bo, const float *beta,
+        int32_t *C, const int *LDC, const int32_t *co) {
+
+    mkldnn_status_t status = check_gemm_x8x8x32_input(offsetc, transa, transb,
+        M, N, K, LDA, LDB, LDC, alpha, beta, false);
+
+    if (status != mkldnn_success)
+        return status;
+
+    if (*M == 0 || *N == 0 || *K == 0)
+        return mkldnn_success;
+
+    bool OCisR = (*offsetc == 'R' || *offsetc == 'r');
+    bool OCisC = (*offsetc == 'C' || *offsetc == 'c');
+    bool AisN = (*transa == 'N' || *transa == 'n');
+    bool BisN = (*transb == 'N' || *transb == 'n');
+
+#if defined(USE_MKL) && defined(USE_CBLAS)
+    if (data_traits<b_dt>::data_type == data_type::u8) {
+        CBLAS_TRANSPOSE Cblas_trA = AisN ? CblasNoTrans : CblasTrans;
+        CBLAS_TRANSPOSE Cblas_trB = BisN ? CblasNoTrans : CblasTrans;
+        CBLAS_OFFSET Cblas_offsetc =
+            OCisR
+            ? CblasRowOffset
+            : OCisC
+            ? CblasColOffset
+            : CblasFixOffset;
+        cblas_gemm_s8u8s32(CblasColMajor, Cblas_trA, Cblas_trB, Cblas_offsetc,
+            *M, *N, *K, *alpha, A, *LDA, *ao, (b_dt*)B, *LDB, *bo, *beta, C, *LDC, co);
+        return mkldnn_success;
+    }
+#endif
+    int m = *M, n = *N, k = *K, lda = *LDA, ldb = *LDB, ldc = *LDC;
+    size_t sizeA = AisN ? lda * k : lda * m;
+    size_t sizeB = BisN ? ldb * n : ldb * k;
+    size_t sizeC = ldc * n;
+
+    double *dA = (double *)malloc(sizeA * sizeof(double), PAGE_4K);
+    double *dB = (double *)malloc(sizeB * sizeof(double), PAGE_4K);
+    double *dC = (double *)malloc(sizeC * sizeof(double), PAGE_4K);
+
+    if (utils::any_null(dA, dB, dC)) {
+        free(dA);
+        free(dB);
+        free(dC);
+        return mkldnn_out_of_memory;
+    }
+
+    auto da_setter = [=] (int i, int j, double v) { dA[j * lda + i] = v; };
+    auto db_setter = [=] (int i, int j, double v) { dB[j * ldb + i] = v; };
+
+    auto ia_accessor = [=] (int i, int j) { return A[j * lda + i]; };
+    auto ib_accessor = [=] (int i, int j) { return B[j * ldb + i]; };
+
+    const int a_rows = AisN ? m : k;
+    const int a_cols = AisN ? k : m;
+    mkldnn::impl::parallel_nd(a_cols, a_rows, [&](int j, int i) {
+        da_setter(i, j,
+            static_cast<double>(ia_accessor(i, j)) + static_cast<double>(ao[0]));
+    });
+
+    const int b_rows = BisN ? k : n;
+    const int b_cols = BisN ? n : k;
+    mkldnn::impl::parallel_nd(b_cols, b_rows, [&](int j, int i) {
+        db_setter(i, j,
+            static_cast<double>(ib_accessor(i, j)) + static_cast<double>(bo[0]));
+    });
+    double one = 1.0, zero = 0.0;
+    ref_gemm<double>(transa, transb, M, N, K, &one, dA, LDA, dB, LDB, &zero,
+        dC, LDC, nullptr);
+
+    auto i2d = [=] (int32_t v) { return static_cast<double>(v); };
+    auto f2d = [=] (float v)   { return static_cast<double>(v); };
+
+    mkldnn::impl::parallel_nd(n, m, [&] (int j, int i) {
+        double coffset = OCisR ? i2d(co[j]) : OCisC ? i2d(co[i]) : i2d(co[0]);
+        double val = ((*beta == 0.0f) ? 0.0 : f2d(*beta) * i2d(C[i + j * ldc]))
+            + f2d(*alpha) * dC[i + j * ldc] + coffset;
+        C[i + j * ldc] = math::out_round<int32_t>(math::saturate<int32_t>(val));
+    });
+
+    free(dA);
+    free(dB);
+    free(dC);
+    return mkldnn_success;
+}
 }
 }
 }
@@ -193,3 +300,23 @@ mkldnn_status_t mkldnn_sgemm(const char *transa, const char *transb,
     return extended_sgemm(
             transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
 }
+
+mkldnn_status_t mkldnn_gemm_s8u8s32(const char *transa, const char *transb,
+        const char *offsetc, const int *M, const int *N, const int *K,
+        const float *alpha, const int8_t *A, const int *lda, const int8_t *ao,
+        const uint8_t *B, const int *ldb, const int8_t *bo, const float *beta,
+        int32_t *c, const int *ldc, const int32_t *co) {
+    return gemm_s8x8s32(
+        transa, transb, offsetc, M, N, K, alpha, A, lda, ao, B, ldb, bo,
+        beta, c, ldc, co);
+}
+
+mkldnn_status_t mkldnn_gemm_s8s8s32(const char *transa, const char *transb,
+        const char *offsetc, const int *M, const int *N, const int *K,
+        const float *alpha, const int8_t *A, const int *lda, const int8_t *ao,
+        const int8_t *B, const int *ldb, const int8_t *bo, const float *beta,
+        int32_t *c, const int *ldc, const int32_t *co) {
+    return gemm_s8x8s32(
+        transa, transb, offsetc, M, N, K, alpha, A, lda, ao, B, ldb, bo,
+        beta, c, ldc, co);
+}
index 8917de1..3f33a37 100644 (file)
@@ -22,11 +22,20 @@ mkldnn_status_t extended_sgemm(const char *transa, const char *transb,
         const int *M, const int *N, const int *K, const float *alpha,
         const float *A, const int *lda, const float *B, const int *ldb,
         const float *beta, float *C, const int *ldc,
-        const float *bias = nullptr);
+        const float *bias = nullptr, bool force_jit_gemm = false);
+
+template <typename b_dt>
+mkldnn_status_t gemm_s8x8s32(const char *transa, const char *transb,
+        const char *offsetc, const int *M, const int *N, const int *K,
+        const float *alpha, const int8_t *A, const int *lda, const int8_t *ao,
+        const b_dt *B, const int *ldb, const int8_t *bo, const float *beta,
+        int32_t *c, const int *ldc, const int32_t *co);
+
+template <typename data_t>
 void ref_gemm(const char *transa, const char *transb, const int *M,
-        const int *N, const int *K, const float *alpha, const float *A,
-        const int *lda, const float *B, const int *ldb, const float *beta,
-        float *C, const int *ldc, const float *bias);
+        const int *N, const int *K, const data_t *alpha, const data_t *A,
+        const int *lda, const data_t *B, const int *ldb, const data_t *beta,
+        data_t *C, const int *ldc, const data_t *bias);
 #ifdef USE_CBLAS
 #define GEMM_IMPL_STR "gemm:blas"
 #else
index 934ba81..e3b6cff 100644 (file)
@@ -343,8 +343,9 @@ void partition_unit_diff(
 
 // Sum the m*n values from p_src into p_dst, assuming the two-dimensional
 // arrays have leading dimensions ld_src and ld_dst, respectively
+template<typename data_t>
 void sum_two_matrices(
-        int m, int n, float *p_src, int ld_src, float *p_dst, int ld_dst)
+        int m, int n, data_t *p_src, int ld_src, data_t *p_dst, int ld_dst)
 {
     int i, j;
     for (j = 0; j < n; j++) {
@@ -353,6 +354,12 @@ void sum_two_matrices(
         }
     }
 }
+
+template void sum_two_matrices<float>(
+        int m, int n, float *p_src, int ld_src, float *p_dst, int ld_dst);
+
+template void sum_two_matrices<double>(
+        int m, int n, double *p_src, int ld_src, double *p_dst, int ld_dst);
 }
 }
 }
index 7a8f7fc..0888787 100644 (file)
@@ -22,8 +22,34 @@ namespace impl {
 namespace cpu {
 
 namespace gemm_utils {
+
+template <typename T, bool isTransA, bool isTransB>
+struct gemm_traits {};
+
+template <bool isTransA, bool isTransB>
+struct gemm_traits<double, isTransA, isTransB> {
+    static constexpr int m = 8;
+    static constexpr int n = 6;
+    static constexpr int BM = 4032;
+    static constexpr int BN = isTransA ? 96 : 192;
+    static constexpr int BK = isTransB ? 96 : 512;
+};
+
+template <bool isTransA, bool isTransB>
+struct gemm_traits<float, isTransA, isTransB> {
+    static constexpr int m = 16;
+    static constexpr int n = 6;
+    static constexpr int BM = 4032;
+    static constexpr int BN = isTransA ? 96 : 48;
+    static constexpr int BK = isTransB ? 96 : 256;
+};
+
+template <typename T>
+using unroll_factor = gemm_traits<T, false, false>;
+
+template <typename data_type>
 void sum_two_matrices(
-        int m, int n, float *p_src, int ld_src, float *p_dst, int ld_dst);
+        int m, int n, data_type *p_src, int ld_src, data_type *p_dst, int ld_dst);
 
 void calc_nthr_nocopy_avx512_common(int m,
         int n, int k, int nthrs, int *nthrs_m, int *nthrs_n, int *nthrs_k,
@@ -35,6 +61,8 @@ void calc_nthr_nocopy_avx(int m, int n, int k,
 
 void partition_unit_diff(
         int ithr, int nthr, int n, int *t_offset, int *t_block);
+
+inline double saturate(double value, double min, double max);
 };
 
 }
index 08ba7b4..8aee85f 100644 (file)
@@ -22,7 +22,7 @@
 #include "gemm_utils.hpp"
 #include "jit_avx512_common_gemm_f32.hpp"
 
-#define CACHE_LINE_SIZE 16
+#define CACHE_LINE_SIZE 64
 
 namespace mkldnn {
 namespace impl {
@@ -128,15 +128,16 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
 
         // Function for packing if needed
         auto do_pack = [&](int unroll_m) {
-            inLocalLabel();
+            Label pack2, pack3, pack4, pack10;
+
             mov(BO1, A);
             lea(AO1, ptr[rsp + 128 + OFFSET * SIZE]);
             mov(LL, K);
             sar(LL, 2);
-            jle(".pack3", T_NEAR);
+            jle(pack3, T_NEAR);
             align(16);
 
-            L(".pack2");
+            L(pack2);
             if (!isTransA) {
                 for (int i = 0; i < 4; i++) {
                     vmovups(zmm0 | k1, ptr[BO1 + (0 * 16 - OFFSET) * SIZE]);
@@ -216,16 +217,16 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             add(AO1, unroll_m * 4 * SIZE);
 
             sub(LL, 1);
-            jg(".pack2", T_NEAR);
+            jg(pack2, T_NEAR);
             align(16);
 
-            L(".pack3");
+            L(pack3);
             mov(LL, K);
             and_(LL, 3);
-            jle(".pack10", T_NEAR);
+            jle(pack10, T_NEAR);
             align(16);
 
-            L(".pack4");
+            L(pack4);
             if (!isTransA) {
                 vmovups(zmm0 | k1, ptr[BO1 + (0 * 16 - OFFSET) * SIZE]);
                 if (unroll_m > 16)
@@ -279,11 +280,10 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
 
             add(AO1, unroll_m * SIZE);
             sub(LL, 1);
-            jg(".pack4", T_NEAR);
+            jg(pack4, T_NEAR);
             align(16);
 
-            L(".pack10");
-            outLocalLabel();
+            L(pack10);
         };
 
         // Function to update C, covering masking and other considerations
@@ -617,8 +617,6 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
         // Innerkernel; called by kernel
         auto innerkernel = [&](int unroll_m, int unroll_n, bool isDirect,
                 bool isCopy, bool doCPrefetch, bool isUnmasked = true) {
-            inLocalLabel();
-
             for (int i = 0; i < 8; i++) {
                 if (!isDirect) {
                     prefetcht0(ptr[AO1
@@ -960,7 +958,6 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             }
 
             sub(LL, 1);
-            outLocalLabel();
         };
 
         // Main kernel; does prefetching and calls innerkernel
@@ -968,7 +965,6 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
         // calling update
         auto kernel = [&](int unroll_m, int unroll_n, bool isDirect,
                 bool isCopy, bool isUnmasked = true) {
-            inLocalLabel();
             if (!isDirect) {
                 lea(AO1, ptr[rsp + 128 + OFFSET * SIZE]);
             } else {
@@ -1020,36 +1016,38 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
                 }
             }
 
+            Label kernel12, kernel13, kernel14, kernel15, kernel16, kernel18;
+
             mov(LL, K);
             sar(LL, 3);
             sub(LL, SECOND_FETCH);
-            jle(".kernel13", T_NEAR);
+            jle(kernel13, T_NEAR);
             align(16);
 
-            L(".kernel12");
+            L(kernel12);
             innerkernel(
                     unroll_m, unroll_n, isDirect, isCopy, false, isUnmasked);
-            jg(".kernel12", T_NEAR);
+            jg(kernel12, T_NEAR);
             align(16);
 
-            L(".kernel13");
+            L(kernel13);
             lea(CO2, ptr[CO1 + (16 - 1) * SIZE]);
             add(LL, unroll_n);
-            jle(".kernel15", T_NEAR);
+            jle(kernel15, T_NEAR);
             align(16);
 
-            L(".kernel14");
+            L(kernel14);
             innerkernel(unroll_m, unroll_n, isDirect, isCopy, true, isUnmasked);
-            jg(".kernel14", T_NEAR);
+            jg(kernel14, T_NEAR);
             align(16);
 
-            L(".kernel15");
+            L(kernel15);
             mov(LL, K);
             and_(LL, 7);
-            jle(".kernel18", T_NEAR);
+            jle(kernel18, T_NEAR);
             align(16);
 
-            L(".kernel16");
+            L(kernel16);
             if (isDirect) {
                 if (isUnmasked || unroll_m > 16) {
                     vmovups(zmm0, ptr[AO1 + (0 * 16 - OFFSET) * SIZE]);
@@ -1204,10 +1202,10 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             }
 
             sub(LL, 1);
-            jg(".kernel16", T_NEAR);
+            jg(kernel16, T_NEAR);
             align(16);
 
-            L(".kernel18");
+            L(kernel18);
             vbroadcastss(VALPHA, ALPHA);
 
             if (isBetaN) {
@@ -1329,8 +1327,6 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
                 sub(BO1, rax);
                 add(BO1, unroll_n * SIZE);
             }
-
-            outLocalLabel();
         };
 
         // High-level subroutine; does packing if needed, then splits C matrix.
@@ -1338,11 +1334,16 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
         // cases appropriately by doing 32 or 16 rows, and/or with masking,
         // and/or fewer columns).
         auto subloop = [&](int unroll_m) {
-            inLocalLabel();
-
             Label l_subloop_20x[8], l_subloop_mask_20x[8];
             Label l_subloop_30x[8], l_subloop_mask_30x[8];
 
+            Label subloop11, subloop11mask;
+            Label subloop30, subloop30mask;
+            Label subloop31, subloop31mask;
+            Label subloop96;
+            Label subloop98, subloop98mask;
+            Label subloop99;
+
             // Create mask
             mov(BO1, rcx);
             mov(rcx, M);
@@ -1370,7 +1371,7 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
 
             and_(rax, 0xffff);
             cmp(rax, 0xffff);
-            jne(".subloop96", T_NEAR);
+            jne(subloop96, T_NEAR);
 
             if (isTransA) {
                 do_pack(unroll_m);
@@ -1387,11 +1388,11 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             if (!isTransA) {
                 lea(AA, ptr[A + (unroll_m + 16 - 1 - OFFSET) * SIZE]);
                 cmp(M, UNROLL_M);
-                jg(".subloop98", T_NEAR);
+                jg(subloop98, T_NEAR);
 
                 mov(AA, ORIG_A);
                 lea(AA, ptr[AA + (16 - 1 - OFFSET) * SIZE]);
-                L(".subloop98");
+                L(subloop98);
             }
 
             mov(LL, N);
@@ -1399,11 +1400,11 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             if (!isTransA) {
                 // If N is too small, skip copy operation
                 cmp(LL, UNROLL_N * 3);
-                jle(".subloop30", T_NEAR);
+                jle(subloop30, T_NEAR);
 
                 // If A is not aligned to cache line
                 cmp(FLAG, 0);
-                je(".subloop30", T_NEAR);
+                je(subloop30, T_NEAR);
             } else {
                 cmp(LL, UNROLL_N);
                 jl(l_subloop_20x[1], T_NEAR);
@@ -1421,11 +1422,11 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             jl(l_subloop_20x[1], T_NEAR);
             align(16);
 
-            L(".subloop11");
+            L(subloop11);
             kernel(unroll_m, UNROLL_N, false, false);
             sub(I, UNROLL_N);
             cmp(I, UNROLL_N);
-            jge(".subloop11", T_NEAR);
+            jge(subloop11, T_NEAR);
             align(16);
 
             for (int i = 1; i <= 7; i++) {
@@ -1434,24 +1435,24 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
                 if (i < 7) {
                     jne(l_subloop_20x[i + 1], T_NEAR);
                 } else {
-                    jne(".subloop99", T_NEAR);
+                    jne(subloop99, T_NEAR);
                 }
                 kernel(unroll_m, i, false, false);
-                jmp(".subloop99", T_NEAR);
+                jmp(subloop99, T_NEAR);
                 align(16);
             }
 
             if (!isTransA) {
-                L(".subloop30");
+                L(subloop30);
                 cmp(I, UNROLL_N);
                 jl(l_subloop_30x[1], T_NEAR);
                 align(16);
 
-                L(".subloop31");
+                L(subloop31);
                 kernel(unroll_m, UNROLL_N, true, false);
                 sub(I, UNROLL_N);
                 cmp(I, UNROLL_N);
-                jge(".subloop31", T_NEAR);
+                jge(subloop31, T_NEAR);
                 align(16);
 
                 for (int i = 1; i <= 7; i++) {
@@ -1460,18 +1461,18 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
                     if (i < 7) {
                         jne(l_subloop_30x[i + 1], T_NEAR);
                     } else {
-                        jne(".subloop99", T_NEAR);
+                        jne(subloop99, T_NEAR);
                     }
                     kernel(unroll_m, i, true, false);
                     if (i < 7)
-                        jmp(".subloop99", T_NEAR);
+                        jmp(subloop99, T_NEAR);
                     align(16);
                 }
             }
-            jmp(".subloop99", T_NEAR);
+            jmp(subloop99, T_NEAR);
             align(16);
 
-            L(".subloop96");
+            L(subloop96);
             if (isTransA) {
                 do_pack(unroll_m);
             }
@@ -1486,10 +1487,10 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             if (!isTransA) {
                 lea(AA, ptr[A + (unroll_m + 16 - 1 - OFFSET) * SIZE]);
                 cmp(M, UNROLL_M);
-                jg(".subloop98mask", T_NEAR);
+                jg(subloop98mask, T_NEAR);
                 mov(AA, ORIG_A);
                 lea(AA, ptr[AA + (16 - 1 - OFFSET) * SIZE]);
-                L(".subloop98mask");
+                L(subloop98mask);
             }
 
             mov(LL, N);
@@ -1497,11 +1498,11 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             if (!isTransA) {
                 // If N is too small, skip copy operation
                 cmp(LL, UNROLL_N * 3);
-                jle(".subloop30mask", T_NEAR);
+                jle(subloop30mask, T_NEAR);
 
                 // If A is not aligned to cache line
                 cmp(FLAG, 0);
-                je(".subloop30mask", T_NEAR);
+                je(subloop30mask, T_NEAR);
             } else {
                 cmp(LL, UNROLL_N);
                 jl(l_subloop_mask_20x[1], T_NEAR);
@@ -1519,11 +1520,11 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             jl(l_subloop_mask_20x[1], T_NEAR);
             align(16);
 
-            L(".subloop11mask");
+            L(subloop11mask);
             kernel(unroll_m, UNROLL_N, false, false, false);
             sub(I, UNROLL_N);
             cmp(I, UNROLL_N);
-            jge(".subloop11mask", T_NEAR);
+            jge(subloop11mask, T_NEAR);
             align(16);
 
             for (int i = 1; i <= 7; i++) {
@@ -1532,24 +1533,24 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
                 if (i < 7) {
                     jne(l_subloop_mask_20x[i + 1], T_NEAR);
                 } else {
-                    jne(".subloop99", T_NEAR);
+                    jne(subloop99, T_NEAR);
                 }
                 kernel(unroll_m, i, false, false, false);
-                jmp(".subloop99", T_NEAR);
+                jmp(subloop99, T_NEAR);
                 align(16);
             }
 
             if (!isTransA) {
-                L(".subloop30mask");
+                L(subloop30mask);
                 cmp(I, UNROLL_N);
                 jl(l_subloop_mask_30x[1], T_NEAR);
                 align(16);
 
-                L(".subloop31mask");
+                L(subloop31mask);
                 kernel(unroll_m, UNROLL_N, true, false, false);
                 sub(I, UNROLL_N);
                 cmp(I, UNROLL_N);
-                jge(".subloop31mask", T_NEAR);
+                jge(subloop31mask, T_NEAR);
                 align(16);
 
                 for (int i = 1; i <= 7; i++) {
@@ -1558,16 +1559,16 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
                     if (i < 7) {
                         jne(l_subloop_mask_30x[i + 1], T_NEAR);
                     } else {
-                        jne(".subloop99", T_NEAR);
+                        jne(subloop99, T_NEAR);
                     }
                     kernel(unroll_m, i, true, false, false);
                     if (i < 7)
-                        jmp(".subloop99", T_NEAR);
+                        jmp(subloop99, T_NEAR);
                     align(16);
                 }
             }
 
-            L(".subloop99");
+            L(subloop99);
             // Compute address for A
             if (!isTransA) {
                 add(A, unroll_m * SIZE);
@@ -1581,14 +1582,12 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             if (hasBias) {
                 add(BIAS, unroll_m * SIZE);
             }
-
-            outLocalLabel();
         };
 
-        inLocalLabel();
-
         preamble();
 
+        Label buffer_in_ws, buffer_allocated;
+
         // Get the registers
         mov(B, ARG_B);
         mov(LDB, ARG_LDB);
@@ -1608,7 +1607,7 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
 #endif
 
         cmp(K, STACK_K_CAPACITY);
-        jg(".buffer_in_ws", T_NEAR);
+        jg(buffer_in_ws, T_NEAR);
 
         // Create buffer and align to 4kB page
         lea(rax, ptr[K * SIZE]);
@@ -1616,12 +1615,12 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
         add(rax, 256);
         sub(rsp, rax);
         and_(rsp, -PAGE_4K);
-        jmp(".buffer_allocated", T_NEAR);
+        jmp(buffer_allocated, T_NEAR);
 
-        L(".buffer_in_ws");
+        L(buffer_in_ws);
         mov(rsp, ARG_WS);
 
-        L(".buffer_allocated");
+        L(buffer_allocated);
 
         mov(ORIG_SP, rbp);
         mov(M, ARG_M);
@@ -1665,40 +1664,40 @@ struct jit_avx512_common_gemm_f32::xbyak_gemm : public jit_generator {
             }
         }
 
+        Label main0, main1, main2, main999;
+
         cmp(M, 32);
-        jle(".main0", T_NEAR);
+        jle(main0, T_NEAR);
         align(16);
 
-        L(".main1");
+        L(main1);
         subloop(48);
         sub(M, UNROLL_M);
         cmp(M, 32);
-        jg(".main1", T_NEAR);
+        jg(main1, T_NEAR);
         align(16);
 
-        L(".main0");
+        L(main0);
         cmp(M, 16);
-        jle(".main2", T_NEAR);
+        jle(main2, T_NEAR);
 
         subloop(32);
-        jmp(".main999", T_NEAR);
+        jmp(main999, T_NEAR);
         align(16);
 
-        L(".main2");
+        L(main2);
         cmp(M, 0);
-        jle(".main999", T_NEAR);
+        jle(main999, T_NEAR);
         subloop(16);
         align(16);
 
-        L(".main999");
+        L(main999);
         // Restore original stack
         mov(rsp, ORIG_SP);
 
         vzeroupper();
         postamble();
 
-        outLocalLabel();
-
         ker_ = reinterpret_cast<decltype(ker_)>(
                 const_cast<uint8_t *>(this->getCode()));
     }
@@ -1763,7 +1762,7 @@ void jit_avx512_common_gemm_f32::sgemm_nocopy_driver(const char *transa,
         if (!isTransA && !isTransB)
             BK = 128;
     }
-    const float *curA, *curB, *curBias = NULL;
+    const float *curA, *curB, *curBias = nullptr;
     float *curC;
 
     for (Bk = 0; Bk < k; Bk += sizeK) {
@@ -1804,15 +1803,15 @@ void jit_avx512_common_gemm_f32::sgemm_nocopy_driver(const char *transa,
                     curB = b + Bn + (size_t)Bk * ldb;
                 }
                 curC = c + Bm + (size_t)Bn * ldc;
-                if (bias != NULL) {
+                if (bias != nullptr) {
                     if (Bk == 0) {
                         curBias = bias + Bm;
                     } else {
-                        curBias = NULL;
+                        curBias = nullptr;
                     }
                 }
                 if (Bk == 0) {
-                    if (*beta == 0.0 && bias == NULL)
+                    if (*beta == 0.0 && bias == nullptr)
                         (*ker_b0_)((long long int)sizeM, (long long int)sizeN,
                                 (long long int)sizeK, alpha, curA,
                                 (long long int)lda, curB, (long long int)ldb,
@@ -1860,7 +1859,7 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb,
     // Determine threading partitioning
     gemm_utils::calc_nthr_nocopy_avx512_common(
             m, n, k, nthr, &nthr_m, &nthr_n, &nthr_k, &MB, &NB, &KB);
-    assert(utils::implication(!mkldnn_thr_syncable(), nthr_k == 1));
+    assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_k == 1));
 
     // May not happen, but just in case
     if (nthr < nthr_m * nthr_n * nthr_k)
@@ -1868,13 +1867,18 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb,
 
     nthr_mn = nthr_m * nthr_n;
 
-    unsigned int volatile *ompstatus = (unsigned int volatile *)ompstatus_;
-    if (!ompstatus) return;
+    unsigned char * ompstatus_ = nullptr;
+    unsigned char volatile *ompstatus = nullptr;
 
-    float *c_buffers = NULL;
-    float *ws_buffers = NULL;
+    float *c_buffers = nullptr;
+    float *ws_buffers = nullptr;
 
     if (nthr_k > 1) {
+        ompstatus_ = (unsigned char *) malloc(
+                nthr * CACHE_LINE_SIZE,
+                CACHE_LINE_SIZE);
+        ompstatus = (unsigned char volatile *) ompstatus_;
+        assert(ompstatus);
         for (int i = 0; i < nthr; i++)
             ompstatus[i * CACHE_LINE_SIZE] = 0;
 
@@ -1895,7 +1899,7 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb,
         int n_from, n_to, myN;
         int k_from, k_to, myK;
         int cbase, ibase;
-        const float *myA, *myB, *myBias = NULL;
+        const float *myA, *myB, *myBias = nullptr;
         float *myC = C, myBeta;
         float *ws = ws_buffers ?
                 ws_buffers + ithr * ws_size_per_thr / sizeof(float) : 0;
@@ -1957,7 +1961,7 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb,
                     myC = c_buffers + MB * NB * (cbase + ithr_k - 1);
                     myBeta = 0.0;
                     ld = MB;
-                    myBias = NULL;
+                    myBias = nullptr;
                 }
 
                 sgemm_nocopy_driver(transa, transb, myM, myN, myK, p_alpha, myA,
@@ -2004,8 +2008,8 @@ void jit_avx512_common_gemm_f32::sgemm(const char *transa, const char *transb,
         }
     });
 
-    if (nthr_k > 1)
-        free(c_buffers);
+    free(c_buffers);
+    free(ompstatus_);
     free(ws_buffers);
 }
 
@@ -2032,10 +2036,6 @@ jit_avx512_common_gemm_f32::jit_avx512_common_gemm_f32(
     }
 
     nthrs_ = mkldnn_get_max_threads();
-    ompstatus_ = (unsigned int *)malloc(
-        sizeof(unsigned int *) * nthrs_ * CACHE_LINE_SIZE, 64);
-    assert(ompstatus_);
-
 }
 
 jit_avx512_common_gemm_f32::~jit_avx512_common_gemm_f32()
@@ -2045,7 +2045,6 @@ jit_avx512_common_gemm_f32::~jit_avx512_common_gemm_f32()
         delete ker_b1_;
     if (beta_ != 0.0 || (beta_ == 0.0 && hasBias_))
         delete ker_b0_;
-    free(ompstatus_);
 }
 }
 }
index ede1cf9..c057335 100644 (file)
@@ -49,7 +49,6 @@ private:
     bool hasBias_;
     struct xbyak_gemm;
     xbyak_gemm *ker_bn_, *ker_b1_, *ker_b0_;
-    unsigned int *ompstatus_;
     int nthrs_;
 };
 }
index 9766a46..354fa0b 100644 (file)
@@ -21,7 +21,7 @@
 #include "gemm_utils.hpp"
 #include "jit_avx_gemm_f32.hpp"
 
-#define CACHE_LINE_SIZE 16
+#define CACHE_LINE_SIZE 64
 
 namespace mkldnn {
 namespace impl {
@@ -51,7 +51,7 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
         : jit_generator(code_ptr, code_size)
     {
         const bool is_avx2 = mayiuse(avx2);
-        assert(implication(!is_avx2, mayiuse(avx)));
+        assert(IMPLICATION(!is_avx2, mayiuse(avx)));
 
         const int UNROLL_M = is_avx2 ? 16 : 8;
         const int UNROLL_N = 6;
@@ -128,10 +128,10 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
         // Function for packing if needed
         auto do_pack = [&](
                 int unroll_m, bool isLoad1Unmasked, bool isLoad2Unmasked) {
+            Label pack2, pack3, pack4, pack10;
 
             int regIdx;
             Reg64 reg;
-            inLocalLabel();
 
             mov(BO1, A);
             lea(AO1, ptr[rsp + 256 + OFFSET * SIZE]);
@@ -144,10 +144,10 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
 
             mov(LL, K);
             sar(LL, 2);
-            jle(".pack3", T_NEAR);
+            jle(pack3, T_NEAR);
             align(16);
 
-            L(".pack2");
+            L(pack2);
             if (!isTransA) {
                 for (int i = 0; i < 4; i++) {
                     regIdx = (i % 2 == 0) ? 4 : 6;
@@ -396,16 +396,16 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
 
             add(AO1, unroll_m * 4 * SIZE);
             sub(LL, 1);
-            jg(".pack2", T_NEAR);
+            jg(pack2, T_NEAR);
             align(16);
 
-            L(".pack3");
+            L(pack3);
             mov(LL, K);
             and_(LL, 3);
-            jle(".pack10", T_NEAR);
+            jle(pack10, T_NEAR);
             align(16);
 
-            L(".pack4");
+            L(pack4);
             if (!isTransA) {
                 if (isLoad1Unmasked) {
                     vmovups(ymm4, ptr[BO1 + (0 * 8 - OFFSET) * SIZE]);
@@ -542,12 +542,10 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
 
             add(AO1, unroll_m * SIZE);
             sub(LL, 1);
-            jg(".pack4", T_NEAR);
+            jg(pack4, T_NEAR);
             align(16);
 
-            L(".pack10");
-
-            outLocalLabel();
+            L(pack10);
         };
 
         // Fused multiply add; may become one or two instructions
@@ -1382,8 +1380,6 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                 Ymm reg15 = Ymm(7), Ymm reg16 = Ymm(8), Ymm reg17 = Ymm(9),
                 Ymm reg18 = Ymm(10), Ymm reg19 = Ymm(11), Ymm reg20 = Ymm(12),
                 Ymm reg21 = Ymm(13), Ymm reg22 = Ymm(14), Ymm reg23 = Ymm(15)) {
-            inLocalLabel();
-
             if (!isDirect) {
                 lea(AO1, ptr[rsp + 256 + OFFSET * SIZE]);
             } else {
@@ -1431,20 +1427,23 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
             mov(LL, K);
             sar(LL, 3);
 
+            Label kernel12, kernel13, kernel14, kernel15;
+            Label kernel16, kernel17, kernel18;
+
             sub(LL, SECOND_FETCH);
-            jle(".kernel13", T_NEAR);
+            jle(kernel13, T_NEAR);
             align(16);
 
-            L(".kernel12");
+            L(kernel12);
             innerkernel8(unroll_m, unroll_n, isLoad1Unmasked, isLoad2Unmasked,
                     isDirect, isCopy, useFma, reg00, reg01, reg02, reg03, reg04,
                     reg05, reg06, reg07, reg08, reg09, reg10, reg11, reg12,
                     reg13, reg14, reg15, reg16, reg17, reg18, reg19, reg20,
                     reg21, reg22, reg23);
-            jg(".kernel12", T_NEAR);
+            jg(kernel12, T_NEAR);
             align(16);
 
-            L(".kernel13");
+            L(kernel13);
             prefetcht0(ptr[CO1 + (unroll_m - 1) * SIZE]);
             if (unroll_n >= 2)
                 prefetcht0(ptr[CO1 + LDC + (unroll_m - 1) * SIZE]);
@@ -1458,30 +1457,30 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                 prefetcht0(ptr[CO2 + LDC * 2 + (unroll_m - 1) * SIZE]);
 
             add(LL, SECOND_FETCH);
-            jle(".kernel15", T_NEAR);
+            jle(kernel15, T_NEAR);
             align(16);
 
-            L(".kernel14");
+            L(kernel14);
             innerkernel8(unroll_m, unroll_n, isLoad1Unmasked, isLoad2Unmasked,
                     isDirect, isCopy, useFma, reg00, reg01, reg02, reg03, reg04,
                     reg05, reg06, reg07, reg08, reg09, reg10, reg11, reg12,
                     reg13, reg14, reg15, reg16, reg17, reg18, reg19, reg20,
                     reg21, reg22, reg23);
-            jg(".kernel14", T_NEAR);
+            jg(kernel14, T_NEAR);
             align(16);
 
-            L(".kernel15");
+            L(kernel15);
             test(K, 4);
-            jle(".kernel16", T_NEAR);
+            jle(kernel16, T_NEAR);
             innerkernel4(unroll_m, unroll_n, isLoad1Unmasked, isLoad2Unmasked,
                     isDirect, isCopy, useFma, reg00, reg01, reg02, reg03, reg04,
                     reg05, reg06, reg07, reg08, reg09, reg10, reg11, reg12,
                     reg13, reg14, reg15, reg16, reg17, reg18, reg19, reg20,
                     reg21, reg22, reg23);
 
-            L(".kernel16");
+            L(kernel16);
             test(K, 2);
-            jle(".kernel17", T_NEAR);
+            jle(kernel17, T_NEAR);
             innerkernel2(unroll_m, unroll_n, isLoad1Unmasked, isLoad2Unmasked,
                     isDirect, isCopy, useFma, reg00, reg01, reg02, reg03, reg04,
                     reg05, reg06, reg07, reg08, reg09, reg10, reg11, reg12,
@@ -1489,7 +1488,7 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                     reg21, reg22, reg23);
             align(16);
 
-            L(".kernel17");
+            L(kernel17);
             if (unroll_m == 16) {
                 if (unroll_n <= 3) {
                     vaddps(reg00, reg00, reg12);
@@ -1511,13 +1510,13 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
             }
 
             test(K, 1);
-            jle(".kernel18", T_NEAR);
+            jle(kernel18, T_NEAR);
             innerkernel1(unroll_m, unroll_n, isLoad1Unmasked, isLoad2Unmasked,
                     isDirect, isCopy, useFma, reg00, reg01, reg02, reg03, reg04,
                     reg05, reg06, reg07, reg08, reg09, reg10, reg11);
             align(16);
 
-            L(".kernel18");
+            L(kernel18);
             vbroadcastss(VALPHA, ALPHA);
 
             if (isBetaN) {
@@ -1804,8 +1803,6 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                 sub(BO1, rax);
                 add(BO1, unroll_n * SIZE);
             }
-
-            outLocalLabel();
         };
 
         auto kernel_16x6 = [&](int unroll_m, int unroll_n, bool isLoad1Unmasked,
@@ -1898,12 +1895,18 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
         // Masking is used for tail cases where M is not divisible by 8.
         auto subloop = [&](
                 int unroll_m, bool isLoad1Unmasked, bool isLoad2Unmasked) {
-            inLocalLabel();
-
             if (isTransA) {
                 do_pack(unroll_m, isLoad1Unmasked, isLoad2Unmasked);
             }
 
+            Label subloop11, subloop11mask;
+            Label subloop20, subloop21, subloop22, subloop23;
+            Label subloop24, subloop25;
+            Label subloop30, subloop31, subloop32, subloop33;
+            Label subloop34, subloop35;
+            Label subloop98, subloop98mask;
+            Label subloop99, subloop99mask;
+
             mov(CO1, C);
             lea(CO2, ptr[CO1 + LDC * 2]);
             add(CO2, LDC);
@@ -1916,11 +1919,11 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
             if (!isTransA) {
                 lea(AA, ptr[A + (unroll_m * 2 - 1 - OFFSET) * SIZE]);
                 cmp(M, UNROLL_M);
-                jg(".subloop98", T_NEAR);
+                jg(subloop98, T_NEAR);
 
                 mov(AA, ORIG_A);
                 lea(AA, ptr[AA + (unroll_m - 1 - OFFSET) * SIZE]);
-                L(".subloop98");
+                L(subloop98);
             }
 
             mov(LL, N);
@@ -1928,14 +1931,14 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
             if (!isTransA) {
                 // If N is too small, skip copy operation
                 cmp(LL, UNROLL_N * 3);
-                jle(".subloop30", T_NEAR);
+                jle(subloop30, T_NEAR);
 
                 // If A is not aligned to cache line
                 cmp(FLAG, 0);
-                je(".subloop30", T_NEAR);
+                je(subloop30, T_NEAR);
             } else {
                 cmp(LL, UNROLL_N);
-                jl(".subloop20", T_NEAR);
+                jl(subloop20, T_NEAR);
             }
             align(16);
 
@@ -1959,10 +1962,10 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
 
             sub(I, UNROLL_N);
             cmp(I, UNROLL_N);
-            jl(".subloop20", T_NEAR);
+            jl(subloop20, T_NEAR);
             align(16);
 
-            L(".subloop11");
+            L(subloop11);
             if (unroll_m == 16) {
                 kernel_16x6(unroll_m, UNROLL_N, isLoad1Unmasked,
                         isLoad2Unmasked, false, false);
@@ -1972,12 +1975,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
             }
             sub(I, UNROLL_N);
             cmp(I, UNROLL_N);
-            jge(".subloop11", T_NEAR);
+            jge(subloop11, T_NEAR);
             align(16);
 
-            L(".subloop20");
+            L(subloop20);
             cmp(I, 1);
-            jne(".subloop21", T_NEAR);
+            jne(subloop21, T_NEAR);
             if (unroll_m == 16) {
                 kernel_16x1(unroll_m, 1, isLoad1Unmasked, isLoad2Unmasked,
                         false, false);
@@ -1985,12 +1988,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                 kernel_8x1(unroll_m, 1, isLoad1Unmasked, isLoad2Unmasked, false,
                         false);
             }
-            jmp(".subloop99", T_NEAR);
+            jmp(subloop99, T_NEAR);
             align(16);
 
-            L(".subloop21");
+            L(subloop21);
             cmp(I, 2);
-            jne(".subloop22", T_NEAR);
+            jne(subloop22, T_NEAR);
             if (unroll_m == 16) {
                 kernel_16x2(unroll_m, 2, isLoad1Unmasked, isLoad2Unmasked,
                         false, false);
@@ -1998,12 +2001,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                 kernel_8x2(unroll_m, 2, isLoad1Unmasked, isLoad2Unmasked, false,
                         false);
             }
-            jmp(".subloop99", T_NEAR);
+            jmp(subloop99, T_NEAR);
             align(16);
 
-            L(".subloop22");
+            L(subloop22);
             cmp(I, 3);
-            jne(".subloop23", T_NEAR);
+            jne(subloop23, T_NEAR);
             if (unroll_m == 16) {
                 kernel_16x3(unroll_m, 3, isLoad1Unmasked, isLoad2Unmasked,
                         false, false);
@@ -2011,12 +2014,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                 kernel_8x3(unroll_m, 3, isLoad1Unmasked, isLoad2Unmasked, false,
                         false);
             }
-            jmp(".subloop99", T_NEAR);
+            jmp(subloop99, T_NEAR);
             align(16);
 
-            L(".subloop23");
+            L(subloop23);
             cmp(I, 4);
-            jne(".subloop24", T_NEAR);
+            jne(subloop24, T_NEAR);
             if (unroll_m == 16) {
                 kernel_16x4(unroll_m, 4, isLoad1Unmasked, isLoad2Unmasked,
                         false, false);
@@ -2024,12 +2027,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                 kernel_8x4(unroll_m, 4, isLoad1Unmasked, isLoad2Unmasked, false,
                         false);
             }
-            jmp(".subloop99", T_NEAR);
+            jmp(subloop99, T_NEAR);
             align(16);
 
-            L(".subloop24");
+            L(subloop24);
             cmp(I, 5);
-            jne(".subloop99", T_NEAR);
+            jne(subloop99, T_NEAR);
             if (unroll_m == 16) {
                 kernel_16x5(unroll_m, 5, isLoad1Unmasked, isLoad2Unmasked,
                         false, false);
@@ -2037,16 +2040,16 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                 kernel_8x5(unroll_m, 5, isLoad1Unmasked, isLoad2Unmasked, false,
                         false);
             }
-            jmp(".subloop99", T_NEAR);
+            jmp(subloop99, T_NEAR);
             align(16);
 
             if (!isTransA) {
-                L(".subloop30");
+                L(subloop30);
                 cmp(I, UNROLL_N);
-                jl(".subloop25", T_NEAR);
+                jl(subloop25, T_NEAR);
                 align(16);
 
-                L(".subloop31");
+                L(subloop31);
                 if (unroll_m == 16) {
                     kernel_16x6(unroll_m, UNROLL_N, isLoad1Unmasked,
                             isLoad2Unmasked, true, false);
@@ -2056,12 +2059,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                 }
                 sub(I, UNROLL_N);
                 cmp(I, UNROLL_N);
-                jge(".subloop31", T_NEAR);
+                jge(subloop31, T_NEAR);
                 align(16);
 
-                L(".subloop25");
+                L(subloop25);
                 cmp(I, 1);
-                jne(".subloop32", T_NEAR);
+                jne(subloop32, T_NEAR);
                 if (unroll_m == 16) {
                     kernel_16x1(unroll_m, 1, isLoad1Unmasked, isLoad2Unmasked,
                             true, false);
@@ -2069,12 +2072,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                     kernel_8x1(unroll_m, 1, isLoad1Unmasked, isLoad2Unmasked,
                             true, false);
                 }
-                jmp(".subloop99", T_NEAR);
+                jmp(subloop99, T_NEAR);
                 align(16);
 
-                L(".subloop32");
+                L(subloop32);
                 cmp(I, 2);
-                jne(".subloop33", T_NEAR);
+                jne(subloop33, T_NEAR);
                 if (unroll_m == 16) {
                     kernel_16x2(unroll_m, 2, isLoad1Unmasked, isLoad2Unmasked,
                             true, false);
@@ -2082,12 +2085,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                     kernel_8x2(unroll_m, 2, isLoad1Unmasked, isLoad2Unmasked,
                             true, false);
                 }
-                jmp(".subloop99", T_NEAR);
+                jmp(subloop99, T_NEAR);
                 align(16);
 
-                L(".subloop33");
+                L(subloop33);
                 cmp(I, 3);
-                jne(".subloop34", T_NEAR);
+                jne(subloop34, T_NEAR);
                 if (unroll_m == 16) {
                     kernel_16x3(unroll_m, 3, isLoad1Unmasked, isLoad2Unmasked,
                             true, false);
@@ -2095,12 +2098,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                     kernel_8x3(unroll_m, 3, isLoad1Unmasked, isLoad2Unmasked,
                             true, false);
                 }
-                jmp(".subloop99", T_NEAR);
+                jmp(subloop99, T_NEAR);
                 align(16);
 
-                L(".subloop34");
+                L(subloop34);
                 cmp(I, 4);
-                jne(".subloop35", T_NEAR);
+                jne(subloop35, T_NEAR);
                 if (unroll_m == 16) {
                     kernel_16x4(unroll_m, 4, isLoad1Unmasked, isLoad2Unmasked,
                             true, false);
@@ -2108,12 +2111,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                     kernel_8x4(unroll_m, 4, isLoad1Unmasked, isLoad2Unmasked,
                             true, false);
                 }
-                jmp(".subloop99", T_NEAR);
+                jmp(subloop99, T_NEAR);
                 align(16);
 
-                L(".subloop35");
+                L(subloop35);
                 cmp(I, 5);
-                jne(".subloop99", T_NEAR);
+                jne(subloop99, T_NEAR);
                 if (unroll_m == 16) {
                     kernel_16x5(unroll_m, 5, isLoad1Unmasked, isLoad2Unmasked,
                             true, false);
@@ -2124,7 +2127,7 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
                 align(16);
             }
 
-            L(".subloop99");
+            L(subloop99);
             // Compute address for A
             if (!isTransA) {
                 add(A, unroll_m * SIZE);
@@ -2138,14 +2141,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
             if (hasBias) {
                 add(BIAS, unroll_m * SIZE);
             }
-
-            outLocalLabel();
         };
 
-        inLocalLabel();
-
         preamble();
 
+        Label buffer_in_ws, buffer_allocated;
+
         // Get the registers
         mov(B, ARG_B);
         mov(LDB, ARG_LDB);
@@ -2165,7 +2166,7 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
 #endif
 
         cmp(K, STACK_K_CAPACITY);
-        jg(".buffer_in_ws", T_NEAR);
+        jg(buffer_in_ws, T_NEAR);
 
         // Create buffer and align to 4kB page
         lea(rax, ptr[K * SIZE]);
@@ -2173,12 +2174,12 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
         add(rax, 256);
         sub(rsp, rax);
         and_(rsp, -PAGE_4K);
-        jmp(".buffer_allocated", T_NEAR);
+        jmp(buffer_allocated, T_NEAR);
 
-        L(".buffer_in_ws");
+        L(buffer_in_ws);
         mov(rsp, ARG_WS);
 
-        L(".buffer_allocated");
+        L(buffer_allocated);
 
         mov(ORIG_SP, rbp);
         mov(M, ARG_M);
@@ -2218,43 +2219,45 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
         and_(rax, 0x1f);
         mov(FLAG, rax);
 
+        Label main0, main1, main2, main3, main999;
+
         cmp(M, UNROLL_M);
-        jl(".main0", T_NEAR);
+        jl(main0, T_NEAR);
         align(16);
 
-        L(".main1");
+        L(main1);
         subloop(UNROLL_M, true, true);
         sub(M, UNROLL_M);
         cmp(M, UNROLL_M);
-        jge(".main1", T_NEAR);
+        jge(main1, T_NEAR);
         align(16);
 
-        L(".main0");
+        L(main0);
         cmp(M, 0);
-        jle(".main999", T_NEAR);
+        jle(main999, T_NEAR);
 
         if (UNROLL_M > 8) {
             cmp(M, 8);
-            jle(".main2", T_NEAR);
+            jle(main2, T_NEAR);
 
             sub(M, 8);
             vbroadcastss(VMASK, M);
             vpcmpgtd(VMASK, VMASK, MASK);
 
             subloop(16, true, false);
-            jmp(".main999", T_NEAR);
+            jmp(main999, T_NEAR);
             align(16);
 
-            L(".main2");
+            L(main2);
             cmp(M, 8);
-            jne(".main3", T_NEAR);
+            jne(main3, T_NEAR);
             subloop(8, true, true);
-            jmp(".main999", T_NEAR);
+            jmp(main999, T_NEAR);
         }
 
         align(16);
 
-        L(".main3");
+        L(main3);
         vbroadcastss(VMASK, M);
         if (is_avx2) {
             vpcmpgtd(VMASK, VMASK, MASK);
@@ -2270,7 +2273,7 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
         subloop(8, false, false);
         align(16);
 
-        L(".main999");
+        L(main999);
         // Restore original stack
         mov(rax, ORIG_SP);
         mov(rsp, rax);
@@ -2278,8 +2281,6 @@ struct jit_avx_gemm_f32::xbyak_gemm : public jit_generator {
         vzeroupper();
         postamble();
 
-        outLocalLabel();
-
         ker_ = reinterpret_cast<decltype(ker_)>(
                 const_cast<uint8_t *>(this->getCode()));
     }
@@ -2335,7 +2336,7 @@ void jit_avx_gemm_f32::sgemm_nocopy_driver(const char *transa,
     int BM = 4032;
     int BN = isTransA ? 96 : 48;
     int BK = isTransB ? 96 : 256;
-    const float *curA, *curB, *curBias = NULL;
+    const float *curA, *curB, *curBias = nullptr;
     float *curC;
 
     for (Bk = 0; Bk < k; Bk += sizeK) {
@@ -2376,15 +2377,15 @@ void jit_avx_gemm_f32::sgemm_nocopy_driver(const char *transa,
                     curB = b + Bn + (size_t)Bk * ldb;
                 }
                 curC = c + Bm + (size_t)Bn * ldc;
-                if (bias != NULL) {
+                if (bias != nullptr) {
                     if (Bk == 0) {
                         curBias = bias + Bm;
                     } else {
-                        curBias = NULL;
+                        curBias = nullptr;
                     }
                 }
                 if (Bk == 0) {
-                    if (*beta == 0.0 && bias == NULL)
+                    if (*beta == 0.0 && bias == nullptr)
                         (*ker_b0_)((long long int)sizeM, (long long int)sizeN,
                                 (long long int)sizeK, alpha, curA,
                                 (long long int)lda, curB, (long long int)ldb,
@@ -2431,7 +2432,7 @@ void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb,
     // Determine threading partitioning
     gemm_utils::calc_nthr_nocopy_avx(
             m, n, k, nthr, &nthr_m, &nthr_n, &nthr_k, &MB, &NB, &KB);
-    assert(utils::implication(!mkldnn_thr_syncable(), nthr_k == 1));
+    assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_k == 1));
 
     // May not happen, but just in case
     if (nthr < nthr_m * nthr_n * nthr_k)
@@ -2439,13 +2440,19 @@ void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb,
 
     nthr_mn = nthr_m * nthr_n;
 
-    unsigned int volatile *ompstatus = (unsigned int volatile *)ompstatus_;
-    if (!ompstatus) return;
+    unsigned char * ompstatus_ = nullptr;
+    unsigned char volatile *ompstatus = nullptr;
 
-    float *c_buffers = NULL;
-    float *ws_buffers = NULL;
+    float *c_buffers = nullptr;
+    float *ws_buffers = nullptr;
 
     if (nthr_k > 1) {
+        ompstatus_ = (unsigned char *) malloc(
+                nthr * CACHE_LINE_SIZE,
+                CACHE_LINE_SIZE);
+        ompstatus = (unsigned char volatile *) ompstatus_;
+        assert(ompstatus);
+
         for (int i = 0; i < nthr; i++)
             ompstatus[i * CACHE_LINE_SIZE] = 0;
 
@@ -2466,7 +2473,7 @@ void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb,
         int n_from, n_to, myN;
         int k_from, k_to, myK;
         int cbase, ibase;
-        const float *myA, *myB, *myBias = NULL;
+        const float *myA, *myB, *myBias = nullptr;
         float *myC = C, myBeta;
         float *ws = ws_buffers ?
                 ws_buffers + ithr * ws_size_per_thr / sizeof(float) : 0;
@@ -2528,7 +2535,7 @@ void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb,
                     myC = c_buffers + MB * NB * (cbase + ithr_k - 1);
                     myBeta = 0.0;
                     ld = MB;
-                    myBias = NULL;
+                    myBias = nullptr;
                 }
 
                 sgemm_nocopy_driver(transa, transb, myM, myN, myK, p_alpha, myA,
@@ -2575,8 +2582,8 @@ void jit_avx_gemm_f32::sgemm(const char *transa, const char *transb,
         }
     });
 
-    if (nthr_k > 1)
-        free(c_buffers);
+    free(c_buffers);
+    free(ompstatus_);
     free(ws_buffers);
 }
 
@@ -2602,9 +2609,6 @@ jit_avx_gemm_f32::jit_avx_gemm_f32(
         ker_b0_ = ker_bn_;
     }
     nthrs_ = mkldnn_get_max_threads();
-    ompstatus_ = (unsigned int *)malloc(
-        sizeof(unsigned int *) * nthrs_ * CACHE_LINE_SIZE, 64);
-    assert(ompstatus_);
 }
 
 jit_avx_gemm_f32::~jit_avx_gemm_f32()
@@ -2614,7 +2618,6 @@ jit_avx_gemm_f32::~jit_avx_gemm_f32()
         delete ker_b1_;
     if (beta_ != 0.0 || (beta_ == 0.0 && hasBias_))
         delete ker_b0_;
-    free(ompstatus_);
 }
 
 }
index 0f0cc46..dd34e09 100644 (file)
@@ -49,7 +49,6 @@ private:
     bool hasBias_;
     struct xbyak_gemm;
     xbyak_gemm *ker_bn_, *ker_b1_, *ker_b0_;
-    unsigned int *ompstatus_;
     int nthrs_;
 };
 }
index 3310bf5..e0331e0 100644 (file)
@@ -27,63 +27,68 @@ namespace impl {
 namespace cpu {
 
 using namespace mkldnn::impl::utils;
+using namespace gemm_utils;
 
-constexpr int unroll_m = 16;
-constexpr int unroll_n = 6;
+
+template <typename data_t>
 static void copy_A(
-        bool isTransA, int K, const float *A, const int lda, float *ws) {
+        bool isTransA, int K, const data_t *A, const int lda, data_t *ws) {
     for (int k = 0; k < K; k++) {
         PRAGMA_OMP_SIMD()
-        for (int i = 0; i < unroll_m; i++) {
+        for (int i = 0; i < gemm_utils::unroll_factor<data_t>::m; i++) {
             ws[i] = isTransA ? A[i * lda + k] : A[i + k * lda];
         }
-        ws += unroll_m;
+        ws += unroll_factor<data_t>::m;
     }
 }
 
-template <bool isTransA, bool isTransB>
-static void kernel_mxn(int K, const float *A, const int lda,
-        const float *B, const int ldb, float *C, const int ldc,
-        const float alpha, const float beta) {
-    float c[unroll_m * unroll_n] = { 0. };
+template <typename data_t, bool isTransA, bool isTransB>
+static void kernel_mxn(int K, const data_t *A, const int lda,
+        const data_t *B, const int ldb, data_t *C, const int ldc,
+        const data_t alpha, const data_t beta) {
+    data_t c[unroll_factor<data_t>::m * unroll_factor<data_t>::n] =
+        { static_cast<data_t>(0.) };
     for (int k = 0; k < K; k++) {
-        for (int j = 0; j < unroll_n; j++) {
-            float b = isTransB ? B[j + k * ldb] : B[k + j * ldb];
+        for (int j = 0; j < unroll_factor<data_t>::n; j++) {
+            data_t b = isTransB ? B[j + k * ldb] : B[k + j * ldb];
             PRAGMA_OMP_SIMD()
-            for (int i = 0; i < unroll_m; i++) {
-                float a = isTransA ? A[i * lda + k] : A[i + lda * k];
-                c[i + unroll_m * j] += a * b;
+            for (int i = 0; i < unroll_factor<data_t>::m; i++) {
+                data_t a = isTransA ? A[i * lda + k] : A[i + lda * k];
+                c[i + unroll_factor<data_t>::m * j] += a * b;
             }
         }
     }
-    for (int j = 0; j < unroll_n; j++) {
+    for (int j = 0; j < unroll_factor<data_t>::n; j++) {
         PRAGMA_OMP_SIMD()
-        for (int i = 0; i < unroll_m; i++) {
-            C[i + j * ldc] = (beta == 0.0f)
-            ? alpha * c[i + unroll_m * j]
-            : alpha * c[i + unroll_m * j] + beta * C[i + j * ldc];
+        for (int i = 0; i < unroll_factor<data_t>::m; i++) {
+            C[i + j * ldc] = (beta == static_cast<data_t>(0.))
+            ? alpha * c[i + unroll_factor<data_t>::m * j]
+            : alpha * c[i + unroll_factor<data_t>::m * j]
+                + beta * C[i + j * ldc];
         }
     }
 }
 
-template <bool isTransA, bool isTransB>
+template <typename data_t, bool isTransA, bool isTransB>
 static void block_ker(const int M, const int N, const int K,
-        const float *A, const int lda, const float *B, const int ldb, float *C,
-        const int ldc, const float alpha, const float beta, float *ws,
-        bool do_copy) {
-    int Nu = rnd_dn(N, unroll_n), Mu = rnd_dn(M, unroll_m);
-    for (int i = 0; i < Mu; i += unroll_m) {
-        for (int j = 0; j < Nu; j += unroll_n) {
-            const float *b = isTransB ? &B[j] : &B[j * ldb];
-            const float *a = isTransA ? &A[i * lda] : &A[i];
+        const data_t *A, const int lda, const data_t *B, const int ldb,
+        data_t *C, const int ldc, const data_t alpha, const data_t beta,
+        data_t *ws, bool do_copy) {
+    int Nu = rnd_dn(N, unroll_factor<data_t>::n);
+    int Mu = rnd_dn(M, unroll_factor<data_t>::m);
+    for (int i = 0; i < Mu; i += unroll_factor<data_t>::m) {
+        for (int j = 0; j < Nu; j += unroll_factor<data_t>::n) {
+            const data_t *b = isTransB ? &B[j] : &B[j * ldb];
+            const data_t *a = isTransA ? &A[i * lda] : &A[i];
             if (do_copy) {
                 if (j == 0) {
-                    copy_A(isTransA, K, a, lda, ws);
+                    copy_A<data_t>(isTransA, K, a, lda, ws);
                 }
-                kernel_mxn<false, isTransB>(
-                        K, ws, unroll_m, b, ldb, &C[i + j * ldc], ldc, alpha, beta);
+                kernel_mxn<data_t, false, isTransB>(
+                        K, ws, unroll_factor<data_t>::m, b, ldb,
+                        &C[i + j * ldc], ldc, alpha, beta);
             } else {
-                kernel_mxn<isTransA, isTransB>(
+                kernel_mxn<data_t, isTransA, isTransB>(
                         K, a, lda, b, ldb, &C[i + j * ldc], ldc, alpha, beta);
             }
         }
@@ -91,10 +96,12 @@ static void block_ker(const int M, const int N, const int K,
     // tail processing
     for (int i = 0; i < M; i++) {
         for (int j = Nu; j < N; j++) {
-            float c = beta == 0.0f ? 0.0f : beta * C[i + j * ldc];
+            data_t c = beta == static_cast<data_t>(0.)
+                ? static_cast<data_t>(0.)
+                : beta * C[i + j * ldc];
             for (int p = 0; p < K; p++) {
-                float b = isTransB ? B[j + p * ldb] : B[p + j * ldb];
-                float a = isTransA ? A[p + i * lda] : A[i + p * lda];
+                data_t b = isTransB ? B[j + p * ldb] : B[p + j * ldb];
+                data_t a = isTransA ? A[p + i * lda] : A[i + p * lda];
                 c += alpha * a * b;
             }
             C[i + j * ldc] = c;
@@ -102,10 +109,12 @@ static void block_ker(const int M, const int N, const int K,
     }
     for (int i = Mu; i < M; i++) {
         for (int j = 0; j < Nu; j++) {
-            float c = beta == 0.0f ? 0.0f : beta * C[i + j * ldc];
+            data_t c = beta == static_cast<data_t>(0.)
+                ? static_cast<data_t>(0.)
+                : beta * C[i + j * ldc];
             for (int p = 0; p < K; p++) {
-                float b = isTransB ? B[j + p * ldb] : B[p + j * ldb];
-                float a = isTransA ? A[p + i * lda] : A[i + p * lda];
+                data_t b = isTransB ? B[j + p * ldb] : B[p + j * ldb];
+                data_t a = isTransA ? A[p + i * lda] : A[i + p * lda];
                 c += alpha * a * b;
             }
             C[i + j * ldc] = c;
@@ -113,25 +122,28 @@ static void block_ker(const int M, const int N, const int K,
     }
 }
 
-template <bool isTransA, bool isTransB>
-void gemm_ithr(const int M, const int N, const int K, const float alpha,
-        const float *A, const int lda, const float *B, const int ldb,
-        const float beta, float *C, const int ldc, bool do_copy, float *ws) {
-    int BM = 4032;
-    int BN = isTransA ? 96 : 48;
-    int BK = isTransB ? 96 : 256;
-    const float *curA, *curB;
-    float *curC;
+template <typename data_t, bool isTransA, bool isTransB>
+void gemm_ithr(const int M, const int N, const int K, const data_t alpha,
+        const data_t *A, const int lda, const data_t *B, const int ldb,
+        const data_t beta, data_t *C, const int ldc, bool do_copy, data_t *ws) {
+    constexpr int BM = gemm_traits<data_t, isTransA, isTransB>::BM;
+    constexpr int BN = gemm_traits<data_t, isTransA, isTransB>::BN;
+    constexpr int BK = gemm_traits<data_t, isTransA, isTransB>::BK;
+
+    const data_t *curA;
+    const data_t *curB;
+    data_t *curC;
 
     if ((M <= 0) || (N <= 0))
         return;
 
-    if ((K <= 0) || (alpha == 0.0f)) {
-        if (beta == 0.0f) {
-            for (int j = 0; j < N * M; j++)
-                C[j] = 0.0f;
-        } else if (beta != 1.0f) {
-            for (int j = 0; j < N * M; j++)
+    if ((K <= 0) || (alpha == static_cast<data_t>(0))) {
+        ptrdiff_t MN = (ptrdiff_t)N * M;
+        if (beta == static_cast<data_t>(0.)) {
+            for (ptrdiff_t j = 0; j < MN; j++)
+                C[j] = static_cast<data_t>(0.);
+        } else if (beta != static_cast<data_t>(1.)) {
+            for (ptrdiff_t j = 0; j < MN; j++)
                 C[j] *= beta;
         }
         return;
@@ -147,25 +159,27 @@ void gemm_ithr(const int M, const int N, const int K, const float alpha,
                 curB = isTransB ? B + Bn + Bk * ldb : B + Bk + Bn * ldb;
                 curC = C + Bm + Bn * ldc;
                 if (Bk == 0) {
-                    block_ker<isTransA, isTransB>(mb, nb, kb, curA, lda, curB,
-                            ldb, curC, ldc, alpha, beta, ws, do_copy);
+                    block_ker<data_t, isTransA, isTransB>(mb, nb, kb, curA, lda,
+                        curB, ldb, curC, ldc, alpha, beta, ws, do_copy);
                 } else {
-                    block_ker<isTransA, isTransB>(mb, nb, kb, curA, lda, curB,
-                            ldb, curC, ldc, alpha, 1.0f, ws, do_copy);
+                    block_ker<data_t, isTransA, isTransB>(mb, nb, kb, curA, lda,
+                        curB, ldb, curC, ldc, alpha, static_cast<data_t>(1.0),
+                        ws, do_copy);
                 }
             }
         }
     }
 }
 
+template <typename data_t>
 void ref_gemm(const char *transa_, const char *transb_, const int *M_,
-        const int *N_, const int *K_, const float *alpha_, const float *A,
-        const int *lda_, const float *B, const int *ldb_, const float *beta_,
-        float *C, const int *ldc_, const float *bias) {
+        const int *N_, const int *K_, const data_t *alpha_, const data_t *A,
+        const int *lda_, const data_t *B, const int *ldb_, const data_t *beta_,
+        data_t *C, const int *ldc_, const data_t *bias) {
     bool isTransA = (*transa_ == 'T' || *transa_ == 't');
     bool isTransB = (*transb_ == 'T' || *transb_ == 't');
     const int M = *M_, N = *N_, K = *K_, lda = *lda_, ldb = *ldb_, ldc = *ldc_;
-    const float alpha = *alpha_, beta = *beta_;
+    const data_t alpha = *alpha_, beta = *beta_;
 
     int max_nthr = mkldnn_in_parallel() ? 1 : mkldnn_get_max_threads();
     int nthr_m, nthr_n, nthr_k;
@@ -173,26 +187,27 @@ void ref_gemm(const char *transa_, const char *transb_, const int *M_,
     // thread balancing over M, N, K & size of blocking dimensions
     gemm_utils::calc_nthr_nocopy_avx(
             M, N, K, max_nthr, &nthr_m, &nthr_n, &nthr_k, &MB, &NB, &KB);
-    assert(utils::implication(!mkldnn_thr_syncable(), nthr_k == 1));
+    assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_k == 1));
 
-    float *c_buffers = nullptr, *ws_buffers = nullptr;
+    data_t *c_buffers = nullptr;
+    data_t *ws_buffers = nullptr;
     if (nthr_k > 1) {
-        c_buffers = (float *)malloc(nthr_m * nthr_n * (nthr_k - 1) * MB * NB
-                * sizeof(float), PAGE_4K);
+        c_buffers = (data_t *)malloc(nthr_m * nthr_n * (nthr_k - 1) * MB * NB
+                * sizeof(data_t), PAGE_4K);
         if (!c_buffers) {
             nthr_k = 1;
             KB = K;
         }
     }
 
-    bool do_copy = (NB / unroll_n > 3);
+    bool do_copy = (NB / unroll_factor<data_t>::n > 3);
     const int nthr_mn = nthr_m * nthr_n;
     const int nthr = nthr_mn * nthr_k;
-    const size_t ws_elems_per_thr = K * unroll_m;
+    const size_t ws_elems_per_thr = K * unroll_factor<data_t>::m;
     const size_t ws_size_per_thr
-            = utils::rnd_up(ws_elems_per_thr * sizeof(float), PAGE_4K);
+            = utils::rnd_up(ws_elems_per_thr * sizeof(data_t), PAGE_4K);
     if (do_copy) {
-        ws_buffers = (float *)malloc(nthr * ws_size_per_thr, PAGE_4K);
+        ws_buffers = (data_t*)malloc(nthr * ws_size_per_thr, PAGE_4K);
         if (!ws_buffers)
             do_copy = false;
     }
@@ -205,8 +220,8 @@ void ref_gemm(const char *transa_, const char *transb_, const int *M_,
 
         int cbase = (ithr_m + nthr_m * ithr_n) * (nthr_k - 1);
 
-        float *ws = do_copy
-                ? ws_buffers + ithr * ws_size_per_thr / sizeof(float)
+        data_t *ws = do_copy
+                ? ws_buffers + ithr * ws_size_per_thr / sizeof(data_t)
                 : nullptr;
 
         int m_from = 0, m_to = 0, myM = 0, n_from = 0, n_to = 0, myN = 0,
@@ -224,7 +239,7 @@ void ref_gemm(const char *transa_, const char *transb_, const int *M_,
         get_thr_block(k_from, k_to, myK, KB, K, ithr_k);
 
         if (myM > 0 && myN > 0) {
-            float myBeta, *myC;
+            data_t myBeta, *myC;
             int ld;
             if (ithr_k == 0) {
                 myC = &(C[m_from + n_from * ldc]);
@@ -235,28 +250,28 @@ void ref_gemm(const char *transa_, const char *transb_, const int *M_,
                 myBeta = 0.0f;
                 ld = MB;
             }
-            const float *myA = isTransA
+            const data_t *myA = isTransA
                     ? &(A[k_from + m_from * lda])
                     : &(A[m_from + k_from * lda]);
-            const float *myB = isTransB
+            const data_t *myB = isTransB
                     ? &(B[n_from + k_from * ldb])
                     : &(B[k_from + n_from * ldb]);
 
             if (!isTransA) {
                 if (!isTransB) {
-                    gemm_ithr<false, false>(myM, myN, myK, alpha, myA, lda, myB,
-                            ldb, myBeta, myC, ld, do_copy, ws);
+                    gemm_ithr<data_t, false, false>(myM, myN, myK, alpha, myA,
+                        lda, myB, ldb, myBeta, myC, ld, do_copy, ws);
                 } else {
-                    gemm_ithr<false, true>(myM, myN, myK, alpha, myA, lda, myB,
-                            ldb, myBeta, myC, ld, do_copy, ws);
+                    gemm_ithr<data_t, false, true>(myM, myN, myK, alpha, myA,
+                        lda, myB, ldb, myBeta, myC, ld, do_copy, ws);
                 }
             } else {
                 if (!isTransB) {
-                    gemm_ithr<true, false>(myM, myN, myK, alpha, myA, lda, myB,
-                            ldb, myBeta, myC, ld, do_copy, ws);
+                    gemm_ithr<data_t, true, false>(myM, myN, myK, alpha, myA,
+                        lda, myB, ldb, myBeta, myC, ld, do_copy, ws);
                 } else {
-                    gemm_ithr<true, true>(myM, myN, myK, alpha, myA, lda, myB,
-                            ldb, myBeta, myC, ld, do_copy, ws);
+                    gemm_ithr<data_t, true, true>(myM, myN, myK, alpha, myA,
+                        lda, myB, ldb, myBeta, myC, ld, do_copy, ws);
                 }
             }
         }
@@ -270,7 +285,8 @@ void ref_gemm(const char *transa_, const char *transb_, const int *M_,
             gemm_utils::partition_unit_diff(ithr_k, nthr_k, myN, &offset,
                     &block);
             for (int ik = 1; ik < nthr_k; ++ik) {
-                float *myC = c_buffers + MB * (NB * (cbase + ik - 1) + offset);
+                data_t *myC = c_buffers + MB * (NB * (cbase + ik - 1) + offset);
+
                 gemm_utils::sum_two_matrices(myM, block, myC, MB,
                         &C[m_from + (n_from + offset) * ldc], ldc);
             }
@@ -286,6 +302,16 @@ void ref_gemm(const char *transa_, const char *transb_, const int *M_,
     free(ws_buffers);
     free(c_buffers);
 }
+
+template void ref_gemm<float>(const char *transa_, const char *transb_,
+        const int *M_, const int *N_, const int *K_, const float *alpha_,
+        const float *A, const int *lda_, const float *B, const int *ldb_,
+        const float *beta_, float *C, const int *ldc_, const float *bias);
+
+template void ref_gemm<double>(const char *transa_, const char *transb_,
+        const int *M_, const int *N_, const int *K_, const double *alpha_,
+        const double *A, const int *lda_, const double *B, const int *ldb_,
+        const double *beta_, double *C, const int *ldc_, const double *bias);
 }
 }
 }
index 591f46d..c403e45 100644 (file)
@@ -185,9 +185,6 @@ void gemm_convolution_bwd_data_t::execute_backward_data() {
     const int LDC = jcp.im2col_sz ? m : M;
     data_t *col = jcp.im2col_sz ? (data_t *)this->scratchpad_->get() : nullptr;
 
-    parallel_nd(jcp.im2col_sz * jcp.nthr,
-            [&](ptrdiff_t i) { col[i] = (data_t)0; });
-
     const size_t work_amount = (size_t)jcp.ngroups * MB;
 
     if (jcp.id > 1) {
@@ -266,14 +263,14 @@ void gemm_convolution_bwd_weights_t::execute_backward_weights() {
         jit_gemm_convolution_utils::bwd_weights_balance(ithr, nthr, jcp.ngroups,
                 mb_for_balance, ithr_g, nthr_g, ithr_mb, nthr_mb);
 
-        assert(utils::implication(!jcp.need_wei_reduction, nthr_mb == 1));
+        assert(IMPLICATION(!jcp.need_wei_reduction, nthr_mb == 1));
         const int need_reduction = nthr_mb != 1;
 
         if (ithr_g != -1 && ithr_mb != -1) {
             balance211((size_t)jcp.ngroups, nthr_g, ithr_g, g_start, g_end);
             balance211((size_t)jcp.mb, nthr_mb, ithr_mb, mb_start, mb_end);
 
-            assert(implication((g_end - g_start) > 1, need_reduction == 0));
+            assert(IMPLICATION((g_end - g_start) > 1, need_reduction == 0));
 
             data_t *_col = col + (ptrdiff_t)ithr * jcp.im2col_sz;
             data_t *weights_reduce_base = wei_reduction
index 31c3947..d0d65c1 100644 (file)
@@ -46,14 +46,17 @@ struct _gemm_convolution_fwd_t: public cpu_primitive_t {
         inline memory_format_t src_format()
         {
             using namespace memory_format;
-            return (this->cdesc_().src_desc.ndims == 4) ? nchw : ncdhw;
+            return (utils::pick(this->cdesc_().src_desc.ndims - 3,
+                ncw, nchw, ncdhw));
         }
         inline memory_format_t wei_format()
         {
             using namespace memory_format;
-            return (this->cdesc_().src_desc.ndims == 4)
-                ? this->with_groups() ? goihw : oihw
-                : this->with_groups() ? goidhw : oidhw;
+            return (this->with_groups()
+                ? utils::pick(this->cdesc_().src_desc.ndims - 3,
+                    goiw, goihw, goidhw)
+                : utils::pick(this->cdesc_().src_desc.ndims - 3,
+                    oiw, oihw, oidhw));
         }
 
         virtual status_t init() override {
@@ -72,7 +75,7 @@ struct _gemm_convolution_fwd_t: public cpu_primitive_t {
                            this->cdesc_().src_desc.data_type,
                            this->cdesc_().weights_desc.data_type,
                            this->cdesc_().dst_desc.data_type)
-                && utils::implication(this->with_bias(), data_type::f32
+                && IMPLICATION(this->with_bias(), data_type::f32
                                    == this->cdesc_().bias_desc.data_type)
                 && this->src_pd_.desc()->format == src_format()
                 && this->dst_pd_.desc()->format == src_format()
@@ -229,14 +232,17 @@ struct gemm_convolution_bwd_data_t: public cpu_primitive_t {
         inline memory_format_t src_format()
         {
             using namespace memory_format;
-            return (this->desc()->diff_src_desc.ndims == 4) ? nchw : ncdhw;
+            return (utils::pick(this->desc()->diff_src_desc.ndims - 3,
+                ncw, nchw, ncdhw));
         }
         inline memory_format_t wei_format()
         {
             using namespace memory_format;
-            return (this->desc()->diff_src_desc.ndims == 4)
-                ? this->with_groups() ? goihw : oihw
-                : this->with_groups() ? goidhw : oidhw;
+            return (this->with_groups()
+                ? utils::pick(this->desc()->diff_src_desc.ndims - 3,
+                    goiw, goihw, goidhw)
+                : utils::pick(this->desc()->diff_src_desc.ndims - 3,
+                    oiw, oihw, oidhw));
         }
 
         virtual status_t init() override {
@@ -329,14 +335,17 @@ struct gemm_convolution_bwd_weights_t: public cpu_primitive_t {
         inline memory_format_t src_format()
         {
             using namespace memory_format;
-            return (this->desc()->src_desc.ndims == 4) ? nchw : ncdhw;
+            return (utils::pick(this->desc()->src_desc.ndims - 3,
+                ncw, nchw, ncdhw));
         }
         inline memory_format_t wei_format()
         {
             using namespace memory_format;
-            return (this->desc()->src_desc.ndims == 4)
-                ? this->with_groups() ? goihw : oihw
-                : this->with_groups() ? goidhw : oidhw;
+            return (this->with_groups()
+                ? utils::pick(this->desc()->src_desc.ndims - 3,
+                    goiw, goihw, goidhw)
+                : utils::pick(this->desc()->src_desc.ndims - 3,
+                    oiw, oihw, oidhw));
         }
 
         virtual status_t init() override {
@@ -354,7 +363,7 @@ struct gemm_convolution_bwd_weights_t: public cpu_primitive_t {
                     this->desc()->src_desc.data_type,
                     this->desc()->diff_weights_desc.data_type,
                     this->desc()->diff_dst_desc.data_type)
-            && utils::implication(this->with_bias(),
+            && IMPLICATION(this->with_bias(),
                     data_type::f32 == this->desc()->diff_bias_desc.data_type)
             && this->src_pd_.desc()->format == src_format()
             && this->diff_dst_pd_.desc()->format == src_format()
index 6c8a4da..80dfe9f 100644 (file)
@@ -20,6 +20,7 @@
 #include "type_helpers.hpp"
 #include "mkldnn_thread.hpp"
 #include "utils.hpp"
+#include "cpu_isa_traits.hpp"
 
 #include "gemm_convolution_utils.hpp"
 
@@ -163,7 +164,8 @@ void im2col(jit_gemm_conv_conf_t &jcp, const float *im, float *col) {
 }
 
 /* col[oh][ow][kh][kw][ic] <-- im2col_u8(im[ih][iw][ic]) */
-void im2col_u8(jit_gemm_conv_conf_t &jcp, const uint8_t *im, uint8_t *col) {
+template <typename T>
+void im2col_u8(jit_gemm_conv_conf_t &jcp, const T *im, uint8_t *col) {
     parallel_nd(jcp.oh, jcp.ow, [&](int oh, int ow) {
             for (int kh = 0; kh < jcp.kh; ++kh) {
                 const int ih = oh * jcp.stride_h
@@ -181,13 +183,19 @@ void im2col_u8(jit_gemm_conv_conf_t &jcp, const uint8_t *im, uint8_t *col) {
                         = (ih * jcp.iw + iw) * jcp.ngroups * jcp.ic;
                     PRAGMA_OMP_SIMD()
                     for (int ic = 0; ic < jcp.ic; ++ic) {
-                        col[col_idx + ic] = im[im_idx + ic];
+                        col[col_idx + ic] = jcp.signed_input
+                        ? im[im_idx + ic] + 128
+                        : im[im_idx + ic];
                     }
                 }
             }
         }
     );
 }
+template void im2col_u8<int8_t>(
+        jit_gemm_conv_conf_t &jcp, const int8_t *im, uint8_t *col);
+template void im2col_u8<uint8_t>(
+        jit_gemm_conv_conf_t &jcp, const uint8_t *im, uint8_t *col);
 
 /* im[ih][iw][ic] <-- col2im_s32(col[oh][ow][kh][kw][ic]) */
 void col2im_s32(jit_gemm_conv_conf_t &jcp, const int32_t *col, int32_t *im) {
@@ -323,34 +331,35 @@ void init_conf(
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
     jcp.prop_kind = cd.prop_kind;
     const int ndims = src_d.ndims();
+    const int is_1d = ndims == 3;
+    const int is_3d = ndims == 5;
 
     jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
     jcp.mb = src_d.dims()[0];
 
     jcp.oc = dst_d.dims()[1] / jcp.ngroups;
     jcp.ic = src_d.dims()[1] / jcp.ngroups;
-
-    jcp.id = (ndims == 4) ? 1 : src_d.dims()[2];
-    jcp.ih = src_d.dims()[ndims - 2];
+    jcp.id = is_3d ? src_d.dims()[2] : 1;
+    jcp.ih = is_1d ? 1 : src_d.dims()[ndims - 2];
     jcp.iw = src_d.dims()[ndims - 1];
-    jcp.od = (ndims == 4) ? 1 : dst_d.dims()[2];
-    jcp.oh = dst_d.dims()[ndims - 2];
+    jcp.od = is_3d ? dst_d.dims()[2] : 1;
+    jcp.oh = is_1d ? 1 : dst_d.dims()[ndims - 2];
     jcp.ow = dst_d.dims()[ndims - 1];
 
-    jcp.kd = (ndims == 4) ? 1 : weights_d.dims()[with_groups + 2];
-    jcp.kh = weights_d.dims()[with_groups + ndims - 2];
+    jcp.kd = is_3d ? weights_d.dims()[with_groups + 2] : 1;
+    jcp.kh = is_1d ? 1 : weights_d.dims()[with_groups + ndims - 2];
     jcp.kw = weights_d.dims()[with_groups + ndims - 1];
 
-    jcp.f_pad = (ndims == 4) ? 0 : cd.padding[0][0];
-    jcp.t_pad = cd.padding[0][ndims - 4];
+    jcp.f_pad = is_3d ? cd.padding[0][0] : 0;
+    jcp.t_pad = is_1d ? 0 : cd.padding[0][ndims - 4];
     jcp.l_pad = cd.padding[0][ndims - 3];
 
-    jcp.stride_d = (ndims == 4) ? 1 : cd.strides[0];
-    jcp.stride_h = cd.strides[ndims - 4];
+    jcp.stride_d = is_3d ? cd.strides[0] : 1;
+    jcp.stride_h = is_1d ? 1 : cd.strides[ndims - 4];
     jcp.stride_w = cd.strides[ndims - 3];
 
-    jcp.dilate_d = (ndims == 4) ? 0 : cd.dilates[0];
-    jcp.dilate_h = cd.dilates[ndims - 4];
+    jcp.dilate_d = is_3d ? cd.dilates[0] : 0;
+    jcp.dilate_h = is_1d ? 0 : cd.dilates[ndims - 4];
     jcp.dilate_w = cd.dilates[ndims - 3];
 
     jcp.src_fmt = src_d.format();
@@ -363,14 +372,22 @@ void init_conf(
     jcp.is = jcp.ih * jcp.iw;
     jcp.os = jcp.oh * jcp.ow;
     jcp.ks = jcp.kh * jcp.kw * jcp.kd;
-    jcp.im2col_sz = !(jcp.oh == jcp.ih && jcp.ow == jcp.iw
-                            && jcp.od == jcp.id && jcp.ks == 1)
+
+    jcp.signed_input = (src_d.data_type() == data_type::s8);
+    jcp.wei_adj_scale = (!jcp.signed_input || mayiuse(avx512_core_vnni))
+            ? 1.0f
+            : (1.0f / 2.0f);
+    jcp.im2col_sz = !everyone_is(true,
+            jcp.ow == jcp.iw, jcp.oh == jcp.ih, jcp.od == jcp.id,
+            jcp.stride_w == 1, jcp.stride_h == 1, jcp.stride_d == 1,
+            jcp.ks == 1, !jcp.signed_input)
         ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os
         : 0;
 
     bool do_outer_threading = false;
-    bool is_int8_conv = (cd.src_desc.data_type == u8
-            && cd.weights_desc.data_type == s8);
+    bool is_int8_conv
+            = (utils::one_of(cd.src_desc.data_type == u8, cd.src_desc.data_type == s8)
+                    && cd.weights_desc.data_type == s8);
     if (is_int8_conv) {
         bool is_depthwise =
                 utils::everyone_is(1, jcp.ic, jcp.oc) && jcp.ngroups != 1;
@@ -379,7 +396,7 @@ void init_conf(
     } else {
         if (utils::one_of(jcp.prop_kind, forward_training, forward_inference))
             do_outer_threading = jcp.os / max_threads < 512
-                && utils::implication(jcp.od == 1, (jcp.mb != 1 || jcp.ngroups > 2));
+                && IMPLICATION(jcp.od == 1, (jcp.mb != 1 || jcp.ngroups > 2));
         else if (jcp.prop_kind == backward_data)
             do_outer_threading = (jcp.mb != 1 || jcp.ngroups > 2);
         else //(jcp.prop_kind == backward_weights)
index 538ff18..c2ebc45 100644 (file)
@@ -33,7 +33,9 @@ namespace jit_gemm_convolution_utils {
     void im2col_3d(jit_gemm_conv_conf_t &jcp, const float *im, float *col,
         int od);
     void im2col(jit_gemm_conv_conf_t &jcp, const float *im, float *col);
-    void im2col_u8(jit_gemm_conv_conf_t &jcp, const uint8_t *im, uint8_t *col);
+    template <typename T>
+    void im2col_u8(jit_gemm_conv_conf_t &jcp, const T *im, uint8_t *col);
+
     void col2im_s32(jit_gemm_conv_conf_t &jcp, const int32_t *col, int32_t *im);
     void col2im_3d(jit_gemm_conv_conf_t &jcp, const float *col, float *im,
         int od);
index 45f0015..6e7806e 100644 (file)
@@ -52,11 +52,11 @@ struct gemm_inner_product_fwd_t: public cpu_primitive_t {
                 && everyone_is(data_type, desc()->src_desc.data_type,
                         desc()->weights_desc.data_type,
                         desc()->dst_desc.data_type)
-                && implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                         data_type == desc()->bias_desc.data_type)
                 && attr()->output_scales_.has_default_values()
                 && attr()->post_ops_.len_ <= 1
-                && utils::implication(attr()->post_ops_.len_ == 1,
+                && IMPLICATION(attr()->post_ops_.len_ == 1,
                         attr()->post_ops_.entry_[0].is_relu(true, false))
                 && dense_gemm_consitency_check(src_pd(), weights_pd(),
                         dst_pd());
index b5584b2..a4163fe 100644 (file)
@@ -59,11 +59,11 @@ struct gemm_u8s8s32x_inner_product_fwd_t: public cpu_primitive_t {
                 && this->desc()->src_desc.data_type == u8
                 && this->desc()->dst_desc.data_type == dst_type
                 && this->desc()->weights_desc.data_type == s8
-                && utils::implication(this->with_bias(), utils::one_of(
+                && IMPLICATION(this->with_bias(), utils::one_of(
                             this->desc()->bias_desc.data_type, f32, s32, s8,
                             u8))
                 && attr()->post_ops_.len_ <= 1
-                && utils::implication(attr()->post_ops_.len_,
+                && IMPLICATION(attr()->post_ops_.len_,
                         attr()->post_ops_.entry_[0].is_relu(true, false))
                 && dense_gemm_consitency_check(src_pd(), weights_pd(),
                         dst_pd());
@@ -24,7 +24,7 @@
 
 #include "simple_q10n.hpp"
 
-#include "gemm_u8s8s32x_convolution.hpp"
+#include "gemm_x8s8s32x_convolution.hpp"
 
 namespace mkldnn {
 namespace impl {
@@ -33,8 +33,9 @@ namespace cpu {
 using namespace mkldnn::impl::utils;
 using namespace mkldnn::impl::math;
 
-template <bool with_relu, data_type_t dst_type>
-void _gemm_u8s8s32x_convolution_fwd_t<with_relu, dst_type>::execute_forward() {
+template <bool with_relu, data_type_t src_type, data_type_t dst_type>
+void _gemm_x8s8s32x_convolution_fwd_t<with_relu, src_type,
+        dst_type>::execute_forward() {
     auto src_base = reinterpret_cast<const src_data_t *>(this->input_memory(0));
     auto wei_base = reinterpret_cast<const wei_data_t *>(this->input_memory(1));
     auto bia_base = reinterpret_cast<const char *>(this->input_memory(2));
@@ -43,9 +44,10 @@ void _gemm_u8s8s32x_convolution_fwd_t<with_relu, dst_type>::execute_forward() {
     jit_gemm_conv_conf_t &jcp = this->conf_.jcp_;
 
     char *scratchpad = (char *)this->scratchpad_->get();
-    src_data_t *col = (src_data_t *)scratchpad;
-    parallel_nd(jcp.im2col_sz * jcp.nthr,
-            [&](ptrdiff_t i) { col[i] = (src_data_t)0; });
+    uint8_t *col = (uint8_t *)scratchpad;
+    parallel_nd(jcp.im2col_sz * jcp.nthr, [&](ptrdiff_t i) {
+        col[i] = jcp.signed_input ? (uint8_t)128 : (uint8_t)0;
+    });
 
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         execute_forward_thr(ithr, nthr, src_base, wei_base, bia_base,
@@ -53,9 +55,9 @@ void _gemm_u8s8s32x_convolution_fwd_t<with_relu, dst_type>::execute_forward() {
     });
 }
 
-template <bool with_relu, data_type_t dst_type>
-void _gemm_u8s8s32x_convolution_fwd_t<with_relu, dst_type>
-::execute_forward_thr(const int ithr, const int nthr,
+template <bool with_relu, data_type_t src_type, data_type_t dst_type>
+void _gemm_x8s8s32x_convolution_fwd_t<with_relu, src_type,
+        dst_type>::execute_forward_thr(const int ithr, const int nthr,
         const src_data_t *src_base, const wei_data_t *wei_base,
         const char *bia_base, dst_data_t *dst_base, char *scratchpad) {
 #if USE_MKL_IGEMM
@@ -97,7 +99,7 @@ void _gemm_u8s8s32x_convolution_fwd_t<with_relu, dst_type>
         && scale_idx_mult == 0
         && jcp.ngroups == 1
         && !jcp.with_bias;
-    const float fast_path_alpha = scales[0];
+    const float fast_path_alpha = scales[0] / jcp.wei_adj_scale;
 
     const auto &post_ops = conf_.attr()->post_ops_;
     const bool do_sum = post_ops.contain(primitive_kind::sum, 0);
@@ -115,14 +117,16 @@ void _gemm_u8s8s32x_convolution_fwd_t<with_relu, dst_type>
     }
     const bool do_relu = jcp.with_relu || (entry_idx >= 0);
 
-    src_data_t *_col = (src_data_t *)scratchpad;
-    ptrdiff_t offset = (ptrdiff_t)jcp.im2col_sz
-                                   * sizeof(src_data_t) * jcp.nthr;
+    uint8_t *_col = (uint8_t *)scratchpad;
+    ptrdiff_t offset = (ptrdiff_t)jcp.im2col_sz * sizeof(uint8_t) * jcp.nthr;
     acc_data_t *_acc = (acc_data_t *)(scratchpad + offset);
 
-    src_data_t *col = _col + (ptrdiff_t)ithr * jcp.im2col_sz;
+    uint8_t *col = _col + (ptrdiff_t)ithr * jcp.im2col_sz;
     acc_data_t *acc = _acc + (ptrdiff_t)ithr * jcp.os * jcp.oc;
 
+    offset = (ptrdiff_t)jcp.ngroups * jcp.ks * jcp.ic * jcp.oc;
+    const int32_t *_wei_comp = (const int32_t *)(wei_base + offset);
+
     int n{0}, g{0};
     size_t start = 0, end = 0;
 
@@ -135,19 +139,23 @@ void _gemm_u8s8s32x_convolution_fwd_t<with_relu, dst_type>
             + g * src_g_stride;
         const wei_data_t *wei = wei_base + g * wei_g_stride;
         dst_data_t *dst = dst_base + n * dst_mb_stride + g * dst_g_stride;
+        const int32_t *wei_comp = _wei_comp + g * jcp.oc;
 
         if (jcp.im2col_sz)
-            jit_gemm_convolution_utils::im2col_u8(jcp, src, col);
+            jit_gemm_convolution_utils::im2col_u8<src_data_t>(jcp, src, col);
 
         const int M = jcp.oc;
         const int K = jcp.ks * jcp.ic;
         const int N = jcp.os;
+        const CBLAS_OFFSET offsetc
+                = jcp.signed_input ? CblasColOffset : CblasFixOffset;
         const int8_t off_a = 0, off_b = 0;
         const int32_t off_c = 0;
 
-        cblas_gemm_s8u8s32(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                CblasFixOffset, M, N, K, 1., wei, M * jcp.ngroups, off_a,
-                jcp.im2col_sz ? col : src, K, off_b, 0., acc, M, &off_c);
+        cblas_gemm_s8u8s32(CblasColMajor, CblasNoTrans, CblasNoTrans, offsetc,
+                M, N, K, 1.0f, wei, M * jcp.ngroups, off_a,
+                jcp.im2col_sz ? col : (uint8_t *)src, K, off_b, 0.0f, acc, M,
+                jcp.signed_input ? wei_comp : &off_c);
 
         if (use_fast_path) {
             auto body = [&](int o) {
@@ -156,7 +164,7 @@ void _gemm_u8s8s32x_convolution_fwd_t<with_relu, dst_type>
                 dst[o] = qz_a1b0<float, dst_data_t>()(d, rmode);
             };
 
-#           if _OPENMP >= 201307
+#           if MKLDNN_THR == MKLDNN_THR_OMP && _OPENMP >= 201307
 #           pragma omp parallel for simd
             for (int o = 0; o < jcp.os * jcp.oc; ++o) body(o);
 #           else
@@ -166,6 +174,8 @@ void _gemm_u8s8s32x_convolution_fwd_t<with_relu, dst_type>
             parallel_nd(jcp.os, jcp.oc, [&](const int os, const int oc) {
                 const size_t acc_off = os * jcp.oc + oc;
                 float d = (float)acc[acc_off];
+                if (jcp.signed_input)
+                    d /= jcp.wei_adj_scale;
 
                 if (jcp.with_bias)
                     d += get_bias(g * jcp.oc + oc);
@@ -292,14 +302,23 @@ void _gemm_u8s8s32x_convolution_bwd_data_t<dst_type>
 
 using namespace data_type;
 
-template struct _gemm_u8s8s32x_convolution_fwd_t<true, f32>;
-template struct _gemm_u8s8s32x_convolution_fwd_t<true, s32>;
-template struct _gemm_u8s8s32x_convolution_fwd_t<true, s8>;
-template struct _gemm_u8s8s32x_convolution_fwd_t<true, u8>;
-template struct _gemm_u8s8s32x_convolution_fwd_t<false, f32>;
-template struct _gemm_u8s8s32x_convolution_fwd_t<false, s32>;
-template struct _gemm_u8s8s32x_convolution_fwd_t<false, s8>;
-template struct _gemm_u8s8s32x_convolution_fwd_t<false, u8>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<true, u8, f32>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<true, u8, s32>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<true, u8, s8>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<true, u8, u8>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<false, u8, f32>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<false, u8, s32>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<false, u8, s8>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<false, u8, u8>;
+
+template struct _gemm_x8s8s32x_convolution_fwd_t<true, s8, f32>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<true, s8, s32>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<true, s8, s8>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<true, s8, u8>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<false, s8, f32>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<false, s8, s32>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<false, s8, s8>;
+template struct _gemm_x8s8s32x_convolution_fwd_t<false, s8, u8>;
 
 template struct _gemm_u8s8s32x_convolution_bwd_data_t<f32>;
 template struct _gemm_u8s8s32x_convolution_bwd_data_t<s32>;
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef GEMM_U8S8S32X_CONVOLUTION_HPP
-#define GEMM_U8S8S32X_CONVOLUTION_HPP
+#ifndef GEMM_X8S8S32X_CONVOLUTION_HPP
+#define GEMM_X8S8S32X_CONVOLUTION_HPP
 
 #include "c_types_map.hpp"
 #include "cpu_convolution_pd.hpp"
@@ -29,8 +29,8 @@ namespace mkldnn {
 namespace impl {
 namespace cpu {
 
-template <bool with_relu, data_type_t dst_type>
-struct _gemm_u8s8s32x_convolution_fwd_t: public cpu_primitive_t {
+template <bool with_relu, data_type_t src_type, data_type_t dst_type>
+struct _gemm_x8s8s32x_convolution_fwd_t: public cpu_primitive_t {
     struct pd_t: public _cpu_convolution_fwd_pd_t<with_relu> {
         pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc,
                 const primitive_attr_t *attr,
@@ -39,7 +39,7 @@ struct _gemm_u8s8s32x_convolution_fwd_t: public cpu_primitive_t {
                     hint_fwd_pd), jcp_() {}
 
         DECLARE_COMMON_PD_T("gemm:blas",
-                _gemm_u8s8s32x_convolution_fwd_t<with_relu, dst_type>);
+                _gemm_x8s8s32x_convolution_fwd_t<with_relu, src_type, dst_type>);
 
         virtual status_t init() override {
             using namespace data_type;
@@ -57,17 +57,18 @@ struct _gemm_u8s8s32x_convolution_fwd_t: public cpu_primitive_t {
                         prop_kind::forward_inference)
                 && this->cdesc_().alg_kind == alg_kind::convolution_direct
                 && !this->has_zero_dim_memory()
-                && this->cdesc_().src_desc.data_type == u8
+                && this->cdesc_().src_desc.data_type == src_type
                 && this->cdesc_().dst_desc.data_type == dst_type
                 && this->cdesc_().weights_desc.data_type == s8
-                && utils::implication(this->with_bias(), utils::one_of(
+                && IMPLICATION(this->with_bias(), utils::one_of(
                             this->cdesc_().bias_desc.data_type, f32, s32, s8,
                             u8))
                 && this->cdesc_().accum_data_type == data_type::s32
                 && utils::everyone_is(nhwc, this->src_pd_.desc()->format,
                         this->dst_pd_.desc()->format)
                 && this->weights_pd_.desc()->format == (this->with_groups()
-                        ? hwigo : hwio)
+                        ? ((src_type == data_type::s8) ? hwigo_s8s8 : hwigo)
+                        : ((src_type == data_type::s8) ? hwio_s8s8 : hwio))
                 && this->is_gemm_conv_format();
 
             return ok ? status::success : status::unimplemented;
@@ -78,13 +79,16 @@ struct _gemm_u8s8s32x_convolution_fwd_t: public cpu_primitive_t {
     protected:
         virtual status_t set_default_params() override {
             using namespace memory_format;
+            bool is_sign_input =
+                    (this->cdesc_().src_desc.data_type == data_type::s8);
             if (this->src_pd_.desc()->format == any)
                 CHECK(this->src_pd_.set_format(nhwc));
             if (this->dst_pd_.desc()->format == any)
                 CHECK(this->dst_pd_.set_format(nhwc));
             if (this->weights_pd_.desc()->format == any)
                 CHECK(this->weights_pd_.set_format(this->with_groups()
-                            ? hwigo : hwio));
+                            ? ((is_sign_input) ? hwigo_s8s8 : hwigo)
+                            : ((is_sign_input) ? hwio_s8s8 : hwio)));
             if (this->bias_pd_.desc()->format == any)
                 CHECK(this->bias_pd_.set_format(x));
             return status::success;
@@ -108,7 +112,7 @@ struct _gemm_u8s8s32x_convolution_fwd_t: public cpu_primitive_t {
         }
     };
 
-    _gemm_u8s8s32x_convolution_fwd_t(const pd_t *pd, const input_vector &inputs,
+    _gemm_x8s8s32x_convolution_fwd_t(const pd_t *pd, const input_vector &inputs,
            const output_vector &outputs)
         : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
         , scratchpad_(nullptr)
@@ -126,11 +130,11 @@ struct _gemm_u8s8s32x_convolution_fwd_t: public cpu_primitive_t {
                 &this->scratchpad_, size, this->conf_.jcp_.nthr);
     }
 
-    ~_gemm_u8s8s32x_convolution_fwd_t() {
+    ~_gemm_x8s8s32x_convolution_fwd_t() {
         delete this->scratchpad_;
     };
 
-    typedef typename prec_traits<data_type::u8>::type src_data_t;
+    typedef typename prec_traits<src_type>::type src_data_t;
     typedef typename prec_traits<data_type::s8>::type wei_data_t;
     typedef typename prec_traits<dst_type>::type dst_data_t;
     typedef typename prec_traits<data_type::s32>::type acc_data_t;
@@ -182,7 +186,7 @@ struct _gemm_u8s8s32x_convolution_bwd_data_t: public cpu_primitive_t {
                 && this->desc()->diff_src_desc.data_type == dst_type
                 && this->desc()->diff_dst_desc.data_type == u8
                 && this->desc()->weights_desc.data_type == s8
-                && utils::implication(this->with_bias(), utils::one_of(
+                && IMPLICATION(this->with_bias(), utils::one_of(
                             this->desc()->bias_desc.data_type, f32, s32, s8,
                             u8))
                 && this->desc()->accum_data_type == data_type::s32
index 2284023..9ef2558 100644 (file)
@@ -35,15 +35,13 @@ using namespace mkldnn::impl::utils;
 
 using namespace Xbyak;
 
-void jit_avx2_1x1_conv_kernel_f32::bcast_loop(int load_loop_blk,
-        char load_loop_tag)
+void jit_avx2_1x1_conv_kernel_f32::generate_bcast_loop(int load_loop_blk)
 {
     mov(aux1_reg_bcast_data, reg_bcast_data);
     mov(aux_reg_output_data, reg_output_data);
     mov(bcast_loop_iter, reg_bcast_loop_work);
 
-    jit_tagged_label bcast_loop("bcast_loop", load_loop_tag);
-    jit_tagged_label bcast_loop_tail("bcast_loop_tail", load_loop_tag);
+    Label bcast_loop, bcast_loop_tail;
 
     cmp(bcast_loop_iter, jcp.ur);
     jl(bcast_loop_tail, T_NEAR);
@@ -53,7 +51,7 @@ void jit_avx2_1x1_conv_kernel_f32::bcast_loop(int load_loop_blk,
         int num_substeps = jcp.bcast_block / jcp.ur;
         assert(num_substeps > 0 && num_substeps < 10);
         for (int i = 0; i < num_substeps; i++) {
-            reduce_loop(load_loop_blk, jcp.ur, load_loop_tag, '0' + i);
+            generate_reduce_loop(load_loop_blk, jcp.ur);
             if (i < num_substeps - 1) {
                 add(aux1_reg_bcast_data, jcp.bcast_loop_bcast_substep);
                 add(aux_reg_output_data, jcp.bcast_loop_output_substep);
@@ -71,17 +69,16 @@ void jit_avx2_1x1_conv_kernel_f32::bcast_loop(int load_loop_blk,
 
     L(bcast_loop_tail);
     if (jcp.ur_tail) {
-        jit_tagged_label bcast_loop_tail_out(
-                "bcast_loop_tail_out", load_loop_tag);
+        Label bcast_loop_tail_out;
         cmp(bcast_loop_iter, 0);
         jz(bcast_loop_tail_out, T_NEAR);
-        reduce_loop(load_loop_blk, jcp.ur_tail, load_loop_tag, '1');
+        generate_reduce_loop(load_loop_blk, jcp.ur_tail);
         L(bcast_loop_tail_out);
     }
 }
 
-void jit_avx2_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
-        char load_loop_tag, char bcast_loop_tag)
+void jit_avx2_1x1_conv_kernel_f32::generate_reduce_loop(
+        int load_loop_blk, int ur)
 {
     auto vreg_load = [=](int i) {
         return Ymm(ur * load_loop_blk + i);
@@ -152,8 +149,7 @@ void jit_avx2_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
     };
 
     auto init = [=]() {
-        jit_tagged_label init_done("init_done", load_loop_tag, bcast_loop_tag);
-        jit_tagged_label init_zero("init_zero", load_loop_tag, bcast_loop_tag);
+        Label init_done, init_zero;
 
         if (jcp.with_bias && one_of(jcp.prop_kind, forward_training,
                     forward_inference)) {
@@ -180,10 +176,7 @@ void jit_avx2_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
     };
 
     auto store = [=]() {
-        jit_tagged_label store_done(
-                "store_done", load_loop_tag, bcast_loop_tag);
-        jit_tagged_label store_noadd(
-                "store_noadd", load_loop_tag, bcast_loop_tag);
+        Label store_done, store_noadd;
 
         if (!jcp.with_sum) {
             test(reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
@@ -198,15 +191,13 @@ void jit_avx2_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
 
         L(store_noadd);
 
-        jit_tagged_label store_norelu(
-                "store_norelu", load_loop_tag, bcast_loop_tag);
+        Label store_norelu;
         test(reg_reduce_pos_flag, FLAG_REDUCE_LAST);
         jz(store_norelu, T_NEAR);
 
         int eltwise_inj_idx = 0;
         int depthwise_inj_idx = 0;
         const auto &p = attr_.post_ops_;
-
         if (p.len_ == 0 && eltwise_injectors.size() == 1) {
             eltwise_injectors[0]->compute_vector_range(0, ur * load_loop_blk);
         }
@@ -255,7 +246,7 @@ void jit_avx2_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
                 for (int i = 0; i < load_loop_blk; ++i) {
                     if (mayiuse(avx2))
                         vfmadd231ps(vreg_accum(i, j), vreg_load(i), vreg_bcast);
-                    else { // AVX support
+                    else { // Intel(R) Advanced Vector Extensions (Intel(R) AVX) support
                         auto tmp = vmask;
                         vmulps(tmp, vreg_bcast, vreg_load(i));
                         vaddps(vreg_accum(i, j), vreg_accum(i, j), tmp);
@@ -272,9 +263,7 @@ void jit_avx2_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
         }
     };
 
-    jit_tagged_label reduce_loop("reduce_loop", load_loop_tag, bcast_loop_tag);
-    jit_tagged_label reduce_loop_tail(
-            "reduce_loop_tail", load_loop_tag, bcast_loop_tag);
+    Label reduce_loop, reduce_loop_tail;
 
     mov(aux_reg_load_data, reg_load_data);
     mov(aux_reg_bcast_data, aux1_reg_bcast_data);
@@ -299,16 +288,13 @@ void jit_avx2_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
     store();
 }
 
-void jit_avx2_1x1_conv_kernel_f32::diff_bias_loop(int load_loop_blk,
-        char load_loop_tag)
+void jit_avx2_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk)
 {
     if (!jcp.with_bias || jcp.prop_kind != backward_weights)
         return;
 
-    jit_tagged_label diff_bias_loop("diff_bias_loop", load_loop_tag);
-    jit_tagged_label diff_bias_loop_out("diff_bias_loop_out", load_loop_tag);
-    jit_tagged_label diff_bias_init_out("diff_bias_init_out", load_loop_tag);
-    jit_tagged_label diff_bias_load("diff_bias_load", load_loop_tag);
+    Label diff_bias_loop, diff_bias_loop_out, diff_bias_init_out;
+    Label diff_bias_load;
 
     auto diff_bias_ptr = [=](int i) {
         return ptr[reg_diff_bias_data + i * jcp.oc_block * sizeof(float)];
@@ -408,8 +394,8 @@ void jit_avx2_1x1_conv_kernel_f32::generate()
         mov(reg_output_stride, ptr[param1 + GET_OFF(output_stride)]);
     mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
 
-    auto load_loop_body = [=] (int load_loop_blk, char bcast_loop_tag) {
-        bcast_loop(load_loop_blk, bcast_loop_tag);
+    auto generate_load_loop_body = [=] (int load_loop_blk) {
+        generate_bcast_loop(load_loop_blk);
         add(reg_load_data, load_loop_blk * jcp.load_loop_load_step);
         switch (jcp.prop_kind) {
         case forward_training:
@@ -437,10 +423,10 @@ void jit_avx2_1x1_conv_kernel_f32::generate()
         add(reg_oc_off, load_loop_blk * jcp.oc_block * sizeof(float));
     };
 
-    const char *load_loop_blk_8 = "load_loop_blk_8";
-    const char *load_loop_blk_16 = "load_loop_blk_16";
-    const char *load_loop_blk_24 = "load_loop_blk_24";
-    const char *load_loop_blk_end = "load_loop_blk_end";
+    Label load_loop_blk_8;
+    Label load_loop_blk_16;
+    Label load_loop_blk_24;
+    Label load_loop_blk_end;
 
     cmp(reg_load_loop_work, 8);
     jle(load_loop_blk_8, T_NEAR);
@@ -452,8 +438,8 @@ void jit_avx2_1x1_conv_kernel_f32::generate()
     jle(load_loop_blk_16, T_NEAR);
 
     L(load_loop_blk_24); {
-        diff_bias_loop(3, '3');
-        load_loop_body(3, '3');
+        generate_diff_bias_loop(3);
+        generate_load_loop_body(3);
         cmp(reg_load_loop_work, 32);
         je(load_loop_blk_16);
         cmp(reg_load_loop_work, 24);
@@ -464,8 +450,8 @@ void jit_avx2_1x1_conv_kernel_f32::generate()
     jle(load_loop_blk_8, T_NEAR);
 
     L(load_loop_blk_16); {
-        diff_bias_loop(2, '2');
-        load_loop_body(2, '2');
+        generate_diff_bias_loop(2);
+        generate_load_loop_body(2);
         cmp(reg_load_loop_work, 16);
         jge(load_loop_blk_16);
     }
@@ -473,8 +459,8 @@ void jit_avx2_1x1_conv_kernel_f32::generate()
     L(load_loop_blk_8); {
         cmp(reg_load_loop_work, 0);
         je(load_loop_blk_end, T_NEAR);
-        diff_bias_loop(1, '1');
-        load_loop_body(1, '1');
+        generate_diff_bias_loop(1);
+        generate_load_loop_body(1);
     }
 
     L(load_loop_blk_end);
@@ -533,6 +519,7 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     // TODO (Roma): this code is duplicated from the generic kernel; maybe the
     // configuration struct could do some stuff below
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+    const int ndims = src_d.ndims();
 
     jcp.prop_kind = cd.prop_kind;
 
@@ -543,19 +530,19 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     jcp.oc_without_padding = jcp.oc;
     jcp.ic = src_d.dims()[1] / jcp.ngroups;
 
-    jcp.ih = src_d.dims()[2];
-    jcp.iw = src_d.dims()[3];
-    jcp.oh = dst_d.dims()[2];
-    jcp.ow = dst_d.dims()[3];
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[2];
+    jcp.iw = src_d.dims()[ndims - 1];
+    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[2];
+    jcp.ow = dst_d.dims()[ndims - 1];
 
-    jcp.kh = weights_d.dims()[with_groups + 2];
-    jcp.kw = weights_d.dims()[with_groups + 3];
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + 2];
+    jcp.kw = weights_d.dims()[with_groups + ndims - 1];
 
-    jcp.t_pad = cd.padding[0][0];
-    jcp.l_pad = cd.padding[0][1];
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][0];
+    jcp.l_pad = cd.padding[0][ndims - 3];
 
-    jcp.stride_h = cd.strides[0];
-    jcp.stride_w = cd.strides[1];
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[0];
+    jcp.stride_w = cd.strides[ndims - 3];
 
     jcp.src_fmt = src_d.format();
     jcp.with_bias = cd.bias_desc.format != memory_format::undef;
@@ -609,12 +596,12 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     jcp.os = jcp.oh * jcp.ow;
     jcp.is = jcp.ih * jcp.iw;
 
-    constexpr memory_format_t weights_formats[2][2] = {
-        { OIhw8i8o, OIhw8o8i },
-        { gOIhw8i8o, gOIhw8o8i }
-    };
-    memory_format_t weights_format
-        = weights_formats[with_groups][jcp.prop_kind == backward_data];
+    const int is_bwd_d = jcp.prop_kind == backward_data;
+    memory_format_t weights_format = with_groups
+        ? utils::pick(2 * ndims - 6 + is_bwd_d, gOIw8i8o, gOIw8o8i, gOIhw8i8o,
+            gOIhw8o8i)
+        : utils::pick(2 * ndims - 6 + is_bwd_d, OIw8i8o, OIw8o8i, OIhw8i8o,
+            OIhw8o8i);
 
     const int simd_w = 8;
 
@@ -623,10 +610,10 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
 
     bool args_ok = true
         && jcp.ngroups == 1
-        && src_d.format() == nChw8c
+        && one_of(src_d.format(), nCw8c, nChw8c)
         && weights_d.format() == weights_format
         && one_of(cd.bias_desc.format, memory_format::undef, any, x)
-        && dst_d.format() == nChw8c;
+        && one_of(dst_d.format(), nCw8c, nChw8c);
     if (!args_ok) return status::unimplemented;
 
     args_ok = true
@@ -636,9 +623,14 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
         && jcp.kh == 1 && jcp.kw == 1;
     if (!args_ok) return status::unimplemented;
 
+    // TODO: remove this restriction
+    // optimized 1x1 bwd_w does not support Intel AVX
+    if (jcp.prop_kind == backward_weights && !mayiuse(avx2))
+        return status::unimplemented;
+
     jcp.ic_block = jcp.oc_block = simd_w;
 
-    jcp.ur = mayiuse(avx2) ? 4 : 3; // AVX support
+    jcp.ur = mayiuse(avx2) ? 4 : 3; // Intel AVX support
 
     int load_blocking{ 0 };
     int load_blocking_max{ 0 };
index 2214db5..2c10b85 100644 (file)
@@ -20,6 +20,7 @@
 #include "c_types_map.hpp"
 #include "jit_generator.hpp"
 #include "jit_primitive_conf.hpp"
+#include "cpu_memory.hpp"
 #include "jit_uni_eltwise.hpp"
 #include "jit_uni_depthwise.hpp"
 
@@ -105,14 +106,13 @@ private:
     ymm_t vreg_bcast = ymm_t(15);
     Xbyak::Ymm vmask = Xbyak::Ymm(14);
 
+    void generate_bcast_loop(int load_loop_blk);
+    void generate_reduce_loop(int load_loop_blk, int ur);
+    void generate_diff_bias_loop(int load_loop_blk);
+
     nstl::vector<jit_uni_eltwise_injector_f32<avx2>*> eltwise_injectors;
     nstl::vector<jit_uni_depthwise_injector_f32<avx2>*> depthwise_injectors;
 
-    void bcast_loop(int load_loop_blk, char load_loop_tag);
-    void reduce_loop(int load_loop_blk, int ur, char load_loop_tag,
-            char bcast_loop_tag);
-    void diff_bias_loop(int load_loop_blk, char load_loop_tag);
-
     void generate();
 };
 
index 60af420..7a6e17c 100644 (file)
@@ -35,6 +35,11 @@ using namespace mkldnn::impl::status;
 using namespace mkldnn::impl::memory_format;
 using namespace mkldnn::impl::utils;
 
+#define data_blk_off(f, n, c, h, w) \
+    ((ndims == 3) \
+    ? (f).blk_off(n, c, w) \
+    : (f).blk_off(n, c, h, w))
+
 /* convolution forward */
 
 template <bool with_relu>
@@ -52,11 +57,12 @@ void _jit_avx2_1x1_convolution_fwd_t<with_relu>::execute_forward() {
     const int MB = conf_.MB();
 
     const int work_amount = MB * jcp.ngroups * jcp.nb_bcast;
+    const int ndims = dst_d.ndims();
 
-    const int stride_h = conf_.cdesc()->strides[0];
-    const int stride_w = conf_.cdesc()->strides[1];
-    const int pad_t = conf_.cdesc()->padding[0][0];
-    const int pad_l = conf_.cdesc()->padding[0][1];
+    const int stride_h = (ndims == 3) ? 1 : conf_.cdesc()->strides[0];
+    const int stride_w = conf_.cdesc()->strides[ndims - 3];
+    const int pad_t = (ndims == 3) ? 0 : conf_.cdesc()->padding[0][0];
+    const int pad_l = conf_.cdesc()->padding[0][ndims - 3];
 
     auto step = [](int default_step, int remaining, int tail_step) {
         assert(default_step <= tail_step);
@@ -107,8 +113,8 @@ void _jit_avx2_1x1_convolution_fwd_t<with_relu>::execute_forward() {
                 const int _ocb = g * nb_oc + ocb;
                 p.load_dim = this_block_size(ocb * jcp.oc_block, jcp.oc,
                         load_step * jcp.oc_block);
+                const size_t dst_off = data_blk_off(dst_d, n, _ocb, oh, ow);
 
-                const size_t dst_off = dst_d.blk_off(n, _ocb, oh, ow);
                 p.output_data = &dst[dst_off];
 
                 p.bias_data = &bias[_ocb * jcp.oc_block];
@@ -133,13 +139,13 @@ void _jit_avx2_1x1_convolution_fwd_t<with_relu>::execute_forward() {
                             + _icb * jcp.is * jcp.ic_block;
 
                         if (ocb == 0) {
-                            rp.src = src + src_d.blk_off(n, _icb, ih, iw);
+                            rp.src = src + data_blk_off(src_d, n, _icb, ih, iw);
                             rtus_driver_->ker_(&rp);
                         }
 
                         p.bcast_data = rp.ws;
                     } else
-                        p.bcast_data = src + src_d.blk_off(n, _icb, ih, iw);
+                        p.bcast_data = src + data_blk_off(src_d, n, _icb, ih, iw);
 
                     p.oc_off = _ocb * jcp.oc_block * sizeof(float);
 
@@ -170,7 +176,6 @@ void _jit_avx2_1x1_convolution_fwd_t<with_relu>::execute_forward_fusing() {
     auto dst = reinterpret_cast<data_t *>(this->memory());
 
     const memory_desc_wrapper src_d(conf_.src_pd());
-    const memory_desc_wrapper dst_d(conf_.dst_pd());
     const memory_desc_wrapper weights_d(conf_.weights_pd(0));
 
     const auto &jcp = kernel_->jcp;
@@ -354,11 +359,12 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() {
 
     // TODO (Roma): remove this restriction
     assert(jcp.stride_w == 1 && jcp.stride_h == 1);
+    const int ndims = diff_dst_d.ndims();
 
-    const int stride_h = conf_.desc()->strides[0];
-    const int stride_w = conf_.desc()->strides[1];
-    const int pad_t = conf_.desc()->padding[0][0];
-    const int pad_l = conf_.desc()->padding[0][1];
+    const int stride_h = (ndims == 3) ? 1 : conf_.desc()->strides[0];
+    const int stride_w = conf_.desc()->strides[ndims - 3];
+    const int pad_t = (ndims == 3) ? 0 : conf_.desc()->padding[0][0];
+    const int pad_l = conf_.desc()->padding[0][ndims - 3];
 
     const int nb_ic = jcp.nb_load;
     const int nb_oc = jcp.nb_reduce;
@@ -410,8 +416,7 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() {
                 rp.iw_start = iw;
 
                 const int _icb = g * nb_ic + icb;
-                rp.src = diff_src + diff_src_d.blk_off(n, _icb, ih, iw);
-
+                rp.src = diff_src + data_blk_off(diff_src_d, n, _icb, ih, iw);
                 if (conf_.rtus_.reduce_src_) {
                     rp.ws = scratch_ + ithr * ws_per_thread_;
                     p.output_data = rp.ws;
@@ -421,7 +426,8 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data() {
                 for (int ocb = 0; ocb < jcp.nb_reduce;
                         ocb += jcp.nb_reduce_blocking) {
                     const int _ocb = g * nb_oc + ocb;
-                    size_t diff_dst_off = diff_dst_d.blk_off(n, _ocb, oh, ow);
+                    size_t diff_dst_off = data_blk_off(diff_dst_d, n, _ocb, oh,
+                        ow);
                     p.bcast_data = &diff_dst[diff_dst_off];
 
                     p.load_data = &weights[conf_.with_groups()
@@ -507,6 +513,7 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() {
 
     const auto &jcp = kernel_->jcp;
 
+    const int ndims = diff_dst_d.ndims();
     // TODO (Roma): remove this restriction
     assert(jcp.stride_w == 1 && jcp.stride_h == 1);
 
@@ -521,10 +528,10 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() {
     const int sp_dim = jcp.reduce_dim;
     const int mb_sp_work = jcp.mb * sp_dim;
 
-    const int stride_h = conf_.desc()->strides[0];
-    const int stride_w = conf_.desc()->strides[1];
-    const int pad_t = conf_.desc()->padding[0][0];
-    const int pad_l = conf_.desc()->padding[0][1];
+    const int stride_h = (ndims == 3) ? 1 : conf_.desc()->strides[0];
+    const int stride_w = conf_.desc()->strides[ndims - 3];
+    const int pad_t = (ndims == 3) ? 0 : conf_.desc()->padding[0][0];
+    const int pad_l = conf_.desc()->padding[0][ndims - 3];
 
     auto step = [](int default_step, int remaining, int tail_step) {
         assert(default_step <= tail_step);
@@ -577,9 +584,13 @@ void jit_avx2_1x1_convolution_bwd_weights_t::execute_backward_weights() {
 
                         rp.ws = scratch_ + ithr * ws_per_thread_
                             + (ic_b * jcp.is + sp) * jcp.ic_block;
-                        rp.src = src
-                            + ih * src_d.blocking_desc().strides[0][2]
-                            + iw * src_d.blocking_desc().strides[0][3];
+                        if (ndims == 3)
+                            rp.src = src
+                                + iw * src_d.blocking_desc().strides[0][2];
+                        else
+                            rp.src = src
+                                + ih * src_d.blocking_desc().strides[0][2]
+                                + iw * src_d.blocking_desc().strides[0][3];
 
                         if (oc_b == 0)
                             rtus_driver_->ker_(&rp);
index 30e5fbb..7846252 100644 (file)
@@ -62,7 +62,7 @@ struct _jit_avx2_1x1_convolution_fwd_t: public cpu_primitive_t {
                         this->cdesc_().src_desc.data_type,
                         this->cdesc_().weights_desc.data_type,
                         this->cdesc_().dst_desc.data_type)
-                && utils::implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                         data_type::f32 == this->cdesc_().bias_desc.data_type);
             if (!ok) return status::unimplemented;
 
@@ -103,12 +103,15 @@ struct _jit_avx2_1x1_convolution_fwd_t: public cpu_primitive_t {
         virtual status_t set_default_params() override {
             using namespace memory_format;
             if (this->src_pd_.desc()->format == any)
-                CHECK(this->src_pd_.set_format(nChw8c));
+                CHECK(this->src_pd_.set_format(utils::pick(this->ndims() - 3,
+                    nCw8c, nChw8c)));
             if (this->dst_pd_.desc()->format == any)
-                CHECK(this->dst_pd_.set_format(nChw8c));
+                CHECK(this->dst_pd_.set_format(utils::pick(this->ndims() - 3,
+                    nCw8c, nChw8c)));
             if (this->weights_pd_.desc()->format == any)
                 CHECK(this->weights_pd_.set_format(this->with_groups()
-                            ? gOIhw8i8o : OIhw8i8o));
+                    ? utils::pick(this->ndims() - 3, gOIw8i8o, gOIhw8i8o)
+                    : utils::pick(this->ndims() - 3, OIw8i8o, OIhw8i8o)));
             if (this->bias_pd_.desc()->format == any)
                 CHECK(this->bias_pd_.set_format(x));
             return status::success;
@@ -250,12 +253,15 @@ struct jit_avx2_1x1_convolution_bwd_data_t: public cpu_primitive_t {
             using namespace memory_format;
 
             if (this->diff_src_pd_.desc()->format == any)
-                CHECK(this->diff_src_pd_.set_format(nChw8c));
+                CHECK(this->diff_src_pd_.set_format(utils::pick(
+                    this->ndims() - 3, nCw8c, nChw8c)));
             if (this->diff_dst_pd_.desc()->format == any)
-                CHECK(this->diff_dst_pd_.set_format(nChw8c));
+                CHECK(this->diff_dst_pd_.set_format(utils::pick(
+                    this->ndims() - 3, nCw8c, nChw8c)));
             if (this->weights_pd_.desc()->format == any)
                 CHECK(this->weights_pd_.set_format(this->with_groups()
-                            ? gOIhw8o8i : OIhw8o8i));
+                    ? utils::pick(this->ndims() - 3, gOIw8o8i, gOIhw8o8i)
+                    : utils::pick(this->ndims() - 3, OIw8o8i, OIhw8o8i)));
             return status::success;
         }
     };
@@ -327,7 +333,7 @@ struct jit_avx2_1x1_convolution_bwd_weights_t: public cpu_primitive_t {
                         this->desc()->src_desc.data_type,
                         this->desc()->diff_weights_desc.data_type,
                         this->desc()->diff_dst_desc.data_type)
-                && utils::implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                         data_type::f32 == desc()->diff_bias_desc.data_type);
             if (!ok) return status::unimplemented;
 
@@ -353,12 +359,15 @@ struct jit_avx2_1x1_convolution_bwd_weights_t: public cpu_primitive_t {
             using namespace memory_format;
 
             if (this->src_pd_.desc()->format == any)
-                CHECK(this->src_pd_.set_format(nChw8c));
+                CHECK(this->src_pd_.set_format(utils::pick(this->ndims() - 3,
+                    nCw8c, nChw8c)));
             if (this->diff_dst_pd_.desc()->format == any)
-                CHECK(this->diff_dst_pd_.set_format(nChw8c));
+                CHECK(this->diff_dst_pd_.set_format(utils::pick(
+                    this->ndims() - 3, nCw8c, nChw8c)));
             if (this->diff_weights_pd_.desc()->format == any)
                 CHECK(this->diff_weights_pd_.set_format(this->with_groups()
-                            ? gOIhw8i8o : OIhw8i8o));
+                    ? utils::pick(this->ndims() - 3, gOIw8i8o, gOIhw8i8o)
+                    : utils::pick(this->ndims() - 3, OIw8i8o, OIhw8i8o)));
             if (this->diff_bias_pd_.desc()->format == any)
                 CHECK(this->diff_bias_pd_.set_format(x));
             return status::success;
index 2c56e95..392622a 100644 (file)
@@ -58,7 +58,7 @@ void jit_avx2_conv_fwd_kernel_f32::oh_step_unroll_kw(int ur_w,
         for (int ifm2 = 0; ifm2 < ic_blk; ifm2++) {
             for (int jj = jj_start; jj < jj_end; jj++) {
                 size_t inp_off;
-                if (one_of(jcp.src_fmt, nchw, ncdhw))
+                if (one_of(jcp.src_fmt, ncw, nchw, ncdhw))
                     inp_off = sizeof(float)*((size_t)ifm2*id*ih*iw
                         + (ki*dilate_w + jj*stride_w - pad_l));
                 else
@@ -76,7 +76,7 @@ void jit_avx2_conv_fwd_kernel_f32::oh_step_unroll_kw(int ur_w,
                     if (mayiuse(avx2))
                         vfmadd231ps(Ymm(ur_w * ii + jj),
                                 Ymm(oc_blocks * ur_w + jj), ymm15);
-                    else { // AVX support
+                    else { // Intel(R) Advanced Vector Extensions (Intel(R) AVX) support
                         Ymm tmp = ymask;
                         vmulps(tmp, ymm15, Ymm(oc_blocks * ur_w + jj));
                         vaddps(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), tmp);
@@ -90,7 +90,7 @@ void jit_avx2_conv_fwd_kernel_f32::oh_step_nopad(int ur_w,
         int pad_l, int pad_r, char pad_tag,
         int oc_blocks, char oc_blocks_tag)
 {
-    jit_tagged_label kw_label("kw", pad_tag, oc_blocks_tag);
+    Label kw_loop;
 
     int iw = jcp.iw;
     int ih = jcp.ih;
@@ -105,14 +105,14 @@ void jit_avx2_conv_fwd_kernel_f32::oh_step_nopad(int ur_w,
     int oc_blk = jcp.oc_block;
 
     xor_(ki_iter, ki_iter);
-    L(kw_label);
+    L(kw_loop);
     {
         int jj_start = 0;
         int jj_end = ur_w;
         for (int ifm2 = 0; ifm2 < ic_blk; ifm2++) {
             for (int jj = jj_start; jj < jj_end; jj++) {
                 size_t inp_off;
-                if (one_of(jcp.src_fmt, nchw, ncdhw))
+                if (one_of(jcp.src_fmt, ncw, nchw, ncdhw))
                     inp_off = sizeof(float)*((size_t)ifm2 * id * ih * iw
                             + (jj * stride_w - pad_l));
                 else
@@ -130,7 +130,7 @@ void jit_avx2_conv_fwd_kernel_f32::oh_step_nopad(int ur_w,
                     if (mayiuse(avx2))
                         vfmadd231ps(Ymm(ur_w * ii + jj),
                                 Ymm(oc_blocks * ur_w + jj), ymm15);
-                    else { // AVX support
+                    else { // Intel AVX support
                         Ymm tmp = ymask;
                         vmulps(tmp, ymm15, Ymm(oc_blocks * ur_w + jj));
                         vaddps(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), tmp);
@@ -138,12 +138,12 @@ void jit_avx2_conv_fwd_kernel_f32::oh_step_nopad(int ur_w,
             }
         }
         add(aux_reg_kernel, sizeof(float) * oc_blk * ic_blk);
-        add(aux_reg_input, sizeof(float) * (one_of(jcp.src_fmt, nchw, ncdhw)
+        add(aux_reg_input, sizeof(float) * (one_of(jcp.src_fmt, ncw, nchw, ncdhw)
                 ? dilate_w : ic_blk * dilate_w));
 
         inc(ki_iter);
         cmp(ki_iter, kw);
-        jl(kw_label, T_NEAR);
+        jl(kw_loop, T_NEAR);
     }
 }
 
@@ -160,17 +160,16 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w,
     int dilate_w = jcp.dilate_w + 1;
     int ic_blk = jcp.ic_block;
     int oc_blk = jcp.oc_block;
-    const int inp_mult = one_of(jcp.src_fmt, nchw, ncdhw)
-        ? dilate_h : ic_blk * dilate_h;
-    const int inp_off = one_of(jcp.src_fmt, nchw, ncdhw)
+    const int inp_mult = one_of(jcp.src_fmt, ncw, nchw, ncdhw)
+        ? 1 : ic_blk;
+    const int inp_off = one_of(jcp.src_fmt, ncw, nchw, ncdhw)
         ? dilate_w : ic_blk * dilate_w;
 
-    jit_tagged_label init_done_label("init", pad_tag, oc_blocks_tag);
-    jit_tagged_label init_first_label("first", pad_tag, oc_blocks_tag);
+    Label init_done, init_first;
 
     if (!jcp.with_sum) {
         test(reg_ci_flag, FLAG_IC_FIRST);
-        jne(init_first_label, T_NEAR);
+        jne(init_first, T_NEAR);
     }
 
     for (int ii = 0; ii < oc_blocks; ii++) {
@@ -187,7 +186,7 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w,
 
     if (jcp.with_sum && jcp.with_bias) {
         test(reg_ci_flag, FLAG_IC_FIRST);
-        je(init_done_label, T_NEAR);
+        je(init_done, T_NEAR);
 
         for (int ii = 0; ii < oc_blocks; ii++)
             for (int jj = 0; jj < ur_w; jj++)
@@ -195,9 +194,9 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w,
                     yword[reg_bias + sizeof(float) * ii * oc_blk]);
     }
 
-    jmp(init_done_label);
+    jmp(init_done);
 
-    L(init_first_label);
+    L(init_first);
     if (this->jcp.with_bias) {
         for (int ii = 0; ii < oc_blocks; ii++)
             for (int jj = 0; jj < ur_w; jj++)
@@ -209,14 +208,14 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w,
                 uni_vpxor(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj));
     }
 
-    L(init_done_label);
+    L(init_done);
 
-    if (jcp.ndims == 4) {
+    if (one_of(jcp.ndims, 3, 4)) {
         mov(aux_reg_input, reg_input);
         mov(aux_reg_kernel, reg_kernel);
     }
 
-    Label skip_kh_loop, skip_kd_loop, kd_label;
+    Label skip_kh_loop, skip_kd_loop, kd_loop;
     if (jcp.ndims == 5) {
         push(reg_output);
         push(oi_iter);
@@ -229,7 +228,7 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w,
             cmp(reg_ki, 0);
             je(skip_kd_loop, T_NEAR);
         }
-        L(kd_label);
+        L(kd_loop);
         mov(kj, ptr[param1 + GET_OFF(kh_padding)]);
     } else {
         mov(kj, reg_kh);
@@ -244,23 +243,23 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w,
         cmp(kj, 0);
         je(skip_kh_loop, T_NEAR);
     }
-    jit_tagged_label kh_label("kh", pad_tag, oc_blocks_tag);
-    L(kh_label);
+    Label kh_loop;
+    L(kh_loop);
     {
         if (jcp.kw >= 5 && pad_l == 0 && pad_r == 0) {
             oh_step_nopad(ur_w, pad_l, pad_r, pad_tag, oc_blocks,
                     oc_blocks_tag);
             sub(aux_reg_input, sizeof(float) * kw * inp_off);
-            add(aux_reg_input, sizeof(float) * iw * inp_mult);
+            add(aux_reg_input, sizeof(float) * iw * dilate_h * inp_mult);
         } else {
             oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks);
             add(aux_reg_kernel, sizeof(float) * kw * oc_blk * ic_blk);
-            add(aux_reg_input, sizeof(float) * iw * inp_mult);
+            add(aux_reg_input, sizeof(float) * iw * dilate_h * inp_mult);
         }
 
         dec(kj);
         cmp(kj, 0);
-        jg(kh_label, T_NEAR);
+        jg(kh_loop, T_NEAR);
     }
 
     L(skip_kh_loop);
@@ -273,7 +272,7 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w,
 
         dec(reg_ki);
         cmp(reg_ki, 0);
-        jg(kd_label, T_NEAR);
+        jg(kd_loop, T_NEAR);
         L(skip_kd_loop);
 
         pop(oi_iter);
@@ -281,11 +280,10 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w,
     }
 
 
-    jit_tagged_label done_label("done", pad_tag, oc_blocks_tag);
-    jit_tagged_label regular_store_label("store", pad_tag, oc_blocks_tag);
+    Label done, regular_store;
 
     test(reg_ci_flag, FLAG_IC_LAST);
-    je(regular_store_label, T_NEAR);
+    je(regular_store, T_NEAR);
 
     int eltwise_inj_idx = 0;
     int depthwise_inj_idx = 0;
@@ -320,7 +318,7 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w,
         }
     }
 
-    L(regular_store_label);
+    L(regular_store);
 
     for (int ii = 0; ii < oc_blocks; ii++) {
         for (int jj = 0; jj < ur_w; jj++) {
@@ -333,7 +331,7 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(int ur_w,
             vmovups(make_safe_addr(reg_output, o_off, reg_long_offt), reg_out);
         }
     }
-    L(done_label);
+    L(done);
 }
 
 inline void jit_avx2_conv_fwd_kernel_f32::solve_common(
@@ -348,7 +346,7 @@ inline void jit_avx2_conv_fwd_kernel_f32::solve_common(
     int oc_blk = jcp.oc_block;
     int dilate_w = jcp.dilate_w + 1;
     int str_w = jcp.stride_w;
-    const int inp_mult = one_of(jcp.src_fmt, nchw, ncdhw) ? 1 : ic_blk;
+    const int inp_mult = one_of(jcp.src_fmt, ncw, nchw, ncdhw) ? 1 : ic_blk;
 
     int l_pad = jcp.l_pad;
     int r_pad = nstl::max(0, (int(jcp.ow) - 1) * str_w + (kw - 1) * dilate_w
@@ -369,11 +367,11 @@ inline void jit_avx2_conv_fwd_kernel_f32::solve_common(
         add(reg_output, sizeof(float) * ur_w * oc_blk);
     }
 
-    jit_tagged_label ow_loop_label("ow", oc_blocks_tag);
+    Label ow_loop;
     xor_(oi_iter, oi_iter);
 
     if (n_oi > 0) {
-        L(ow_loop_label);
+        L(ow_loop);
 
         width_blk_step(ur_w, 0, 0,
                 'm', oc_blocks, oc_blocks_tag); // "middle"
@@ -382,7 +380,7 @@ inline void jit_avx2_conv_fwd_kernel_f32::solve_common(
 
         inc(oi_iter);
         cmp(oi_iter, n_oi);
-        jl(ow_loop_label, T_NEAR);
+        jl(ow_loop, T_NEAR);
     }
 
     if (r_pad1 > 0 && n_oi >=0) {
@@ -436,24 +434,23 @@ void jit_avx2_conv_fwd_kernel_f32::generate()
     mov(reg_oc_blocks, ptr[this->param1 + GET_OFF(oc_blocks)]);
 
     int nb_oc_tail = jcp.nb_oc % jcp.nb_oc_blocking;
-    const char *tail_label = ".tail";
-    const char *exit_label = ".exit";
+    Label tail, exit;
 
     if (jcp.nb_oc > jcp.nb_oc_blocking) {
         cmp(reg_oc_blocks, jcp.nb_oc_blocking);
-        jne(nb_oc_tail ? tail_label : exit_label, T_NEAR);
+        jne(nb_oc_tail ? tail : exit, T_NEAR);
 
         solve_common(jcp.nb_oc_blocking, '0' + jcp.nb_oc_blocking);
-        jmp(exit_label, T_NEAR);
+        jmp(exit, T_NEAR);
 
         if (nb_oc_tail) {
-            L(tail_label);
+            L(tail);
             cmp(reg_oc_blocks, nb_oc_tail);
-            jne(exit_label, T_NEAR);
+            jne(exit, T_NEAR);
             solve_common(nb_oc_tail, '0' + nb_oc_tail);
         }
 
-        L(exit_label);
+        L(exit);
     } else if (jcp.nb_oc == jcp.nb_oc_blocking) {
         solve_common(jcp.nb_oc_blocking, '0' + jcp.nb_oc_blocking);
     } else {
@@ -522,24 +519,24 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     jcp.ic = src_d.dims()[1] / jcp.ngroups;
 
     jcp.id = (ndims == 5) ? src_d.dims()[2] : 1;
-    jcp.ih = src_d.dims()[ndims-2];
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims-2];
     jcp.iw = src_d.dims()[ndims-1];
     jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1;
-    jcp.oh = dst_d.dims()[ndims-2];
+    jcp.oh = (ndims == 3) ? 1 :dst_d.dims()[ndims-2];
     jcp.ow = dst_d.dims()[ndims-1];
     jcp.kd = (ndims == 5) ? weights_d.dims()[with_groups + 2] : 1;
-    jcp.kh = weights_d.dims()[with_groups + ndims-2];
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + ndims-2];
     jcp.kw = weights_d.dims()[with_groups + ndims-1];
 
     jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
-    jcp.t_pad = cd.padding[0][ndims-4];
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims-4];
     jcp.l_pad = cd.padding[0][ndims-3];
     jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1;
-    jcp.stride_h = cd.strides[ndims-4];
+    jcp.stride_h = (ndims == 3) ? 1 :cd.strides[ndims-4];
     jcp.stride_w = cd.strides[ndims-3];
 
     jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0;
-    jcp.dilate_h = cd.dilates[ndims-4];
+    jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims-4];
     jcp.dilate_w = cd.dilates[ndims-3];
 
     jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1)
@@ -609,13 +606,15 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     }
 
     bool args_ok = true
-        && implication(flat, one_of(src_d.format(), nchw, nhwc, ncdhw, ndhwc)
-            && one_of(weights_d.format(), Ohwi8o, gOhwi8o, Odhwi8o, gOdhwi8o))
-        && implication(mimo, one_of(src_d.format(), nChw8c, nCdhw8c)
-            && one_of(weights_d.format(), OIhw8i8o, gOIhw8i8o, OIdhw8i8o,
-                gOIdhw8i8o))
+        && IMPLICATION(flat, one_of(src_d.format(), ncw, nwc, nchw, nhwc,
+            ncdhw, ndhwc)
+            && one_of(weights_d.format(), Owi8o, gOwi8o, Ohwi8o, gOhwi8o,
+                Odhwi8o, gOdhwi8o))
+        && IMPLICATION(mimo, one_of(src_d.format(), nCw8c, nChw8c, nCdhw8c)
+            && one_of(weights_d.format(), OIw8i8o, gOIw8i8o, OIhw8i8o,
+                gOIhw8i8o, OIdhw8i8o, gOIdhw8i8o))
         && one_of(cd.bias_desc.format, memory_format::undef, any, x)
-        && one_of(dst_d.format(), nChw8c, nCdhw8c);
+        && one_of(dst_d.format(), nCw8c, nChw8c, nCdhw8c);
     if (!args_ok) return status::unimplemented;
 
     jcp.ur_h = 1; /* no code-unrolling by h so far */
@@ -626,10 +625,12 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
 
     jcp.nb_oc_blocking = 4; /* the optimal value for the kernel */
 
+    // Intel AVX and Intel AVX2 kernels need 2 and 1 temporary YMMs, respectively
+    // Thus, we can only assign 14 or 15 YMMs for data storage
+    const int num_avail_regs = mayiuse(avx2) ? 15 : 14;
     if (!mayiuse(avx2)) {
-        // AVX kernel needs 2 temporary YMMs -- can assign only 14 YMMs
-        if ((jcp.nb_oc_blocking + 1) * jcp.ur_w >= 15) {
-            // current register assignment requires >= 15 YMMs
+        if ((jcp.nb_oc_blocking + 1) * jcp.ur_w > num_avail_regs) {
+            // current register assignment requires more YMMs than available
             // adjust one of nb_oc_block, ur_w preserving to ur_w >= l_pad
             if (jcp.ur_w > jcp.l_pad && jcp.ur_w > 1)
                 jcp.ur_w -= 1;
@@ -648,26 +649,28 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     args_ok = true
         && jcp.oc % simd_w == 0
         && jcp.l_pad <= jcp.ur_w
-        && implication(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0)
+        && IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0)
                 || (jcp.stride_w == 1 && jcp.stride_h == 1))
-        && implication(mimo, jcp.ic % simd_w == 0);
+        && IMPLICATION(mimo, jcp.ic % simd_w == 0);
     if (!args_ok) return status::unimplemented;
 
     int r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w
         + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1));
 
-    if (r_pad_no_tail > jcp.ur_w) {
+    if (r_pad_no_tail > jcp.ur_w * jcp.stride_w && jcp.ow / jcp.ur_w > 1) {
         /* recalculate ur_w, nb_oc_blocking and ur_w_tail */
-        jcp.ur_w = r_pad_no_tail + 1;
-        jcp.nb_oc_blocking = ((16 - 1)-jcp.ur_w)/jcp.ur_w;
+        jcp.ur_w = nstl::min(r_pad_no_tail / jcp.stride_w + jcp.ur_w_tail,
+                nstl::min(jcp.ow, num_avail_regs / 2));
+        jcp.nb_oc_blocking = (num_avail_regs - jcp.ur_w) / jcp.ur_w;
         jcp.ur_w_tail = jcp.ow % jcp.ur_w;
         /* check again ... */
         r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w
             + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1));
-        if ((r_pad_no_tail > jcp.ur_w) || (jcp.ow < jcp.ur_w))
+        if (jcp.ur_w < nstl::max(jcp.l_pad, r_pad_no_tail))
             return status::unimplemented;
     }
-    if (jcp.l_pad > jcp.ur_w) return status::unimplemented;
+    assert(jcp.nb_oc_blocking > 0);
+    assert(jcp.ur_w * (jcp.nb_oc_blocking + 1) <= num_avail_regs);
 
     jcp.ic_block = (jcp.ic % simd_w != 0) ? jcp.ic : simd_w;
     jcp.nb_ic = jcp.ic / jcp.ic_block;
@@ -684,7 +687,7 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
 }
 
 void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow,
-        int r_overflow, int start_off, char hsw_iter_tag, char start_off_tag)
+        int r_overflow, int start_off)
 {
     int kw = jcp.kw;
     int kh = jcp.kh;
@@ -700,7 +703,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow,
     int oc_block = jcp.oc_block;
     int nb_ic_block = jcp.nb_ic_blocking;
 
-    Label kd_label, skip_kd_loop;
+    Label kd_loop, skip_kd_loop;
 
     for (int ii = 0; ii < nb_ic_block; ii++)
         for (int jj = 0; jj < ur_w; jj++) {
@@ -710,7 +713,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow,
                     make_safe_addr(reg_dsrc, offt, reg_long_offt));
         }
 
-    if (jcp.ndims == 4) {
+    if (one_of(jcp.ndims, 3, 4)) {
         mov(aux_reg_ddst, reg_ddst);
         mov(aux_reg_kernel, reg_kernel);
     }
@@ -722,7 +725,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow,
         mov(aux_reg_dst_d, reg_ddst);
         mov(aux_reg_ker_d, ptr[this->param1 + GET_OFF(filt)]);
 
-        L(kd_label);
+        L(kd_loop);
         mov(kj, ptr[this->param1 + GET_OFF(kh_padding)]);
     } else {
         mov(kj, reg_kh);
@@ -735,7 +738,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow,
 
     mov(kj, reg_kh);
 
-    jit_tagged_label kh_label(".kh_loop", hsw_iter_tag, start_off_tag);
+    Label kh_label;
 
     L(kh_label); {
         for (int ki = 0; ki < kw; ki++) {
@@ -778,7 +781,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow,
 
         dec(reg_ki);
         cmp(reg_ki, 0);
-        jg(kd_label, T_NEAR);
+        jg(kd_loop, T_NEAR);
         L(skip_kd_loop);
 
         pop(oi_iter);
@@ -796,15 +799,15 @@ void jit_avx2_conv_bwd_data_kernel_f32::hsw_iter(int ur_w, int l_overflow,
 void jit_avx2_conv_bwd_data_kernel_f32::generate() {
     preamble();
 
-    auto hsw_iter_body = [=] (int ur_w, int l_overflow, int r_overflow, char hsw_iter_tag) {
+    auto hsw_iter_body = [=] (int ur_w, int l_overflow, int r_overflow) {
         if (jcp.stride_w == 1) {
-            hsw_iter(ur_w, l_overflow, r_overflow, 0, hsw_iter_tag, '0');
+            hsw_iter(ur_w, l_overflow, r_overflow, 0);
             add(reg_dsrc, sizeof(float) * jcp.ur_w * jcp.ic_block);
             add(reg_ddst, sizeof(float) * jcp.ur_w * jcp.oc_block);
         } else {
-            jit_tagged_label hsw_iter_off_0(".hsw_iter_off_0", hsw_iter_tag);
-            jit_tagged_label hsw_iter_off_1(".hsw_iter_off_1", hsw_iter_tag);
-            jit_tagged_label hsw_iter_exit(".hsw_iter_exit",  hsw_iter_tag);
+            Label hsw_iter_off_0;
+            Label hsw_iter_off_1;
+            Label hsw_iter_exit;
 
             int dst_off = jcp.ur_w / jcp.stride_w;
 
@@ -814,7 +817,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::generate() {
                 cmp(start_off_reg, 0);
                 jg(hsw_iter_off_1, T_NEAR);
 
-                hsw_iter(ur_w, l_overflow, r_overflow, 0, hsw_iter_tag, '0');
+                hsw_iter(ur_w, l_overflow, r_overflow, 0);
                 add(reg_dsrc, sizeof(float) * jcp.ur_w * jcp.ic_block);
                 add(reg_ddst, sizeof(float) * dst_off * jcp.oc_block);
 
@@ -822,7 +825,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::generate() {
             }
 
             L(hsw_iter_off_1); {
-                hsw_iter(ur_w, l_overflow, r_overflow, 1, hsw_iter_tag, '1');
+                hsw_iter(ur_w, l_overflow, r_overflow, 1);
                 add(reg_dsrc, sizeof(float) * jcp.ur_w * jcp.ic_block);
                 add(reg_ddst, sizeof(float) * (dst_off + 1) * jcp.oc_block);
             }
@@ -843,7 +846,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::generate() {
 
     int l_overflow = nstl::max(0, jcp.kw - 1 - jcp.l_pad);
     if (l_overflow > 0) {
-        hsw_iter_body(jcp.ur_w, l_overflow, 0, 'l');
+        hsw_iter_body(jcp.ur_w, l_overflow, 0);
         inc(oi_iter);
     }
 
@@ -855,20 +858,20 @@ void jit_avx2_conv_bwd_data_kernel_f32::generate() {
         n_oi--;
 
     if ((l_overflow <= 0 && n_oi > 0) || (l_overflow >  0 && n_oi > 1)) {
-        L(".ow_loop"); {
-            hsw_iter_body(jcp.ur_w, 0, 0, 'm');
+        Label ow_loop;
+        L(ow_loop); {
+            hsw_iter_body(jcp.ur_w, 0, 0);
             inc(oi_iter);
             cmp(oi_iter, n_oi);
-            jl(".ow_loop", T_NEAR);
+            jl(ow_loop, T_NEAR);
         }
     }
 
-    if (r_overflow1 > 0 ) {
-        hsw_iter_body(jcp.ur_w, 0, r_overflow1, 'r');
-    }
+    if (r_overflow1 > 0 )
+        hsw_iter_body(jcp.ur_w, 0, r_overflow1);
 
     if (jcp.ur_w_tail != 0)
-        hsw_iter_body(jcp.ur_w_tail, 0, r_overflow, 't');
+        hsw_iter_body(jcp.ur_w_tail, 0, r_overflow);
 
     this->postamble();
 }
@@ -893,26 +896,26 @@ status_t jit_avx2_conv_bwd_data_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     jcp.ic = diff_src_d.dims()[1] / jcp.ngroups;
 
     jcp.id = (ndims == 5) ? diff_src_d.dims()[2] : 1;
-    jcp.ih = diff_src_d.dims()[ndims-2];
+    jcp.ih = (ndims == 3) ? 1 : diff_src_d.dims()[ndims-2];
     jcp.iw = diff_src_d.dims()[ndims-1];
     jcp.od = (ndims == 5) ? diff_dst_d.dims()[2] : 1;
-    jcp.oh = diff_dst_d.dims()[ndims-2];
+    jcp.oh = (ndims == 3) ? 1 : diff_dst_d.dims()[ndims-2];
     jcp.ow = diff_dst_d.dims()[ndims-1];
 
     jcp.kd = (ndims == 5) ? weights_d.dims()[with_groups + 2] : 1;
-    jcp.kh = weights_d.dims()[with_groups + ndims - 2];
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + ndims - 2];
     jcp.kw = weights_d.dims()[with_groups + ndims - 1];
 
     jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
-    jcp.t_pad = cd.padding[0][ndims-4];
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims-4];
     jcp.l_pad = cd.padding[0][ndims-3];
 
     jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1;
-    jcp.stride_h = cd.strides[ndims-4];
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims-4];
     jcp.stride_w = cd.strides[ndims-3];
 
     jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0;
-    jcp.dilate_h = cd.dilates[ndims-4];
+    jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims-4];
     jcp.dilate_w = cd.dilates[ndims-3];
 
     const int simd_w = 8;
@@ -947,10 +950,10 @@ status_t jit_avx2_conv_bwd_data_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     jcp.with_eltwise = false;
 
     bool args_ok = true
-        && one_of(diff_src_d.format(), nChw8c, nCdhw8c)
-        && one_of(weights_d.format(), gOIhw8o8i, OIhw8o8i,
+        && one_of(diff_src_d.format(), nCw8c, nChw8c, nCdhw8c)
+        && one_of(weights_d.format(), gOIw8o8i, OIw8i8o, gOIhw8o8i, OIhw8o8i,
                 gOIdhw8o8i, OIdhw8o8i)
-        && one_of(diff_dst_d.format(), nChw8c, nCdhw8c)
+        && one_of(diff_dst_d.format(), nCw8c, nChw8c, nCdhw8c)
         && (jcp.stride_w == 1 || jcp.stride_w == 2)
         && jcp.stride_d == 1
         && jcp.dilate_d == 0
@@ -1018,26 +1021,26 @@ status_t jit_avx2_conv_bwd_weights_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     jcp.ic = src_d.dims()[1] / jcp.ngroups;
 
     jcp.id = (ndims == 5) ? src_d.dims()[2] : 1;
-    jcp.ih = src_d.dims()[ndims-2];
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims-2];
     jcp.iw = src_d.dims()[ndims-1];
     jcp.od = (ndims == 5) ? diff_dst_d.dims()[2] : 1;
-    jcp.oh = diff_dst_d.dims()[ndims-2];
+    jcp.oh = (ndims == 3) ? 1 : diff_dst_d.dims()[ndims-2];
     jcp.ow = diff_dst_d.dims()[ndims-1];
 
     jcp.kd = (ndims == 5) ? diff_weights_d.dims()[with_groups + 2] : 1;
-    jcp.kh = diff_weights_d.dims()[with_groups + ndims-2];
+    jcp.kh = (ndims == 3) ? 1 : diff_weights_d.dims()[with_groups + ndims-2];
     jcp.kw = diff_weights_d.dims()[with_groups + ndims-1];
 
     jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
-    jcp.t_pad = cd.padding[0][ndims-4];
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims-4];
     jcp.l_pad = cd.padding[0][ndims-3];
 
     jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1;
-    jcp.stride_h = cd.strides[ndims-4];
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims-4];
     jcp.stride_w = cd.strides[ndims-3];
 
     jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0;
-    jcp.dilate_h = cd.dilates[ndims-4];
+    jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims-4];
     jcp.dilate_w = cd.dilates[ndims-3];
 
     jcp.src_fmt = src_d.format();
@@ -1066,15 +1069,16 @@ status_t jit_avx2_conv_bwd_weights_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     }
 
     bool args_ok = true
-        && implication(flat, one_of(src_d.format(), nchw, nhwc, ncdhw, ndhwc)
-                && one_of(diff_weights_d.format(), Ohwi8o, gOhwi8o,
-                    Odhwi8o, gOdhwi8o))
-        && implication(mimo, one_of(src_d.format(), nChw8c, nCdhw8c)
-                && one_of(diff_weights_d.format(), OIhw8i8o, gOIhw8i8o,
-                    OIdhw8i8o, gOIdhw8i8o))
+        && IMPLICATION(flat, one_of(src_d.format(), ncw, nwc, nchw, nhwc, ncdhw,
+                ndhwc)
+                && one_of(diff_weights_d.format(), Owi8o, gOwi8o, Ohwi8o,
+                    gOhwi8o, Odhwi8o, gOdhwi8o))
+        && IMPLICATION(mimo, one_of(src_d.format(), nCw8c, nChw8c, nCdhw8c)
+                && one_of(diff_weights_d.format(), OIw8i8o, gOIw8i8o, OIhw8i8o,
+                    gOIhw8i8o, OIdhw8i8o, gOIdhw8i8o))
         && one_of(cd.bias_desc.format, memory_format::undef, any, x)
-        && one_of(diff_dst_d.format(), nChw8c, nCdhw8c)
-        && implication(mimo, jcp.ic % simd_w == 0)
+        && one_of(diff_dst_d.format(), nCw8c, nChw8c, nCdhw8c)
+        && IMPLICATION(mimo, jcp.ic % simd_w == 0)
         && jcp.oc % simd_w == 0
         && jcp.kw < 14
         && jcp.kh <= jcp.t_pad + jcp.ih /* [bwd_w:r1] */
@@ -1098,32 +1102,32 @@ status_t jit_avx2_conv_bwd_weights_kernel_f32::init_conf(jit_conv_conf_t &jcp,
 
 inline void jit_avx2_conv_bwd_weights_kernel_f32::od_step_comeback_pointers()
 {
-    Label kd_comeback_label;
-    mov(kj, jcp.kd); //FIXME, work only if f_pad = back_pad = 0 (Anton)
-    L(kd_comeback_label); {
-        const int inp_mult = one_of(jcp.src_fmt, nchw, ncdhw)
+    Label kd_comeback_loop;
+    mov(kj, jcp.kd); //FIXME (Anton): this works only if f_pad = back_pad = 0
+    L(kd_comeback_loop); {
+        const int inp_mult = one_of(jcp.src_fmt, ncw, nchw, ncdhw)
             ? 1 : jcp.ic_block;
         sub(aux_reg_input, sizeof(float) * jcp.iw * jcp.ih * inp_mult);
         sub(aux_reg_kernel, sizeof(float) * jcp.kw * jcp.kh * jcp.ic_block
                 * jcp.oc_block);
         dec(kj);
         cmp(kj, 0);
-        jg(kd_comeback_label, T_NEAR);
+        jg(kd_comeback_loop, T_NEAR);
     }
 }
 
-inline void jit_avx2_conv_bwd_weights_kernel_f32::oh_step_comeback_pointers(
-        const char *kh_comeback_label)
+inline void jit_avx2_conv_bwd_weights_kernel_f32::oh_step_comeback_pointers()
 {
     mov(kj, reg_kh);
-    L(kh_comeback_label); {
-        const int inp_mult = one_of(jcp.src_fmt, nchw, ncdhw)
+    Label kh_comeback_loop;
+    L(kh_comeback_loop); {
+        const int inp_mult = one_of(jcp.src_fmt, ncw, nchw, ncdhw)
             ? 1 : jcp.ic_block;
         sub(reg_input, sizeof(float) * jcp.iw * inp_mult);
         sub(reg_kernel, sizeof(float) * jcp.kw * jcp.ic_block * jcp.oc_block);
         dec(kj);
         cmp(kj, 0);
-        jg(kh_comeback_label, T_NEAR);
+        jg(kh_comeback_loop, T_NEAR);
     }
 }
 
@@ -1154,7 +1158,7 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_ic_block_step(
                 continue;
             for (int i_ic = 0; i_ic < ic_block_step; i_ic++) {
                 size_t i_off = (size_t)input_offset + sizeof(float)*(
-                    one_of(jcp.src_fmt, nchw, ncdhw)
+                    one_of(jcp.src_fmt, ncw, nchw, ncdhw)
                         ? (i_iw - pad_l) + i_ic
                         * ((size_t)jcp.id * jcp.ih * jcp.iw)
                         : (i_iw - pad_l) * ic_block + i_ic);
@@ -1177,12 +1181,10 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_ic_block_step(
         }
 }
 
-inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_disp(
-        const char *kh_label, const char *ic_block_label,
-        const char *ow_block_label, const char *kh_comeback_label)
+inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_disp()
 {
     int ic_block_step;
-    if (one_of(jcp.src_fmt, nchw, ncdhw)) {
+    if (one_of(jcp.src_fmt, ncw, nchw, ncdhw)) {
         ic_block_step = jcp.kw >= 5 ? 1 : jcp.ic_block;
     } else {
         ic_block_step = jcp.kw > 7 ? 1
@@ -1193,34 +1195,28 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_disp(
     const int max_ur_w = jcp.ow > 56 ? 14 : 28;
 
     if (jcp.ow <= max_ur_w)
-        compute_oh_step_unroll_ow(kh_label, ic_block_label, ow_block_label,
-                kh_comeback_label, ic_block_step, max_ur_w);
+        compute_oh_step_unroll_ow(ic_block_step, max_ur_w);
     else
-        compute_oh_step_common(kh_label, ic_block_label, ow_block_label,
-                kh_comeback_label, ic_block_step, max_ur_w);
+        compute_oh_step_common(ic_block_step, max_ur_w);
 
     if (jcp.ndims == 5) {
         od_step_comeback_pointers();
         mov(reg_input, aux_reg_input);
         mov(reg_kernel, aux_reg_kernel);
     } else {
-        oh_step_comeback_pointers(kh_comeback_label);
+        oh_step_comeback_pointers();
     }
 }
 
 inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_unroll_ow(
-        const char *kh_label, const char *ic_block_label,
-        const char *ow_block_label, const char *kh_comeback_label,
         int ic_block_step, int max_ur_w)
 {
-    UNUSED(ow_block_label);
-    UNUSED(kh_comeback_label);
     UNUSED(max_ur_w);
 
     const int ic_block = jcp.ic_block;
     const int oc_block = jcp.oc_block;
-    int inp_mul = one_of(jcp.src_fmt, nchw, ncdhw) ? 1 : jcp.ic_block;
-    Label kd_label;
+    int inp_mul = one_of(jcp.src_fmt, ncw, nchw, ncdhw) ? 1 : jcp.ic_block;
+    Label kd_loop;
 
     const int r_pad
         = nstl::max(0,
@@ -1230,26 +1226,29 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_unroll_ow(
         mov(aux_reg_input, reg_input);
         mov(aux_reg_kernel, reg_kernel);
         mov(ki, jcp.kd);
-        L(kd_label);
+        L(kd_loop);
         mov(reg_input, aux_reg_input);
         mov(reg_kernel, aux_reg_kernel);
     }
 
     mov(kj, reg_kh);
-    L(kh_label); {
+    Label kh_loop;
+    L(kh_loop); {
         xor_(b_ic, b_ic);
-        L(ic_block_label); {
+        Label ic_block_loop;
+        L(ic_block_loop); {
             compute_ic_block_step(jcp.ow, jcp.l_pad, r_pad, ic_block_step, 0,
                     0, 0);
             size_t inp_icblk_stride = sizeof(float) * ic_block_step
-                * (one_of(jcp.src_fmt, nchw, ncdhw) ? jcp.id*jcp.ih*jcp.iw : 1);
+                * (one_of(jcp.src_fmt, ncw, nchw, ncdhw)
+                ? jcp.id*jcp.ih*jcp.iw : 1);
             safe_add(reg_input, inp_icblk_stride, reg_long_offt);
             add(reg_kernel, sizeof(float) * ic_block_step * oc_block);
             add(b_ic, ic_block_step);
             cmp(b_ic, ic_block);
-            jl(ic_block_label, T_NEAR);
+            jl(ic_block_loop, T_NEAR);
         }
-        if(one_of(jcp.src_fmt, nchw, ncdhw)) {
+        if(one_of(jcp.src_fmt, ncw, nchw, ncdhw)) {
             size_t offt = sizeof(float) * jcp.id * jcp.ih * jcp.iw * ic_block;
             safe_sub(reg_input, offt, reg_long_offt);
             add(reg_input, sizeof(float) * jcp.iw);
@@ -1259,7 +1258,7 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_unroll_ow(
         add(reg_kernel, sizeof(float) * (jcp.kw - 1) * ic_block * oc_block);
         dec(kj);
         cmp(kj, 0);
-        jg(kh_label, T_NEAR);
+        jg(kh_loop, T_NEAR);
     }
 
     if (jcp.ndims == 5) {
@@ -1268,23 +1267,19 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_unroll_ow(
             * oc_block);
         dec(ki);
         cmp(ki, 0);
-        jg(kd_label, T_NEAR);
+        jg(kd_loop, T_NEAR);
     }
 
 }
 
 inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_common(
-        const char *kh_label, const char *ic_block_label,
-        const char *ow_block_label, const char *kh_comeback_label,
         int ic_block_step, int max_ur_w)
 {
-    UNUSED(kh_comeback_label);
-
     const int ic_block = jcp.ic_block;
     const int oc_block = jcp.oc_block;
     const int stride_w = jcp.stride_w;
-    int inp_mul = one_of(jcp.src_fmt, nchw, ncdhw) ? 1 : jcp.ic_block;
-    Label kd_label;
+    int inp_mul = one_of(jcp.src_fmt, ncw, nchw, ncdhw) ? 1 : jcp.ic_block;
+    Label kd_loop;
 
     const int r_pad
         = nstl::max(0,
@@ -1302,7 +1297,7 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_common(
             ur_w = ur_w / 2;
         }
     }
-    const int inp_mult = one_of(jcp.src_fmt, nchw, ncdhw) ? 1 : ic_block;
+    const int inp_mult = one_of(jcp.src_fmt, ncw, nchw, ncdhw) ? 1 : ic_block;
 
     int input_comeback = (ur_w_trips * ur_w * stride_w - jcp.l_pad) * inp_mult;
     int output_comeback = ur_w_trips * ur_w * oc_block;
@@ -1311,15 +1306,17 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_common(
         mov(aux_reg_input, reg_input);
         mov(aux_reg_kernel, reg_kernel);
         mov(ki, jcp.kd);
-        L(kd_label);
+        L(kd_loop);
         mov(reg_input, aux_reg_input);
         mov(reg_kernel, aux_reg_kernel);
     }
 
     mov(kj, reg_kh);
-    L(kh_label); {
+    Label kh_loop;
+    L(kh_loop); {
         xor_(b_ic, b_ic);
-        L(ic_block_label); {
+        Label ic_block_loop;
+        L(ic_block_loop); {
             if (jcp.l_pad != 0) {
                 ur_w_trips--;
                 compute_ic_block_step(ur_w,
@@ -1331,14 +1328,15 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_common(
 
             if (ur_w_trips > 0) {
                 xor_(reg_ur_w_trips, reg_ur_w_trips);
-                L(ow_block_label); {
+                Label ow_block_loop;
+                L(ow_block_loop); {
                     compute_ic_block_step(ur_w, 0, 0, ic_block_step, 0, 0, 0);
                     add(reg_input, sizeof(float) * ur_w * stride_w * inp_mult);
                     add(reg_output, sizeof(float) * ur_w * oc_block);
 
                     inc(reg_ur_w_trips);
                     cmp(reg_ur_w_trips, ur_w_trips);
-                    jl(ow_block_label, T_NEAR);
+                    jl(ow_block_loop, T_NEAR);
                 }
             }
 
@@ -1350,15 +1348,16 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_common(
             sub(reg_output, sizeof(float) * output_comeback);
 
             size_t inp_icblk_stride = sizeof(float) * ic_block_step
-                * (one_of(jcp.src_fmt, nchw, ncdhw) ? jcp.id*jcp.ih*jcp.iw : 1);
+                * (one_of(jcp.src_fmt, ncw, nchw, ncdhw)
+                ? jcp.id*jcp.ih*jcp.iw : 1);
             safe_add(reg_input, inp_icblk_stride, reg_long_offt);
             add(reg_kernel, sizeof(float) * ic_block_step * oc_block);
 
             add(b_ic, ic_block_step);
             cmp(b_ic, jcp.ic_block);
-            jl(ic_block_label, T_NEAR);
+            jl(ic_block_loop, T_NEAR);
         }
-        if (one_of(jcp.src_fmt, nchw, ncdhw)) {
+        if (one_of(jcp.src_fmt, ncw, nchw, ncdhw)) {
             size_t offt = sizeof(float) * jcp.id * jcp.ih * jcp.iw * ic_block;
             safe_sub(reg_input, offt, reg_long_offt);
             add(reg_input, sizeof(float) * jcp.iw);
@@ -1368,7 +1367,7 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_common(
         add(reg_kernel, sizeof(float) * (jcp.kw - 1) * ic_block * oc_block);
         dec(kj);
         cmp(kj, 0);
-        jg(kh_label, T_NEAR);
+        jg(kh_loop, T_NEAR);
     }
 
     if (jcp.ndims == 5) {
@@ -1377,7 +1376,7 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_step_common(
             * oc_block);
         dec(ki);
         cmp(ki, 0);
-        jg(kd_label, T_NEAR);
+        jg(kd_loop, T_NEAR);
     }
 
 }
@@ -1387,10 +1386,13 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_loop_common()
     const int icoc_block = jcp.ic_block * jcp.oc_block;
     const int t_pad = jcp.t_pad;
     const int stride_h = jcp.stride_h;
-    const int inp_mult = one_of(jcp.src_fmt, nchw, ncdhw) ? 1 : jcp.ic_block;
+    const int inp_mult = one_of(jcp.src_fmt, ncw, nchw, ncdhw)
+        ? 1 : jcp.ic_block;
     int b_pad
         = nstl::max(0, (jcp.oh - 1) * stride_h + jcp.kh - jcp.ih - t_pad);
 
+    Label oh_tpad_loop, oh_loop, oh_loop_end;
+
     mov(reg_kh, jcp.kh);
     xor_(reg_ih_count, reg_ih_count);
     xor_(reg_oj, reg_oj);
@@ -1399,9 +1401,8 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_loop_common()
         mov(reg_kh, jcp.kh <= t_pad + jcp.ih ? jcp.kh - t_pad : jcp.ih);
         add(reg_kernel, sizeof(float) * t_pad * jcp.kw * icoc_block);
 
-        L(".oh_tpad_label"); {
-            compute_oh_step_disp(".L_kh_top", "L.ic_block_top",
-                    "L.ow_block_top", "L.kh_comeback_top");
+        L(oh_tpad_loop); {
+            compute_oh_step_disp();
             add(reg_output, sizeof(float) * jcp.ow * jcp.oc_block);
             sub(reg_kernel, sizeof(float) * stride_h * jcp.kw * icoc_block);
 
@@ -1413,7 +1414,7 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_loop_common()
              * so far we do not support that (until we put constant here) */
             const int final_inp_ker_overlap = jcp.kh; /* [bwd_w:r2] */
             cmp(reg_kh, final_inp_ker_overlap);
-            jl(".oh_tpad_label", T_NEAR);
+            jl(oh_tpad_loop, T_NEAR);
         }
 
         if (t_pad % stride_h != 0) {
@@ -1423,14 +1424,13 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_loop_common()
         }
     }
     cmp(reg_ih_count, jcp.ih + t_pad - jcp.kh + 1);
-    jge(".oh_label_end", T_NEAR);
+    jge(oh_loop_end, T_NEAR);
     cmp(reg_oj, jcp.oh);
-    jge(".oh_label", T_NEAR);
+    jge(oh_loop, T_NEAR);
 
     mov(reg_kh, jcp.kh);
-    L(".oh_label"); {
-        compute_oh_step_disp(".L_kh_center", "L.ic_block_center",
-                "L.ow_block_center", "L.kh_comeback_center");
+    L(oh_loop); {
+        compute_oh_step_disp();
         add(reg_input, sizeof(float) * stride_h * jcp.iw * inp_mult);
         add(reg_output, sizeof(float) * jcp.ow * jcp.oc_block);
 
@@ -1438,33 +1438,33 @@ inline void jit_avx2_conv_bwd_weights_kernel_f32::compute_oh_loop_common()
         add(reg_ih_count, stride_h);
 
         cmp(reg_ih_count, jcp.ih + t_pad - jcp.kh + 1);
-        jge(".oh_label_end", T_NEAR);
+        jge(oh_loop_end, T_NEAR);
 
         cmp(reg_oj, jcp.oh);
-        jl(".oh_label", T_NEAR);
+        jl(oh_loop, T_NEAR);
     }
-    L(".oh_label_end");
+    L(oh_loop_end);
     if (b_pad > 0) {
+        Label oh_bpad_loop, oh_bpad_loop_end;
         cmp(reg_oj, jcp.oh);
-        jge(".oh_bpad_label_end", T_NEAR);
+        jge(oh_bpad_loop_end, T_NEAR);
 
         mov(reg_kh, jcp.ih + t_pad);
         sub(reg_kh, reg_ih_count);
-        L(".oh_bpad_label"); {
-            compute_oh_step_disp(".L_kh_bottom", "L.ic_block_bottom",
-                    "L.ow_block_bottom", "L.kh_comeback_bottom");
+        L(oh_bpad_loop); {
+            compute_oh_step_disp();
             add(reg_input, sizeof(float) * stride_h * jcp.iw * inp_mult);
             add(reg_output, sizeof(float) * jcp.ow * jcp.oc_block);
 
             sub(reg_kh, stride_h);
             cmp(reg_kh, 0);
-            jle(".oh_bpad_label_end", T_NEAR);
+            jle(oh_bpad_loop_end, T_NEAR);
 
             inc(reg_oj);
             cmp(reg_oj, jcp.oh);
-            jl(".oh_bpad_label", T_NEAR);
+            jl(oh_bpad_loop, T_NEAR);
         }
-        L(".oh_bpad_label_end");
+        L(oh_bpad_loop_end);
     }
 }
 
index 68947ce..f370054 100644 (file)
@@ -20,6 +20,7 @@
 #include "c_types_map.hpp"
 #include "jit_generator.hpp"
 #include "jit_primitive_conf.hpp"
+#include "cpu_memory.hpp"
 #include "jit_uni_eltwise.hpp"
 #include "jit_uni_depthwise.hpp"
 
@@ -146,7 +147,7 @@ private:
     reg64_t start_off_reg = aux1_reg_input;
 
     inline void hsw_iter(int ur_w, int l_overflow, int r_overflow,
-            int start_off, char hsw_iter_tag, char start_off_tag);
+            int start_off);
 
     void generate();
 };
@@ -186,19 +187,13 @@ private:
     reg64_t reg_long_offt = r11;
 
     inline void od_step_comeback_pointers();
-    inline void oh_step_comeback_pointers(const char *kh_comeback_label);
+    inline void oh_step_comeback_pointers();
     inline void compute_ic_block_step(int ur_w, int pad_l, int pad_r,
             int ic_block_step, int input_offset, int kernel_offset,
             int output_offset);
-    inline void compute_oh_step_disp(const char* kh_label,
-            const char* ic_block_label, const char* ow_block_label,
-            const char* kh_comeback_label);
-    inline void compute_oh_step_unroll_ow(const char* kh_label,
-            const char* ic_block_label, const char* ow_block_label,
-            const char* kh_comeback_label, int ic_block_step, int max_ur_w);
-    inline void compute_oh_step_common(const char* kh_label,
-            const char* ic_block_label, const char* ow_block_label,
-            const char* kh_comeback_label, int ic_block_step, int max_ur_w);
+    inline void compute_oh_step_disp();
+    inline void compute_oh_step_unroll_ow(int ic_block_step, int max_ur_w);
+    inline void compute_oh_step_common(int ic_block_step, int max_ur_w);
     inline void compute_oh_loop_common();
 
     void generate();
index 2874bf9..e9ccf6f 100644 (file)
@@ -31,19 +31,22 @@ using namespace mkldnn::impl::status;
 using namespace mkldnn::impl::memory_format;
 using namespace mkldnn::impl::utils;
 
-#define src_blk_off(f, n, c, d, h, w) \
-    conf_.ndims() == 5 \
-        ? (f).blk_off(n, c, d, h, w) \
-        : (f).blk_off(n, c, h, w)
 
+#define src_blk_off(f, n, c, d, h, w) \
+    (conf_.ndims() == 3) \
+    ? (f).blk_off(n, c, w) \
+    : (conf_.ndims() == 4) \
+    ? (f).blk_off(n, c, h, w) \
+    : (f).blk_off(n, c, d, h, w)
+
+#define wht_blk_off_(f, g, ...) \
+    conf_.with_groups() ? (f).blk_off(g, __VA_ARGS__) : (f).blk_off(__VA_ARGS__)
 #define wht_blk_off(f, g, oc, ic, kd, kh, kw) \
-    conf_.ndims() == 5 \
-        ? conf_.with_groups() \
-            ? (f).blk_off(g, oc, ic, kd, kh, kw) \
-            : (f).blk_off(oc, ic, kd, kh, kw) \
-        : conf_.with_groups() \
-            ? (f).blk_off(g, oc, ic, kh, kw) \
-            : (f).blk_off(oc, ic, kh, kw)
+    (conf_.ndims() == 3) \
+    ? wht_blk_off_(f, g, oc, ic, kw) \
+    : (conf_.ndims() == 4) \
+    ? wht_blk_off_(f, g, oc, ic, kh, kw) \
+    : wht_blk_off_(f, g, oc, ic, kd, kh, kw)
 
 template <bool with_relu>
 void _jit_avx2_convolution_fwd_t<with_relu>::execute_forward() {
index 5e56f54..bd151dd 100644 (file)
@@ -58,7 +58,7 @@ struct _jit_avx2_convolution_fwd_t: public cpu_primitive_t {
                         this->cdesc_().src_desc.data_type,
                         this->cdesc_().weights_desc.data_type,
                         this->cdesc_().dst_desc.data_type)
-                && utils::implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                         data_type::f32 == this->cdesc_().bias_desc.data_type);
             if (!ok) return status::unimplemented;
 
@@ -94,20 +94,19 @@ struct _jit_avx2_convolution_fwd_t: public cpu_primitive_t {
             const int simd_w = 8;
             const bool flat = this->IC() < simd_w;
             if (this->src_pd_.desc()->format == any)
-                CHECK(this->src_pd_.set_format(this->ndims() == 4
-                            ? flat ? nchw : nChw8c
-                            : flat ? ncdhw : nCdhw8c));
+                CHECK(this->src_pd_.set_format(flat
+                    ? utils::pick(this->ndims() - 3, ncw, nchw, ncdhw)
+                    : utils::pick(this->ndims() - 3, nCw8c, nChw8c, nCdhw8c)));
             if (this->dst_pd_.desc()->format == any)
-                CHECK(this->dst_pd_.set_format(this->ndims() == 4
-                        ? nChw8c : nCdhw8c));
+                CHECK(this->dst_pd_.set_format(
+                    utils::pick(this->ndims() - 3, nCw8c, nChw8c, nCdhw8c)));
             if (this->weights_pd_.desc()->format == any)
-                CHECK(this->weights_pd_.set_format(this->ndims() == 4
-                        ? this->with_groups()
-                            ? (flat ? gOhwi8o : gOIhw8i8o)
-                            : (flat ? Ohwi8o : OIhw8i8o)
-                        : this->with_groups()
-                            ? (flat ? gOdhwi8o : gOIdhw8i8o)
-                            : (flat ? Odhwi8o : OIdhw8i8o)));
+                CHECK(this->weights_pd_.set_format(this->with_groups()
+                    ? utils::pick(2 * this->ndims() - 6 + flat, gOIw8i8o,
+                        gOwi8o, gOIhw8i8o, gOhwi8o, gOIdhw8i8o, gOdhwi8o)
+                    : utils::pick(2 * this->ndims() - 6 + flat, OIw8i8o, Owi8o,
+                        OIhw8i8o, Ohwi8o, OIdhw8i8o, Odhwi8o)));
+
             if (this->bias_pd_.desc()->format == any)
                 CHECK(this->bias_pd_.set_format(x));
             return status::success;
@@ -230,15 +229,17 @@ struct jit_avx2_convolution_bwd_data_t: public cpu_primitive_t {
             using namespace memory_format;
 
             if (this->diff_src_pd_.desc()->format == any)
-                CHECK(this->diff_src_pd_.set_format(this->ndims() == 4
-                            ? nChw8c : nCdhw8c));
+                CHECK(this->diff_src_pd_.set_format(
+                    utils::pick(this->ndims() - 3, nCw8c, nChw8c, nCdhw8c)));
             if (this->diff_dst_pd_.desc()->format == any)
-                CHECK(this->diff_dst_pd_.set_format(this->ndims() == 4
-                            ? nChw8c : nCdhw8c));
+                CHECK(this->diff_dst_pd_.set_format(
+                    utils::pick(this->ndims() - 3, nCw8c, nChw8c, nCdhw8c)));
             if (this->weights_pd_.desc()->format == any)
-                CHECK(this->weights_pd_.set_format(this->ndims() == 4
-                            ? this->with_groups() ? gOIhw8o8i : OIhw8o8i
-                            : this->with_groups() ? gOIdhw8o8i : OIdhw8o8i));
+                CHECK(this->weights_pd_.set_format(this->with_groups()
+                    ? utils::pick(this->ndims() - 3, gOIw8o8i, gOIhw8o8i,
+                        gOIdhw8o8i)
+                    : utils::pick(this->ndims() - 3, OIw8o8i, OIhw8o8i,
+                        OIdhw8o8i)));
             return status::success;
         }
     };
@@ -307,20 +308,18 @@ struct jit_avx2_convolution_bwd_weights_t: public cpu_primitive_t {
             const bool flat = this->IC() == 3;
 
             if (this->src_pd_.desc()->format == any)
-                CHECK(this->src_pd_.set_format(this->ndims() == 4
-                            ? flat ? nchw : nChw8c
-                            : flat ? ncdhw : nCdhw8c));
+                CHECK(this->src_pd_.set_format(flat
+                    ? utils::pick(this->ndims() - 3, ncw, nchw, ncdhw)
+                    : utils::pick(this->ndims() - 3, nCw8c, nChw8c, nCdhw8c)));
             if (this->diff_dst_pd_.desc()->format == any)
-                CHECK(this->diff_dst_pd_.set_format(this->ndims() == 4
-                            ? nChw8c : nCdhw8c));
+                CHECK(this->diff_dst_pd_.set_format(
+                    utils::pick(this->ndims() - 3, nCw8c, nChw8c, nCdhw8c)));
             if (this->diff_weights_pd_.desc()->format == any)
-                CHECK(this->diff_weights_pd_.set_format(this->ndims() == 4
-                        ? this->with_groups()
-                            ? (flat ? gOhwi8o : gOIhw8i8o)
-                            : (flat ? Ohwi8o : OIhw8i8o)
-                        : this->with_groups()
-                            ? (flat ? gOdhwi8o : gOIdhw8i8o)
-                            : (flat ? Odhwi8o : OIdhw8i8o)));
+                CHECK(this->diff_weights_pd_.set_format(this->with_groups()
+                    ? utils::pick(2 * this->ndims() - 6 + flat, gOIw8i8o,
+                        gOwi8o, gOIhw8i8o, gOhwi8o, gOIdhw8i8o, gOdhwi8o)
+                    : utils::pick(2 * this->ndims() - 6 + flat, OIw8i8o, Owi8o,
+                        OIhw8i8o, Ohwi8o, OIdhw8i8o, Odhwi8o)));
             if (this->diff_bias_pd_.desc()->format == any)
                 CHECK(this->diff_bias_pd_.set_format(x));
             return status::success;
@@ -353,6 +352,8 @@ struct jit_avx2_convolution_bwd_weights_t: public cpu_primitive_t {
     }
     ~jit_avx2_convolution_bwd_weights_t() {
         delete kernel_;
+        delete reducer_weights_;
+        delete reducer_bias_;
         free(padded_bias_);
     };
 
index 3d227fd..30f1823 100644 (file)
@@ -681,6 +681,7 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(
 
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
     const int simd_w = cpu_isa_traits<avx512_common>::vlen / sizeof(float);
+    const int ndims = src_d.ndims();
 
     jcp.prop_kind = cd.prop_kind;
 
@@ -699,19 +700,19 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(
         jcp.ic = rnd_up(jcp.ic, simd_w);
     }
 
-    jcp.ih = src_d.dims()[2];
-    jcp.iw = src_d.dims()[3];
-    jcp.oh = dst_d.dims()[2];
-    jcp.ow = dst_d.dims()[3];
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[2];
+    jcp.iw = src_d.dims()[ndims - 1];
+    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[2];
+    jcp.ow = dst_d.dims()[ndims - 1];
 
-    jcp.kh = weights_d.dims()[with_groups + 2];
-    jcp.kw = weights_d.dims()[with_groups + 3];
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + 2];
+    jcp.kw = weights_d.dims()[with_groups + ndims - 1];
 
-    jcp.t_pad = cd.padding[0][0];
-    jcp.l_pad = cd.padding[0][1];
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][0];
+    jcp.l_pad = cd.padding[0][ndims - 3];
 
-    jcp.stride_h = cd.strides[0];
-    jcp.stride_w = cd.strides[1];
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[0];
+    jcp.stride_w = cd.strides[ndims - 3];
 
     jcp.src_fmt = src_d.format();
     jcp.with_bias = one_of(jcp.prop_kind, forward_training, forward_inference)
@@ -732,7 +733,8 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(
 
     bool args_ok = true
         && jcp.ngroups == 1
-        && everyone_is(nChw16c, src_d.format(), dst_d.format())
+        && everyone_is(pick(ndims - 3, nCw16c, nChw16c), src_d.format(),
+            dst_d.format())
         && one_of(cd.bias_desc.format, memory_format::undef, any, x);
     if (!args_ok) return status::unimplemented;
 
@@ -756,12 +758,13 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(
             && weights_d.data_type() == data_type::s16
             && dst_d.data_type() == data_type::s16)))
     {
-        constexpr memory_format_t weights_formats[2][2] = {
-            { OIhw8i16o2i, OIhw8o16i2o },
-            { gOIhw8i16o2i, gOIhw8o16i2o }
-        };
-        memory_format_t weights_format
-            = weights_formats[with_groups][jcp.prop_kind == backward_data];
+        const int is_bwd_d = jcp.prop_kind == backward_data;
+        memory_format_t weights_format = with_groups
+            ? pick(2 * ndims - 6 + is_bwd_d, gOIw8i16o2i, gOIw8o16i2o,
+                gOIhw8i16o2i, gOIhw8o16i2o)
+            : pick(2 * ndims - 6 + is_bwd_d, OIw8i16o2i, OIw8o16i2o,
+                OIhw8i16o2i, OIhw8o16i2o);
+
         if (weights_d.format() != weights_format)
             return status::unimplemented;
 
@@ -773,12 +776,12 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(
     else if (everyone_is(data_type::f32, src_d.data_type(),
                             weights_d.data_type(), dst_d.data_type()))
     {
-        constexpr memory_format_t weights_formats[2][2] = {
-            { OIhw16i16o, IOhw16o16i },
-            { gOIhw16i16o, gIOhw16o16i }
-        };
-        memory_format_t weights_format
-            = weights_formats[with_groups][jcp.prop_kind == backward_data];
+        const int is_bwd_d = jcp.prop_kind == backward_data;
+        memory_format_t weights_format = with_groups
+            ? pick(2 * ndims - 6 + is_bwd_d, gOIw16i16o, gIOw16o16i,
+                gOIhw16i16o, gIOhw16o16i)
+            : pick(2 * ndims - 6 + is_bwd_d, OIw16i16o, IOw16o16i,
+                OIhw16i16o, IOhw16o16i);
 
         if (weights_d.format() != weights_format)
             return status::unimplemented;
@@ -892,6 +895,7 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(
             bool is4ops = (jcp.ver == ver_4fma || jcp.ver == ver_4vnni);
 
 //            max_regs = is4ops ? 28 : 30;
+            // FIXME (ichuraev): it is a fix for densnet-121
             max_regs = 28;
             min_regs = 9;
             size_treshold = is4ops ? 28 : 14;
index ae171de..da38121 100644 (file)
@@ -32,6 +32,11 @@ using namespace mkldnn::impl::status;
 using namespace mkldnn::impl::memory_format;
 using namespace mkldnn::impl::utils;
 
+#define data_blk_off(f, n, c, h, w) \
+    ((ndims == 3) \
+    ? (f).blk_off(n, c, w) \
+    : (f).blk_off(n, c, h, w))
+
 namespace {
 template <typename T, typename U>
 void balance2D(U nthr, U ithr, T ny, T &ny_start, T &ny_end,
@@ -92,10 +97,11 @@ void _jit_avx512_common_1x1_convolution_fwd_t
     const memory_desc_wrapper dst_d(conf_.dst_pd());
     const memory_desc_wrapper weights_d(conf_.weights_pd(0));
 
-    const int stride_h = conf_.cdesc()->strides[0];
-    const int stride_w = conf_.cdesc()->strides[1];
-    const int pad_t = conf_.cdesc()->padding[0][0];
-    const int pad_l = conf_.cdesc()->padding[0][1];
+    const int ndims = src_d.ndims();
+    const int stride_h = (ndims == 3) ? 1 : conf_.cdesc()->strides[0];
+    const int stride_w = conf_.cdesc()->strides[ndims - 3];
+    const int pad_t = (ndims == 3) ? 0 : conf_.cdesc()->padding[0][0];
+    const int pad_l = conf_.cdesc()->padding[0][ndims - 3];
 
     auto &jcp = kernel_->jcp;
     const int MB = conf_.MB();
@@ -169,7 +175,7 @@ void _jit_avx512_common_1x1_convolution_fwd_t
     {
 
         const int _ocb = g * nb_oc + ocb;
-        const size_t dst_off = dst_d.blk_off(n, _ocb, oh, ow);
+        const size_t dst_off = data_blk_off(dst_d, n, _ocb, oh, ow);
 
         p.output_data = &dst[dst_off];
         p.bias_data = &bias[_ocb * jcp.oc_block];
@@ -182,12 +188,12 @@ void _jit_avx512_common_1x1_convolution_fwd_t
             rp.ws = scratch_ + ithr * ws_per_thread_
                 + _icb * jcp.is * jcp.ic_block;
             if (ocb == ocb_start) {
-                rp.src = src + src_d.blk_off(n, _icb, ih, iw);
+                rp.src = src + data_blk_off(src_d, n, _icb, ih, iw);
                 rtus_driver_->ker_(&rp);
             }
             p.bcast_data = rp.ws;
         } else
-            p.bcast_data = src + src_d.blk_off(n, _icb, ih, iw);
+            p.bcast_data = src + data_blk_off(src_d, n, _icb, ih, iw);
 
         p.oc_off = _ocb * jcp.oc_block * sizeof(dst_data_t);
 
@@ -291,16 +297,17 @@ void _jit_avx512_common_1x1_convolution_bwd_data_t
     const memory_desc_wrapper weights_d(conf_.weights_pd(0));
     const memory_desc_wrapper diff_src_d(conf_.diff_src_pd());
 
+    const int ndims = diff_src_d.ndims();
     const auto &jcp = kernel_->jcp;
     const int MB = conf_.MB();
 
     // TODO (Roma): remove this restriction
     assert(jcp.stride_w == 1 && jcp.stride_h == 1);
 
-    const int stride_h = conf_.desc()->strides[0];
-    const int stride_w = conf_.desc()->strides[1];
-    const int pad_t = conf_.desc()->padding[0][0];
-    const int pad_l = conf_.desc()->padding[0][1];
+    const int stride_h = (ndims == 3) ? 1 : conf_.desc()->strides[0];
+    const int stride_w = conf_.desc()->strides[ndims - 3];
+    const int pad_t = (ndims == 3) ? 0 : conf_.desc()->padding[0][0];
+    const int pad_l = conf_.desc()->padding[0][ndims - 3];
 
     const int nb_ic = jcp.nb_load;
     const int nb_oc = jcp.nb_reduce;
@@ -368,8 +375,7 @@ void _jit_avx512_common_1x1_convolution_bwd_data_t
                     rp.iw_start = iw;
 
                     const int _icb = g * nb_ic + icb;
-                    rp.src = diff_src + diff_src_d.blk_off(n, _icb, ih, iw);
-
+                    rp.src = diff_src + data_blk_off(diff_src_d, n, _icb, ih, iw);
                     if (conf_.rtus_.reduce_src_) {
                         rp.ws = scratch_ + ithr * ws_per_thread_;
                         p.output_data = rp.ws;
@@ -386,8 +392,7 @@ void _jit_avx512_common_1x1_convolution_bwd_data_t
                         int nb_oc_blocking_step = reduce_outer
                             ? cur_ocb_outer : cur_ocb_inner;
                         const int _ocb = g * nb_oc + ocb;
-                        size_t diff_dst_off =
-                            diff_dst_d.blk_off(n, _ocb, oh, ow);
+                        size_t diff_dst_off = data_blk_off(diff_dst_d, n, _ocb, oh, ow);
                         p.bcast_data = &diff_dst[diff_dst_off];
 
                         p.load_data = &weights[conf_.with_groups()
@@ -484,6 +489,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights()
     const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0));
 
     const auto &jcp = kernel_->jcp;
+    const int ndims = src_d.ndims();
     const int wei_size = jcp.ngroups * jcp.oc * jcp.ic;
 
     simple_barrier::ctx_t reduction_barrier;
@@ -501,10 +507,10 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights()
     const int sp_nb = jcp.nb_reduce;
     const int mb_sp_work = jcp.mb * sp_nb;
 
-    const int stride_h = conf_.desc()->strides[0];
-    const int stride_w = conf_.desc()->strides[1];
-    const int pad_t = conf_.desc()->padding[0][0];
-    const int pad_l = conf_.desc()->padding[0][1];
+    const int stride_h = (ndims == 3) ? 1 : conf_.desc()->strides[0];
+    const int stride_w = conf_.desc()->strides[ndims - 3];
+    const int pad_t = (ndims == 3) ? 0 : conf_.desc()->padding[0][0];
+    const int pad_l = conf_.desc()->padding[0][ndims - 3];
 
     auto step = [](int default_step, int remaining, int tail_step) {
         assert(default_step <= tail_step);
@@ -540,7 +546,8 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights()
         const int ih = is / jcp.iw;
         const int iw = is % jcp.iw;
 
-        data_t *src1 = (data_t *)&src[src_d.blk_off(img, _ic, ih, iw)];
+        const int src1_off = data_blk_off(src_d, img, _ic, ih, iw);
+        data_t *src1 = (data_t *)&src[src1_off];
         data_t *tr_src1 = &tr_src_[tr_src_off(ithr_mb, ic_b_tr, is)];
 
         assert(jcp.ic_block == 16);
@@ -565,7 +572,7 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights()
 
     auto ker = [&](const int ithr, const int nthr) {
         assert(nthr == jcp.nthr);
-        assert(utils::implication(!mkldnn_thr_syncable(), jcp.nthr_mb == 1));
+        assert(IMPLICATION(!mkldnn_thr_syncable(), jcp.nthr_mb == 1));
 
         const int ithr_ic_b = ithr % jcp.nthr_ic_b;
         const int ithr_oc_b = ithr / jcp.nthr_ic_b % jcp.nthr_oc_b;
@@ -693,10 +700,14 @@ void jit_avx512_common_1x1_convolution_bwd_weights_t::execute_backward_weights()
 
                             rp.ws = scratch_ + ithr * ws_per_thread_
                                     + sp * jcp.ic_block;
-                            rp.src = local_src
-                                    + ih * src_d.blocking_desc().strides[0][2]
-                                    + iw * src_d.blocking_desc().strides[0][3];
 
+                            if (ndims == 3)
+                                rp.src = local_src + iw
+                                    * src_d.blocking_desc().strides[0][2];
+                            else
+                                rp.src = local_src + ih
+                                    * src_d.blocking_desc().strides[0][2]
+                                    + iw * src_d.blocking_desc().strides[0][3];
                             rtus_driver_->ker_(&rp);
 
                             p.bcast_data = rp.ws;
index a367d46..7878697 100644 (file)
@@ -63,9 +63,9 @@ struct _jit_avx512_common_1x1_convolution_fwd_t : public cpu_primitive_t {
                 && this->cdesc_().src_desc.data_type == src_type
                 && this->cdesc_().weights_desc.data_type == wei_type
                 && this->cdesc_().dst_desc.data_type == dst_type
-                && implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                     dst_type == this->cdesc_().bias_desc.data_type)
-                && implication(with_relu && dst_type == data_type::s32
+                && IMPLICATION(with_relu && dst_type == data_type::s32
                     && everyone_is(data_type::s16, src_type, wei_type),
                     this->negative_slope() == 0.);
             if (!ok) return status::unimplemented;
@@ -90,19 +90,23 @@ struct _jit_avx512_common_1x1_convolution_fwd_t : public cpu_primitive_t {
         virtual status_t set_default_params() override {
             using namespace memory_format;
             if (this->src_pd_.desc()->format == any)
-                CHECK(this->src_pd_.set_format(nChw16c));
+                CHECK(this->src_pd_.set_format(pick(this->ndims() - 3,
+                    nCw16c, nChw16c)));
             if (this->dst_pd_.desc()->format == any)
-                CHECK(this->dst_pd_.set_format(nChw16c));
+                CHECK(this->dst_pd_.set_format(pick(this->ndims() - 3,
+                    nCw16c, nChw16c)));
             if (this->weights_pd_.desc()->format == any) {
                 if (dst_type == data_type::f32 && src_type == data_type::f32
                     && wei_type == data_type::f32)
                         CHECK(this->weights_pd_.set_format(this->with_groups()
-                                                ? gOIhw16i16o : OIhw16i16o));
+                            ? pick(this->ndims() - 3, gOIw16i16o, gOIhw16i16o)
+                            : pick(this->ndims() - 3, OIw16i16o, OIhw16i16o)));
                 else if (dst_type == data_type::s32
                     && src_type == data_type::s16
                     && wei_type == data_type::s16)
                         CHECK(this->weights_pd_.set_format(this->with_groups()
-                                                ? gOIhw8i16o2i : OIhw8i16o2i));
+                            ? pick(this->ndims() - 3, gOIw8i16o2i, gOIhw8i16o2i)
+                            : pick(this->ndims() - 3, OIw8i16o2i, OIhw8i16o2i)));
             }
             if (this->bias_pd_.desc()->format == any)
                 CHECK(this->bias_pd_.set_format(x));
@@ -224,21 +228,25 @@ struct _jit_avx512_common_1x1_convolution_bwd_data_t : public cpu_primitive_t {
             using namespace memory_format;
 
             if (this->diff_src_pd_.desc()->format == any)
-                CHECK(this->diff_src_pd_.set_format(nChw16c));
+                CHECK(this->diff_src_pd_.set_format(pick(this->ndims() - 3,
+                    nCw16c, nChw16c)));
             if (this->diff_dst_pd_.desc()->format == any)
-                CHECK(this->diff_dst_pd_.set_format(nChw16c));
+                CHECK(this->diff_dst_pd_.set_format(pick(this->ndims() - 3,
+                   nCw16c, nChw16c)));
             if (this->weights_pd_.desc()->format == any) {
                 if (diff_dst_type == data_type::f32
                     && diff_src_type == data_type::f32
                     && wei_type == data_type::f32) {
                     CHECK(this->weights_pd_.set_format(this->with_groups()
-                        ? gIOhw16o16i : IOhw16o16i));
+                        ? pick(this->ndims() - 3, gIOw16o16i, gIOhw16o16i)
+                        : pick(this->ndims() - 3, IOw16o16i, IOhw16o16i)));
                 }
                 else if (diff_dst_type == data_type::s16
                     && diff_src_type == data_type::s32
                     && wei_type == data_type::s16)
                         CHECK(this->weights_pd_.set_format(this->with_groups()
-                                                ? gOIhw8o16i2o : OIhw8o16i2o));
+                            ? pick(this->ndims() - 3, gOIw8o16i2o, gOIhw8o16i2o)
+                            : pick(this->ndims() - 3, OIw8o16i2o, OIhw8o16i2o)));
             }
 
             return status::success;
@@ -322,7 +330,7 @@ struct jit_avx512_common_1x1_convolution_bwd_weights_t : public cpu_primitive_t
                         this->desc()->src_desc.data_type,
                         this->desc()->diff_weights_desc.data_type,
                         this->desc()->diff_dst_desc.data_type)
-                && utils::implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                         data_type::f32 == desc()->diff_bias_desc.data_type);
             if (!ok) return status::unimplemented;
 
@@ -348,12 +356,15 @@ struct jit_avx512_common_1x1_convolution_bwd_weights_t : public cpu_primitive_t
             using namespace memory_format;
 
             if (this->src_pd_.desc()->format == any)
-                CHECK(this->src_pd_.set_format(nChw16c));
+                CHECK(this->src_pd_.set_format(pick(this->ndims() - 3,
+                    nCw16c, nChw16c)));
             if (this->diff_dst_pd_.desc()->format == any)
-                CHECK(this->diff_dst_pd_.set_format(nChw16c));
+                CHECK(this->diff_dst_pd_.set_format(pick(this->ndims() - 3,
+                    nCw16c, nChw16c)));
             if (this->diff_weights_pd_.desc()->format == any)
                 CHECK(this->diff_weights_pd_.set_format(this->with_groups()
-                                                ? gOIhw16i16o : OIhw16i16o));
+                    ? pick(this->ndims() - 3, gOIw16i16o, gOIhw16i16o)
+                    : pick(this->ndims() - 3, OIw16i16o, OIhw16i16o)));
             if (this->diff_bias_pd_.desc()->format == any)
                 CHECK(this->diff_bias_pd_.set_format(x));
             return status::success;
index 80206ce..7f00356 100644 (file)
@@ -44,18 +44,16 @@ inline void pick_loop_order(jit_conv_conf_t &jcp) {
                 forward_training, forward_inference, backward_data));
     auto w = (jcp.prop_kind == backward_data) ? jcp.iw : jcp.ow;
     auto h = (jcp.prop_kind == backward_data) ? jcp.ih : jcp.oh;
-    switch (jcp.ver) {
-    case ver_fma:
-        jcp.loop_order = loop_cgn;
-    case ver_4vnni:
-    case ver_vnni:
-        // TBD: Tune on HW
-    case ver_4fma:
-        jcp.loop_order
-            = (w <= small_spatial && h <= small_spatial) ? loop_cgn : loop_gnc;
-        break;
-    default:
-        assert(!"unsupported convolution version");
+
+    // ow-threading is currently implemented for forward only
+    // TODO: single code for fwd and bwd after ow-thr for bwd
+    // meaningless switch was removed
+    if (jcp.prop_kind == backward_data) {
+        jcp.loop_order = (w <= small_spatial && h <= small_spatial)
+            ? loop_cgn : loop_gnc;
+    } else {
+        jcp.loop_order = (w <= small_spatial && h <= small_spatial)
+            ? loop_cwgn : loop_gncw;
     }
 }
 
@@ -65,7 +63,19 @@ inline bool is_1stconv(const jit_conv_conf_t &jcp) {
     else
         return one_of(jcp.ic, 1, 3);
 }
-
+inline bool is_1D_conv(const jit_conv_conf_t &jcp) {
+    return (jcp.ih == 1 && jcp.kh == 1);
+}
+inline bool is_ow_threading_available(const jit_conv_conf_t &jcp) {
+    return (is_1D_conv(jcp) && one_of(jcp.ndims, 3, 4)
+        && !(jcp.ver == ver_fma && mayiuse(avx512_mic)));
+}
+inline bool is_ow_threading_on(const jit_conv_conf_t &jcp) {
+    return (jcp.nb_ow > 1);
+}
+inline bool is_1D_prefetching(const jit_conv_conf_t &jcp) {
+    return (jcp.ver == ver_4fma && is_1D_conv(jcp) && is_ow_threading_on(jcp));
+}
 }
 
 void jit_avx512_common_conv_fwd_kernel::prepare_output(int ur_w)
@@ -74,9 +84,11 @@ void jit_avx512_common_conv_fwd_kernel::prepare_output(int ur_w)
         for (int j = 0; j < ur_w; j++) {
             Zmm zmm = zmm_out(j, k);
             vpxord(zmm, zmm, zmm);
-            size_t aux_output_offset = get_output_offset(j, k);
-            mic_prefetcht1(EVEX_compress_addr_safe(reg_out_prf,
-                        aux_output_offset, reg_out_long_offt));
+            if (!is_1D_prefetching(jcp)) {
+                size_t aux_output_offset = get_output_offset(j, k);
+                mic_prefetcht1(EVEX_compress_addr_safe(reg_out_prf,
+                            aux_output_offset, reg_out_long_offt));
+            }
         }
 }
 
@@ -171,14 +183,16 @@ void jit_avx512_common_conv_fwd_kernel::store_output(int ur_w)
                 ((size_t)k * jcp.od * jcp.oh * jcp.ow + j) * jcp.oc_block;
             vmovups(EVEX_compress_addr_safe(reg_out, aux_output_offset,
                         reg_out_long_offt), zmm);
-            mic_prefetcht0(EVEX_compress_addr_safe(reg_out_prf,
-                        aux_output_offset, reg_out_long_offt));
+            if (!is_1D_prefetching(jcp))
+                mic_prefetcht0(EVEX_compress_addr_safe(reg_out_prf,
+                            aux_output_offset, reg_out_long_offt));
         }
 }
 
 void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w,
         int pad_l, int pad_r)
 {
+    assert(jcp.dilate_d == 0 && jcp.dilate_h == 0 && jcp.dilate_w == 0);
 
     int iw = jcp.iw;
     int ih = jcp.ih;
@@ -191,14 +205,14 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w,
 
     prepare_output(ur_w);
 
-    if (jcp.ndims == 4) {
+    if (one_of(jcp.ndims, 3, 4)) {
         mov(aux_reg_inp, reg_inp);
         mov(aux_reg_ker, reg_ker);
         mov(aux_reg_inp_prf, reg_inp_prf);
     }
 
     size_t max_input_offset = (size_t)jcp.typesize_in
-        * ((size_t)(kw * (jcp.dilate_w + 1) + ur_w * stride_w - pad_l)
+        * ((size_t)(kw + ur_w * stride_w - pad_l)
                 + (size_t)ic_block * iw * ih * jcp.id);
     assert(reg_inp_prf == reg_long_offt);
     if (max_input_offset > INT_MAX) push(reg_inp_prf);
@@ -212,7 +226,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w,
         mov(aux_reg_inp_d, reg_inp);
         mov(aux_reg_inp_d_prf, reg_inp_prf);
 
-        if ((jcp.kd - 1) * (jcp.dilate_d + 1) < nstl::max(jcp.f_pad, jcp.back_pad)) {
+        if ((jcp.kd - 1) < nstl::max(jcp.f_pad, jcp.back_pad)) {
             cmp(reg_ki, 0);
             je(skip_kd_loop, T_NEAR);
         }
@@ -220,7 +234,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w,
     }
     mov(reg_kj, reg_kh);
     Label skip_kh_loop;
-    if ((jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) {
+    if ((jcp.kh - 1) < nstl::max(jcp.t_pad, jcp.b_pad)) {
         cmp(reg_kj, 0);
         je(skip_kh_loop, T_NEAR);
     }
@@ -250,7 +264,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w,
 
             for (int j = j_start, prf_count=0; j < j_end; j++) {
                 size_t aux_input_offset = (size_t)jcp.typesize_in
-                        * ((size_t)(ki * (jcp.dilate_w + 1) + j * stride_w
+                        * ((size_t)(ki + j * stride_w
                             - pad_l) + (size_t)ic * iw * ih * jcp.id);
                 v4fmaddps(zmm_out(j, 0), zmm_ker(0),
                         EVEX_compress_addr_safe(aux_reg_inp, aux_input_offset,
@@ -278,8 +292,8 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w,
         }
     }
     add(aux_reg_ker, jcp.typesize_in * kw * oc_block);
-    add(aux_reg_inp, jcp.typesize_in * (jcp.dilate_h + 1) * iw);
-    add(aux_reg_inp_prf, jcp.typesize_in * (jcp.dilate_h + 1) * iw);
+    add(aux_reg_inp, jcp.typesize_in * iw);
+    add(aux_reg_inp_prf, jcp.typesize_in * iw);
 
     dec(reg_kj);
     cmp(reg_kj, 0);
@@ -288,9 +302,9 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma_1st(int ur_w,
     L(skip_kh_loop);
 
     if (jcp.ndims == 5) {
-        add(aux_reg_inp_d, typesize * (jcp.dilate_d + 1) * jcp.ih * jcp.iw);
+        add(aux_reg_inp_d, typesize * jcp.ih * jcp.iw);
         add(aux_reg_ker_d, typesize * jcp.kw * jcp.kh * oc_block);
-        add(aux_reg_inp_d_prf, typesize * (jcp.dilate_d + 1) * jcp.ih * jcp.iw);
+        add(aux_reg_inp_d_prf, typesize * jcp.ih * jcp.iw);
 
         dec(reg_ki);
         cmp(reg_ki, 0);
@@ -352,7 +366,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w,
 
     prepare_output(ur_w);
 
-    if (jcp.ndims == 4) {
+    if (one_of(jcp.ndims, 3, 4)) {
         mov(aux_reg_inp, reg_inp);
         mov(aux_reg_ker, reg_ker);
         mov(aux_reg_ker_prf, reg_ker_prf);
@@ -479,30 +493,59 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_4fma(int ur_w,
             for (int ic = 0; ic < ic_block; ic += 4)
                 for (int kk = 0; kk < jcp.nb_oc_blocking; kk++) {
                     kernel_loads(ki, ic, kk);
-                    for (int oi = get_ow_start(ki, pad_l), prf_count_t1 = 0;
+                    for (int oi = get_ow_start(ki, pad_l),
+                            prf_count_t1 = 0, prf_count_t0 = 0;
                             oi < get_ow_end(ur_w, ki, pad_r); oi++) {
                         int aux_input_offset = typesize
                                 * ((ki * (jcp.dilate_w + 1) + oi * stride_w
-                                           - pad_l) * ic_block
-                                                       + ic);
+                                - pad_l) * ic_block + ic);
                         v4fmaddps(zmm_out(oi, kk), zmm_ker(0),
                             EVEX_compress_addr(aux_reg_inp,
                                 aux_input_offset));
-                        if ((oi % 2) && (prf_count_t1 < 4)) {
-                            mic_prefetcht1(EVEX_compress_addr(
-                                aux_reg_ker_prf, kernel_offset(kk,
-                                ic + prf_count_t1, ki)));
-                            prf_count_t1++;
-                        }
-                        if (pref_current_inp) {
-                            if (ki == 0 && ic == 0 && kk == 0)
+
+                        if (!is_1D_prefetching(jcp)) {
+                            if ((oi % 2) && (prf_count_t1 < 4)) {
+                                mic_prefetcht1(EVEX_compress_addr(
+                                    aux_reg_ker_prf, kernel_offset(kk,
+                                    ic + prf_count_t1, ki)));
+                                prf_count_t1++;
+                            }
+                        } else {
+                            if (!(ki == 0 && ic == 0)
+                                && !(ki == kw-1 && ic == 0) &&
+                                (oi % 2) && (prf_count_t1 < 4)
+                                ) {
                                 mic_prefetcht0(EVEX_compress_addr(
-                                    aux_reg_inp,
-                                    aux_input_offset+shift_input_ptr));
+                                    aux_reg_ker, kernel_offset(kk,
+                                    ic + 4 + prf_count_t0, ki)));
+                                prf_count_t0++;
+                            }
+                        }
+                        if (!is_1D_prefetching(jcp)) {
+                            if (pref_current_inp) {
+                                if (ki == 0 && ic == 0 && kk == 0)
+                                    mic_prefetcht0(EVEX_compress_addr(
+                                        aux_reg_inp,
+                                        aux_input_offset + shift_input_ptr));
+                            } else {
+                                if (ki == 1 && ic == 0 && kk == 0)
+                                    mic_prefetcht1(EVEX_compress_addr(
+                                        aux_reg_inp_prf, aux_input_offset));
+                            }
                         } else {
-                            if (ki == 1 && ic == 0 && kk == 0)
-                                mic_prefetcht1(EVEX_compress_addr(
-                                    aux_reg_inp_prf, aux_input_offset));
+                            int inp_mult = jcp.is_1stconv ? 1 : jcp.ic_block;
+                            int inp_shift
+                                = jcp.typesize_in * ur_w * stride_w * inp_mult;
+                            bool kk_pref_slot = kk ? oi % 2 : !(oi % 2);
+                            if (ki == 0 && ic == 0 && kk_pref_slot)
+                                    mic_prefetcht1(EVEX_compress_addr(
+                                        aux_reg_inp,
+                                        aux_input_offset + inp_shift));
+
+                            if (ki == kw - 1 && ic == 0 && kk_pref_slot)
+                                    mic_prefetcht0(EVEX_compress_addr(
+                                        aux_reg_inp,
+                                        aux_input_offset + inp_shift));
                         }
                     }
                 }
@@ -578,7 +621,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma(int ur_w,
 
     prepare_output(ur_w);
 
-    if (jcp.ndims == 4) {
+    if (one_of(jcp.ndims, 3, 4)) {
         mov(aux_reg_inp, reg_inp);
         mov(aux_reg_ker, reg_ker);
         mov(aux_reg_inp_prf, reg_inp_prf);
@@ -758,7 +801,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_fma_core(int ur_w,
 
     prepare_output(ur_w);
 
-    if (jcp.ndims == 4) {
+    if (one_of(jcp.ndims, 3, 4)) {
         mov(aux_reg_inp, reg_inp);
         mov(aux_reg_ker, reg_ker);
     }
@@ -867,7 +910,7 @@ void jit_avx512_common_conv_fwd_kernel::compute_loop_vnni(
 
     prepare_output(ur_w);
 
-    if (jcp.ndims == 4) {
+    if (one_of(jcp.ndims, 3, 4)) {
         mov(aux_reg_inp, reg_inp);
         mov(aux_reg_ker, reg_ker);
         mov(aux_reg_ker_prf, reg_ker_prf);
@@ -1046,6 +1089,8 @@ void jit_avx512_common_conv_fwd_kernel::generate()
 
     int iw = jcp.iw;
     int ow = jcp.ow;
+    int ow_block = jcp.ow_block;
+    int nb_ow = jcp.nb_ow;
     int kw = jcp.kw;
     int l_pad = jcp.l_pad;
     int ur_w = jcp.ur_w;
@@ -1056,6 +1101,7 @@ void jit_avx512_common_conv_fwd_kernel::generate()
     int inp_mult = jcp.is_1stconv ? 1 : jcp.ic_block;
     int inp_shift_pad = jcp.typesize_in * (ur_w * stride_w - l_pad) * inp_mult;
     int inp_shift = jcp.typesize_in * ur_w * stride_w * inp_mult;
+    int inp_shift_pad_second_block = -1 * jcp.typesize_in * l_pad * inp_mult;
     int out_shift = jcp.typesize_out * ur_w * jcp.oc_block;
 
     preamble();
@@ -1070,66 +1116,192 @@ void jit_avx512_common_conv_fwd_kernel::generate()
     int n_oi = ow / ur_w;
     int r_pad1 = (ur_w * n_oi - 1) * stride_w + (kw - 1) * dilate_w
             - (iw + l_pad - 1);
-    if (r_pad1 > 0) n_oi--;
 
-    if (ow == ur_w) {
-        mov(reg_inp_prf, ptr[param1 + GET_OFF(src_prf)]);
-        mov(reg_out_prf, ptr[param1 + GET_OFF(dst_prf)]);
-        compute_loop(ur_w, l_pad, r_pad);
-    } else {
-        //TODO: potentially suboptimal
-        mov(reg_inp_prf, reg_inp);
-        mov(reg_out_prf, reg_out);
-        if (n_oi == 0) {
-            add(reg_inp_prf, inp_shift_pad);
-            add(reg_out_prf, out_shift);
-            compute_loop(ur_w, l_pad, r_pad1);
-            add(reg_inp, inp_shift_pad);
-            add(reg_out, out_shift);
-            if (ur_w_tail != 0) {
-                add(reg_inp_prf, inp_shift);
-                add(reg_out_prf, out_shift);
-                compute_loop(ur_w_tail, 0, r_pad);
-            }
+    if (!is_ow_threading_on(jcp)) {
+        // ow is being processed as a whole - with left and right paddings
+        if (r_pad1 > 0) n_oi--;
+
+        if (ow == ur_w) {
+            mov(reg_inp_prf, ptr[param1 + GET_OFF(src_prf)]);
+            mov(reg_out_prf, ptr[param1 + GET_OFF(dst_prf)]);
+            compute_loop(ur_w, l_pad, r_pad);
         } else {
-            if (l_pad > 0) {
-                n_oi--;
+            mov(reg_inp_prf, reg_inp);
+            mov(reg_out_prf, reg_out);
+            if (n_oi == 0) {
                 add(reg_inp_prf, inp_shift_pad);
                 add(reg_out_prf, out_shift);
-                compute_loop(ur_w, l_pad, 0);
+                compute_loop(ur_w, l_pad, r_pad1);
                 add(reg_inp, inp_shift_pad);
                 add(reg_out, out_shift);
-            }
-            if (n_oi > 0) {
-                xor_(reg_oi, reg_oi);
-                Label ow_loop_label;
-                L(ow_loop_label);
-                {
+                if (ur_w_tail != 0) {
                     add(reg_inp_prf, inp_shift);
                     add(reg_out_prf, out_shift);
-                    compute_loop(ur_w, 0, 0);
+                    compute_loop(ur_w_tail, 0, r_pad);
+                }
+            } else {
+                if (l_pad > 0) {
+                    n_oi--;
+                    add(reg_inp_prf, inp_shift_pad);
+                    add(reg_out_prf, out_shift);
+                    compute_loop(ur_w, l_pad, 0);
+                    add(reg_inp, inp_shift_pad);
+                    add(reg_out, out_shift);
+                }
+                if (n_oi > 0) {
+                    xor_(reg_oi, reg_oi);
+                    Label ow_loop_label;
+                    L(ow_loop_label);
+                    {
+                        add(reg_inp_prf, inp_shift);
+                        add(reg_out_prf, out_shift);
+                        compute_loop(ur_w, 0, 0);
+                        add(reg_inp, inp_shift);
+                        add(reg_out, out_shift);
+                        inc(reg_oi);
+                        cmp(reg_oi, n_oi);
+                        jl(ow_loop_label, T_NEAR);
+                    }
+                }
+                if (r_pad1 > 0) {
+                    add(reg_inp_prf, inp_shift);
+                    add(reg_out_prf, out_shift);
+                    compute_loop(ur_w, 0, r_pad1);
                     add(reg_inp, inp_shift);
                     add(reg_out, out_shift);
-                    inc(reg_oi);
-                    cmp(reg_oi, n_oi);
-                    jl(ow_loop_label, T_NEAR);
                 }
-            }
-            if (r_pad1 > 0) {
-                add(reg_inp_prf, inp_shift);
-                add(reg_out_prf, out_shift);
-                compute_loop(ur_w, 0, r_pad1);
-                add(reg_inp, inp_shift);
-                add(reg_out, out_shift);
-            }
-            if (ur_w_tail != 0) {
-                add(reg_inp_prf, inp_shift);
-                add(reg_out_prf, out_shift);
-                compute_loop(ur_w_tail, 0, r_pad);
+                if (ur_w_tail != 0) {
+                    add(reg_inp_prf, inp_shift);
+                    add(reg_out_prf, out_shift);
+                    compute_loop(ur_w_tail, 0, r_pad);
+                }
             }
         }
-    }
+    } else {
+        // ow block is only processed.
+        // Number of block is passed as parameter owb,
+        // and padding processing depends on this number.
+
+        Label end_label, last_oi_label, middle_ow_blocks_label, tail_label;
+        Label oi_loop_label, oi_loop_start_label, oi_loop_end_label;
+
+        assert(ow_block % ur_w == 0);
+        int n_oi_not_last_ow_block = ow_block / ur_w;
+        // to simplify code (and general regs usage),
+        // size of ow block must be >= 2 * ur_w
+        assert(n_oi_not_last_ow_block > 1);
+        int n_oi_next_last_ow_block = n_oi_not_last_ow_block;
+        int n_oi_first_ow_block = n_oi_not_last_ow_block;
 
+        int n_oi_last_ow_block = (ow - ow_block * (nb_ow-1)) / ur_w;
+
+        // prepare right padding
+        bool next_last_ow_block_padded = r_pad1 > 0 && n_oi_last_ow_block == 0;
+        bool first_ow_block_padded = next_last_ow_block_padded && jcp.nb_ow == 2;
+        bool last_ow_block_padded = r_pad1 > 0 && n_oi_last_ow_block > 0;
+
+        if (last_ow_block_padded) n_oi_last_ow_block--;
+        else if (first_ow_block_padded) n_oi_first_ow_block--;
+        else if (next_last_ow_block_padded) n_oi_next_last_ow_block--;
+
+        mov(reg_owb, ptr[param1 + GET_OFF(owb)]);
+        cmp(reg_owb, 0); // is that the first ow-block ?
+        jg(middle_ow_blocks_label, T_NEAR);
+
+        // the first ow block, compute left padding
+
+        mov(reg_oi, n_oi_first_ow_block);
+        mov(reg_inp_prf, reg_inp);
+        mov(reg_out_prf, reg_out);
+
+        if (l_pad > 0) {
+            mov(reg_ker_prf, ptr[param1 + GET_OFF(filt_prf)]);
+            add(reg_inp_prf, inp_shift_pad);
+            add(reg_out_prf, out_shift);
+            compute_loop(ur_w, l_pad, 0);
+            add(reg_inp, inp_shift_pad);
+            add(reg_out, out_shift);
+            dec(reg_oi);
+        }
+        jmp(oi_loop_label, T_NEAR);
+
+        // middle or last ow block entry
+
+        L(middle_ow_blocks_label);
+
+        if (l_pad > 0) {
+            // just to consider left padding, not compute
+            add(reg_inp, inp_shift_pad_second_block);
+            add(reg_inp_prf, inp_shift_pad_second_block);
+        }
+
+        // set number of iteration for oi-loop
+        cmp(reg_owb, jcp.nb_ow - 1); // last ow-block ?
+        mov(reg_oi, n_oi_last_ow_block);
+        je(oi_loop_label, T_NEAR);
+        cmp(reg_owb, jcp.nb_ow - 2); // next to last ow-block ?
+        mov(reg_oi, n_oi_next_last_ow_block);
+        je(oi_loop_label, T_NEAR);
+        mov(reg_oi, n_oi_not_last_ow_block); // other middle ow-blocks
+
+        // oi loop w/o padding
+        L(oi_loop_label);
+        mov(reg_ker_prf, ptr[param1 + GET_OFF(filt_prf)]);
+        L(oi_loop_start_label);
+            cmp(reg_oi, 0);
+            jle(oi_loop_end_label, T_NEAR);
+
+            add(reg_inp_prf, inp_shift);
+            add(reg_out_prf, out_shift);
+            compute_loop(ur_w, 0, 0);
+            add(reg_inp, inp_shift);
+            add(reg_out, out_shift);
+            dec(reg_oi);
+            jmp(oi_loop_start_label, T_NEAR);
+        L(oi_loop_end_label);
+
+        mov(reg_owb, ptr[param1 + GET_OFF(owb)]);
+
+        cmp(reg_owb, 0); // first ow-block ?
+        if (first_ow_block_padded) {
+            je(last_oi_label, T_NEAR);
+        } else {
+            je(end_label, T_NEAR);
+        }
+        cmp(reg_owb, jcp.nb_ow - 2); // next to last ow-block ?
+        jl(end_label, T_NEAR);
+        if (next_last_ow_block_padded) {
+            je(last_oi_label, T_NEAR);
+        } else {
+            je(end_label, T_NEAR);
+        }
+        // that is last block
+        if (!last_ow_block_padded) {
+            jmp(tail_label, T_NEAR);
+        }
+
+        // last oi block with right padding
+        L(last_oi_label);
+        mov(reg_ker_prf, ptr[param1 + GET_OFF(filt_prf)]);
+        add(reg_inp_prf, inp_shift);
+        add(reg_out_prf, out_shift);
+        compute_loop(ur_w, 0, r_pad1);
+        add(reg_inp, inp_shift);
+        add(reg_out, out_shift);
+
+        mov(reg_owb, ptr[param1 + GET_OFF(owb)]);
+        cmp(reg_owb, jcp.nb_ow - 1); // last ow_block?
+        jl(end_label, T_NEAR);
+
+        L(tail_label);
+        mov(reg_ker_prf, ptr[param1 + GET_OFF(filt_prf)]);
+        if (ur_w_tail != 0) {
+            add(reg_inp_prf, inp_shift);
+            add(reg_out_prf, out_shift);
+            compute_loop(ur_w_tail, 0, r_pad);
+        }
+        L(end_label);
+    }
     postamble();
 
     for (auto& inj : eltwise_injectors)
@@ -1195,19 +1367,19 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(
     jcp.oc_without_padding = jcp.oc;
     jcp.ic = src_d.dims()[1] / jcp.ngroups;
     jcp.id = (ndims == 5) ? src_d.dims()[2] : 1;
-    jcp.ih = src_d.dims()[ndims-2];
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims-2];
     jcp.iw = src_d.dims()[ndims-1];
     jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1;
-    jcp.oh = dst_d.dims()[ndims-2];
+    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[ndims-2];
     jcp.ow = dst_d.dims()[ndims-1];
     jcp.kd = (ndims == 5) ? weights_d.dims()[with_groups + 2] : 1;
-    jcp.kh = weights_d.dims()[with_groups + ndims-2];
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + ndims-2];
     jcp.kw = weights_d.dims()[with_groups + ndims-1];
     jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
-    jcp.t_pad = cd.padding[0][ndims-4];
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims-4];
     jcp.l_pad = cd.padding[0][ndims-3];
     jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1;
-    jcp.stride_h = cd.strides[ndims-4];
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims-4];
     jcp.stride_w = cd.strides[ndims-3];
     jcp.src_fmt = src_d.format();
     jcp.with_eltwise = with_relu;
@@ -1215,7 +1387,7 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(
     jcp.eltwise_alpha = relu_negative_slope;
 
     jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0;
-    jcp.dilate_h = cd.dilates[ndims-4];
+    jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims-4];
     jcp.dilate_w = cd.dilates[ndims-3];
 
     jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1)
@@ -1249,17 +1421,13 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(
     const auto &p = attr.post_ops_;
     jcp.with_sum = p.find(primitive_kind::sum) != -1;
 
-    jcp.is_1stconv = is_1stconv(jcp);
-    if (jcp.ic % simd_w != 0 && !jcp.is_1stconv)
-        return status::unimplemented;
-
-    auto src_format = (ndims == 5)
-        ? (jcp.is_1stconv) ? ncdhw : nCdhw16c
-        : (jcp.is_1stconv) ? nchw : nChw16c;
-    auto dst_format = (ndims == 5) ? nCdhw16c : nChw16c;
-    auto wei_format = (ndims == 5)
-        ? (with_groups) ? gOIdhw16i16o : OIdhw16i16o
-        : (with_groups) ? gOIhw16i16o : OIhw16i16o;
+    auto src_format = jcp.is_1stconv
+        ? pick(ndims - 3, ncw, nchw, ncdhw)
+        : pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
+    auto dst_format = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
+    auto wei_format = with_groups
+        ? pick(ndims - 3, gOIw16i16o, gOIhw16i16o, gOIdhw16i16o)
+        : pick(ndims - 3, OIw16i16o, OIhw16i16o, OIdhw16i16o);
 
     if (src_d.format() == any)
         CHECK(src_pd.set_format(src_format));
@@ -1295,9 +1463,9 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(
         jcp.typesize_in = sizeof(int16_t);
         jcp.typesize_out = sizeof(int32_t);
 
-        const auto w_format = (ndims == 5)
-            ? with_groups ? gOIdhw8i16o2i : OIdhw8i16o2i
-            : with_groups ? gOIhw8i16o2i : OIhw8i16o2i;
+        const auto w_format = with_groups
+            ? pick(ndims - 3, gOIw8i16o2i, gOIhw8i16o2i, gOIdhw8i16o2i)
+            : pick(ndims - 3, OIw8i16o2i, OIhw8i16o2i, OIdhw8i16o2i);
         if (weights_d.format() == any)
             CHECK(weights_pd.set_format(w_format));
         if (weights_d.format() != w_format)
@@ -1314,22 +1482,25 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(
 
         if (jcp.is_1stconv) {
             // TODO: fix & remove constraints below
-            if (jcp.l_pad != 0 || jcp.r_pad != 0
-                || jcp.b_pad != 0 || jcp.t_pad != 0
-                || (jcp.kw < 7 && jcp.kh < 7))
+            bool not_for_4fma
+                    = IMPLICATION(everyone_is(0, jcp.l_pad, jcp.t_pad),
+                            nstl::max(jcp.kw, jcp.kh) < 7);
+            bool is_dilated
+                    = !everyone_is(0, jcp.dilate_d, jcp.dilate_h, jcp.dilate_w);
+            if (one_of(true, not_for_4fma, is_dilated))
                 jcp.ver = ver_fma;
             if (jcp.ver == ver_4fma) {
-                const auto w_format = (ndims == 5)
-                    ? (with_groups) ? gOidhw16o : Oidhw16o
-                    : (with_groups) ? gOihw16o : Oihw16o;
+                const auto w_format = with_groups
+                    ? pick(ndims - 3, gOiw16o, gOihw16o, gOidhw16o)
+                    : pick(ndims - 3, Oiw16o, Oihw16o, Oidhw16o);
                 if (weights_d.format() == any)
                     CHECK(weights_pd.set_format(w_format));
                 if (weights_d.format() != w_format)
                     return status::unimplemented;
             } else {
-                const auto w_format = (ndims == 5)
-                    ? (with_groups) ? gOdhwi16o : Odhwi16o
-                    : (with_groups) ? gOhwi16o : Ohwi16o;
+                const auto w_format = with_groups
+                    ? pick(ndims - 3, gOwi16o, gOhwi16o, gOdhwi16o)
+                    : pick(ndims - 3, Owi16o, Ohwi16o, Odhwi16o);
                 if (weights_d.format() == any)
                     CHECK(weights_pd.set_format(w_format));
                 if (weights_d.format() != w_format)
@@ -1432,11 +1603,18 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(
                     break;
                 }
         }
+        if (jcp.ver == ver_4fma
+            && is_1D_conv(jcp) && one_of(jcp.ndims, 3, 4)) {
+            if (jcp.nb_oc % 2 == 0) {
+                jcp.nb_oc_blocking = 2;
+                jcp.ur_w = nstl::min(jcp.ow, regs / jcp.nb_oc_blocking);
+            }
+        }
     }
 
     if (jcp.ver == ver_fma && mayiuse(avx512_core)) {
         int try_nb_oc_blocking = 2;
-        unsigned int ker_inp_size = typesize * (jcp.iw / jcp.stride_w)
+        unsigned int ker_inp_size = typesize * div_up(jcp.iw, jcp.stride_w)
             * jcp.ic_block * jcp.kh * jcp.kd;
         unsigned int ker_out_size = typesize * jcp.ow * jcp.oc_block
             * try_nb_oc_blocking;
@@ -1452,9 +1630,8 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(
 
         if (jcp.mb == 1) {
             jcp.kernel_kind = embd_bcast;
-            unsigned int inp_size = jcp.mb * (jcp.ih / jcp.stride_h)
-                    * (jcp.iw / jcp.stride_w) * jcp.ic;
-            if (inp_size == 0) inp_size = 1;
+            unsigned int inp_size = jcp.mb * div_up(jcp.ih, jcp.stride_h)
+                    * div_up(jcp.iw, jcp.stride_w) * jcp.ic;
             unsigned int wei_size = jcp.ic * jcp.oc * jcp.kh * jcp.kw;
 
             // Estimate whether we need to limit the number of threads
@@ -1523,6 +1700,21 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(
 
     jcp.ur_w_tail = jcp.ow % jcp.ur_w;
 
+    jcp.ow_block = jcp.ow;
+    if (is_ow_threading_available(jcp)) {
+        const int L1_part = get_cache_size(1) * 5 / 8;
+        int size_src_chunk = typesize * jcp.ic_block * jcp.ur_w;
+        int size_dst_chunk = typesize
+            * jcp.oc_block * jcp.nb_oc_blocking * jcp.ur_w;
+        int size_wei_chunk = typesize
+            * jcp.oc_block * jcp.ic_block * jcp.nb_oc_blocking * jcp.kw;
+        int nurw = (L1_part - size_wei_chunk)
+            / (size_dst_chunk + size_src_chunk);
+        // current design of generate() requires ow_block >= 2 * ur_w
+        jcp.ow_block = jcp.ur_w * nstl::max(2, nurw);
+    }
+    jcp.nb_ow = div_up(jcp.ow, jcp.ow_block);
+
     args_ok = true
         && jcp.l_pad <= jcp.ur_w
         && jcp.ic <= src_d.blocking_desc().padding_dims[1]
@@ -1542,27 +1734,39 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(
 
     jcp.nb_ic_L2 = jcp.nb_ic;
 
+    const int L2_size = get_cache_size(2, true) / sizeof(float);
+    // Source and output data needs to fit in L2,
+    // leaving some space for weights and prefetching.
+    int h_L2 = int(((0.6f * L2_size) / simd_w
+                           - nstl::min(0, jcp.kh - jcp.stride_h) * jcp.iw)
+            / (jcp.stride_h * jcp.iw + jcp.ow));
+    jcp.h_blocking = nstl::max(1, nstl::min(jcp.oh, h_L2));
+
     // TODO check for 4vnni
     if (jcp.ver == ver_4fma) {
-        for (int divf = 2, temp_nb = jcp.nb_ic_L2; divf <= jcp.nb_ic;
-              divf++) {
-            size_t l2_src
-                = (size_t)jcp.iw * jcp.ic_block * jcp.ih * temp_nb * jcp.id;
-            size_t l2_dst = (size_t)jcp.ow * jcp.oc_block * jcp.nb_oc_blocking
-                * jcp.oh * jcp.od;
-            size_t l2_filt = (size_t)jcp.kw * jcp.oc_block * jcp.ic_block
-                * jcp.kh * jcp.nb_oc_blocking * temp_nb * jcp.kd;
-            if (4 * (l2_src + l2_dst + l2_filt) > KNx_L2_EFFECTIVE_CAPACITY) {
-                if (jcp.kh == 3 && jcp.oh == 7) {
-                    jcp.nb_ic_L2 = 1;
+        if (!is_ow_threading_on(jcp)) {
+            for (int divf = 2, temp_nb = jcp.nb_ic_L2; divf <= jcp.nb_ic;
+                  divf++) {
+                size_t l2_src
+                    = (size_t)jcp.iw * jcp.ic_block * jcp.ih * temp_nb * jcp.id;
+                size_t l2_dst = (size_t)jcp.ow * jcp.oc_block * jcp.nb_oc_blocking
+                    * jcp.oh * jcp.od;
+                size_t l2_filt = (size_t)jcp.kw * jcp.oc_block * jcp.ic_block
+                    * jcp.kh * jcp.nb_oc_blocking * temp_nb * jcp.kd;
+                if (4 * (l2_src + l2_dst + l2_filt) > KNx_L2_EFFECTIVE_CAPACITY) {
+                    if (jcp.kh == 3 && jcp.oh == 7) {
+                        jcp.nb_ic_L2 = 1;
+                        break;
+                    }
+                    temp_nb = (jcp.nb_ic_L2 % divf == 0 ? jcp.nb_ic_L2 / divf
+                                    : jcp.nb_ic_L2);
+                } else {
+                    jcp.nb_ic_L2 = temp_nb;
                     break;
                 }
-                temp_nb = (jcp.nb_ic_L2 % divf == 0 ? jcp.nb_ic_L2 / divf
-                                : jcp.nb_ic_L2);
-            } else {
-                jcp.nb_ic_L2 = temp_nb;
-                break;
             }
+        } else {
+            jcp.nb_ic_L2 = 2; /* according to performance data*/
         }
     }
 
@@ -1630,7 +1834,6 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_4fma(
     int iw_end_ipref = get_iw_end(ur_w, 0, r_overflow);
 
     bool check_last_kh = (jcp.kh > 3);
-
     auto kernel_offset = [=](int icb, int oc, int ki) {
         int blk_idx = icb * jcp.kh * jcp.kw * jcp.kd + ki;
         int blk_offset = blk_idx * jcp.oc_block * jcp.ic_block;
@@ -1656,7 +1859,7 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_4fma(
 
     prepare_output(ur_w);
 
-    if (jcp.ndims == 4) {
+    if (one_of(jcp.ndims, 3, 4)) {
         mov(aux_reg_dst, reg_dst);
         mov(aux_reg_ker, reg_ker);
         mov(aux_reg_dst_prf, reg_dst_prf);
@@ -1938,7 +2141,7 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma(
 
     prepare_output(ur_w);
 
-    if (jcp.ndims == 4) {
+    if (one_of(jcp.ndims, 3, 4)) {
         mov(aux_reg_dst, reg_dst);
         mov(aux_reg_ker, reg_ker);
 
@@ -2106,7 +2309,7 @@ void jit_avx512_common_conv_bwd_data_kernel_f32::compute_loop_fma_core(
 
     prepare_output(ur_w);
 
-    if (jcp.ndims == 4) {
+    if (one_of(jcp.ndims, 3, 4)) {
         mov(aux_reg_dst, reg_dst);
         mov(aux_reg_ker, reg_ker);
     }
@@ -2316,26 +2519,26 @@ status_t jit_avx512_common_conv_bwd_data_kernel_f32::init_conf(
     jcp.ic = diff_src_d.dims()[1] / jcp.ngroups;
 
     jcp.id = (ndims == 5) ? diff_src_d.dims()[2] : 1;
-    jcp.ih = diff_src_d.dims()[ndims-2];
+    jcp.ih = (ndims == 3) ? 1 : diff_src_d.dims()[ndims-2];
     jcp.iw = diff_src_d.dims()[ndims-1];
     jcp.od = (ndims == 5) ? diff_dst_d.dims()[2] : 1;
-    jcp.oh = diff_dst_d.dims()[ndims-2];
+    jcp.oh = (ndims == 3) ? 1 : diff_dst_d.dims()[ndims-2];
     jcp.ow = diff_dst_d.dims()[ndims-1];
 
     jcp.kd = (ndims == 5) ? weights_d.dims()[with_groups + 2] : 1;
-    jcp.kh = weights_d.dims()[with_groups + ndims - 2];
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + ndims - 2];
     jcp.kw = weights_d.dims()[with_groups + ndims - 1];
 
     jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
-    jcp.t_pad = cd.padding[0][ndims-4];
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims-4];
     jcp.l_pad = cd.padding[0][ndims-3];
 
     jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1;
-    jcp.stride_h = cd.strides[ndims-4];
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims-4];
     jcp.stride_w = cd.strides[ndims-3];
 
     jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0;
-    jcp.dilate_h = cd.dilates[ndims-4];
+    jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims-4];
     jcp.dilate_w = cd.dilates[ndims-3];
     if ((jcp.dilate_w != 0 && jcp.stride_w != 1)
             || (jcp.dilate_d != 0 && jcp.stride_d != 1)
@@ -2365,11 +2568,10 @@ status_t jit_avx512_common_conv_bwd_data_kernel_f32::init_conf(
         jcp.ic = rnd_up(jcp.ic, jcp.ic_block);
     }
 
-    auto src_format = (ndims == 5) ? nCdhw16c : nChw16c;
-    auto wei_format = (ndims == 5)
-        ? (with_groups) ? gOIdhw16o16i : OIdhw16o16i
-        : (with_groups) ? gOIhw16o16i : OIhw16o16i;
-
+    auto src_format = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
+    auto wei_format = with_groups
+        ? pick(ndims - 3, gOIw16o16i, gOIhw16o16i, gOIdhw16o16i)
+        : pick(ndims - 3, OIw16o16i, OIhw16o16i, OIdhw16o16i);
     bool args_ok = true
         && jcp.oc % jcp.oc_block == 0
         && jcp.ic % jcp.ic_block == 0
@@ -2585,7 +2787,8 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32::od_step_comeback_pointers()
 {
     Label kd_comeback_label;
 
-    mov(kj, jcp.kd); //FIX, work only if f_pad = back_pad = 0 (Anton)
+    /* 'depth' loop count bound by 'kd_work_size' */
+    mov(kj, ptr[param + GET_OFF(kd_padding)]);
     L(kd_comeback_label); {
         int inp_mult = jcp.is_1stconv ? 1 : jcp.ic_block;
         int iw = (utils::one_of(jcp.ver, ver_4fma, ver_4vnni, ver_vnni))
@@ -2933,7 +3136,7 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32
     if (jcp.ndims == 5) {
         mov(aux_reg_input, reg_input);
         mov(aux_reg_kernel, reg_kernel);
-        mov(ki, jcp.kd);
+        mov(ki, ptr[param + GET_OFF(kd_padding)]);
         L(kd_label);
         mov(reg_input, aux_reg_input);
         mov(reg_kernel, aux_reg_kernel);
@@ -2989,7 +3192,7 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32
     if (jcp.ndims == 5) {
         mov(aux_reg_input, reg_input);
         mov(aux_reg_kernel, reg_kernel);
-        mov(ki, jcp.kd);
+        mov(ki, ptr[param + GET_OFF(kd_padding)]);
         L(kd_label);
         mov(reg_input, aux_reg_input);
         mov(reg_kernel, aux_reg_kernel);
@@ -3077,7 +3280,7 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32
     if (jcp.ndims == 5) {
         mov(aux_reg_input, reg_input);
         mov(aux_reg_kernel, reg_kernel);
-        mov(ki, jcp.kd);
+        mov(ki, ptr[param + GET_OFF(kd_padding)]);
         L(kd_label);
         mov(reg_input, aux_reg_input);
         mov(reg_kernel, aux_reg_kernel);
@@ -3228,7 +3431,8 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32::bias_kernel()
 
     L(skip_load_bias);
 
-    mov(reg_oi, ptr[param + GET_OFF(kh_padding)]);
+    mov(reg_oi, ptr[param + GET_OFF(d_worksize)]);
+    sub(reg_oi, ptr[param + GET_OFF(d_index)]);
     mov(reg_tmp, jcp.oc_block * jcp.ow * jcp.oh * jcp.typesize_out);
     imul(reg_oi, reg_tmp);
 
@@ -3248,44 +3452,48 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32::bias_kernel()
 void jit_avx512_common_conv_bwd_weights_kernel_f32
     ::compute_oh_loop_common()
 {
+    int ic_block = jcp.ic_block;
+    int oc_block = jcp.oc_block;
     int back_pad = jcp.back_pad;
     int b_pad = jcp.b_pad;
     int t_pad = jcp.t_pad;
     bool is_dilated = jcp.dilate_h != 0;
     int dilate_h = jcp.dilate_h + 1;
     int stride_h = jcp.stride_h;
-    int idp = jcp.id + jcp.f_pad + back_pad;
     const int inp_mult = jcp.is_1stconv ? 1 : jcp.ic_block;
     int iw = utils::one_of(jcp.ver, ver_4fma, ver_4vnni, ver_vnni) ? jcp.tr_iw
         : jcp.iw;
+    const size_t io_overlap = jcp.od - back_pad;
     Label oh_label, oh_label_end, oh_tpad_label, oh_tpad_tail_label,
             oh_bpad_label, oh_bpad_label_end, od_label, od_label_end,
-            oh_dilate_label_shift, oh_dilate_label_noshift, oh_dilate_label_end;
+            oh_dilate_label_shift, oh_dilate_label_noshift, oh_dilate_label_end,
+            skip_neg_overlap_label, skip_fpad_label, skip_input_label;
 
     maybe_zero_kernel();
     if (jcp.ndims == 5 && jcp.with_bias) bias_kernel();
 
+    /* initially offset 'kd' by f_pad */
+    if (jcp.ndims == 5) add(reg_kernel, ptr[param + GET_OFF(kd_offset)]);
+
     int ow = (jcp.ver == ver_4vnni || jcp.ver == ver_vnni) ? jcp.tr_ow : jcp.ow;
 
     if (jcp.ndims == 5) {
         mov(reg_input_d, ptr[param + GET_OFF(src)]);
         mov(reg_output_d, ptr[param + GET_OFF(dst)]);
-
-        mov(reg_id_count, ptr[param + GET_OFF(kd_padding)]);
-        mov(reg_oi, ptr[param + GET_OFF(kh_padding)]);
+        mov(reg_d_index, ptr[param + GET_OFF(d_index)]);
         L(od_label);
 
         mov(reg_input, reg_input_d);
         mov(reg_output, reg_output_d);
         push(reg_input_d);
         push(reg_output_d);
-        push(reg_oi);
-        push(reg_id_count);
+        push(reg_d_index);
     }
 
     mov(reg_kh, jcp.kh);
     xor_(reg_ih_count, reg_ih_count);
     xor_(reg_oj, reg_oj);
+    /* Compute 'top' edge */
     if (t_pad > 0) {
         const int kh_range = 1 + (jcp.kh - 1) * dilate_h;
         const int overflow
@@ -3380,6 +3588,7 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32
     cmp(reg_oj, jcp.oh);
     jge(oh_label, T_NEAR);
 
+    /* Compute middle block(s) */
     mov(reg_kh, jcp.kh);
     L(oh_label); {
         compute_oh_step_disp();
@@ -3397,6 +3606,7 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32
     }
     L(oh_label_end);
 
+    /* Compute bottom edge */
     if (b_pad > 0) {
         cmp(reg_oj, jcp.oh);
         jge(oh_bpad_label_end, T_NEAR);
@@ -3433,22 +3643,47 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32
     }
 
     if (jcp.ndims == 5) {
-        pop(reg_id_count);
-        pop(reg_oi);
+        pop(reg_d_index);
         pop(reg_output_d);
         pop(reg_input_d);
 
-        add(reg_input_d, jcp.typesize_in * jcp.stride_d * jcp.ih * iw * inp_mult);
+        mov(reg_kd_count, ptr[param + GET_OFF(kd_padding)]);
+
+        /* 'outer-depth loop' offset into next 'depth' index */
         add(reg_output_d, jcp.typesize_in * jcp.oh * ow * jcp.oc_block);
 
-        dec(reg_oi);
-        add(reg_id_count, jcp.stride_d);
+        /* only increase input address when convolution is not within the
+         * 'f_pad' region */
+        if (jcp.f_pad > 0) {
+            cmp(reg_d_index, jcp.f_pad);
+            jl(skip_input_label);
+        }
+        add(reg_input_d,
+                jcp.typesize_in * jcp.stride_d * jcp.ih * iw * inp_mult);
+        L(skip_input_label);
+
+        inc(reg_d_index);
+        cmp(reg_d_index, io_overlap);
+        jl(skip_neg_overlap_label);
 
-        cmp(reg_id_count, idp - back_pad - (jcp.kd - 1) * (jcp.dilate_d + 1));
-        jge(od_label_end, T_NEAR);
+        /* Reduce 'kd' count as convolution steps within 'back_pad' region */
+        dec(reg_kd_count);
+        jmp(skip_fpad_label);
 
-        cmp(reg_oi, 0);
-        jg(od_label, T_NEAR);
+        L(skip_neg_overlap_label);
+        cmp(reg_kd_count, jcp.kd);
+        jge(skip_fpad_label);
+
+        /* increase 'kd' count as convolution steps out of 'f_pad' region */
+        inc(reg_kd_count);
+        sub(reg_kernel,
+                jcp.typesize_out * jcp.kh * jcp.kw * ic_block * oc_block);
+
+        L(skip_fpad_label);
+        mov(ptr[param + GET_OFF(kd_padding)], reg_kd_count);
+
+        cmp(reg_d_index, ptr[param + GET_OFF(d_worksize)]);
+        jl(od_label, T_NEAR);
 
         L(od_label_end);
     }
@@ -4256,36 +4491,36 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
     jcp.ic = src_d.dims()[1] / jcp.ngroups;
 
     jcp.id = (ndims == 5) ? src_d.dims()[2] : 1;
-    jcp.ih = src_d.dims()[ndims-2];
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims-2];
     jcp.iw = src_d.dims()[ndims-1];
     jcp.od = (ndims == 5) ? diff_dst_d.dims()[2] : 1;
-    jcp.oh = diff_dst_d.dims()[ndims-2];
+    jcp.oh = (ndims == 3) ? 1 : diff_dst_d.dims()[ndims-2];
     jcp.ow = diff_dst_d.dims()[ndims-1];
 
     jcp.kd = (ndims == 5) ? diff_weights_d.dims()[with_groups + 2] : 1;
-    jcp.kh = diff_weights_d.dims()[with_groups + ndims-2];
+    jcp.kh = (ndims == 3) ? 1 : diff_weights_d.dims()[with_groups + ndims-2];
     jcp.kw = diff_weights_d.dims()[with_groups + ndims-1];
 
     jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
-    jcp.t_pad = cd.padding[0][ndims-4];
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims-4];
     jcp.l_pad = cd.padding[0][ndims-3];
 
     jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1;
-    jcp.stride_h = cd.strides[ndims-4];
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims-4];
     jcp.stride_w = cd.strides[ndims-3];
 
     jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0;
-    jcp.dilate_h = cd.dilates[ndims-4];
+    jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims-4];
     jcp.dilate_w = cd.dilates[ndims-3];
 
     const int kh_range = 1 + (jcp.kh - 1) * (jcp.dilate_h + 1);
     bool ok = true
         // general condition to simplify dilations
-        && implication(jcp.dilate_d != 0, jcp.stride_d == 1)
-        && implication(jcp.dilate_h != 0, jcp.stride_h == 1)
-        && implication(jcp.dilate_w != 0, jcp.stride_w == 1)
+        && IMPLICATION(jcp.dilate_d != 0, jcp.stride_d == 1)
+        && IMPLICATION(jcp.dilate_h != 0, jcp.stride_h == 1)
+        && IMPLICATION(jcp.dilate_w != 0, jcp.stride_w == 1)
         // special condition to simplify dilations in compute_oh_loop_common
-        && implication(jcp.dilate_h != 0, kh_range <= jcp.ih);
+        && IMPLICATION(jcp.dilate_h != 0, kh_range <= jcp.ih);
     if (!ok)
         return status::unimplemented;
 
@@ -4296,8 +4531,9 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
     jcp.back_pad = nstl::max(0, (jcp.od - 1) * jcp.stride_d
             + (jcp.kd - 1) * (jcp.dilate_d + 1) - (jcp.id + jcp.f_pad - 1));
 
-    if ( ndims == 5 )
-        if (jcp.f_pad != 0 || jcp.back_pad != 0)
+    /* XXX: currently, does not support stride_d > 1 or dilation > 0 */
+    if (ndims == 5)
+        if (jcp.stride_d > 1 || jcp.dilate_d > 0)
             return status::unimplemented;
 
     jcp.ihp = jcp.ih + jcp.t_pad + jcp.b_pad;
@@ -4321,11 +4557,10 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
     if (jcp.oc % jcp.oc_block)
         return status::unimplemented;
 
-    auto src_format = (ndims == 5) ? nCdhw16c : nChw16c;
-    auto wei_format = (ndims == 5)
-        ? (with_groups) ? gOIdhw16i16o : OIdhw16i16o
-        : (with_groups) ? gOIhw16i16o : OIhw16i16o;
-
+    auto src_format = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
+    auto wei_format = with_groups
+        ? pick(ndims - 3, gOIw16i16o, gOIhw16i16o, gOIdhw16i16o)
+        : pick(ndims - 3, OIw16i16o, OIhw16i16o, OIdhw16i16o);
     /* conditions on bias memory */
     jcp.with_bias = cd.diff_bias_desc.format != memory_format::undef;
     if (jcp.with_bias) {
@@ -4362,7 +4597,7 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
     }
 
     if (jcp.is_1stconv) {
-        const auto want_src_format = (ndims == 5) ? ncdhw : nchw;
+        const auto want_src_format = pick(ndims - 3, ncw, nchw, ncdhw);
         if (src_d.format() == any)
             CHECK(src_pd.set_format(want_src_format));
 
@@ -4371,9 +4606,9 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
                 src_d.data_type(), diff_weights_d.data_type(),
                 diff_dst_d.data_type())
             && one_of(jcp.ic, 1, 3)
-            && implication(jcp.ic == 1, one_of(src_d.format(), want_src_format,
-                (ndims == 5) ? ndhwc : nhwc))
-            && implication(jcp.ic != 1, src_d.format() == want_src_format)
+            && IMPLICATION(jcp.ic == 1, one_of(src_d.format(), want_src_format,
+                pick(ndims - 3, nwc, nhwc, ndhwc)))
+            && IMPLICATION(jcp.ic != 1, src_d.format() == want_src_format)
             && jcp.ngroups == 1;
         if (!src_ok)
             return status::unimplemented;
@@ -4382,11 +4617,11 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
                     jcp.stride_w), 16);
         const int kh_step = nstl::max((28 - jcp.with_bias) / jcp.kw, 1);
         const int kh_step_rem = jcp.kh % kh_step;
-        const auto want_4fma_wfmt = (ndims == 5)
-            ? with_groups ? gOidhw16o : Oidhw16o
-            : with_groups ? gOihw16o : Oihw16o;
+        const auto want_4fma_wfmt = with_groups
+            ? pick(ndims - 3, gOiw16o, gOihw16o, gOidhw16o)
+            : pick(ndims - 3, Oiw16o, Oihw16o, Oidhw16o);
         const bool use_4fma = true
-            && ndims == 4
+            && one_of(ndims, 3, 4)
             && mayiuse(avx512_mic_4ops)
             && mkldnn_thr_syncable()
             && everyone_is(0, jcp.dilate_d, jcp.dilate_h, jcp.dilate_w)
@@ -4394,8 +4629,8 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
             && jcp.kw <= 28 - jcp.with_bias
             && jcp.stride_w == 4
             && tr_ld / simd_w <= 4 /* [bwd_w:tr_src:r1] */
-            && implication(jcp.with_bias, kh_step_rem == 1) /* [bwd_w:b:r1] */
-            && implication(diff_weights_d.format() != any,
+            && IMPLICATION(jcp.with_bias, kh_step_rem == 1) /* [bwd_w:b:r1] */
+            && IMPLICATION(diff_weights_d.format() != any,
                     diff_weights_d.format() == want_4fma_wfmt);
 
         if (use_4fma) {
@@ -4409,9 +4644,9 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
             jcp.ver = ver_fma;
             jcp.ic_block = jcp.ic;
 
-            const auto want_wfmt = (ndims == 5)
-                ? with_groups ? gOdhwi16o : Odhwi16o
-                : with_groups ? gOhwi16o : Ohwi16o;
+            const auto want_wfmt = with_groups
+                ? pick(ndims - 3, gOwi16o, gOhwi16o, gOdhwi16o)
+                : pick(ndims - 3, Owi16o, Ohwi16o, Odhwi16o);
             if (diff_weights_d.format() == any)
                 CHECK(diff_weights_pd.set_format(want_wfmt));
             if (diff_weights_d.format() != want_wfmt)
@@ -4439,7 +4674,7 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
         jcp.src_fmt = src_d.format();
         if ((mayiuse(avx512_mic_4ops) || mayiuse(avx512_core_vnni))
             && mkldnn_thr_syncable()
-            && ndims == 4
+            && one_of(ndims, 3, 4)
             && jcp.stride_w == 1
             && everyone_is(0, jcp.dilate_d, jcp.dilate_h, jcp.dilate_w)
             && ((src_d.data_type() == data_type::s16
@@ -4452,7 +4687,7 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
                     src_d.data_type(), diff_weights_d.data_type(),
                     diff_dst_d.data_type())) {
             jcp.ver = ver_fma;
-            if (ndims == 4 && mayiuse(avx512_mic_4ops) && jcp.stride_w == 1 &&
+            if (one_of(ndims, 3, 4) && mayiuse(avx512_mic_4ops) && jcp.stride_w == 1 &&
                     everyone_is(0, jcp.dilate_d, jcp.dilate_h, jcp.dilate_w) &&
                     mkldnn_thr_syncable()) {
                 jcp.ver = ver_4fma;
index 42e26a9..ec6e185 100644 (file)
@@ -82,6 +82,7 @@ private:
     reg64_t reg_inp_prf = r11;
     reg64_t reg_ker_prf = r12;
     reg64_t reg_out_prf = r13;
+    reg64_t reg_owb = r12;
 
     reg64_t aux_reg_inp = r14;
     reg64_t aux_reg_ker = r15;
@@ -380,8 +381,9 @@ private:
     reg64_t reg_long_offt = r14;
 
     reg64_t ki = r11;
+    reg64_t reg_kd_count = r12;
     reg64_t reg_oi = r12;
-    reg64_t reg_id_count = r13;
+    reg64_t reg_d_index = r13;
     reg64_t reg_input_d = r15;
     reg64_t reg_output_d = rbx;
     reg64_t aux_reg_input = r12;
index 41c79f5..0405eee 100644 (file)
@@ -91,8 +91,9 @@ struct prefetcher_t {
         int cache_latency;
         switch (cache_type_) {
         case L1: cache_latency = 14; break;
-        case L2: cache_latency = 250; break;
-        case L3: cache_latency = 250; break;
+        case L2:
+        case L3:
+        default: cache_latency = 250; break;
         }
 
         prefetch_distance_ = div_up(cache_latency, nb_cache_lines_to_prefetch_);
@@ -636,12 +637,12 @@ bool jit_avx512_common_conv_winograd_fwd_kernel_f32::post_ops_ok(
         return true; // no post_ops
     case 1:
         return true // relu or sum
-                && implication(jcp.with_eltwise, is_sum(0))
-                && implication(!jcp.with_eltwise, is_eltwise(0) || is_sum(0));
+                && IMPLICATION(jcp.with_eltwise, is_sum(0))
+                && IMPLICATION(!jcp.with_eltwise, is_eltwise(0) || is_sum(0));
     case 2:
         return true // sum->relu or relu->sum
-                && implication(jcp.with_eltwise, is_sum(0) && is_eltwise(1))
-                && implication(!jcp.with_eltwise, false
+                && IMPLICATION(jcp.with_eltwise, is_sum(0) && is_eltwise(1))
+                && IMPLICATION(!jcp.with_eltwise, false
                                    || (is_sum(0) && is_eltwise(1))
                                    || (is_eltwise(0) && is_sum(1)));
     case 3:
index fc7a4e3..8767207 100644 (file)
@@ -54,10 +54,46 @@ inline void jit_conv_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_s &p,
     if (p.src)
         ker(&p);
 }
+// The special case for the driver with ow-parallelization (FWD)
+// TODO: implement it for BWD_D and BWD_W too
+inline void jit_conv_ker_pipeline_ow_thr(jit_conv_ker_t ker, jit_conv_call_s &p,
+        const void *src, const void *dst, const void *filt, const void *bias,
+        int channel, int kh_padding, int owb, int oc_off)
+{
+    PIPELINE(src);
+    PIPELINE(dst);
+    PIPELINE(filt);
+    PIPELINE(bias);
+    PIPELINE(channel);
+    PIPELINE(kh_padding);
+    PIPELINE(owb);
+    PIPELINE(oc_off);
+
+    if (p.src)
+        ker(&p);
+}
 
 inline void jit_conv_3d_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_s &p,
         const void *src, const void *dst, const void *filt, const void *bias,
-        int channel, int kh_padding, int kd_padding)
+        int channel, int kh_padding, int kd_padding, int oc_off)
+{
+    PIPELINE(src);
+    PIPELINE(dst);
+    PIPELINE(filt);
+    PIPELINE(bias);
+    PIPELINE(channel);
+    PIPELINE(kh_padding);
+    PIPELINE(kd_padding);
+    PIPELINE(oc_off);
+
+    if (p.src)
+        ker(&p);
+}
+// The special case for the driver with ow-parallelization (FWD)
+// TODO: implement it for BWD_D and BWD_W too
+inline void jit_conv_3d_ker_pipeline_ow_thr(jit_conv_ker_t ker,
+        jit_conv_call_s &p, const void *src, const void *dst, const void *filt,
+        const void *bias, int channel, int kh_padding, int kd_padding, int owb, int oc_off)
 {
     PIPELINE(src);
     PIPELINE(dst);
@@ -66,11 +102,30 @@ inline void jit_conv_3d_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_s &p,
     PIPELINE(channel);
     PIPELINE(kh_padding);
     PIPELINE(kd_padding);
+    PIPELINE(owb);
+    PIPELINE(oc_off);
 
     if (p.src)
         ker(&p);
 }
 
+void jit_conv_3d_ker_bwd_w_pipeline(jit_conv_ker_t ker, jit_conv_call_s &p,
+        const void *src, const void *dst, const void *filt, const void *bias,
+        int channel, int d_index, int d_worksize,
+        int kd_padding /* kd_work_size */, size_t kd_offset) {
+    PIPELINE(src);
+    PIPELINE(dst);
+    PIPELINE(filt);
+    PIPELINE(bias);
+    PIPELINE(channel);
+    PIPELINE(kd_padding);
+    PIPELINE(d_worksize);
+    PIPELINE(d_index);
+    PIPELINE(kd_offset);
+
+    if (p.src)
+        ker(&p);
+}
 #define wht_blk_off(d, g, ...) \
         (conf_.with_groups() \
          ? (d).blk_off((g), __VA_ARGS__) \
@@ -79,7 +134,104 @@ inline void jit_conv_3d_ker_pipeline(jit_conv_ker_t ker, jit_conv_call_s &p,
 template <bool with_relu, data_type_t src_type, data_type_t wei_type,
           data_type_t dst_type>
 void _jit_avx512_common_convolution_fwd_t
-    <with_relu, src_type, wei_type, dst_type>::execute_forward()
+    <with_relu, src_type, wei_type, dst_type>::execute_forward_1d()
+{
+    auto src = reinterpret_cast<const src_data_t *>(this->input_memory(0));
+    auto weights = reinterpret_cast<const wei_data_t *>(this->input_memory(1));
+    auto bias = reinterpret_cast<const dst_data_t *>(this->input_memory(2));
+    auto dst = reinterpret_cast<dst_data_t *>(this->memory());
+
+    const memory_desc_wrapper src_d(conf_.src_pd());
+    const memory_desc_wrapper dst_d(conf_.dst_pd());
+    const memory_desc_wrapper weights_d(conf_.weights_pd(0));
+
+    const auto &jcp = kernel_->jcp;
+    assert(jcp.nb_oc % jcp.nb_oc_blocking == 0);
+
+    int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking;
+    int work_amount = jcp.mb * jcp.ngroups * oc_chunks * jcp.nb_ow;
+
+    int nthr;
+    if (jcp.aligned_threads)
+        nthr = jcp.aligned_threads;
+    else
+        nthr = mkldnn_get_max_threads();
+
+    if (conf_.want_padded_bias()) {
+        for (int oc = 0; oc < jcp.oc_without_padding; ++oc)
+            padded_bias_[oc] = bias[oc];
+        bias = padded_bias_;
+    }
+    parallel(nthr, [&](const int ithr, const int nthr) {
+        int start{0}, end{0}, start_copy;
+        balance211(work_amount, nthr, ithr, start, end);
+        start_copy = start;
+
+        auto par_conv = jit_conv_call_s();
+        size_t src_c_stride = src_d.blk_off(0, 1);
+        size_t wht_ic_stride = wht_blk_off(weights_d, 0, 0, 1);
+
+        for (int icb_l2 = 0 ; icb_l2 < jcp.nb_ic; icb_l2 += jcp.nb_ic_L2) {
+            start = start_copy;
+            int n{0}, g{0}, occ{0}, owb{0};
+
+            if (jcp.loop_order == loop_cwgn) {
+                int dummy{0};
+                nd_iterator_init(start, occ, oc_chunks, owb, jcp.nb_ow,
+                        g, jcp.ngroups, n, jcp.mb, dummy, 1);
+            } else if (jcp.loop_order == loop_gncw) {
+                int dummy{0};
+                nd_iterator_init(start, g, jcp.ngroups, n, jcp.mb, occ,
+                        oc_chunks, owb, jcp.nb_ow, dummy, 1);
+            } else {
+                assert(!"unsupported loop order");
+            }
+
+            while (start < end) {
+                int ocb = occ * jcp.nb_oc_blocking;
+                int g_ocb = g * jcp.nb_oc + ocb;
+                int g_oc = g_ocb * jcp.oc_block;
+                int g_icb = g * jcp.nb_ic;
+
+                int ow_s =  owb * jcp.ow_block;
+                int iw_s =  ow_s * jcp.stride_w;
+                auto bias_w = bias ? bias + g_oc : nullptr;
+                auto dst_w = dst + dst_d.blk_off(n, g_ocb, ow_s);
+                auto src_w = src + src_d.blk_off(n, g_icb + icb_l2, iw_s);
+                auto wht_w = weights + wht_blk_off(weights_d, g, ocb, icb_l2);
+
+                int oc_off = g_oc * sizeof(dst_data_t);
+
+                for (int icb = icb_l2;
+                     icb < min(jcp.nb_ic, icb_l2 + jcp.nb_ic_L2); ++icb) {
+                     jit_conv_ker_pipeline_ow_thr(kernel_->jit_ker, par_conv,
+                        src_w, dst_w, wht_w, bias_w, icb, 1, owb, oc_off);
+
+                    src_w += src_c_stride;
+                    wht_w += wht_ic_stride;
+                }
+                if (jcp.loop_order == loop_cwgn) {
+                    int dummy{0};
+                    nd_iterator_jump(start, end, occ, oc_chunks, owb, jcp.nb_ow,
+                            g, jcp.ngroups, n, jcp.mb, dummy, 1);
+                } else if (jcp.loop_order == loop_gncw) {
+                    int dummy{0};
+                    nd_iterator_jump(start, end, g, jcp.ngroups, n, jcp.mb,
+                            occ, oc_chunks, owb, jcp.nb_ow, dummy, 1);
+                } else {
+                    assert(!"unsupported loop order");
+                }
+            }
+        }
+        jit_conv_ker_pipeline_ow_thr(kernel_->jit_ker, par_conv,
+                src, dst, weights, bias, 0, 0, 0, 0);
+    });
+}
+
+template <bool with_relu, data_type_t src_type, data_type_t wei_type,
+          data_type_t dst_type>
+void _jit_avx512_common_convolution_fwd_t
+    <with_relu, src_type, wei_type, dst_type>::execute_forward_2d()
 {
     auto src = reinterpret_cast<const src_data_t *>(this->input_memory(0));
     auto weights = reinterpret_cast<const wei_data_t *>(this->input_memory(1));
@@ -95,7 +247,7 @@ void _jit_avx512_common_convolution_fwd_t
     assert(jcp.nb_oc % jcp.nb_oc_blocking == 0);
 
     int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking;
-    int work_amount = MB * jcp.ngroups * oc_chunks * jcp.oh;
+    int work_amount = MB * jcp.ngroups * oc_chunks * jcp.oh * jcp.nb_ow;
 
     int nthr;
     if (jcp.aligned_threads)
@@ -110,7 +262,7 @@ void _jit_avx512_common_convolution_fwd_t
     }
 
     parallel(nthr, [&](const int ithr, const int nthr) {
-        int start, end, start_copy;
+        int start{0}, end{0}, start_copy;
         balance211(work_amount, nthr, ithr, start, end);
         start_copy = start;
 
@@ -123,14 +275,14 @@ void _jit_avx512_common_convolution_fwd_t
 
         for (int icb_l2 = 0 ; icb_l2 < jcp.nb_ic; icb_l2 += jcp.nb_ic_L2) {
             start = start_copy;
-            int n{0}, g{0}, occ{0}, oh_s{0};
-
-            if (jcp.loop_order == loop_cgn)
-                nd_iterator_init(start,
-                    occ, oc_chunks, g, jcp.ngroups, n, MB, oh_s, jcp.oh);
-            else if (jcp.loop_order == loop_gnc)
-                nd_iterator_init(start,
-                    g, jcp.ngroups, n, MB, occ, oc_chunks, oh_s, jcp.oh);
+            int n{0}, g{0}, occ{0}, oh_s{0}, owb{0};
+
+            if (jcp.loop_order == loop_cwgn)
+                nd_iterator_init(start, occ, oc_chunks, owb, jcp.nb_ow,
+                    g, jcp.ngroups, n, MB, oh_s, jcp.oh);
+            else if (jcp.loop_order == loop_gncw)
+                nd_iterator_init(start, g, jcp.ngroups, n, MB,
+                    occ, oc_chunks, owb, jcp.nb_ow, oh_s, jcp.oh);
             else
                 assert(!"unsupported loop order");
 
@@ -141,57 +293,67 @@ void _jit_avx512_common_convolution_fwd_t
                 int g_icb = g * jcp.nb_ic;
 
                 int work_rem = end - start;
-                int ih_s = -jcp.t_pad + oh_s * jcp.stride_h;
-                int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem;
 
+                int ow_s =  owb * jcp.ow_block;
+                int iw_s =  ow_s * jcp.stride_w;
+                int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem;
                 auto bias_w = bias ? bias + g_oc : nullptr;
-                auto dst_w = dst + dst_d.blk_off(n, g_ocb, oh_s);
-                auto src_w = src + src_d.blk_off(n, g_icb + icb_l2, ih_s);
-                auto wht_w = weights + wht_blk_off(weights_d, g, ocb, icb_l2);
 
-                int oc_off = g_oc * sizeof(dst_data_t);
+                for (int oh_b = oh_s; oh_b < oh_e; oh_b += jcp.h_blocking) {
+                    int ih_b = -jcp.t_pad + oh_b * jcp.stride_h;
+
+                    auto dst_w = dst + dst_d.blk_off(n, g_ocb, oh_b, ow_s);
+                    auto src_w
+                        = src + src_d.blk_off(n, g_icb + icb_l2, ih_b, iw_s);
+                    auto wht_w
+                            = weights + wht_blk_off(weights_d, g, ocb, icb_l2);
+
+                    for (int icb = icb_l2;
+                            icb < min(jcp.nb_ic, icb_l2 + jcp.nb_ic_L2);
+                            ++icb) {
+                        auto src_c = src_w;
+                        auto dst_c = dst_w;
+                        for (int oj = oh_b, ij = ih_b;
+                                oj < min(oh_e, oh_b + jcp.h_blocking);
+                                ++oj, ij += jcp.stride_h) {
+                            int dilate_h = jcp.dilate_h + 1;
+                            int i_t_overflow = div_up(max(0, -ij), dilate_h);
+                            int i_b_overflow = div_up(max(0, ij - jcp.ih
+                                + (jcp.kh - 1) * dilate_h + 1), dilate_h);
+                            int kh_padding = nstl::max(
+                                    0, jcp.kh - i_t_overflow - i_b_overflow);
 
-                for (int icb = icb_l2;
-                     icb < min(jcp.nb_ic, icb_l2 + jcp.nb_ic_L2); ++icb) {
-                    auto src_c = src_w;
-                    auto dst_c = dst_w;
-                    for (int oj = oh_s, ij = ih_s;
-                            oj < oh_e; ++oj, ij += jcp.stride_h)
-                    {
-                        int dilate_h = jcp.dilate_h + 1;
-                        int i_t_overflow = div_up(max(0, -ij), dilate_h);
-                        int i_b_overflow = div_up(
-                                max(0, ij - jcp.ih + (jcp.kh - 1) * dilate_h
-                                                + 1),
-                                dilate_h);
-                        int kh_padding = nstl::max(0,
-                            jcp.kh - i_t_overflow - i_b_overflow);
+                            auto aux_src = src_c
+                                    + i_t_overflow * dilate_h * src_h_stride;
+                            auto aux_wht = wht_w + i_t_overflow * wht_h_stride;
 
-                        jit_conv_ker_pipeline(kernel_->jit_ker, par_conv,
-                                src_c + i_t_overflow * dilate_h * src_h_stride,
-                                dst_c, wht_w + i_t_overflow * wht_h_stride,
-                                bias_w, icb, kh_padding, oc_off);
+                            int oc_off = g_oc * sizeof(dst_data_t);
 
-                        src_c += src_h_stride * jcp.stride_h;
-                        dst_c += dst_h_stride;
+                            jit_conv_ker_pipeline_ow_thr(kernel_->jit_ker,
+                                par_conv, aux_src, dst_c, aux_wht, bias_w, icb,
+                                kh_padding, owb, oc_off);
+
+                            src_c += src_h_stride * jcp.stride_h;
+                            dst_c += dst_h_stride;
+                        }
+                        src_w += src_c_stride;
+                        wht_w += wht_ic_stride;
                     }
-                    src_w += src_c_stride;
-                    wht_w += wht_ic_stride;
                 }
 
-                if (jcp.loop_order == loop_cgn)
-                    nd_iterator_jump(start, end,
-                      occ, oc_chunks, g, jcp.ngroups, n, MB, oh_s, jcp.oh);
-                else if (jcp.loop_order == loop_gnc)
-                    nd_iterator_jump(start, end,
-                      g, jcp.ngroups, n, MB, occ, oc_chunks, oh_s, jcp.oh);
+                if (jcp.loop_order == loop_cwgn)
+                    nd_iterator_jump(start, end, occ, oc_chunks, owb, jcp.nb_ow,
+                        g, jcp.ngroups, n, MB, oh_s, jcp.oh);
+                else if (jcp.loop_order == loop_gncw)
+                    nd_iterator_jump(start, end, g, jcp.ngroups, n, MB, occ,
+                        oc_chunks, owb, jcp.nb_ow, oh_s, jcp.oh);
                 else
                     assert(!"unsupported loop order");
             }
         }
 
-        jit_conv_ker_pipeline(kernel_->jit_ker, par_conv,
-                src, dst, weights, bias, 0, 0, 0);
+        jit_conv_ker_pipeline_ow_thr(kernel_->jit_ker, par_conv,
+                src, dst, weights, bias, 0, 0, 0, 0);
     });
 }
 
@@ -222,8 +384,9 @@ void _jit_avx512_common_convolution_fwd_t
 
     parallel(0, [&](const int ithr, const int nthr) {
         int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking;
-        int start, end, start_copy;
-        int work_amount = MB * jcp.ngroups * oc_chunks * jcp.od * jcp.oh;
+        int start{0}, end{0}, start_copy;
+        int work_amount = MB * jcp.ngroups * oc_chunks * jcp.od * jcp.oh
+            * jcp.nb_ow;
         balance211(work_amount, nthr, ithr, start, end);
         start_copy = start;
 
@@ -235,18 +398,19 @@ void _jit_avx512_common_convolution_fwd_t
         size_t wht_d_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 0, 1);
         size_t wht_ic_stride = wht_blk_off(weights_d, 0, 0, 1);
+
         for (int icb_l2 = 0 ; icb_l2 < jcp.nb_ic; icb_l2 += jcp.nb_ic_L2) {
             start = start_copy;
-            int n{0}, g{0}, occ{0}, oh_s{0}, od_s{0};
+            int n{0}, g{0}, occ{0}, oh_s{0}, od_s{0}, owb{0};
 
-            if (jcp.loop_order == loop_cgn)
+            if (jcp.loop_order == loop_cwgn)
                 nd_iterator_init(start,
-                    occ, oc_chunks, g, jcp.ngroups, n, MB, od_s, jcp.od,
-                    oh_s, jcp.oh);
-            else if (jcp.loop_order == loop_gnc)
+                    occ, oc_chunks, owb, jcp.nb_ow, g, jcp.ngroups, n, MB,
+                    od_s, jcp.od, oh_s, jcp.oh);
+            else if (jcp.loop_order == loop_gncw)
                 nd_iterator_init(start,
-                    g, jcp.ngroups, n, MB, occ, oc_chunks, od_s, jcp.od,
-                    oh_s, jcp.oh);
+                    g, jcp.ngroups, n, MB, occ, oc_chunks, owb, jcp.nb_ow,
+                    od_s, jcp.od, oh_s, jcp.oh);
             else
                 assert(!"unsupported loop order");
 
@@ -258,6 +422,8 @@ void _jit_avx512_common_convolution_fwd_t
 
                 int work_rem = end - start;
                 int ih_s = -jcp.t_pad + oh_s * jcp.stride_h;
+                int ow_s =  owb * jcp.ow_block;
+                int iw_s =  ow_s * jcp.stride_w;
                 int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem;
 
                 int id_s = -jcp.f_pad + od_s * jcp.stride_d;
@@ -271,9 +437,9 @@ void _jit_avx512_common_convolution_fwd_t
                     jcp.kd - d_t_overflow - d_b_overflow);
 
                 auto bias_w = bias ? bias + bias_d.blk_off(g_oc) : 0;
-                auto dst_w = dst + dst_d.blk_off(n, g_ocb, od_s, oh_s);
-                auto src_w = src + src_d.blk_off(n, g_icb + icb_l2, id_s, ih_s)
-                        + d_t_overflow * dilate_d * src_d_stride;
+                auto dst_w = dst + dst_d.blk_off(n, g_ocb, od_s, oh_s, ow_s);
+                auto src_w = src + src_d.blk_off(n, g_icb + icb_l2, id_s, ih_s,
+                    iw_s) + d_t_overflow * dilate_d * src_d_stride;
                 auto wht_w = weights + wht_blk_off(weights_d, g, ocb, icb_l2)
                     + d_t_overflow * wht_d_stride;
 
@@ -292,10 +458,14 @@ void _jit_avx512_common_convolution_fwd_t
                                 dilate_h);
                         int kh_padding = nstl::max(0,
                             jcp.kh - i_t_overflow - i_b_overflow);
-                        jit_conv_3d_ker_pipeline(kernel_->jit_ker, par_conv,
-                                src_c + i_t_overflow * dilate_h * src_h_stride,
-                                dst_c, wht_w + i_t_overflow * wht_h_stride,
-                                bias_w, icb, kh_padding, kd_padding);
+
+                        int oc_off = g_oc * sizeof(dst_data_t);
+
+                        jit_conv_3d_ker_pipeline_ow_thr(kernel_->jit_ker,
+                            par_conv,
+                            src_c + i_t_overflow * dilate_h * src_h_stride,
+                            dst_c, wht_w + i_t_overflow * wht_h_stride,
+                            bias_w, icb, kh_padding, kd_padding, owb, oc_off);
 
                         src_c += src_h_stride * jcp.stride_h;
                         dst_c += dst_h_stride;
@@ -304,20 +474,20 @@ void _jit_avx512_common_convolution_fwd_t
                     wht_w += wht_ic_stride;
                 }
 
-                if (jcp.loop_order == loop_cgn)
+                if (jcp.loop_order == loop_cwgn)
                     nd_iterator_jump(start, end,
-                      occ, oc_chunks, g, jcp.ngroups, n, MB, od_s, jcp.od,
-                      oh_s, jcp.oh);
-                else if (jcp.loop_order == loop_gnc)
+                      occ, oc_chunks, owb, jcp.nb_ow, g, jcp.ngroups, n, MB,
+                      od_s, jcp.od, oh_s, jcp.oh);
+                else if (jcp.loop_order == loop_gncw)
                     nd_iterator_jump(start, end,
-                      g, jcp.ngroups, n, MB, occ, oc_chunks, od_s, jcp.od,
-                      oh_s, jcp.oh);
+                      g, jcp.ngroups, n, MB, occ, oc_chunks, owb, jcp.nb_ow,
+                      od_s, jcp.od, oh_s, jcp.oh);
                 else
                     assert(!"unsupported loop order");
             }
         }
         jit_conv_3d_ker_pipeline(kernel_->jit_ker, par_conv,
-                src, dst, weights, bias, 0, 0, 0);
+                src, dst, weights, bias, 0, 0, 0, 0);
     });
 }
 
@@ -331,7 +501,85 @@ template struct _jit_avx512_common_convolution_fwd_t<true, data_type::s16,
 template <data_type_t diff_dst_type, data_type_t wei_type,
           data_type_t diff_src_type>
 void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
-          diff_src_type>::execute_backward_data() {
+          diff_src_type>::execute_backward_data_1d() {
+    auto diff_dst = reinterpret_cast<const diff_dst_data_t *>
+                                                       (this->input_memory(0));
+    auto weights = reinterpret_cast<const wei_data_t *>(this->input_memory(1));
+    auto diff_src = reinterpret_cast<diff_src_data_t*>(this->memory());
+
+    const memory_desc_wrapper diff_dst_d(conf_.diff_dst_pd());
+    const memory_desc_wrapper diff_src_d(conf_.diff_src_pd());
+    const memory_desc_wrapper weights_d(conf_.weights_pd(0));
+
+    const auto &jcp = kernel_->jcp;
+
+    parallel(0, [&](const int ithr, const int nthr) {
+        int start{0}, end{0}, start_copy;
+        int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking;
+        int work_amount = jcp.ngroups * jcp.mb * ic_chunks * jcp.ih;
+        balance211(work_amount, nthr, ithr, start, end);
+        start_copy = start;
+
+        auto par_conv = jit_conv_call_s();
+        size_t diff_dst_c_stride = diff_dst_d.blk_off(0, 1);
+        size_t wht_oc_stride = wht_blk_off(weights_d, 0, 1);
+
+        for (int ocb_l2 = 0; ocb_l2 < jcp.nb_oc; ocb_l2 += jcp.nb_oc_L2) {
+            start = start_copy;
+            int n{0}, g{0}, icc{0};
+            if (jcp.loop_order == loop_cgn) {
+                int dummy{0};
+                nd_iterator_init(start, icc, ic_chunks, g, jcp.ngroups, n,
+                        jcp.mb, dummy, 1);
+            } else if (jcp.loop_order == loop_gnc) {
+                int dummy{0};
+                nd_iterator_init(start, g, jcp.ngroups, n, jcp.mb, icc,
+                        ic_chunks, dummy, 1);
+            } else {
+                assert(!"unsupported loop order");
+            }
+
+            while (start < end) {
+                int icb = icc * jcp.nb_ic_blocking;
+                int g_icb = g * jcp.nb_ic + icb;
+                int g_ocb = g * jcp.nb_oc;
+
+                auto diff_src_w = diff_src + diff_src_d.blk_off(n, g_icb);
+                auto diff_dst_w = diff_dst
+                    + diff_dst_d.blk_off(n, g_ocb + ocb_l2);
+                auto wht_w = weights + wht_blk_off(weights_d, g, ocb_l2, icb);
+
+                for (int ocb = ocb_l2;
+                      ocb < min(jcp.nb_oc, ocb_l2 + jcp.nb_oc_L2); ++ocb) {
+                    jit_conv_ker_pipeline(kernel_->jit_ker, par_conv,
+                            diff_src_w, diff_dst_w, wht_w, 0, ocb, 1, 0);
+                    diff_dst_w += diff_dst_c_stride;
+                    wht_w += wht_oc_stride;
+                }
+
+                if (jcp.loop_order == loop_cgn) {
+                    int dummy{0};
+                    nd_iterator_jump(start, end, icc, ic_chunks, g, jcp.ngroups,
+                            n, jcp.mb, dummy, 1);
+                } else if (jcp.loop_order == loop_gnc) {
+                    int dummy{0};
+                    nd_iterator_jump(start, end, g, jcp.ngroups, n, jcp.mb, icc,
+                            ic_chunks, dummy, 1);
+                } else {
+                    assert(!"unsupported loop order");
+                }
+            }
+        }
+
+        jit_conv_ker_pipeline(kernel_->jit_ker, par_conv,
+                diff_src, diff_dst, weights, 0, 0, 1, 0);
+    });
+}
+
+template <data_type_t diff_dst_type, data_type_t wei_type,
+          data_type_t diff_src_type>
+void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
+          diff_src_type>::execute_backward_data_2d() {
     auto diff_dst = reinterpret_cast<const diff_dst_data_t *>
                                                        (this->input_memory(0));
     auto weights = reinterpret_cast<const wei_data_t *>(this->input_memory(1));
@@ -345,7 +593,7 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
     const int MB = conf_.MB();
 
     parallel(0, [&](const int ithr, const int nthr) {
-        int start, end, start_copy;
+        int start{0}, end{0}, start_copy;
         int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking;
         int work_amount = jcp.ngroups * MB * ic_chunks * jcp.ih;
         balance211(work_amount, nthr, ithr, start, end);
@@ -470,7 +718,7 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
     const int MB = conf_.MB();
 
     parallel(0, [&](const int ithr, const int nthr) {
-        int start, end, start_copy;
+        int start{0}, end{0}, start_copy;
         int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking;
         int work_amount = jcp.ngroups * MB * ic_chunks * jcp.id * jcp.ih;
         balance211(work_amount, nthr, ithr, start, end);
@@ -601,7 +849,7 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
                                 diff_src_w + ij * diff_src_h_stride,
                                 diff_dst_w + oj * diff_dst_h_stride,
                                 wht_w + k_lo * wht_h_stride,
-                                0, ocb, k_len, d_len);
+                                0, ocb, k_len, d_len, 0);
                     }
                     diff_dst_w += diff_dst_c_stride;
                     wht_w += wht_oc_stride;
@@ -621,7 +869,7 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
         }
 
         jit_conv_3d_ker_pipeline(kernel_->jit_ker, par_conv,
-                diff_src, diff_dst, weights, 0, 0, 1, 1);
+                diff_src, diff_dst, weights, 0, 0, 1, 1, 0);
     });
 }
 
@@ -732,10 +980,10 @@ struct jit_avx512_common_convolution_bwd_weights_t<src_type, diff_dst_type,
     int ithr_but_oc;
     int ithr_but_ic;
 
-    int img_start, img_end, img_work;
-    int g_start, g_end, g_work;
-    int oc_b_start, oc_b_end, oc_b_work;
-    int ic_b_start, ic_b_end, ic_b_work;
+    int img_start = 0, img_end = 0, img_work;
+    int g_start = 0, g_end = 0, g_work;
+    int oc_b_start = 0, oc_b_end = 0, oc_b_work;
+    int ic_b_start = 0, ic_b_end = 0, ic_b_work;
 
     thread_info_t(const jit_avx512_common_convolution_bwd_weights_t *self,
             int ithr): ithr(ithr) {
@@ -961,7 +1209,7 @@ void jit_avx512_common_convolution_bwd_weights_t<src_type, diff_dst_type,
         tr_ctx.tr_src = tr_src_
             + ti->ithr_but_oc * jcp.ih * jcp.stride_w * jcp.tr_ld;
 
-        assert(utils::implication(!mkldnn_thr_syncable(), nthr_oc_b_ == 1));
+        assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_oc_b_ == 1));
         tr_ctx.nthr_oc_b = nthr_oc_b_;
         int ih_start{0}, ih_end{0};
         balance211(jcp.ih, nthr_oc_b_, ti->ithr_oc_b, ih_start, ih_end);
@@ -1081,7 +1329,8 @@ void jit_avx512_common_convolution_bwd_weights_t<src_type, diff_dst_type,
     const memory_desc_wrapper diff_weights_d(conf_.diff_weights_pd(0));
 
     const auto &jcp = kernel_->jcp;
-    const int wei_size = jcp.ngroups * jcp.oc * jcp.ic * jcp.kh*jcp.kw*jcp.kd;
+    const int wei_size
+            = jcp.ngroups * jcp.oc * jcp.ic * jcp.kh * jcp.kw * jcp.kd;
 
     diff_weights_data_t *diff_wei = ti->ithr_mb == 0
         ? (diff_weights_data_t*)ti->diff_weights
@@ -1105,38 +1354,42 @@ void jit_avx512_common_convolution_bwd_weights_t<src_type, diff_dst_type,
         int work_rem = img_end - img_start;
         const int od_e = od_s + work_rem > jcp.od ? jcp.od : od_s + work_rem;
         const int id_s = od_s * jcp.stride_d;
-        const int idp = jcp.id + jcp.f_pad + jcp.back_pad;
-
-        if (id_s < idp - jcp.back_pad - (jcp.kd - 1) * (jcp.dilate_d)) {
-            for (int g = ti->g_start; g < ti->g_end; ++g) {
-            for (int oc_b = ti->oc_b_start; oc_b < ti->oc_b_end; ++oc_b) {
-            for (int ic_b = ti->ic_b_start; ic_b < ti->ic_b_end; ++ic_b) {
-                const int _oc = g * jcp.nb_oc + oc_b;
-                const int _ic = g * jcp.nb_ic + ic_b;
-
-                auto src = &ti->src[src_d.blk_off(img, _ic)
-                        + od_s * input_step];
-                auto dst = &ti->diff_dst[diff_dst_d.blk_off(img, _oc)
+        const int ik_overlap = nstl::max(0, id_s - jcp.f_pad);
+        const int kd_front_pad = nstl::max(0, jcp.f_pad - id_s);
+        const int kd_back_pad = nstl::max(0, id_s + 1 + jcp.back_pad - jcp.od);
+        int kd_pad_off = kd_front_pad * jcp.kh * jcp.kw * jcp.ic_block
+                * jcp.oc_block * jcp.typesize_out;
+
+        for (int g = ti->g_start; g < ti->g_end; ++g) {
+        for (int oc_b = ti->oc_b_start; oc_b < ti->oc_b_end; ++oc_b) {
+        for (int ic_b = ti->ic_b_start; ic_b < ti->ic_b_end; ++ic_b) {
+            const int _oc = g * jcp.nb_oc + oc_b;
+            const int _ic = g * jcp.nb_ic + ic_b;
+
+            auto src = &ti->src[src_d.blk_off(img, _ic)
+                    + ik_overlap * input_step];
+            auto dst = &ti->diff_dst[diff_dst_d.blk_off(img, _oc)
                     + od_s * output_step];
 
-                jit_conv_3d_ker_pipeline(kernel_->jit_ker, p, src, dst,
+            jit_conv_3d_ker_bwd_w_pipeline(kernel_->jit_ker, p, src, dst,
                     diff_wei + wht_blk_off(diff_weights_d, g, oc_b, ic_b),
-                    diff_bia + _oc*16, (img == img_first), od_e-od_s, id_s );
-                if (ic_b == 0) p.flags = 0;
-                else p.flags = 1;
-            }
-            }
-            }
+                    diff_bia + _oc * 16, (img == img_first), od_s, od_e,
+                    jcp.kd - nstl::max(kd_front_pad, kd_back_pad), kd_pad_off);
 
-            const int _oc = ti->g_start * jcp.nb_oc + ti->oc_b_start;
-            const int _ic = ti->g_start * jcp.nb_ic + ti->ic_b_start;
-            jit_conv_3d_ker_pipeline(kernel_->jit_ker, p,
-                    &ti->src[src_d.blk_off(img + 1, _ic)],
-                    &ti->diff_dst[diff_dst_d.blk_off(img + 1, _oc)],
-                    diff_wei + wht_blk_off(diff_weights_d, ti->g_start,
-                        ti->oc_b_start, ti->ic_b_start),
-                    diff_bia, 0, 0, 0);
+            if (ic_b == 0) p.flags = 0;
+            else p.flags = 1;
+        }
+        }
         }
+
+        const int _oc = ti->g_start * jcp.nb_oc + ti->oc_b_start;
+        const int _ic = ti->g_start * jcp.nb_ic + ti->ic_b_start;
+        jit_conv_3d_ker_bwd_w_pipeline(kernel_->jit_ker, p,
+                &ti->src[src_d.blk_off(img + 1, _ic)],
+                &ti->diff_dst[diff_dst_d.blk_off(img + 1, _oc)],
+                diff_wei + wht_blk_off(diff_weights_d, ti->g_start,
+                    ti->oc_b_start, ti->ic_b_start),
+                diff_bia, 0, 0, 0, 0, 0);
         nd_iterator_jump(img_start, img_end, img, jcp.mb, od_s, jcp.od);
     }
 }
@@ -1337,14 +1590,16 @@ void jit_avx512_common_convolution_bwd_weights_t<src_type, diff_dst_type,
 
         thread_info_t thread_info(this, ithr);
 
-        if (conf_.ndims() == 4) {
+        if (utils::one_of(conf_.ndims(), 3, 4)) {
             compute_diff_weights(&thread_info);
             if (nthr_mb_ > 1) reduce_diff_weights(&thread_info);
             if (conf_.with_bias()) compute_diff_bias(&thread_info);
-        } else {
+        } else if (conf_.ndims() == 5) {
             compute_diff_weights_3d(&thread_info);
             if (nthr_mb_ > 1) reduce_diff_weights_3d(&thread_info);
             if (conf_.with_bias()) compute_diff_bias_3d(&thread_info);
+        } else {
+            assert(false);
         }
     });
 
@@ -1483,7 +1738,7 @@ void jit_avx512_common_convolution_bwd_weights_t<src_type, diff_dst_type,
         nthr_mb_ = min(j.mb * j.od, max_threads);
     nthr_ = nthr_mb_ * nthr_g_ * nthr_oc_b_ * nthr_ic_b_;
     assert(nthr_ <= max_threads);
-    assert(utils::implication(!mkldnn_thr_syncable(), nthr_mb_ == 1));
+    assert(IMPLICATION(!mkldnn_thr_syncable(), nthr_mb_ == 1));
 }
 
 template struct jit_avx512_common_convolution_bwd_weights_t<data_type::f32>;
index 466548c..42080cc 100644 (file)
@@ -59,7 +59,7 @@ struct _jit_avx512_common_convolution_fwd_t : public cpu_primitive_t {
                     && this->cdesc_().src_desc.data_type == src_type
                     && this->cdesc_().weights_desc.data_type == wei_type
                     && this->cdesc_().dst_desc.data_type == dst_type
-                    && utils::implication(this->with_bias(), dst_type
+                    && IMPLICATION(this->with_bias(), dst_type
                                        == this->cdesc_().bias_desc.data_type)
                     && !(with_relu && this->negative_slope()!= 0.
                                    && dst_type == data_type::s32
@@ -106,13 +106,20 @@ struct _jit_avx512_common_convolution_fwd_t : public cpu_primitive_t {
 
     virtual void execute(event_t *e)
     {
-        if (conf_.ndims() == 4) execute_forward();
-        else                    execute_forward_3d();
+        if (conf_.ndims() == 3)
+            execute_forward_1d();
+        else if (conf_.ndims() == 4)
+            execute_forward_2d();
+        else if (conf_.ndims() == 5)
+            execute_forward_3d();
+        else
+            assert(false);
         e->set_state(event_t::ready);
     }
 
 private:
-    void execute_forward();
+    void execute_forward_1d();
+    void execute_forward_2d();
     void execute_forward_3d();
     pd_t conf_;
     jit_avx512_common_conv_fwd_kernel *kernel_;
@@ -169,7 +176,7 @@ struct jit_avx512_common_convolution_bwd_data_t: public cpu_primitive_t {
         inline memory_format_t src_format()
         {
             using namespace memory_format;
-            return (ndims() == 4) ? nChw16c : nCdhw16c;
+            return utils::pick(ndims() - 3, nCw16c, nChw16c, nCdhw16c);
         }
         inline memory_format_t wei_format()
         {
@@ -179,9 +186,11 @@ struct jit_avx512_common_convolution_bwd_data_t: public cpu_primitive_t {
                 && wei_type == data_type::s16) {
                 return  this->with_groups() ? gOIhw8o16i2o : OIhw8o16i2o;
             } else {
-                return (ndims() == 4)
-                    ? this->with_groups() ? gOIhw16o16i : OIhw16o16i
-                    : this->with_groups() ? gOIdhw16o16i : OIdhw16o16i;
+                return this->with_groups()
+                    ? utils::pick(ndims() - 3, gOIw16o16i, gOIhw16o16i,
+                          gOIdhw16o16i)
+                    : utils::pick(ndims() - 3, OIw16o16i, OIhw16o16i,
+                          OIdhw16o16i);
             }
         }
 
@@ -216,8 +225,14 @@ struct jit_avx512_common_convolution_bwd_data_t: public cpu_primitive_t {
     virtual void execute(event_t *e) {
         switch (conf_.desc()->prop_kind) {
         case prop_kind::backward_data:
-            if (conf_.ndims() == 4) execute_backward_data();
-            else                    execute_backward_data_3d();
+            if (conf_.ndims() == 3)
+                execute_backward_data_1d();
+            else if (conf_.ndims() == 4)
+                execute_backward_data_2d();
+            else if (conf_.ndims() == 5)
+                execute_backward_data_3d();
+            else
+                assert(false);
             break;
         default:
             assert(!"invalid prop_kind");
@@ -226,7 +241,8 @@ struct jit_avx512_common_convolution_bwd_data_t: public cpu_primitive_t {
     }
 
 private:
-    void execute_backward_data();
+    void execute_backward_data_1d();
+    void execute_backward_data_2d();
     void execute_backward_data_3d();
     pd_t conf_;
     jit_avx512_common_conv_bwd_data_kernel_f32 *kernel_;
@@ -269,14 +285,16 @@ struct jit_avx512_common_convolution_bwd_weights_t: public cpu_primitive_t {
         inline memory_format_t src_format()
         {
             using namespace memory_format;
-            return (ndims() == 4) ? nChw16c : nCdhw16c;
+            return utils::pick(ndims() - 3, nCw16c, nChw16c, nCdhw16c);
         }
         inline memory_format_t wei_format()
         {
             using namespace memory_format;
-            return (ndims() == 4)
-                ? this->with_groups() ? gOIhw16o16i : OIhw16o16i
-                : this->with_groups() ? gOIdhw16o16i : OIdhw16o16i;
+            return this->with_groups()
+                ? utils::pick(ndims() - 3, gOIw16o16i, gOIhw16o16i,
+                      gOIdhw16o16i)
+                : utils::pick(ndims() - 3, OIw16o16i, OIhw16o16i,
+                      OIdhw16o16i);
         }
 
 
@@ -305,6 +323,8 @@ struct jit_avx512_common_convolution_bwd_weights_t: public cpu_primitive_t {
         delete kernel_;
         if (trans_kernel_)
             delete trans_kernel_;
+        if (trans_dst_kernel_)
+            delete trans_dst_kernel_;
         if (acc_ker_)
             delete acc_ker_;
         delete reducer_bias_;
@@ -315,6 +335,8 @@ struct jit_avx512_common_convolution_bwd_weights_t: public cpu_primitive_t {
 
         free(tr_src_bctx_);
         free(tr_diff_dst_bctx_);
+
+        free(tr_diff_dst_);
     }
 
     typedef typename prec_traits<src_type>::type src_data_t;
index f31e072..fbdf9eb 100644 (file)
@@ -233,12 +233,9 @@ struct _jit_avx512_common_convolution_winograd_fwd_t
                                this->cdesc_().src_desc.data_type,
                                this->cdesc_().weights_desc.data_type,
                                this->cdesc_().dst_desc.data_type)
-                    && utils::implication(this->with_bias(), data_type::f32
+                    && IMPLICATION(this->with_bias(), data_type::f32
                                        == this->cdesc_().bias_desc.data_type)
                     && mkldnn_thr_syncable();
-
-            ok = ok && this->dst_pd_.desc()->format == memory_format::nChw16c &&
-                 this->src_pd_.desc()->format == memory_format::nChw16c;
             if (!ok)
                 return status::unimplemented;
 
index 3597560..3d1701f 100644 (file)
@@ -263,13 +263,11 @@ struct jit_avx512_common_lrn_fwd_t::jit_avx512_common_lrn_kernel_f32:
         movq(xk, imm_addr64);
         vbroadcastss(zk, xk);
 
-        char tag = '\0';
         if (is_first || is_single) {
             vxorps(xmm2, xmm2, xmm2);
             for(int irb = 0; irb < FWD_RBC; irb++) {
                 vmovups(ptr[t + irb*BUFFER_BLOCK], xmm2);
             }
-            tag = 'f';
         }
         if (is_last || is_single) {
             vxorps(xmm2, xmm2, xmm2);
@@ -277,13 +275,12 @@ struct jit_avx512_common_lrn_fwd_t::jit_avx512_common_lrn_kernel_f32:
                 vmovups(ptr[t + irb*BUFFER_BLOCK + BUFFER_NEXT_OFFSET],
                     xmm2);
             }
-            tag = 'l';
         }
 
         int LSREST = LSB % FWD_RBC;
         int LS = LSB - LSREST;
 
-        jit_tagged_label lrn_loop("lrn_loop", tag);
+        Label lrn_loop;
 
         if (LS > 0) {
             mov(hw, LS);
@@ -675,26 +672,23 @@ struct jit_avx512_common_lrn_bwd_t::jit_avx512_common_lrn_kernel_f32:
         is_last  = J.version == +1 || J.version == +2;
         is_single = J.version == 3;
 
-        char tag = '\0';
         if (is_first || is_single) {
             vxorps(xmm1, xmm1, xmm1);
             for(int irb = 0; irb < BWD_RBC; irb++) {
                 vmovups(ptr[t + irb*BUFFER_BLOCK], xmm1);
             }
-            tag = 'f';
         }
         if (is_last || is_single) {
             vxorps(xmm1, xmm1, xmm1);
             for(int irb = 0; irb < BWD_RBC; irb++) {
                 vmovups(ptr[t + irb*BUFFER_BLOCK + BUFFER_NEXT_OFFSET], xmm1);
             }
-            tag = 'l';
         }
 
         int LSREST = LSB % BWD_RBC;
         int LS = LSB - LSREST;
 
-        jit_tagged_label lrn_loop("lrn_loop", tag);
+        Label lrn_loop;
 
         if (LS > 0) {
             mov(hw, LS);
index 39f602e..1239186 100644 (file)
@@ -236,7 +236,7 @@ struct jit_avx512_core_fp32_wino_conv_2x3_dst_trans_t: public jit_generator {
     Reg64 reg_oc_block = r8;
 
     Reg64 reg_ptr_bias = rbx;
-    Reg64 reg_ptr_scales = rcx;
+    Reg64 reg_ptr_scales = abi_not_param1;
     Reg64 reg_ptr_sum_scale = rdx;
 };
 
@@ -458,11 +458,11 @@ bool jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t::post_ops_ok(
    switch (p.len_) {
     case 0: return true;
     case 1: return true
-                && implication(jcp.with_relu, p.contain(sum, 0))
-                && implication(!jcp.with_relu, is_relu(0) || p.contain(sum, 0));
+                && IMPLICATION(jcp.with_relu, p.contain(sum, 0))
+                && IMPLICATION(!jcp.with_relu, is_relu(0) || p.contain(sum, 0));
     case 2: return true
-                && implication(jcp.with_relu, p.contain(sum, 0) && is_relu(1))
-                && implication(!jcp.with_relu, false
+                && IMPLICATION(jcp.with_relu, p.contain(sum, 0) && is_relu(1))
+                && IMPLICATION(!jcp.with_relu, false
                         || (p.contain(sum, 0) && is_relu(1))
                         || (p.contain(sum, 1) && is_relu(0)));
     case 3: return true
@@ -614,6 +614,14 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf(
     jcp.r = 3;
     jcp.alpha = jcp.m + jcp.r - 1;
     int simdw = 16;
+    jcp.src_fmt = src_d.format();
+    jcp.with_bias = cd.bias_desc.format != memory_format::undef;
+    jcp.with_relu = with_relu;
+    jcp.relu_negative_slope = relu_negative_slope;
+    if (!IMPLICATION(with_relu, relu_negative_slope == 0.))
+        return status::unimplemented;
+    if (!post_ops_ok(jcp, attr))
+        return status::unimplemented;
 
     bool ok_to_pad_channels = jcp.ngroups == 1;
     if (ok_to_pad_channels) {
@@ -621,6 +629,12 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf(
         jcp.ic = rnd_up(jcp.ic, simdw);
     }
 
+    if (src_d.format() != nChw16c
+            || dst_d.format() != nChw16c
+            || !IMPLICATION(jcp.with_bias,
+                bias_d.format() == x))
+        return status::unimplemented;
+
     jcp.ver = ver_avx512_core;
     if (!(mayiuse(avx512_core)))
         return status::unimplemented;
@@ -674,15 +688,6 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf(
         || (jcp.small_mb && sp_sz > 196))
         return unimplemented;
 
-    jcp.src_fmt = src_d.format();
-    jcp.with_bias = cd.bias_desc.format != memory_format::undef;
-    jcp.with_relu = with_relu;
-    jcp.relu_negative_slope = relu_negative_slope;
-    if (!implication(with_relu, relu_negative_slope == 0.))
-        return status::unimplemented;
-    if (!post_ops_ok(jcp, attr))
-        return status::unimplemented;
-
     jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef;
     jcp.dst_dt = cd.dst_desc.data_type;
 
@@ -809,7 +814,7 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf(
 
     const auto &oscales = attr.output_scales_;
     jcp.is_oc_scale = oscales.mask_ == 1 << 1;
-    assert(utils::implication(!jcp.is_oc_scale, oscales.mask_ == 0));
+    assert(IMPLICATION(!jcp.is_oc_scale, oscales.mask_ == 0));
 
     /* re-create weights primitive descriptor
                                     and set weights wino_blocking */
@@ -826,6 +831,7 @@ status_t jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t ::init_conf(
     wd.oc_block = jcp.oc_block;
     wd.oc2_block = jcp.n2_block;
     wd.ic2_block = 1;
+    wd.adj_scale = 1.f;
     size_t max_size = sizeof(float) * jcp.alpha * jcp.alpha * jcp.ic * jcp.oc;
     wd.size = max_size;
 
@@ -847,7 +853,7 @@ _jit_avx512_core_fp32_wino_conv_2x3_fwd_t<with_relu>::
         _jit_avx512_core_fp32_wino_conv_2x3_fwd_t(const pd_t *pd,
                 const input_vector &inputs, const output_vector &outputs)
     : cpu_primitive_t(&conf_, inputs, outputs)
-    , conf_(*pd) {
+    , conf_(*pd), padded_bias_(nullptr) {
     const int nthreads = mkldnn_get_max_threads();
     kernel_ = new jit_avx512_core_fp32_wino_conv_2x3_fwd_ker_t(
             conf_.jcp_, *conf_.attr());
@@ -884,9 +890,7 @@ _jit_avx512_core_fp32_wino_conv_2x3_fwd_t<with_relu>
 
     free(wino_src_);
     free(wino_dst_);
-    if (conf_.want_padded_bias()) {
-        free(padded_bias_);
-    }
+    free(padded_bias_);
 }
 
 template <bool with_relu>
index 17435bb..cd4d5da 100644 (file)
@@ -61,12 +61,9 @@ struct _jit_avx512_core_fp32_wino_conv_2x3_fwd_t : public cpu_primitive_t {
                     && this->cdesc_().src_desc.data_type == data_type::f32
                     && this->cdesc_().dst_desc.data_type == data_type::f32
                     && this->cdesc_().weights_desc.data_type == data_type::f32
-                    && utils::implication(this->with_bias(),
+                    && IMPLICATION(this->with_bias(),
                                utils::one_of(this->cdesc_().bias_desc.data_type,
                                        data_type::f32));
-
-            ok = ok && this->dst_pd_.desc()->format == memory_format::nChw16c &&
-                    this->src_pd_.desc()->format == memory_format::nChw16c;
             if (!ok)
                 return status::unimplemented;
 
index 617a2aa..4b9fbd6 100644 (file)
@@ -724,7 +724,7 @@ _execute_backward_weights_SDGtWo() {
        1.13777777777778f};
     float G_O_3x3_4x4[4] = {2.25f, 0.625f, 1.5f, 0.390625f};
 
-#pragma omp parallel firstprivate(trans_ker_p, I, T)
+#pragma omp parallel num_threads(nthreads) firstprivate(trans_ker_p, I, T)
 {
     if (jcp.with_bias) {
         parallel_nd_in_omp(nthreads, jcp.oc / simd_w,
@@ -878,8 +878,8 @@ _execute_backward_weights_S_D_Giot_W() {
     array_offset_calculator<float, 2> diff_bias_prv(
             (float *)(scratchpad_->bias_ptr()), nthreads, jcp.oc);
 
-    size_t input_starts[max_threads_number];
-    size_t input_ends[max_threads_number];
+    size_t input_starts[max_threads_number] = {0};
+    size_t input_ends[max_threads_number] = {0};
     size_t first_tblk = 0;
 
     auto trans_ker_p = jit_wino_transform_call_s();
index e2f71f8..e4ef286 100644 (file)
@@ -89,6 +89,8 @@ struct winograd_scratchpad_avx512_core_t {
                     * jcp.oc * sizeof(float);
                 break;
             case WSCHED_WEI_SDGtWo:
+                nthreads_ = nstl::min(mkldnn_get_max_threads(), jcp.tile_block);
+
                 U_sz_ = nthreads_
                     * (alpha * alpha * jcp.oc * (jcp.ic / jcp.nb_ic)
                       + jcp.ic * jcp.oc * jcp.kh * jcp.kw)
@@ -207,12 +209,9 @@ struct _jit_avx512_core_fp32_wino_conv_4x3_fwd_t
                                this->cdesc_().src_desc.data_type,
                                this->cdesc_().weights_desc.data_type,
                                this->cdesc_().dst_desc.data_type)
-                    && utils::implication(this->with_bias(), data_type::f32
+                    && IMPLICATION(this->with_bias(), data_type::f32
                                        == this->cdesc_().bias_desc.data_type)
                     && mkldnn_thr_syncable();
-
-            ok = ok && this->dst_pd_.desc()->format == memory_format::nChw16c &&
-                 this->src_pd_.desc()->format == memory_format::nChw16c;
             if (!ok)
                 return status::unimplemented;
 
index 1346d7f..831f182 100644 (file)
@@ -1374,12 +1374,12 @@ bool jit_avx512_core_fp32_wino_conv_4x3_fwd_kernel::post_ops_ok(
         return true; // no post_ops
     case 1:
         return true // relu or sum
-                && implication(jcp.with_eltwise, is_sum(0))
-                && implication(!jcp.with_eltwise, is_eltwise(0) || is_sum(0));
+                && IMPLICATION(jcp.with_eltwise, is_sum(0))
+                && IMPLICATION(!jcp.with_eltwise, is_eltwise(0) || is_sum(0));
     case 2:
         return true // sum->relu or relu->sum
-                && implication(jcp.with_eltwise, is_sum(0) && is_eltwise(1))
-                && implication(!jcp.with_eltwise, false
+                && IMPLICATION(jcp.with_eltwise, is_sum(0) && is_eltwise(1))
+                && IMPLICATION(!jcp.with_eltwise, false
                                    || (is_sum(0) && is_eltwise(1))
                                    || (is_eltwise(0) && is_sum(1)));
     case 3:
@@ -1460,6 +1460,7 @@ status_t jit_avx512_core_fp32_wino_conv_4x3_fwd_kernel::init_conf(
         wd.oc2_block = jcp.dimM_block * jcp.dimM_reg_block;
         size_t max_size = sizeof(float) * wd.alpha * wd.alpha * jcp.ic * jcp.oc;
         wd.size = max_size;
+        wd.adj_scale = 1.f;
 
         cpu_memory_t::pd_t new_weights_pd(
             weights_pd.engine(), &expect_wei_md);
index 28f42d5..eb9d7fd 100644 (file)
@@ -99,7 +99,7 @@ protected:
     reg64_t param = abi_param1;
 
     /* registers used for output_transform_data_ker */
-    reg64_t oreg_temp = rcx;
+    reg64_t oreg_temp = abi_not_param1;
     reg64_t oreg_Ow = r9;
     reg64_t oreg_src = r11;
     reg64_t oreg_tile_block = r12;
@@ -115,7 +115,7 @@ protected:
     reg64_t imm_addr64 = rax;
 
     /* registers used for input_transform_data_ker */
-    reg64_t ireg_temp = rcx;
+    reg64_t ireg_temp = abi_not_param1;
     reg64_t ireg_jtiles = rax;
     reg64_t ireg_itiles = rbx;
     reg64_t ireg_I = r8;
@@ -136,7 +136,7 @@ protected:
     reg64_t ireg_output = r15;
 
     /* registers used for wei transform */
-    reg64_t wreg_temp = rcx;
+    reg64_t wreg_temp = abi_not_param1;
     reg64_t wreg_F = r8;
     reg64_t wreg_src = r9;
     reg64_t wreg_MT = r15;
@@ -253,7 +253,7 @@ private:
     /*registers common to transforms*/
     reg64_t reg_transp = abi_param1;
     reg64_t reg_ti = rbx;
-    reg64_t reg_tj = rcx;
+    reg64_t reg_tj = abi_not_param1;
     reg64_t reg_src = r8;
     reg64_t reg_dst = r9;
     reg64_t reg_G = rsi; /*TODO: check if this is ok*/
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_convolution.cpp
deleted file mode 100644 (file)
index 90bf608..0000000
+++ /dev/null
@@ -1,165 +0,0 @@
-/*******************************************************************************
-* Copyright 2016-2018 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "mkldnn_types.h"
-#include "c_types_map.hpp"
-#include "mkldnn_thread.hpp"
-#include "type_helpers.hpp"
-#include "utils.hpp"
-
-#include "jit_avx512_core_u8s8s32x_convolution.hpp"
-
-namespace mkldnn {
-namespace impl {
-namespace cpu {
-
-using namespace mkldnn::impl::status;
-using namespace mkldnn::impl::memory_format;
-using namespace mkldnn::impl::utils;
-
-using namespace nstl;
-
-using jit_conv_ker_t = void (*)(jit_conv_call_s *);
-
-#define wht_blk_off(d, g, ...) \
-        (conf_.with_groups() \
-         ? (d).blk_off((g), __VA_ARGS__) \
-         : (d).blk_off(__VA_ARGS__))
-
-template <bool with_relu, data_type_t dst_type>
-void _jit_avx512_core_u8s8s32x_convolution_fwd_t<with_relu, dst_type>::
-execute_forward()
-{
-    auto src = reinterpret_cast<const src_data_t *>(this->input_memory(0));
-    auto weights = reinterpret_cast<const wei_data_t *>(this->input_memory(1));
-    auto bias = reinterpret_cast<const char *>(this->input_memory(2));
-    auto dst = reinterpret_cast<dst_data_t *>(this->memory());
-
-    const memory_desc_wrapper src_d(conf_.src_pd());
-    const memory_desc_wrapper dst_d(conf_.dst_pd());
-    const memory_desc_wrapper weights_d(conf_.weights_pd(0));
-    const memory_desc_wrapper bias_d(conf_.weights_pd(1));
-
-    const size_t bia_dt_size = conf_.with_bias()
-        ? types::data_type_size(conf_.cdesc()->bias_desc.data_type) : 0;
-
-    const auto &jcp = kernel_->jcp;
-    assert(jcp.nb_oc % jcp.nb_oc_blocking == 0);
-
-    const auto &oscales = conf_.attr()->output_scales_;
-
-    parallel(0, [&](const int ithr, const int nthr) {
-        int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking;
-        int nb_groups = jcp.nb_ch;
-        int group_block = jcp.ch_block;
-
-        int start{0}, end{0};
-        int work_amount = jcp.mb * nb_groups * oc_chunks * jcp.oh;
-        balance211(work_amount, nthr, ithr, start, end);
-
-        auto p = jit_conv_call_s();
-
-        size_t src_h_stride = src_d.blk_off(0, 0, 1);
-        size_t dst_h_stride = dst_d.blk_off(0, 0, 1);
-        size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
-
-        int n{0}, gb{0}, occ{0}, oh_s{0};
-        if (jcp.loop_order == loop_cgn)
-            nd_iterator_init(start, occ, oc_chunks, gb, nb_groups, n, jcp.mb,
-                    oh_s, jcp.oh);
-        else if (jcp.loop_order == loop_gnc)
-            nd_iterator_init(start, gb, nb_groups, n, jcp.mb, occ, oc_chunks,
-                    oh_s, jcp.oh);
-        else if (jcp.loop_order == loop_ngc)
-            nd_iterator_init(start, n, jcp.mb, gb, nb_groups, occ, oc_chunks,
-                    oh_s, jcp.oh);
-        else
-            assert(!"unsupported loop order");
-        while (start < end) {
-            int ocb = occ * jcp.nb_oc_blocking;
-            int g = gb * group_block;
-            int g_oc = (g * jcp.nb_oc + ocb) * jcp.oc_block;
-
-            int g_ic = g * jcp.nb_ic * jcp.oc_block;
-
-            int work_rem = end - start;
-            int ih_s = -jcp.t_pad + oh_s * jcp.stride_h;
-            int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem;
-
-            auto bias_w = bias ? bias + (bias_d.blk_off(g_oc) * bia_dt_size) : 0;
-
-            auto dst_w = dst + dst_d.blk_off(n, g_oc, oh_s);
-            auto src_w = src + src_d.blk_off(n, g_ic, ih_s);
-            auto wht_w = weights + wht_blk_off(weights_d, gb, ocb, 0);
-
-            auto scales = &oscales.scales_[jcp.is_oc_scale * g_oc];
-
-            for (int oj = oh_s, ij = ih_s;
-                    oj < oh_e; ++oj, ij += jcp.stride_h)
-            {
-                int dilate_h = jcp.dilate_h + 1;
-                int i_t_overflow = div_up(max(0, -ij), dilate_h);
-                int i_b_overflow = div_up(
-                        max(0, ij - jcp.ih + (jcp.kh - 1) * dilate_h + 1),
-                        dilate_h);
-                int kh_padding = nstl::max(0,
-                    jcp.kh - i_t_overflow - i_b_overflow);
-
-                p.src = src_w + i_t_overflow * dilate_h * src_h_stride;
-                p.dst = dst_w;
-                p.filt = wht_w + i_t_overflow * wht_h_stride;
-                p.bias = bias_w;
-                p.oc_blocks = jcp.is_depthwise ? gb : ocb;
-                p.kh_padding = kh_padding;
-                p.scales = scales;
-
-                kernel_->jit_ker(&p);
-
-                src_w += src_h_stride * jcp.stride_h;
-                dst_w += dst_h_stride;
-            }
-            if (jcp.loop_order == loop_cgn)
-                nd_iterator_jump(start, end, occ, oc_chunks, gb, nb_groups, n,
-                        jcp.mb, oh_s, jcp.oh);
-            else if (jcp.loop_order == loop_gnc)
-                nd_iterator_jump(start, end, gb, nb_groups, n, jcp.mb, occ,
-                        oc_chunks, oh_s, jcp.oh);
-            else if (jcp.loop_order == loop_ngc)
-                nd_iterator_jump(start, end, n, jcp.mb, gb, nb_groups, occ,
-                        oc_chunks, oh_s, jcp.oh);
-            else
-                assert(!"unsupported loop order");
-        }
-    });
-}
-
-template struct _jit_avx512_core_u8s8s32x_convolution_fwd_t<false, data_type::u8>;
-template struct _jit_avx512_core_u8s8s32x_convolution_fwd_t<true, data_type::u8>;
-
-template struct _jit_avx512_core_u8s8s32x_convolution_fwd_t<false, data_type::s8>;
-template struct _jit_avx512_core_u8s8s32x_convolution_fwd_t<true, data_type::s8>;
-
-template struct _jit_avx512_core_u8s8s32x_convolution_fwd_t<false, data_type::s32>;
-template struct _jit_avx512_core_u8s8s32x_convolution_fwd_t<true, data_type::s32>;
-
-template struct _jit_avx512_core_u8s8s32x_convolution_fwd_t<false, data_type::f32>;
-template struct _jit_avx512_core_u8s8s32x_convolution_fwd_t<true, data_type::f32>;
-
-}
-}
-}
-
-// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.cpp
new file mode 100644 (file)
index 0000000..6ea1542
--- /dev/null
@@ -0,0 +1,602 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "jit_avx512_core_u8s8s32x_deconvolution.hpp"
+
+#define GET_OFF(field) offsetof(jit_deconv_call_s, field)
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using namespace mkldnn::impl::status;
+using namespace mkldnn::impl::memory_format;
+using namespace mkldnn::impl::utils;
+
+using namespace nstl;
+
+#define wht_blk_off(d, g, ...) \
+        (conf_.with_groups() \
+         ? (d).blk_off((g), __VA_ARGS__) \
+         : (d).blk_off(__VA_ARGS__))
+
+status_t jit_avx512_core_u8s8s32x_deconv_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
+        const deconvolution_desc_t &cd, cpu_memory_t::pd_t &src_pd,
+        cpu_memory_t::pd_t &weights_pd, cpu_memory_t::pd_t &dst_pd,
+        const bool with_bias, cpu_memory_t::pd_t &bias_pd,
+        const primitive_attr_t &attr) {
+    const memory_desc_wrapper src_d(&src_pd);
+    const memory_desc_wrapper dst_d(&dst_pd);
+    const memory_desc_wrapper weights_d(&weights_pd);
+    const memory_desc_wrapper bias_d(&bias_pd);
+
+    if (!(mayiuse(avx512_core) &&
+            src_d.data_type() == data_type::u8
+         && weights_d.data_type() == data_type::s8
+         && one_of(dst_d.data_type(), data_type::f32, data_type::s32,
+            data_type::s8, data_type::u8)))
+        return status::unimplemented;
+
+    jcp = zero<decltype(jcp)>();
+
+    const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+
+    jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
+    jcp.oc = dst_d.dims()[1] / jcp.ngroups;
+    jcp.ic = src_d.dims()[1] / jcp.ngroups;
+    jcp.oc_without_padding = dst_d.dims()[1] / jcp.ngroups;
+    jcp.ic_without_padding = src_d.dims()[1] / jcp.ngroups;
+    jcp.is_depthwise = true && with_groups && utils::everyone_is(1,
+            jcp.ic_without_padding, jcp.oc_without_padding);
+
+    const auto w_format = with_groups
+        ? (jcp.is_depthwise ? Goihw16g : gOIhw4i16o4i)
+        : OIhw4i16o4i;
+
+    if (dst_d.format() == any)
+        CHECK(dst_pd.set_format(nhwc));
+    if (dst_d.format() != nhwc)
+        return status::unimplemented;
+    if (src_d.format() == any)
+        CHECK(src_pd.set_format(nhwc));
+    if (src_d.format() != nhwc)
+        return status::unimplemented;
+    if (weights_d.format() == any)
+        CHECK(weights_pd.set_format(w_format));
+    if (weights_d.format() != w_format)
+        return status::unimplemented;
+
+    jcp.with_bias = with_bias;
+    if (jcp.with_bias) {
+        if (bias_d.format() == any)
+            CHECK(bias_pd.set_format(x));
+        if (bias_d.format() != x)
+            return status::unimplemented;
+    }
+
+    jcp.ndims = dst_d.ndims();
+    jcp.prop_kind = cd.prop_kind;
+    jcp.mb = src_d.dims()[0];
+    jcp.ih = src_d.dims()[2];
+    jcp.iw = src_d.dims()[3];
+    jcp.oh = dst_d.dims()[2];
+    jcp.ow = dst_d.dims()[3];
+    jcp.kh = weights_d.dims()[with_groups + 2];
+    jcp.kw = weights_d.dims()[with_groups + 3];
+    jcp.t_pad = cd.padding[0][0];
+    jcp.l_pad = cd.padding[0][1];
+    jcp.stride_h = cd.strides[0];
+    jcp.stride_w = cd.strides[1];
+    jcp.src_fmt = src_d.format();
+    jcp.with_eltwise = false;/*TODO: support post-ops*/
+
+    if (jcp.is_depthwise) {
+        jcp.ch_block = 16;
+        jcp.oc_block = 1;
+        jcp.ic_block = 1;
+    } else {
+        jcp.ch_block = 1;
+        jcp.oc_block = 16;
+        jcp.ic_block = 16;
+
+        if (jcp.ngroups == 1) {
+            jcp.oc = utils::rnd_up(jcp.oc_without_padding, jcp.oc_block);
+            jcp.ic = utils::rnd_up(jcp.ic_without_padding, jcp.ic_block);
+        }
+        if (jcp.ic % jcp.ic_block != 0)
+            return status::unimplemented;
+    }
+
+    jcp.dilate_h = cd.dilates[0];
+    jcp.dilate_w = cd.dilates[1];
+
+    if (!IMPLICATION(jcp.dilate_h, jcp.stride_h == 1)
+            || !IMPLICATION(jcp.dilate_w, jcp.stride_w == 1))
+            return status::unimplemented;
+
+    /*bottom and right :padding*/
+    jcp.b_pad = (jcp.ih - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1)
+            - (jcp.oh + jcp.t_pad - 1);
+    jcp.r_pad = (jcp.iw - 1) * jcp.stride_w + (jcp.kw - 1) * (jcp.dilate_w + 1)
+            - (jcp.ow + jcp.l_pad - 1);
+
+    if (!attr.post_ops_.has_default_values())
+        return status::unimplemented;
+
+    jcp.ver = ver_avx512_core;
+    if (mayiuse(avx512_core_vnni))
+        jcp.ver = ver_vnni;
+    const auto &oscales = attr.output_scales_;
+    jcp.is_oc_scale = oscales.mask_ == 1 << 1;
+
+    jcp.dst_dt = dst_d.data_type();
+    jcp.bia_dt = jcp.with_bias ? bias_d.data_type() : data_type::undef;
+    jcp.typesize_bia = jcp.with_bias ? types::data_type_size(bias_d.data_type()) : 0;
+    jcp.typesize_in = types::data_type_size(src_d.data_type());
+    jcp.typesize_out = types::data_type_size(dst_d.data_type());
+
+    jcp.nb_ch = div_up(jcp.ngroups, jcp.ch_block);
+    jcp.nb_oc = jcp.oc / jcp.oc_block;
+    jcp.nb_ic = jcp.ic / jcp.ic_block;
+
+    /*kernel blocking params*/
+    const int regs = jcp.ver == ver_vnni ? 31 : 29;
+    jcp.nb_oc_blocking = nstl::min(4, jcp.nb_oc);
+    for (; jcp.nb_oc_blocking > 1; jcp.nb_oc_blocking--)
+        if (jcp.nb_oc % jcp.nb_oc_blocking == 0
+                && jcp.l_pad <= regs / (jcp.nb_oc_blocking + 1))
+            break;
+
+    jcp.ur_w = regs / (jcp.nb_oc_blocking + 1);
+    int l_overflow = max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) - jcp.l_pad) / jcp.stride_w);
+    int r_overflow = max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1)
+                     - max(0, jcp.r_pad)) / jcp.stride_w);
+    if (jcp.ow < jcp.ur_w)
+        jcp.ur_w = jcp.ow;
+    for (; jcp.ur_w > 1; jcp.ur_w--)
+        if (jcp.ur_w % jcp.stride_w == 0
+                && max(l_overflow,
+                    r_overflow - (jcp.ow % jcp.ur_w) / jcp.stride_w) * jcp.stride_w <= jcp.ur_w)
+            break;
+    jcp.ur_w_tail = jcp.ow % jcp.ur_w;
+
+    jcp.loop_order = jcp.ngroups > 1 ? loop_ngc : loop_cgn;
+    return status::success;
+}
+
+void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::compute_ker(
+        int ur_w, int l_overflow, int r_overflow, ker_block_t last_block) {
+
+    int ch_block_all = jcp.ch_block * jcp.ic_block * jcp.oc_block;
+    int shift_src_ih = jcp.typesize_in * (jcp.dilate_h + 1)
+        * jcp.iw * jcp.ngroups * jcp.ic_without_padding;
+    int shift_filt_kh = jcp.typesize_in *  jcp.kw * jcp.stride_h * ch_block_all;
+
+    auto src_offset = [=] (int oj, int icb, int ki) {
+         return jcp.typesize_in *
+           (((oj + jcp.l_pad - ki * (jcp.dilate_w + 1)) / jcp.stride_w) * jcp.ngroups * jcp.ic_without_padding + icb * 4);
+    };
+
+    auto kernel_offset = [=] (int ocb, int icb, int ki) {
+        return jcp.typesize_in *
+            (ocb * jcp.nb_ic * jcp.kh * jcp.kw * ch_block_all + icb * jcp.oc_block * jcp.ic_block/4
+             + ki * ch_block_all);
+    };
+
+    auto compute = [=](zmm_t vreg_acc, zmm_t vreg_wei, zmm_t vreg_src) {
+        if (jcp.ver == ver_vnni) {
+            vpdpbusd(vreg_acc, vreg_src, vreg_wei);
+        } else if (jcp.is_depthwise) {
+            vpmulld(zmm_tmp, vreg_src, vreg_wei);
+            vpaddd(vreg_acc, vreg_acc, zmm_tmp);
+        } else {
+            vpmaddubsw(zmm_tmp, vreg_src, vreg_wei);
+            vpmaddwd(zmm_tmp, zmm_tmp, zmm_one);
+            vpaddd(vreg_acc, vreg_acc, zmm_tmp);
+        }
+    };
+
+    mov(aux_reg_src, reg_src);
+    mov(aux_reg_filt, reg_filt);
+    mov(reg_kj, reg_kh);
+    Xbyak::Label kh_loop_label;
+    L(kh_loop_label); {
+       for (int ki = 0; ki < jcp.kw; ki++) {
+           int jj_start = get_ow_start(ki, l_overflow);
+           int jj_end = get_ow_end(ur_w, ki, r_overflow);
+           int tail_size = jcp.ic_without_padding % 4;
+           int n_ic_blocks = jcp.is_depthwise
+                           ? 1
+                           : (last_block &  ~no_last_block
+                                   ? div_up(jcp.ic_without_padding % jcp.ic_block, 4)
+                                   : jcp.ic_block / 4);
+           for (int icb1 = 0; icb1 < n_ic_blocks; icb1++) {
+               for (int jj = jj_start; jj < jj_end; jj += jcp.stride_w) {
+                    assert((jj + jcp.l_pad - ki) % jcp.stride_w == 0);
+
+                   int aux_src_off = src_offset(jj, icb1, ki);
+                   if (jcp.is_depthwise) {
+                       vpmovzxbd(zmm_inp(jj, jcp.nb_oc_blocking),
+                                   EVEX_compress_addr(aux_reg_src, aux_src_off));
+                   } else if ((last_block & last_sp_block)
+                           && tail_size != 0 && icb1 == n_ic_blocks - 1) {
+                       xmm_t xmm_tmp = xmm_t(zmm_inp(jj, jcp.nb_oc_blocking).getIdx());
+                       for (int r = 0; r < tail_size; ++r)
+                           vpinsrb(xmm_tmp, xmm_tmp,
+                                   ptr[aux_reg_src + aux_src_off + r], r);
+                       vpbroadcastd(zmm_inp(jj, jcp.nb_oc_blocking), xmm_tmp);
+                   } else {
+                       vpbroadcastd(zmm_inp(jj, jcp.nb_oc_blocking),
+                               EVEX_compress_addr(aux_reg_src, aux_src_off));
+                   }
+               }
+
+               for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) {
+                   int aux_filt_off = kernel_offset(ocb, icb1, ki);
+                   if (jj_end - jj_start > 0) {
+                       if (jcp.is_depthwise)
+                           vpmovsxbd(zmm_wei,
+                               EVEX_compress_addr(aux_reg_filt, aux_filt_off));
+                       else
+                           vmovups(zmm_wei,
+                                   EVEX_compress_addr(aux_reg_filt, aux_filt_off));
+                   }
+                   for (int jj = jj_start; jj < jj_end; jj += jcp.stride_w) {
+                       compute(zmm_out(jj, ocb),
+                               zmm_wei, zmm_inp(jj, jcp.nb_oc_blocking));
+                   }
+               }
+           }
+       }
+       sub(aux_reg_src, shift_src_ih);
+       add(aux_reg_filt, shift_filt_kh);
+       dec(reg_kj);
+       cmp(reg_kj, 0);
+       jg(kh_loop_label, T_NEAR);
+    }
+}
+
+void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::prepare_output(int ur_w) {
+    for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) {
+        for (int ur = 0; ur < ur_w; ur++) {
+                zmm_t zmm = zmm_out(ur, ocb);
+                vpxord(zmm, zmm, zmm);
+        }
+    }
+}
+
+void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::cvt2ps(data_type_t type_in,
+        zmm_t zmm_in, const Xbyak::Operand &op, bool mask_flag) {
+    zmm_t zmm = mask_flag ? zmm_in | ktail_mask | T_z : zmm_in;
+    switch (type_in) {
+    case data_type::f32:
+    case data_type::s32: vmovups(zmm, op); break;
+    case data_type::s8: vpmovsxbd(zmm, op); break;
+    case data_type::u8: vpmovzxbd(zmm, op); break;
+    default: assert(!"unsupported data type");
+    }
+    if (type_in != data_type::f32)
+        vcvtdq2ps(zmm_in, zmm_in);
+}
+
+void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::store_output(int ur_w, bool last_oc_block) {
+    mov(reg_bias, ptr[param1 + GET_OFF(bias)]);
+    mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]);
+
+    vpxord(zmm_zero, zmm_zero, zmm_zero);
+    for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) {
+        const bool mask_flag = last_oc_block && ocb == jcp.nb_oc_blocking - 1;
+        int scale_offset = jcp.is_oc_scale * (sizeof(float) * ocb * jcp.oc_block);
+
+        auto zmm_bias = zmm_tmp;
+        if (jcp.with_bias) {
+            int bias_offset = jcp.typesize_bia * ocb * jcp.oc_block;
+            auto bias_addr = EVEX_compress_addr(reg_bias, bias_offset);
+            cvt2ps(jcp.bia_dt, zmm_bias, bias_addr, mask_flag);
+        }
+
+        for (int ur = 0; ur < ur_w; ur++) {
+            zmm_t zmm = zmm_out(ur, ocb);
+            vcvtdq2ps(zmm, zmm);
+            if (jcp.with_bias) vaddps(zmm, zmm, zmm_bias);
+            zmm_t mask_zmm = mask_flag
+                           ? zmm | ktail_mask | T_z
+                           : zmm;
+            vmulps(mask_zmm, zmm,
+                    EVEX_compress_addr(reg_ptr_scales, scale_offset));
+
+            if (jcp.dst_dt == data_type::u8) vmaxps(zmm, zmm_zero, zmm);
+
+            if (jcp.dst_dt != data_type::f32) {
+                if (attr_.round_mode_ == round_mode::nearest)
+                    vcvtps2dq(zmm | T_rn_sae, zmm);
+                else if (attr_.round_mode_ == round_mode::down)
+                    vcvtps2dq(zmm | T_rd_sae, zmm);
+                else
+                    assert(!"unimplemented");
+            }
+        }
+        for (int ur = 0; ur < ur_w; ur++) {
+            int aux_dst_off = jcp.typesize_out
+                * (ur * jcp.ngroups * jcp.oc_without_padding + ocb * jcp.oc_block);
+            auto addr = EVEX_compress_addr(reg_dst, aux_dst_off);
+
+            zmm_t zmm = zmm_out(ur, ocb);
+            zmm_t r_zmm = mask_flag
+                        ? zmm | ktail_mask
+                        : zmm;
+            switch (jcp.dst_dt) {
+            case data_type::f32:
+            case data_type::s32: vmovups(addr, r_zmm); break;
+            case data_type::s8: vpmovsdb(addr, r_zmm); break;
+            case data_type::u8: vpmovusdb(addr, r_zmm); break;
+            default: assert(!"unknown dst_dt");
+            }
+        }
+    }
+}
+
+void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::compute_loop(
+        int ur_w, int l_overflow, int r_overflow, bool is_last_sp_block) {
+
+    int shift_src_icb = jcp.typesize_in * jcp.ic_block;
+    int shift_filt_icb = jcp.typesize_in * jcp.kh * jcp.kw * jcp.ic_block * jcp.oc_block;
+
+    prepare_output(ur_w);
+
+    Xbyak::Label icb_loop_label;
+    mov(reg_icb, jcp.nb_ic);
+    L(icb_loop_label); {
+
+        if (jcp.ic_without_padding != jcp.ic) {
+            Xbyak::Label common_ker, end_ker;
+            cmp(reg_icb, 1);
+            jg(common_ker, T_NEAR);
+
+            compute_ker(ur_w, l_overflow, r_overflow,
+                    is_last_sp_block ? last_sp_block : last_ic_block);
+            jmp(end_ker, T_NEAR);
+
+            L(common_ker);
+            compute_ker(ur_w, l_overflow, r_overflow, no_last_block);
+
+            L(end_ker);
+        } else {
+            compute_ker(ur_w, l_overflow, r_overflow, no_last_block);
+        }
+
+        add(reg_src, shift_src_icb);
+        add(reg_filt, shift_filt_icb);
+        dec(reg_icb);
+        cmp(reg_icb, 0);
+        jg(icb_loop_label, T_NEAR);
+    }
+    sub(reg_src, jcp.nb_ic * shift_src_icb);
+    sub(reg_filt, jcp.nb_ic * shift_filt_icb);
+
+    if (jcp.ngroups % jcp.ch_block != 0 || jcp.oc_without_padding != jcp.oc) {
+        Xbyak::Label common_store, end_store;
+        mov(reg_oc_blocks, ptr[param1 + GET_OFF(oc_blocks)]);
+        if (jcp.is_depthwise)
+            cmp(reg_oc_blocks, jcp.nb_ch - 1);
+        else
+            cmp(reg_oc_blocks, jcp.nb_oc - jcp.nb_oc_blocking);
+        jne(common_store, T_NEAR);
+
+        store_output(ur_w, true);
+        jmp(end_store, T_NEAR);
+
+        L(common_store);
+        store_output(ur_w, false);
+
+        L(end_store);
+
+    } else {
+        store_output(ur_w, false);
+    }
+}
+
+void jit_avx512_core_u8s8s32x_deconv_fwd_kernel::generate() {
+    preamble();
+
+    Xbyak::Reg16 _t = reg_scratch.cvt16();
+    mov(_t, 0x1);
+    vpbroadcastw(zmm_one, _t);
+
+    if (jcp.ngroups % jcp.ch_block != 0 || jcp.oc_without_padding != jcp.oc) {
+        int tail_size = jcp.is_depthwise
+            ? jcp.ngroups % jcp.ch_block
+            : jcp.oc_without_padding % jcp.oc_block;
+        int mask = (1 << tail_size) - 1;
+        Xbyak::Reg32 regw_tmp = reg_nur_w.cvt32();
+        mov(regw_tmp, mask);
+        kmovw(ktail_mask, regw_tmp);
+    }
+
+    mov(reg_src, ptr[param1 + GET_OFF(src)]);
+    mov(reg_filt, ptr[param1 + GET_OFF(filt)]);
+    mov(reg_dst, ptr[param1 + GET_OFF(dst)]);
+    mov(reg_kh, ptr[param1 + GET_OFF(kh_padding)]);
+
+    int dst_shift = jcp.typesize_out * jcp.ur_w * jcp.ngroups * jcp.oc_without_padding;
+    int src_shift = jcp.typesize_in * (jcp.ur_w / jcp.stride_w) * jcp.ngroups * jcp.ic_without_padding;
+
+    int l_overflow = max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1) - jcp.l_pad) / jcp.stride_w);
+    int r_overflow = max(0, ((jcp.kw - 1) * (jcp.dilate_w + 1)
+                     - max(0, jcp.r_pad)) / jcp.stride_w);
+
+    int r_overflow1 = nstl::max(0, ((jcp.kw -1) * (jcp.dilate_w + 1)
+                - nstl::max(0, jcp.r_pad) - jcp.ur_w_tail) / jcp.stride_w);
+    int nur_w = jcp.ow / jcp.ur_w;
+    if (r_overflow1 > 0) nur_w--;
+
+    if (jcp.ur_w == jcp.ow) {
+        compute_loop(jcp.ur_w, l_overflow, r_overflow, true);
+    } else if (nur_w == 0) {
+        compute_loop(jcp.ur_w, l_overflow, r_overflow1, jcp.ur_w_tail == 0);
+        add(reg_src, src_shift);
+        add(reg_dst, dst_shift);
+        if (jcp.ur_w_tail != 0)
+            compute_loop(jcp.ur_w_tail, 0, r_overflow, true);
+    } else {
+        xor_(reg_nur_w, reg_nur_w);
+        if (l_overflow > 0) {
+            compute_loop(jcp.ur_w, l_overflow, 0, false);
+            add(reg_src, src_shift);
+            add(reg_dst, dst_shift);
+            inc(reg_nur_w);
+        }
+        if ((l_overflow <= 0 && nur_w > 0)
+                || (l_overflow > 0 && nur_w > 1)) {
+            Xbyak::Label ow_loop_label;
+            L(ow_loop_label); {
+                compute_loop(jcp.ur_w, 0, 0, false);
+                add(reg_src, src_shift);
+                add(reg_dst, dst_shift);
+                inc(reg_nur_w);
+                cmp(reg_nur_w, nur_w);
+                jl(ow_loop_label, T_NEAR);
+            }
+        }
+        if (r_overflow1 > 0) {
+            compute_loop(jcp.ur_w, 0, r_overflow1, jcp.ur_w_tail == 0);
+            add(reg_src, src_shift);
+            add(reg_dst, dst_shift);
+        }
+        if (jcp.ur_w_tail != 0) {
+            compute_loop(jcp.ur_w_tail, 0, r_overflow, true);
+        }
+    }
+    postamble();
+}
+
+template <data_type_t dst_type>
+void _jit_avx512_core_u8s8s32x_deconvolution_fwd_t<dst_type>::
+execute_forward()
+{
+    auto src = reinterpret_cast<const src_data_t *>(this->input_memory(0));
+    auto weights = reinterpret_cast<const wei_data_t *>(this->input_memory(1));
+    auto bias = reinterpret_cast<const char *>(this->input_memory(2));
+    auto dst = reinterpret_cast<dst_data_t *>(this->memory());
+
+    const memory_desc_wrapper src_d(conf_.src_pd());
+    const memory_desc_wrapper dst_d(conf_.dst_pd());
+    const memory_desc_wrapper weights_d(conf_.weights_pd(0));
+    const memory_desc_wrapper bias_d(conf_.weights_pd(1));
+
+    auto &jcp = kernel_->jcp;
+
+    int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking;
+    int nb_groups = jcp.nb_ch;
+
+    size_t src_h_stride = src_d.blk_off(0, 0, 1);
+    size_t dst_h_stride = dst_d.blk_off(0, 0, 1);
+    size_t wht_kh_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
+
+    const auto &oscales = conf_.attr()->output_scales_;
+
+    parallel(0,
+            [&](const int ithr, const int nthr) {
+            int start{0}, end{0};
+            int work_amount = jcp.mb * nb_groups * oc_chunks * jcp.oh;
+            balance211(work_amount, nthr, ithr, start, end);
+
+            auto p = jit_deconv_call_s();
+
+            /*loop order = cgn*/
+            int n{0}, g{0}, occ{0}, oh_s{0};
+            if (jcp.loop_order == loop_ngc)
+                nd_iterator_init(start, n, jcp.mb, g, nb_groups, occ, oc_chunks,
+                    oh_s, jcp.oh);
+            else if (jcp.loop_order == loop_cgn)
+                nd_iterator_init(start, occ, oc_chunks, g, nb_groups, n, jcp.mb,
+                    oh_s, jcp.oh);
+            else
+                assert(!"unsupported loop order");
+            while (start < end) {
+
+                int ocb = occ * jcp.nb_oc_blocking;
+                int g_oc = (g * jcp.ch_block * jcp.nb_oc + ocb) * jcp.oc_block;
+                int g_ic = g * jcp.ch_block * jcp.ic;
+                int work_rem = end - start;
+                int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem;
+
+                auto dst_w = dst + dst_d.blk_off(n, g_oc);
+                auto src_w = src + src_d.blk_off(n, g_ic);
+                auto wht_w = weights + wht_blk_off(weights_d, g, ocb, 0);
+                auto bias_w = jcp.with_bias
+                            ? bias + (bias_d.blk_off(g_oc) * jcp.typesize_bia)
+                            : 0;
+
+                auto scales = &oscales.scales_[jcp.is_oc_scale * g_oc];
+                for (int oj = oh_s; oj < oh_e; oj++) {
+                    int ih_max, kh_lo, kh_len;
+                    if (jcp.dilate_h != 0 && jcp.stride_h == 1) {
+                            int dilate_h = jcp.dilate_h + 1;
+                            // Note: use div_up to account for "holes" in filter
+                            int o_t_overflow
+                                = div_up(max(0, (jcp.kh - 1) * dilate_h
+                                        - oj - jcp.t_pad), dilate_h);
+                            int o_b_overflow
+                                = div_up(max(0, (jcp.kh - 1) * dilate_h + 1
+                                        - jcp.ih + oj - jcp.b_pad), dilate_h);
+                            kh_len = jcp.kh - o_t_overflow - o_b_overflow;
+                            kh_lo = o_b_overflow;
+                            ih_max = oj + jcp.t_pad - o_b_overflow * dilate_h;
+                    } else {
+                        int o_t_overflow = max(0,
+                                (jcp.kh - (oj + 1 + jcp.t_pad)) / jcp.stride_h); 
+                        int o_b_overflow = max(0,
+                                ((oj + 1 + jcp.kh - 1)
+                                 - (jcp.oh + jcp.b_pad)) / jcp.stride_h);
+                        int overflow_kh_hi = jcp.kh - 1
+                            - abs(jcp.oh + jcp.b_pad - (oj + 1)) % jcp.stride_h;
+                        int overflow_kh_lo = ((oj + 1 + jcp.t_pad) - 1) % jcp.stride_h;
+
+                        kh_len = (overflow_kh_hi - overflow_kh_lo) / jcp.stride_h
+                            + 1 - o_t_overflow - o_b_overflow;
+                        kh_lo = overflow_kh_lo + o_b_overflow * jcp.stride_h;
+                        ih_max = (oj + jcp.t_pad - kh_lo) / jcp.stride_h;
+                    }
+
+                    p.src = src_w + ih_max * src_h_stride;
+                    p.dst = dst_w + oj * dst_h_stride;
+                    p.filt = wht_w + kh_lo * wht_kh_stride;
+                    p.bias = bias_w;
+                    p.kh_padding = kh_len;
+                    p.scales = scales;
+                    p.oc_blocks = jcp.is_depthwise ? g : ocb;
+                    kernel_->jit_ker(&p);
+                }
+                if (jcp.loop_order == loop_ngc)
+                    nd_iterator_jump(start, end,
+                            n, jcp.mb, g, nb_groups, occ, oc_chunks, oh_s, jcp.oh);
+                else if (jcp.loop_order == loop_cgn)
+                    nd_iterator_jump(start, end,
+                            occ, oc_chunks, g, nb_groups, n, jcp.mb, oh_s, jcp.oh);
+                else
+                    assert(!"unsupported loop order");
+            }
+    });
+}
+
+template struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t<data_type::u8>;
+template struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t<data_type::s8>;
+template struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t<data_type::f32>;
+template struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t<data_type::s32>;
+}
+}
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_u8s8s32x_deconvolution.hpp
new file mode 100644 (file)
index 0000000..17f3a52
--- /dev/null
@@ -0,0 +1,201 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_JIT_AVX512_CORE_U8S8S32X_DECONVOLUTION_HPP
+#define CPU_JIT_AVX512_CORE_U8S8S32X_DECONVOLUTION_HPP
+
+
+#include "c_types_map.hpp"
+#include "cpu_engine.hpp"
+#include "cpu_memory.hpp"
+#include "mkldnn_thread.hpp"
+#include "type_helpers.hpp"
+#include "utils.hpp"
+#include "nstl.hpp"
+
+#include "cpu_deconvolution_pd.hpp"
+#include "jit_generator.hpp"
+#include "jit_primitive_conf.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+typedef enum {
+    no_last_block = 0x1U,
+    last_ic_block = 0x2U,
+    last_sp_block = 0x4U,
+    last_ic
+} ker_block_t;
+
+struct jit_avx512_core_u8s8s32x_deconv_fwd_kernel : public jit_generator {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8s8s32x_deconv_fwd_ker_t);
+
+    jit_avx512_core_u8s8s32x_deconv_fwd_kernel(jit_conv_conf_t ajcp,
+            const primitive_attr_t &attr) : jcp(ajcp), attr_(attr) {
+        generate();
+        jit_ker = (void (*)(jit_deconv_call_s *))getCode();
+    }
+
+    static status_t init_conf(jit_conv_conf_t &jcp,
+            const deconvolution_desc_t &cd,
+            cpu_memory_t::pd_t &src_pd,
+            cpu_memory_t::pd_t &weights_pd,
+            cpu_memory_t::pd_t &dst_pd,
+            const bool with_bias,
+            cpu_memory_t::pd_t &bias_pd,
+            const primitive_attr_t &attr);
+
+    jit_conv_conf_t jcp;
+    const primitive_attr_t &attr_;
+    void (*jit_ker)(jit_deconv_call_s *);
+private:
+    using reg64_t = const Xbyak::Reg64;
+    using zmm_t = const Xbyak::Zmm;
+    using xmm_t = const Xbyak::Xmm;
+
+    reg64_t reg_src = r8;
+    reg64_t reg_filt = r9;
+    reg64_t reg_dst = r10;
+    reg64_t param1 = abi_param1;
+    reg64_t reg_kh = abi_not_param1;
+    reg64_t reg_nur_w = rbx;
+    reg64_t reg_bias = rdx;
+    reg64_t reg_icb = reg_bias;
+    reg64_t reg_ptr_scales = rax;
+    reg64_t reg_oc_blocks = rsi;
+
+    reg64_t reg_scratch = r14;
+    reg64_t aux_reg_src = r11;
+    reg64_t aux_reg_filt = r12;
+    reg64_t reg_kj = rax;
+
+    Xbyak::Opmask ktail_mask = Xbyak::Opmask(2);
+    zmm_t zmm_tmp = zmm_t(29);
+    zmm_t zmm_one = zmm_t(30);
+    zmm_t zmm_zero = zmm_t(31);
+    zmm_t zmm_wei = zmm_t(31);
+
+    zmm_t zmm_out(int i_ur, int i_oc) {
+        int idx = i_ur * jcp.nb_oc_blocking + i_oc;
+        assert(idx < 31);
+        return zmm_t(idx);
+    }
+    zmm_t zmm_inp(int i_ic, int nb_x_blocking) {
+        int idx = i_ic + nb_x_blocking * jcp.ur_w;
+        assert(idx < 31);
+        return zmm_t(idx);
+    }
+
+    int get_ow_start(int ki, int l_overflow) {
+        int res = (jcp.ow - 1 + jcp.r_pad) % jcp.stride_w
+                + l_overflow * jcp.stride_w
+                - (jcp.kw - 1 - ki) * (jcp.dilate_w + 1);
+        while (res < 0)
+            res += jcp.stride_w;
+        return res;
+    }
+
+    int get_ow_end(int ur_w, int ki, int r_overflow) {
+        if (utils::one_of(ur_w, jcp.ow, jcp.ur_w_tail))
+                ur_w += nstl::min(0, jcp.r_pad);
+        int res = (ur_w - 1 + jcp.l_pad) % jcp.stride_w
+            + r_overflow * jcp.stride_w - ki * (jcp.dilate_w + 1);
+        while (res < 0)
+            res += jcp.stride_w;
+        return ur_w - res;
+    }
+
+    void prepare_output(int ur_w);
+    void store_output(int ur_w, bool last_oc_block);
+    void compute_ker(int ur_w, int pad_l, int pad_r, ker_block_t last_ker_block);
+    void compute_loop(int ur_w, int pad_l, int pad_r, bool last_block);
+    void generate();
+    void cvt2ps(data_type_t type_in, zmm_t zmm_in, const Xbyak::Operand &op,
+        bool mask_flag);
+};
+
+template <impl::data_type_t dst_type>
+struct _jit_avx512_core_u8s8s32x_deconvolution_fwd_t : public cpu_primitive_t {
+    struct pd_t : public cpu_deconvolution_fwd_pd_t {
+        pd_t(engine_t *engine,
+                const deconvolution_desc_t *adesc,
+                const primitive_attr_t *attr,
+                const deconvolution_fwd_pd_t *hint_fwd_pd)
+            : cpu_deconvolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) {}
+
+        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_deconvolution:", avx512_core, ""),
+                _jit_avx512_core_u8s8s32x_deconvolution_fwd_t<dst_type>);
+
+        virtual status_t init() override {
+            assert(this->engine()->kind() == engine_kind::cpu);
+
+            bool ok = true
+                && utils::one_of(this->desc()->prop_kind, prop_kind::forward_training,
+                            prop_kind::forward_inference)
+                && this->desc()->alg_kind & alg_kind::deconvolution_direct
+                && this->desc()->dst_desc.data_type == dst_type
+                && IMPLICATION(this->with_bias(), utils::one_of(
+                            this->desc()->bias_desc.data_type, data_type::f32,
+                            data_type::s32, data_type::s8, data_type::u8))
+                && this->desc()->accum_data_type == data_type::s32;
+            if (!ok) return status::unimplemented;
+
+            /*TODO: support signed input and postops */
+            return jit_avx512_core_u8s8s32x_deconv_fwd_kernel::init_conf(
+                    jcp_, *this->desc(), this->src_pd_,
+                    this->weights_pd_, this->dst_pd_,
+                    this->with_bias(), this->bias_pd_,
+                    *this->attr());
+        }
+        jit_conv_conf_t jcp_;
+    };
+
+    _jit_avx512_core_u8s8s32x_deconvolution_fwd_t(const pd_t *pd,
+           const input_vector &inputs, const output_vector &outputs)
+       : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {
+           kernel_ = new jit_avx512_core_u8s8s32x_deconv_fwd_kernel(conf_.jcp_,
+                   *conf_.attr());
+       }
+
+    ~_jit_avx512_core_u8s8s32x_deconvolution_fwd_t() {
+        delete kernel_;
+    }
+
+    typedef typename prec_traits<data_type::u8>::type src_data_t;
+    typedef typename prec_traits<data_type::s8>::type wei_data_t;
+    typedef typename prec_traits<dst_type>::type dst_data_t;
+
+    virtual void execute(event_t *e)
+    {
+        execute_forward();
+        e->set_state(event_t::ready);
+    }
+
+private:
+    void execute_forward();
+    pd_t conf_;
+    jit_avx512_core_u8s8s32x_deconv_fwd_kernel *kernel_;
+};
+
+
+}
+}
+}
+
+#endif
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
index d4bd41b..45f516c 100644 (file)
@@ -36,6 +36,17 @@ using namespace mkldnn::impl::memory_format;
 using namespace mkldnn::impl::utils;
 using namespace Xbyak;
 
+namespace {
+    // Below scales are applied to source and weights data accordingly
+    // because this winograd implementation
+    // transforms source which may increase values up to 4x
+    // and transforms weights which may increase values up to 9/4x
+    const float adj_src_scale = 1.f / 4.f;
+    const float adj_wei_scale = 4.f / 9.f;
+    // Winograd transforms need ic and oc to be multiples of 16
+    const int load_block = 16;
+}
+
 /// SRC TRANSFORMS /////////////////////////////////////////////////////////////
 struct jit_avx512_core_u8s8s32x_wino_conv_src_trans_t: public jit_generator {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(
@@ -60,10 +71,19 @@ struct jit_avx512_core_u8s8s32x_wino_conv_src_trans_t: public jit_generator {
     }
     void generate();
 
-    Xmm vreg_inp(int i) {
+    int reg_inp_ind(int i) {
         assert(i < jcp.alpha * jcp.alpha);
-        return Xmm(31 - i);
+        return (31 - i);
+    }
+
+    Xmm vreg_inp(int i) {
+        return Xmm(reg_inp_ind(i));
     }
+
+    Zmm zmm_inp(int i) {
+        return Zmm(reg_inp_ind(i));
+    }
+
     Xmm vreg_tmp(int i) {
         assert(i < jcp.alpha * jcp.alpha);
         return Xmm(15 - i);
@@ -93,11 +113,15 @@ struct jit_avx512_core_u8s8s32x_wino_conv_src_trans_t: public jit_generator {
     Reg64 reg_ic_block = r8;
 
     int unsign_val_in_wino_domain;
+
+    Reg64 reg_scratch_src_alpha = rdx;
+    Xmm xmm_src_alpha = Xmm(0);
+    Zmm zmm_src_alpha = Zmm(0);
 };
+
 void jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::generate() {
     Label ic_block_label;
 
-    const int load_block = 16;
     int out_offset = 0, inp_offset = 0;
     preamble();
 
@@ -119,20 +143,30 @@ void jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::generate() {
         kmovw(x_mask(i), ptr[reg_ptr_v_x_masks + sizeof(int16_t) * i]);
     }
 
+    mov(reg_scratch_src_alpha, float2int(adj_src_scale));
+
     mov(reg_ic_block, jcp.ic / load_block);
     L(ic_block_label);
     {
+        vmovq(xmm_src_alpha, reg_scratch_src_alpha);
+        vbroadcastss(zmm_src_alpha, xmm_src_alpha);
+
         for(int y = 0; y < jcp.alpha; y++) {
             kmovw(y_mask, ptr[reg_ptr_v_y_masks + sizeof(int16_t) * y]);
             for(int x = 0; x < jcp.alpha; x++) {
-                vpxord(vreg_inp(y*jcp.alpha + x), vreg_inp(y*jcp.alpha + x),
-                    vreg_inp(y*jcp.alpha + x));
+                Zmm zmm_i = zmm_inp(y*jcp.alpha + x);
+                Xmm vreg_i = vreg_inp(y*jcp.alpha + x);
+                vpxord(vreg_i, vreg_i, vreg_i);
                 kandw(r_mask, y_mask, x_mask(x));
                 inp_offset = sizeof(uint8_t) *
                    ((-jcp.t_pad + y) * jcp.iw * jcp.ic
                         + (-jcp.l_pad + x) * jcp.ic);
-                vmovdqu8(vreg_inp(y*jcp.alpha + x) | r_mask,
-                            EVEX_compress_addr(reg_aux_ptr_src, inp_offset));
+                vmovdqu8(vreg_i | r_mask, EVEX_compress_addr(reg_aux_ptr_src, inp_offset));
+                vpmovzxbd(zmm_i, vreg_i); // to int32
+                vcvtdq2ps(zmm_i, zmm_i); // to fp32
+                vmulps(zmm_i, zmm_i, zmm_src_alpha); // *alpha
+                vcvtps2dq(zmm_i | T_rn_sae, zmm_i); // to int32
+                vpmovusdb(vreg_i, zmm_i); // to u8
             }
         }
         for(int y = 0; y < 4; y++) {
@@ -163,8 +197,7 @@ void jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::generate() {
         add(reg_aux_ptr_dst, sizeof(uint8_t) * load_block);
     }
     dec(reg_ic_block);
-    cmp(reg_ic_block, 0);
-    jg(ic_block_label, T_NEAR);
+    jnz(ic_block_label, T_NEAR);
 
     postamble();
 }
@@ -204,29 +237,30 @@ struct jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t: public jit_generator {
     }
     Zmm vreg_stg(int id) { // 8
         const int id_reg_stg = jcp.alpha * jcp.alpha + id;
-        assert(id_reg_stg < jcp.alpha * jcp.alpha + 8);
+        assert(id < 8);
         return Zmm(31 - id_reg_stg);
     }
     Zmm vreg_out(int id) { // 4
         const int id_reg_out = jcp.alpha * jcp.alpha + 8 + id;
-        assert(id_reg_out < jcp.alpha * jcp.alpha + 12);
+        assert(id < 4);
         return Zmm(31 - id_reg_out);
     }
     Xmm xmm_out(int id) { // 4
         const int id_reg_out = jcp.alpha * jcp.alpha + 8 + id;
-        assert(id_reg_out < jcp.alpha * jcp.alpha + 12);
+        assert(id < 4);
         return Xmm(31 - id_reg_out);
     }
     Zmm vreg_tmp(int id) { // 2
         const int id_reg_tmp = jcp.alpha * jcp.alpha + 12 + id;
-        assert(id_reg_tmp < jcp.alpha * jcp.alpha + 14);
+        assert(id < 2);
         return Zmm(31 - id_reg_tmp);
     }
 
     Zmm vreg_zero = Zmm(0);
     Zmm vreg_bias = Zmm(1);
     Zmm vreg_prev_dst = Zmm(2);
-
+    Zmm zmm_bias_alpha = Zmm(2);
+    Xmm xmm_bias_alpha = Xmm(2);
 
     Opmask y_mask = Opmask(1);
     Opmask r_mask = Opmask(2);
@@ -234,6 +268,9 @@ struct jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t: public jit_generator {
         assert(id < 4);
         return Opmask(3 + id);
     }
+
+    Reg64 reg_scratch_bias_alpha = r15;
+
     Reg64 reg_ptr_src = r14;
     Reg64 reg_ptr_dst = r13;
 
@@ -246,9 +283,10 @@ struct jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t: public jit_generator {
     Reg64 reg_oc_block = r8;
 
     Reg64 reg_ptr_bias = rbx;
-    Reg64 reg_ptr_scales = rcx;
+    Reg64 reg_ptr_scales = abi_not_param1;
     Reg64 reg_ptr_sum_scale = rdx;
 };
+
 bool jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::maybe_relu(int position) {
     using namespace primitive_kind;
     const auto &p = attr_.post_ops_;
@@ -273,11 +311,10 @@ bool jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::maybe_relu(int position) {
 
     return false;
 }
+
 void jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::generate() {
     Label oc_block_label;
 
-    const int load_block = 16;
-
     auto loop_body = [=]() {
         const auto &p = attr_.post_ops_;
         const int sum_idx = p.find(primitive_kind::sum);
@@ -309,6 +346,9 @@ void jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::generate() {
 
 
         if (jcp.with_bias) {
+            vmovq(xmm_bias_alpha, reg_scratch_bias_alpha);
+            vbroadcastss(zmm_bias_alpha, xmm_bias_alpha);
+
             auto bias_addr = ptr [ reg_ptr_bias ];
             switch (jcp.bia_dt) {
             case data_type::f32:
@@ -319,6 +359,7 @@ void jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::generate() {
             }
             if (jcp.bia_dt != data_type::f32)
                 vcvtdq2ps(vreg_bias, vreg_bias);
+            vmulps(vreg_bias, vreg_bias, zmm_bias_alpha); // *alpha
         }
         for(int y = 0; y < jcp.m; y++) {
             kmovw(y_mask, ptr[ reg_ptr_v_y_masks + sizeof(int16_t) * y ]);
@@ -394,6 +435,9 @@ void jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::generate() {
     READ_PARAM(reg_ptr_scales, scales);
 #   undef READ_PARAM
 
+    if (jcp.with_bias)
+        mov(reg_scratch_bias_alpha, float2int(adj_src_scale * adj_wei_scale));
+
     mov(reg_aux_ptr_src, reg_ptr_src);
     mov(reg_aux_ptr_dst, reg_ptr_dst);
 
@@ -415,8 +459,7 @@ void jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::generate() {
         add(reg_ptr_bias, sizeof(jcp.typesize_bia) * load_block);
     }
     dec(reg_oc_block);
-    cmp(reg_oc_block, 0);
-    jg(oc_block_label, T_NEAR);
+    jnz(oc_block_label, T_NEAR);
 
     sub(reg_ptr_scales, jcp.is_oc_scale *  sizeof(float) * load_block);
     sub(reg_ptr_bias, oc_blocks * sizeof(jcp.typesize_bia) * load_block);
@@ -464,7 +507,8 @@ struct jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t: public jit_generator {
         return Zmm(31 - id_reg_out);
     }
     Zmm vreg_wei(int i) {
-        assert(31 - jcp.n2_block * jcp.m_block - i > 2);
+        assert(31 - jcp.n2_block * jcp.m_block - i
+                > (jcp.ver == ver_vnni ? 0 : 2));
         return Zmm(31 - jcp.n2_block * jcp.m_block - i);
     }
 
@@ -473,20 +517,20 @@ struct jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t: public jit_generator {
     Zmm vreg_tmp = Zmm(2);
 
     Reg64 reg_ptr_src = r15;
-    Reg64 reg_ptr_dst = r14;
-    Reg64 reg_ptr_wei = r13;
-    Reg64 reg_ptr_dst_b = r12;
 
-    Reg64 reg_aux_dst = r11;
+    Reg64 reg_aux_dst_b = r13;
+    Reg64 reg_aux_dst = r12;
+    Reg64 reg_aux_dst2 = r11;
     Reg64 reg_aux_wei = r10;
-    Reg64 reg_aux_dst_b = r9;
+    Reg64 reg_aux_wei2 = r9;
     Reg64 reg_aux_src = r8;
-    Reg64 reg_aux_wei2 = rax;
+    Reg64 reg_aux_src2 = rax;
+    Reg64 reg_mb = rbx;
+    Reg64 reg_nnb = abi_not_param1;
     Reg64 reg_scratch = rdx;
-    Reg64 reg_nnb = rcx;
     Reg64 reg_K = rsi;
-
 };
+
 bool jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t::post_ops_ok(
         jit_conv_conf_2x3_wino_t &jcp, const primitive_attr_t &attr) {
     using namespace primitive_kind;
@@ -502,11 +546,11 @@ bool jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t::post_ops_ok(
    switch (p.len_) {
     case 0: return true;
     case 1: return true
-                && implication(jcp.with_relu, p.contain(sum, 0))
-                && implication(!jcp.with_relu, is_relu(0) || p.contain(sum, 0));
+                && IMPLICATION(jcp.with_relu, p.contain(sum, 0))
+                && IMPLICATION(!jcp.with_relu, is_relu(0) || p.contain(sum, 0));
     case 2: return true
-                && implication(jcp.with_relu, p.contain(sum, 0) && is_relu(1))
-                && implication(!jcp.with_relu, false
+                && IMPLICATION(jcp.with_relu, p.contain(sum, 0) && is_relu(1))
+                && IMPLICATION(!jcp.with_relu, false
                         || (p.contain(sum, 0) && is_relu(1))
                         || (p.contain(sum, 1) && is_relu(0)));
     case 3: return true
@@ -517,8 +561,9 @@ bool jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t::post_ops_ok(
 
     return false;
 }
+
 void jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t::generate() {
-    Label nnb_loop_label, K_loop_label[2];
+    Label nnb_loop_label, K_loop_label, mb_loop_label;
 
     auto compute = [=](Zmm vreg_acc, Zmm vreg_wei, Zmm vreg_src) {
         if (jcp.ver == ver_vnni) {
@@ -534,82 +579,85 @@ void jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t::generate() {
 #   define READ_PARAM(reg, field) \
         mov(reg, ptr[abi_param1 + offsetof(call_params_t, field)])
     READ_PARAM(reg_ptr_src, src);
-    READ_PARAM(reg_ptr_dst, dst);
-    READ_PARAM(reg_ptr_wei, wei);
-    READ_PARAM(reg_ptr_dst_b, dst_b);
+    READ_PARAM(reg_aux_dst, dst);
+    READ_PARAM(reg_aux_wei, wei);
+    READ_PARAM(reg_aux_dst_b, dst_b);
 #   undef READ_PARAM
 
-    xor_(reg_scratch, reg_scratch);
-    Reg16 _t = reg_scratch.cvt16();
-    mov(_t, 0x1);
-    vpbroadcastw(vreg_one, _t);
-
-    mov(reg_aux_dst, reg_ptr_dst);
-    mov(reg_aux_wei, reg_ptr_wei);
-    mov(reg_aux_dst_b, reg_ptr_dst_b);
+    if (jcp.ver != ver_vnni) {
+        xor_(reg_scratch, reg_scratch);
+        Reg16 _t = reg_scratch.cvt16();
+        mov(_t, 0x1);
+        vpbroadcastw(vreg_one, _t);
+    }
 
     if (!jcp.small_mb) {
         mov(reg_nnb, jcp.n_chunks);
         L(nnb_loop_label);
     }
-        for (int mb = 0; mb < jcp.M / jcp.m_block; mb++)
-        {
-            for (int nb2 = 0; nb2 < jcp.n2_block; nb2++) {
-                for (int m = 0; m < jcp.m_block; m++) {
-                    int offset = jcp.typesize_acc * nb2 * jcp.n_block;
-                    vmovups(vreg_out(nb2, m),
+    mov(reg_aux_dst2, reg_aux_dst);
+    mov(reg_aux_src, reg_ptr_src);
+    mov(reg_mb, jcp.M / jcp.m_block);
+    L(mb_loop_label);
+    {
+        for (int nb2 = 0; nb2 < jcp.n2_block; nb2++) {
+            for (int m = 0; m < jcp.m_block; m++) {
+                int offset = jcp.typesize_acc * nb2 * jcp.n_block;
+                vmovups(vreg_out(nb2, m),
                         EVEX_compress_addr(reg_aux_dst_b, offset));
-                }
             }
-            mov(reg_aux_src, reg_ptr_src);
-            mov(reg_aux_wei2, reg_aux_wei);
-            mov(reg_K, jcp.k_chunks);
-            L(K_loop_label[mb]); {
-                for (int k = 0; k < jcp.k2_block; k += 4)
-                {
-                    for (int nb2 = 0; nb2 < jcp.n2_block; nb2++) {
-                        int wei_offset = jcp.typesize_in *
-                                            ((nb2 * jcp.n_block) * jcp.K);
-                        vmovups(vreg_wei(nb2),
+        }
+        mov(reg_aux_src2, reg_aux_src);
+        mov(reg_aux_wei2, reg_aux_wei);
+        mov(reg_K, jcp.k_chunks);
+        L(K_loop_label);
+        {
+            for (int k = 0; k < jcp.k2_block; k += 4) {
+                for (int nb2 = 0; nb2 < jcp.n2_block; nb2++) {
+                    int wei_offset
+                            = jcp.typesize_in * (nb2 * jcp.n_block * jcp.K);
+                    vmovups(vreg_wei(nb2),
                             EVEX_compress_addr(reg_aux_wei2, wei_offset));
-                    }
-                    for (int m = 0; m < jcp.m_block; m++) {
-                        int inp_offset  = jcp.typesize_in *
-                                          (m + mb * jcp.m_block) * jcp.K;
-                        vpbroadcastd(vreg_src,
-                            EVEX_compress_addr(reg_aux_src,inp_offset));
-                        for (int nb2 = 0; nb2 < jcp.n2_block; nb2++)
-                            compute(vreg_out(nb2, m), vreg_wei(nb2), vreg_src);
-                    }
-                    add(reg_aux_src, jcp.typesize_in * 4);
-                    add(reg_aux_wei2, jcp.typesize_in * 4 * jcp.n_block);
                 }
+                for (int m = 0; m < jcp.m_block; m++) {
+                    int inp_offset = jcp.typesize_in * m * jcp.K;
+                    vpbroadcastd(vreg_src,
+                            EVEX_compress_addr(reg_aux_src2, inp_offset));
+                    for (int nb2 = 0; nb2 < jcp.n2_block; nb2++)
+                        compute(vreg_out(nb2, m), vreg_wei(nb2), vreg_src);
+                }
+                add(reg_aux_src2, jcp.typesize_in * 4);
+                add(reg_aux_wei2, jcp.typesize_in * 4 * jcp.n_block);
             }
-            dec(reg_K);
-            cmp(reg_K, 0);
-            jg(K_loop_label[mb], T_NEAR);
+        }
+        dec(reg_K);
+        jnz(K_loop_label, T_NEAR);
 
-            for (int m = 0; m < jcp.m_block; m++) {
-                for (int nb2 = 0; nb2 < jcp.n2_block; nb2++) {
-                    int offset = jcp.typesize_acc *
-                        ((mb * jcp.m_block + m) * jcp.N + nb2 * jcp.n_block);
-                    vmovups(EVEX_compress_addr(reg_aux_dst,offset),
-                                vreg_out(nb2, m));
-                }
+        for (int m = 0; m < jcp.m_block; m++) {
+            for (int nb2 = 0; nb2 < jcp.n2_block; nb2++) {
+                int offset = jcp.typesize_acc * (m * jcp.N + nb2 * jcp.n_block);
+                vmovups(EVEX_compress_addr(reg_aux_dst2, offset),
+                        vreg_out(nb2, m));
             }
         }
+        add(reg_aux_src, jcp.typesize_in * jcp.m_block * jcp.K);
+        add(reg_aux_dst2, jcp.typesize_acc * jcp.m_block * jcp.N);
+    }
+    dec(reg_mb);
+    jnz(mb_loop_label, T_NEAR);
+
     if (!jcp.small_mb) {
         add(reg_aux_dst, jcp.typesize_acc * jcp.n2_block * jcp.n_block);
         add(reg_aux_dst_b, jcp.typesize_acc * jcp.n2_block * jcp.n_block);
         add(reg_aux_wei, jcp.typesize_in * jcp.n2_block * jcp.n_block * jcp.K);
 
         dec(reg_nnb);
-        cmp(reg_nnb, 0);
-        jg(nnb_loop_label, T_NEAR);
+        jnz(nnb_loop_label, T_NEAR);
     }
 
     postamble();
 }
+
 status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t
 ::init_conf(jit_conv_conf_2x3_wino_t &jcp,
             const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd,
@@ -652,25 +700,27 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t
     if (mayiuse(avx512_core_vnni))
         jcp.ver = ver_vnni;
 
+    // block sizes needed for GEMM kernel
     jcp.ic_block = 4;
     jcp.oc_block = 16;
 
     bool ok = true
-        && jcp.kh == 3 && jcp.kw == 3
         && jcp.ngroups == 1
+        && jcp.oc % load_block == 0 && jcp.ic % load_block == 0
         && jcp.oc % jcp.oc_block == 0 && jcp.ic % jcp.ic_block == 0
-        && jcp.stride_h == 1 && jcp.stride_w == 1
-        && jcp.dilate_h == 0 && jcp.dilate_w == 0
+        && everyone_is(3, jcp.kh, jcp.kw)
+        && everyone_is(1, jcp.stride_h, jcp.stride_w)
+        && everyone_is(0, jcp.dilate_h, jcp.dilate_w)
         && jcp.t_pad == jcp.b_pad && jcp.l_pad == jcp.r_pad
-        && jcp.t_pad < 2 && jcp.t_pad >= 0
-        && jcp.l_pad < 2 && jcp.l_pad >= 0;
+        && one_of(jcp.t_pad, 0, 1)
+        && one_of(jcp.l_pad, 0, 1);
     if (!ok) return status::unimplemented;
 
     jcp.src_fmt = src_d.format();
     jcp.with_bias = cd.bias_desc.format != memory_format::undef;
     jcp.with_relu = with_relu;
     jcp.relu_negative_slope = relu_negative_slope;
-    if (!implication(with_relu, relu_negative_slope == 0.))
+    if (!IMPLICATION(with_relu, relu_negative_slope == 0.))
         return status::unimplemented;
     if (!post_ops_ok(jcp, attr))
         return status::unimplemented;
@@ -692,29 +742,131 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t
     jcp.r = 3;
     jcp.alpha = jcp.m + jcp.r - 1;
 
-    jcp.yb = 1;
-    int opt_val = 14, cur_val = 0;
-    for (int i = 14; i >= 8; i -= 2) {
-        cur_val = ((jcp.oh / i) * i + i) - jcp.oh;
-        if (jcp.oh % i == 0) {
-            jcp.yb = i; break;
-        }  else if (cur_val < opt_val)  {
-            jcp.yb = i;
-            opt_val = cur_val;
+    int aa = jcp.alpha * jcp.alpha;
+    int nthr = mkldnn_get_max_threads();
+    int L1_cap = get_cache_size(1, true);
+    int L2_cap = get_cache_size(2, true);
+    // need 1 extra reg for bcast, and 2 tmp regs for non-vnni
+    int free_regs = jcp.ver == ver_vnni ? 31 : 29;
+
+    auto get_thr_eff = [&](int small_mb, int ix, int iy, int n2_b) {
+        float thr_eff;
+        float Z = (float)jcp.ic + jcp.oc;
+        float Y = (float)jcp.ic * jcp.oc;
+        if (small_mb == 0) { // outer par
+            int nblocks = jcp.mb * div_up(jcp.oh, iy) * div_up(jcp.ow, ix);
+            thr_eff = (float)nblocks / rnd_up(nblocks, nthr);
+        } else { // inner par
+            int tranw = iy * ix / jcp.alpha;
+            int gemmw = aa * (jcp.nb_oc / n2_b);
+            int tranw_r = rnd_up(tranw, nthr);
+            int gemmw_r = rnd_up(gemmw, nthr);
+            thr_eff = (Z * tranw / tranw_r + Y * gemmw / gemmw_r) / (Z + Y);
         }
-    }
+        return thr_eff;
+    };
 
-    const int nthreads = mkldnn_get_max_threads();
-    jcp.xb = 4;
-    int oh_blocks = (jcp.oh < jcp.yb) ? 1 : (jcp.oh / jcp.yb);
-    int ow_blocks = (jcp.ow < jcp.xb) ? 1 : (jcp.ow / jcp.xb);
+    auto get_mem_eff = [&](int small_mb, int ix, int iy, int n2_b) {
+        float mem_eff, req_mem;
+        int M = ix * iy / jcp.alpha;
+        if (small_mb == 0) { // outer parallelization strategy
+            // memory for wino transforms (other memory has poor reuse)
+            req_mem = (float)aa * M * (jcp.ic + jcp.typesize_acc * jcp.oc);
+            mem_eff = req_mem < L1_cap ? 1.f : req_mem < L2_cap ? 0.5f : 0.f;
+        } else { // inner parallelization strategy
+            // memory used during gemm
+            int N = jcp.oc_block * n2_b;
+            req_mem = (float)jcp.ic * (M + N) + jcp.typesize_acc * M * N;
+            mem_eff = nstl::min(1.f, L2_cap / req_mem);
+            // memory used during wino transforms
+            int M_per_thr = div_up(M, nthr);
+            req_mem = (float)aa * M_per_thr
+                    * (jcp.ic + jcp.typesize_acc * jcp.oc);
+            if (req_mem > L2_cap)
+                mem_eff = 0.1f;
+        }
+        return mem_eff;
+    };
+
+    auto get_tot_eff = [&](int small_mb, float thr_eff, float work_eff,
+            float mem_eff, float reg_eff) {
+        // these coefficients are chosen empirically
+        float mem_fac = 0.1f, reg_fac = 0.2f;
+        // normalized overhead relative to memory and register components
+        float tot_eff = 1.f + mem_fac * mem_eff + reg_fac * reg_eff;
+        // thread and work components affect all others
+        tot_eff *= thr_eff * work_eff;
+        return tot_eff;
+    };
+
+    auto find_m_n2_blocks = [&](bool small_mb, int ix, int iy, float work_eff,
+            int &m_block, int &n2_block, float &tot_eff) {
+        int M = (ix * iy) / jcp.alpha;
+        int max_m_block = nstl::min(M, free_regs);
+        int max_n2_block = nstl::min(jcp.nb_oc, free_regs);
+        tot_eff = 0.f;
+        for (int im = max_m_block; im > 0; im--) {
+            if (M % im)
+                continue;
+            for (int in2 = max_n2_block; in2 > 0; in2--) {
+                int used_regs = (im + 1) * in2;
+                float mem_eff = get_mem_eff(small_mb, ix, iy, in2);
+                float reg_eff = (float)(im * in2) / (im + in2);
+                float thr_eff = get_thr_eff(small_mb, ix, iy, in2);
+                float cur_tot_eff = get_tot_eff(
+                        small_mb, thr_eff, work_eff, mem_eff, reg_eff);
+                if (jcp.nb_oc % in2 || used_regs > free_regs
+                        || cur_tot_eff <= tot_eff)
+                    continue;
+                tot_eff = cur_tot_eff;
+                m_block = im;
+                n2_block = in2;
+            }
+        }
+    };
+
+    /* Selecting xb and yb blocking */
+    int min_yb = jcp.m;
+    int min_xb = jcp.m;
+    int max_yb = nstl::max(min_yb, rnd_up(jcp.oh, 2));
+    int max_xb = nstl::max(min_xb, rnd_up(jcp.ow, 2));
+    float best_eff = 0.f;
+    for (int ix = min_xb; ix <= max_xb; ix += 2) {
+        assert(rnd_up(jcp.ow, ix) >= jcp.iw - 2);
+        for (int iy = max_yb; iy >= min_yb; iy -= 2) {
+            assert(rnd_up(jcp.oh, iy) >= jcp.ih - 2);
+
+            int m_b[2];
+            int n2_b[2];
+            bool small_mb;
+            float inner_eff, outer_eff, work_eff;
+
+            int tiled_area = rnd_up(jcp.oh, iy) * rnd_up(jcp.ow, ix);
+            work_eff = (float)jcp.oh * jcp.ow / tiled_area;
+            if (best_eff > 0.f && work_eff < 4.f / 9.f)
+                continue; // no gain from Winograd transformation
+
+            /* outer parallelization */
+            find_m_n2_blocks(0, ix, iy, work_eff, m_b[0], n2_b[0], outer_eff);
+
+            /* inner parallelization */
+            find_m_n2_blocks(1, ix, iy, work_eff, m_b[1], n2_b[1], inner_eff);
+
+            small_mb = inner_eff > outer_eff;
+            float eff = small_mb ? inner_eff : outer_eff;
+            if (eff > best_eff) {
+                best_eff = eff;
+                jcp.yb = iy;
+                jcp.xb = ix;
+                jcp.m_block = m_b[small_mb];
+                jcp.n2_block = n2_b[small_mb];
+                jcp.small_mb = small_mb;
+            }
+        }
+    }
 
-    const int work_amount = jcp.mb * oh_blocks * ow_blocks;
-    if (work_amount < nthreads && jcp.ow < 24) {
-        jcp.small_mb = true;
-        jcp.xb = (jcp.ow < 9) ? jcp.yb : 4;
-    } else
-        jcp.small_mb = false;
+    assert((jcp.m_block + 1) * jcp.n2_block <= free_regs);
+    assert(jcp.xb % 2 == 0 && jcp.yb % 2 == 0);
 
     jcp.inp_stride = jcp.yb * jcp.xb / 4 * jcp.ic;
     jcp.out_stride = jcp.yb * jcp.xb / 4 * jcp.oc;
@@ -725,31 +877,20 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t
     jcp.N = jcp.oc;
     jcp.K = jcp.ic;
 
-    jcp.m_block = jcp.xb * jcp.yb / 8;
     jcp.n_block = jcp.oc_block;
     jcp.k_block = jcp.ic_block;
 
-    int n_nblock = jcp.N / jcp.n_block;
-    jcp.n2_block = (!(n_nblock % 4))
-                    ? 4
-                    : (!(n_nblock % 2)) ? 2 : 1;
-    const int skx_free_regs = 28;
-    if (jcp.n2_block * jcp.m_block > (skx_free_regs - jcp.n2_block)) {
-        jcp.n2_block /= 2;
-    }
-    jcp.n_chunks = n_nblock / jcp.n2_block;
+    jcp.n_chunks = (jcp.N / jcp.n_block) / jcp.n2_block;
 
-    int k_nblock = jcp.K / jcp.k_block;
-    jcp.k2_block = 1;
-    for (int i = 16; i >= 2; i /= 2)
-        if (!(k_nblock % i)) {
-            jcp.k2_block = i; break;
-        }
+    // We need jcp.k2_block to be a multiple of jcp.k_block = jcp.ic_block = 4
+    // and jcp.K = jcp.ic to be a multiple of jcp.k2_block. Since jcp.ic is
+    // a multiple of load_block = 16, we just use that for now.
+    jcp.k2_block = load_block;
     jcp.k_chunks = jcp.K / jcp.k2_block;
 
     const auto &oscales = attr.output_scales_;
     jcp.is_oc_scale = oscales.mask_ == 1 << 1;
-    assert(utils::implication(!jcp.is_oc_scale, oscales.mask_ == 0));
+    assert(IMPLICATION(!jcp.is_oc_scale, oscales.mask_ == 0));
 
     /* re-create weights primitive descriptor
                                     and set weights wino_blocking */
@@ -767,6 +908,8 @@ status_t jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t
     wd.oc_block = jcp.oc_block;
     wd.oc2_block = jcp.n2_block;
     wd.ic2_block = 1;
+    wd.adj_scale = adj_wei_scale;
+
     size_t max_size = types::data_type_size(data_type::s8) *
                         jcp.alpha * jcp.alpha * jcp.ic * jcp.oc;
     max_size += types::data_type_size(data_type::s32) *
@@ -797,7 +940,8 @@ _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<with_relu, dst_data_type>::
         _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t(const pd_t *pd,
                 const input_vector &inputs, const output_vector &outputs)
     : cpu_primitive_t(&conf_, inputs, outputs)
-    , conf_(*pd) {
+    , conf_(*pd)
+    , scratchpad_(nullptr) {
     const int nthreads = mkldnn_get_max_threads();
     kernel_ = new jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t(
             conf_.jcp_, *conf_.attr());
@@ -806,25 +950,27 @@ _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<with_relu, dst_data_type>::
     dst_trans_ = new jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t(
             conf_.jcp_, *conf_.attr());
 
-    int wino_size_offset = (conf_.jcp_.yb / 2) * (conf_.jcp_.xb / 2)
-                                + (conf_.jcp_.xb);
-    size_wino_wei = conf_.jcp_.alpha * conf_.jcp_.alpha * conf_.jcp_.oc
-                        * conf_.jcp_.ic;
-    size_wino_src = (conf_.jcp_.ic * 16) * (wino_size_offset);
-    size_wino_dst = (conf_.jcp_.oc * 16) * (wino_size_offset);
+    const int tilesize = conf_.jcp_.alpha * conf_.jcp_.alpha;
+    const int numtiles = (conf_.jcp_.yb / 2) * (conf_.jcp_.xb / 2);
+    const int alltiles = tilesize * numtiles;
+    size_wino_wei_ = tilesize * conf_.jcp_.oc * conf_.jcp_.ic;
+    size_wino_src_ = sizeof(src_data_t) * alltiles * conf_.jcp_.ic;
+    size_wino_src_ = rnd_up(size_wino_src_, PAGE_4K);
+    size_wino_src_ /= sizeof(src_data_t);
+    size_wino_dst_ = alltiles * conf_.jcp_.oc;
 
-    size_t workspace_size = nthreads
-                    * (sizeof(src_data_t) * size_wino_src
-                     + sizeof(acc_data_t) * size_wino_dst);
+    size_t workspace_size = (conf_.jcp_.small_mb ? 1 : nthreads)
+            * (sizeof(src_data_t) * size_wino_src_
+                                    + sizeof(acc_data_t) * size_wino_dst_);
 
-    workspace = malloc(workspace_size, 4096);
-    char *_t = static_cast<char *>(workspace);
+    scratchpad_ = create_scratchpad(workspace_size);
+    assert(scratchpad_); // TODO: add proper check and raise exception?
 
-    size_t shift = 0;
-    wino_src_ = (src_data_t *)(_t + shift);
+    wino_shift_ = (conf_.jcp_.small_mb ? 1 : nthreads) * sizeof(src_data_t)
+            * size_wino_src_;
 
-    shift += nthreads * sizeof(src_data_t) * size_wino_src;
-    wino_dst_ = (acc_data_t *)(_t + shift);
+    updated_output_scales_ = conf_.attr()->output_scales_;
+    updated_output_scales_.scale(1.f / (adj_src_scale * adj_wei_scale));
 }
 
 template <bool with_relu, data_type_t dst_data_type>
@@ -833,8 +979,7 @@ _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<with_relu,
     delete kernel_;
     delete src_trans_;
     delete dst_trans_;
-
-    free(workspace);
+    delete scratchpad_;
 }
 
 template <bool with_relu, data_type_t dst_data_type>
@@ -856,30 +1001,32 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<with_relu,
     auto dst = reinterpret_cast<dst_data_t *>(memory(0));
 
     const auto &jcp = kernel_->jcp;
-    const auto &oscales = conf_.attr()->output_scales_;
+    const auto &oscales = updated_output_scales_;
 
-    wino_wei_ = wei;
-    dst_bias_ = (const acc_data_t*)(wei + size_wino_wei);
+    auto wino_wei = wei;
+    auto dst_bias = (const acc_data_t *)(wei + size_wino_wei_);
+    auto wino_src_base = (src_data_t *)scratchpad_->get();
+    auto wino_dst_base = (acc_data_t *)(scratchpad_->get() + wino_shift_);
 
     parallel_nd(jcp.mb, div_up(jcp.oh, jcp.yb), div_up(jcp.ow, jcp.xb),
-        [&](int mb, int tile_y_b, int tile_x_b) {
+            [&](int mb, int tile_y_b, int tile_x_b) {
 
         int tile_y = tile_y_b * jcp.yb;
         int tile_x = tile_x_b * jcp.xb;
 
         int ithr = mkldnn_get_thread_num();
-        auto wino_src = wino_src_ + size_wino_src * ithr;
-        auto wino_dst = wino_dst_ + size_wino_dst * ithr;
-
-        auto src_trans_p = jit_avx512_core_u8s8s32x_wino_conv_src_trans_t
-            ::call_params_t();
-        auto dst_trans_p = jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t
-            ::call_params_t();
-        auto gemm_p = jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t
-            ::call_params_t();
-
-        /* transformation of input tensor to winograd domain */
-            for (int y_in_block = 0; y_in_block < jcp.yb; y_in_block += 2) {
+        auto wino_src = wino_src_base + size_wino_src_ * ithr;
+        auto wino_dst = wino_dst_base + size_wino_dst_ * ithr;
+
+        auto src_trans_p =
+            jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::call_params_t();
+        auto dst_trans_p =
+            jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::call_params_t();
+        auto gemm_p =
+            jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t::call_params_t();
+
+        /* transformation of input tensor to winograd domain */
+        for (int y_in_block = 0; y_in_block < jcp.yb; y_in_block += 2) {
             for (int x_in_block = 0; x_in_block < jcp.xb; x_in_block += 2) {
                 unsigned short v_y_masks[4], v_x_masks[4];
 
@@ -889,19 +1036,20 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<with_relu,
 
                 int v_ys = nstl::max(0, jcp.t_pad - y);
                 int v_ye = nstl::min(jcp.alpha,
-                    nstl::max(0, jcp.ih + jcp.t_pad - y));
+                        nstl::max(0, jcp.ih + jcp.t_pad - y));
 
                 int v_xs = nstl::max(0, jcp.l_pad - x);
                 int v_xe = nstl::min(jcp.alpha,
-                    nstl::max(0, jcp.iw + jcp.l_pad - x));
+                        nstl::max(0, jcp.iw + jcp.l_pad - x));
 
-                #pragma unroll(4)
+#pragma unroll(4)
                 for (int i = 0; i < jcp.alpha; i++) {
                     v_y_masks[i] = (i < v_ys || i >= v_ye) ? 0 : 0xffff;
                     v_x_masks[i] = (i < v_xs || i >= v_xe) ? 0 : 0xffff;
                 }
-                auto local_s = src + mb * jcp.ih * jcp.iw * jcp.ic
-                                            + y * jcp.iw * jcp.ic + x * jcp.ic;
+                auto local_s = src
+                        + mb * jcp.ih * jcp.iw * jcp.ic
+                        + y * jcp.iw * jcp.ic + x * jcp.ic;
                 auto local_w = wino_src + m * jcp.ic;
 
                 src_trans_p.src = local_s;
@@ -910,20 +1058,22 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<with_relu,
                 src_trans_p.v_x_masks = v_x_masks;
 
                 src_trans_->ker_(&src_trans_p);
-            }}
-        }
-        {  /* gemms */
-            for (int tile_ij = 0; tile_ij < 16; tile_ij++) {
-                gemm_p.src = wino_src + jcp.inp_stride * tile_ij;
-                gemm_p.dst = wino_dst + jcp.out_stride * tile_ij;
-                gemm_p.wei = wino_wei_ + jcp.wei_stride * tile_ij;
-                gemm_p.dst_b = dst_bias_ + jcp.bia_stride * tile_ij;
-
-                kernel_->ker_(&gemm_p);
             }
         }
-        { /* transformation from winograd domain to output tensor */
-            for (int y_in_block = 0; y_in_block < jcp.yb; y_in_block += 2) {
+        /* gemms */
+        for (int tile_ij = 0; tile_ij < 16; tile_ij++) {
+            // start threads at different GEMMs to help bring weights into LLC
+            int offset = (tile_ij + ithr) % 16;
+            gemm_p.src = wino_src + jcp.inp_stride * offset;
+            gemm_p.dst = wino_dst + jcp.out_stride * offset;
+            gemm_p.wei = wino_wei + jcp.wei_stride * offset;
+            gemm_p.dst_b = dst_bias + jcp.bia_stride * offset;
+
+            kernel_->ker_(&gemm_p);
+        }
+
+        /* transformation from winograd domain to output tensor */
+        for (int y_in_block = 0; y_in_block < jcp.yb; y_in_block += 2) {
             for (int x_in_block = 0; x_in_block < jcp.xb; x_in_block += 2) {
                 unsigned short v_y_masks[2], v_x_masks[2];
 
@@ -931,13 +1081,14 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<with_relu,
                 int x = x_in_block + tile_x;
                 int m = (y_in_block / 2) * (jcp.xb / 2) + (x_in_block / 2);
 
-                #pragma unroll(2)
+#pragma unroll(2)
                 for (int i = 0; i < jcp.m; i++) {
                     v_x_masks[i] = (x + i < jcp.ow) ? 0xffff : 0;
                     v_y_masks[i] = (y + i < jcp.oh) ? 0xffff : 0;
                 }
-                auto local_d = dst + mb * jcp.oh * jcp.ow * jcp.oc
-                                            + y * jcp.ow * jcp.oc + x * jcp.oc;
+                auto local_d = dst
+                        + mb * jcp.oh * jcp.ow * jcp.oc
+                        + y * jcp.ow * jcp.oc + x * jcp.oc;
                 auto local_w = wino_dst + m * jcp.oc;
 
                 auto scales = oscales.scales_;
@@ -950,7 +1101,7 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<with_relu,
                 dst_trans_p.bias = bia;
 
                 dst_trans_->ker_(&dst_trans_p);
-            }}
+            }
         }
     });
 }
@@ -964,113 +1115,110 @@ void _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<with_relu,
     auto dst = reinterpret_cast<dst_data_t *>(memory(0));
 
     const auto &jcp = kernel_->jcp;
-    const auto &oscales = conf_.attr()->output_scales_;
+    const auto &oscales = updated_output_scales_;
 
-    wino_wei_ = wei;
-    dst_bias_ = (const acc_data_t*)(wei + size_wino_wei);
+    auto wino_wei = wei;
+    auto dst_bias = (const acc_data_t *)(wei + size_wino_wei_);
+    auto wino_src = (src_data_t *)scratchpad_->get();
+    auto wino_dst = (acc_data_t *)(scratchpad_->get() + wino_shift_);
 
     for (int mb = 0; mb < jcp.mb; mb++) {
     for (int tile_y = 0; tile_y < jcp.oh; tile_y += jcp.yb) {
     for (int tile_x = 0; tile_x < jcp.ow; tile_x += jcp.xb) {
-        { /* transformation of input tensor to winograd domain */
-
-            parallel_nd(div_up(jcp.yb, 2), div_up(jcp.xb, 2),
-                [&](int y_in_block_b, int x_in_block_b) {
-
-                int y_in_block = y_in_block_b * 2;
-                int x_in_block = x_in_block_b * 2;
-                auto src_trans_p =
-                    jit_avx512_core_u8s8s32x_wino_conv_src_trans_t
-                    ::call_params_t();
+        /* transformation of input tensor to winograd domain */
+        parallel_nd(div_up(jcp.yb, 2), div_up(jcp.xb, 2),
+            [&](int y_in_block_b, int x_in_block_b) {
+            int y_in_block = y_in_block_b * 2;
+            int x_in_block = x_in_block_b * 2;
 
-                unsigned short v_y_masks[4], v_x_masks[4];
+            auto src_trans_p =
+                jit_avx512_core_u8s8s32x_wino_conv_src_trans_t::call_params_t();
 
-                int y = y_in_block + tile_y;
-                int x = x_in_block + tile_x;
-                int m = (y_in_block / 2) * (jcp.xb / 2) + (x_in_block / 2);
+            unsigned short v_y_masks[4], v_x_masks[4];
 
-                int v_ys = nstl::max(0, jcp.t_pad - y);
-                int v_ye = nstl::min(jcp.alpha,
-                    nstl::max(0, jcp.ih + jcp.t_pad - y));
+            int y = y_in_block + tile_y;
+            int x = x_in_block + tile_x;
+            int m = (y_in_block / 2) * (jcp.xb / 2) + (x_in_block / 2);
 
-                int v_xs = nstl::max(0, jcp.l_pad - x);
-                int v_xe = nstl::min(jcp.alpha,
-                    nstl::max(0, jcp.iw + jcp.l_pad - x));
+            int v_ys = nstl::max(0, jcp.t_pad - y);
+            int v_ye = nstl::min(
+                    jcp.alpha, nstl::max(0, jcp.ih + jcp.t_pad - y));
 
-                #pragma unroll(4)
-                for (int i = 0; i < jcp.alpha; i++) {
-                    v_y_masks[i] = (i < v_ys || i >= v_ye) ? 0 : 0xffff;
-                    v_x_masks[i] = (i < v_xs || i >= v_xe) ? 0 : 0xffff;
-                }
-                auto local_s = src + mb * jcp.ih * jcp.iw * jcp.ic
-                                            + y * jcp.iw * jcp.ic + x * jcp.ic;
-                auto local_w = wino_src_ + m * jcp.ic;
+            int v_xs = nstl::max(0, jcp.l_pad - x);
+            int v_xe = nstl::min(
+                    jcp.alpha, nstl::max(0, jcp.iw + jcp.l_pad - x));
 
-                src_trans_p.src = local_s;
-                src_trans_p.wino_src = local_w;
-                src_trans_p.v_y_masks = v_y_masks;
-                src_trans_p.v_x_masks = v_x_masks;
-
-                src_trans_->ker_(&src_trans_p);
-            });
-        }
-        {  /* gemms */
-            parallel_nd(16, jcp.n_chunks, [&](int tile_ij, int nnb) {
-                auto gemm_p = jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t
-                    ::call_params_t();
-
-                auto _t_src = wino_src_ + jcp.inp_stride * tile_ij;
-                auto _t_dst = wino_dst_ + jcp.out_stride * tile_ij;
-                auto _t_wei = wino_wei_ + jcp.wei_stride * tile_ij;
-                auto _t_dst_b = dst_bias_ + jcp.bia_stride * tile_ij;
-
-                gemm_p.src = _t_src;
-                gemm_p.dst = _t_dst + nnb * jcp.n2_block * jcp.n_block;
-                gemm_p.wei = _t_wei + nnb * jcp.n2_block * jcp.n_block * jcp.K;
-                gemm_p.dst_b = _t_dst_b + nnb * jcp.n2_block * jcp.n_block;
-
-                kernel_->ker_(&gemm_p);
-            });
-        }
-        { /* transformation from winograd domain to output tensor */
-            parallel_nd(div_up(jcp.yb, 2), div_up(jcp.xb, 2),
-                [&](int y_in_block_b, int x_in_block_b) {
-                int y_in_block = y_in_block_b * 2;
-                int x_in_block = x_in_block_b * 2;
-
-                auto dst_trans_p =
-                    jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t
-                    ::call_params_t();
-
-                unsigned short v_y_masks[2], v_x_masks[2];
-
-                int y = y_in_block + tile_y;
-                int x = x_in_block + tile_x;
-                int m = (y_in_block / 2) * (jcp.xb / 2) + (x_in_block / 2);
-
-                #pragma unroll(2)
-                for (int i = 0; i < jcp.m; i++) {
-                    v_x_masks[i] = (x + i < jcp.ow) ? 0xffff : 0;
-                    v_y_masks[i] = (y + i < jcp.oh) ? 0xffff : 0;
-                }
-                auto local_d = dst + mb * jcp.oh * jcp.ow * jcp.oc
-                                            + y * jcp.ow * jcp.oc + x * jcp.oc;
-                auto local_w = wino_dst_ + m * jcp.oc;
-
-                auto scales = oscales.scales_;
-                dst_trans_p.dst = local_d;
-                dst_trans_p.wino_dst = local_w;
-                dst_trans_p.v_y_masks = v_y_masks;
-                dst_trans_p.v_x_masks = v_x_masks;
-
-                dst_trans_p.scales = scales;
-                dst_trans_p.bias = bia;
-
-                dst_trans_->ker_(&dst_trans_p);
-            });
-        }
-    }}
-    }
+#pragma unroll(4)
+            for (int i = 0; i < jcp.alpha; i++) {
+                v_y_masks[i] = (i < v_ys || i >= v_ye) ? 0 : 0xffff;
+                v_x_masks[i] = (i < v_xs || i >= v_xe) ? 0 : 0xffff;
+            }
+            auto local_s = src
+                    + mb * jcp.ih * jcp.iw * jcp.ic
+                    + y * jcp.iw * jcp.ic + x * jcp.ic;
+            auto local_w = wino_src + m * jcp.ic;
+
+            src_trans_p.src = local_s;
+            src_trans_p.wino_src = local_w;
+            src_trans_p.v_y_masks = v_y_masks;
+            src_trans_p.v_x_masks = v_x_masks;
+
+            src_trans_->ker_(&src_trans_p);
+        });
+
+        /* gemms */
+        parallel_nd(16, jcp.n_chunks, [&](int tile_ij, int nnb) {
+            auto gemm_p = jit_avx512_core_u8s8s32x_wino_conv_fwd_ker_t::
+                    call_params_t();
+
+            gemm_p.src = wino_src + jcp.inp_stride * tile_ij;
+            gemm_p.dst = wino_dst + jcp.out_stride * tile_ij
+                    + nnb * jcp.n2_block * jcp.n_block;
+            gemm_p.wei = wino_wei + jcp.wei_stride * tile_ij
+                    + nnb * jcp.n2_block * jcp.n_block * jcp.K;
+            gemm_p.dst_b = dst_bias + jcp.bia_stride * tile_ij
+                    + nnb * jcp.n2_block * jcp.n_block;
+
+            kernel_->ker_(&gemm_p);
+        });
+
+        /* transformation from winograd domain to output tensor */
+        parallel_nd(div_up(jcp.yb, 2), div_up(jcp.xb, 2),
+            [&](int y_in_block_b, int x_in_block_b) {
+            int y_in_block = y_in_block_b * 2;
+            int x_in_block = x_in_block_b * 2;
+
+            auto dst_trans_p =
+                jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t::call_params_t();
+
+            unsigned short v_y_masks[2], v_x_masks[2];
+
+            int y = y_in_block + tile_y;
+            int x = x_in_block + tile_x;
+            int m = (y_in_block / 2) * (jcp.xb / 2) + (x_in_block / 2);
+
+#pragma unroll(2)
+            for (int i = 0; i < jcp.m; i++) {
+                v_x_masks[i] = (x + i < jcp.ow) ? 0xffff : 0;
+                v_y_masks[i] = (y + i < jcp.oh) ? 0xffff : 0;
+            }
+            auto local_d = dst
+                    + mb * jcp.oh * jcp.ow * jcp.oc
+                    + y * jcp.ow * jcp.oc + x * jcp.oc;
+            auto local_w = wino_dst + m * jcp.oc;
+
+            auto scales = oscales.scales_;
+            dst_trans_p.dst = local_d;
+            dst_trans_p.wino_dst = local_w;
+            dst_trans_p.v_y_masks = v_y_masks;
+            dst_trans_p.v_x_masks = v_x_masks;
+
+            dst_trans_p.scales = scales;
+            dst_trans_p.bias = bia;
+
+            dst_trans_->ker_(&dst_trans_p);
+        });
+    }}}
 }
 
 template struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t<true,
index 391bafb..83392ab 100644 (file)
@@ -23,6 +23,7 @@
 #include "cpu_convolution_pd.hpp"
 #include "cpu_engine.hpp"
 #include "mkldnn_thread.hpp"
+#include "scratchpad.hpp"
 #include "type_helpers.hpp"
 #include "utils.hpp"
 
@@ -66,7 +67,7 @@ struct _jit_avx512_core_u8s8s32x_wino_convolution_fwd_t : public cpu_primitive_t
                 && this->cdesc_().src_desc.data_type == data_type::u8
                 && this->cdesc_().dst_desc.data_type == dst_data_type
                 && this->cdesc_().weights_desc.data_type == data_type::s8
-                && utils::implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                     utils::one_of(this->cdesc_().bias_desc.data_type,
                                                 data_type::f32, data_type::s32,
                                                 data_type::s8, data_type::u8))
@@ -118,17 +119,14 @@ private:
     jit_avx512_core_u8s8s32x_wino_conv_src_trans_t *src_trans_;
     jit_avx512_core_u8s8s32x_wino_conv_dst_trans_t *dst_trans_;
 
-    size_t size_wino_wei;
-    size_t size_wino_src;
-    size_t size_wino_dst;
+    size_t size_wino_wei_;
+    size_t size_wino_src_;
+    size_t size_wino_dst_;
+    size_t wino_shift_;
 
-    const wei_data_t *wino_wei_;
-    const acc_data_t *dst_bias_;
+    scratchpad_t *scratchpad_;
 
-    src_data_t *wino_src_;
-    acc_data_t *wino_dst_;
-
-    void *workspace;
+    mkldnn::impl::scales_t updated_output_scales_;
 };
 
 template <impl::data_type_t dst_type>
@@ -22,7 +22,7 @@
 #include "cpu_memory.hpp"
 
 #include "jit_uni_1x1_conv_utils.hpp"
-#include "jit_avx512_core_u8s8s32x_1x1_conv_kernel.hpp"
+#include "jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp"
 
 #define GET_OFF(field) offsetof(jit_1x1_conv_call_s, field)
 
@@ -35,7 +35,7 @@ using namespace mkldnn::impl::utils;
 
 using namespace Xbyak;
 
-bool jit_avx512_core_u8s8s32x_1x1_conv_kernel::maybe_relu(int position)
+bool jit_avx512_core_x8s8s32x_1x1_conv_kernel::maybe_relu(int position)
 {
     using namespace primitive_kind;
     const auto &p = attr_.post_ops_;
@@ -61,13 +61,13 @@ bool jit_avx512_core_u8s8s32x_1x1_conv_kernel::maybe_relu(int position)
     return false;
 }
 
-void jit_avx512_core_u8s8s32x_1x1_conv_kernel::bcast_loop(int load_loop_blk)
+void jit_avx512_core_x8s8s32x_1x1_conv_kernel::bcast_loop(int load_loop_blk)
 {
     mov(aux1_reg_bcast_data, reg_bcast_data);
     mov(aux_reg_bcast_data, reg_bcast_data);
 
     mov(aux_reg_output_data, reg_output_data);
-    mov(bcast_loop_iter, EVEX_compress_addr(rsp, bcast_loop_work_offt));
+    mov(bcast_loop_iter, EVEX_compress_addr(rsp, bcast_loop_work_off));
 
     Label bcast_loop;
     Label bcast_loop_tail;
@@ -109,7 +109,7 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::bcast_loop(int load_loop_blk)
     }
 }
 
-void jit_avx512_core_u8s8s32x_1x1_conv_kernel::cvt2ps(data_type_t type_in,
+void jit_avx512_core_x8s8s32x_1x1_conv_kernel::cvt2ps(data_type_t type_in,
         zmm_t zmm_in, const Xbyak::Operand &op, bool mask_flag) {
     zmm_t zmm = mask_flag ? zmm_in | ktail_mask | T_z : zmm_in;
     switch (type_in) {
@@ -123,7 +123,7 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::cvt2ps(data_type_t type_in,
         vcvtdq2ps(zmm_in, zmm_in);
 }
 
-void jit_avx512_core_u8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk,
+void jit_avx512_core_x8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk,
          int ur, int substep, bool wraparound)
 {
     auto vreg_load = [=](int i_load) {
@@ -134,10 +134,23 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk,
         return Zmm(i_ur * load_loop_blk + i_load);
     };
 
+    auto zmm_bias_alpha = [=]() {
+        return Zmm(ur * load_loop_blk);
+    };
+
+    auto xmm_bias_alpha = [=]() {
+        return Xmm(ur * load_loop_blk);
+    };
     auto bias_ptr = [=](int i_load) {
         return EVEX_compress_addr(reg_bias_data,
                                   jcp.typesize_bia * jcp.oc_block * i_load);
     };
+
+    auto comp_ptr = [=](int i_load) {
+        return EVEX_compress_addr(reg_comp_data,
+                                  sizeof(int32_t) * jcp.oc_block * i_load);
+    };
+
     auto scale_ptr = [=](int i_load) {
         return EVEX_compress_addr(reg_ptr_scales,
                     jcp.is_oc_scale * (sizeof(float) * jcp.oc_block * i_load));
@@ -167,7 +180,8 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk,
 
     auto output_ptr = [=](int i_load, int i_ur) {
         return EVEX_compress_addr(aux_reg_output_data,
-            jcp.typesize_out * (jcp.oc_without_padding * i_ur + i_load * jcp.load_block));
+            jcp.typesize_out * (jcp.oc_without_padding * i_ur
+                                + i_load * jcp.load_block));
     };
 
     auto init = [=]() {
@@ -176,6 +190,12 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk,
                 auto r = vreg_accum(i_load, i_ur);
                 vpxord(r, r, r);
             }
+        if (jcp.signed_input) {
+            xor_(reg_scratch, reg_scratch);
+            Reg8 _t8 = reg_scratch.cvt8();
+            mov(_t8, (int8_t)-128);
+            vpbroadcastb(zmm_shift, _t8);
+        }
     };
 
     auto store = [=](const bool mask_flag_in) {
@@ -190,25 +210,45 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk,
             mov(EVEX_compress_addr(rsp, reg_load_data_off), reg_load_data);
             mov(reg_ptr_sum_scale, (size_t)p_sum_scale);
         }
-        vpxord(zmm_zero, zmm_zero, zmm_zero);
+        if (jcp.signed_input && jcp.ver != ver_vnni) {
+            mov(reg_scratch, float2int(jcp.wei_adj_scale));
+            vmovq(xmm_bias_alpha(), reg_scratch);
+            vbroadcastss(zmm_bias_alpha(), xmm_bias_alpha());
+        }
         for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
             const bool mask_flag = mask_flag_in && i_load == load_loop_blk - 1;
             auto zmm_bias = zmm_tmp;
-            if (jcp.with_bias)
+            auto zmm_comp = zmm_bcast;
+            if (jcp.with_bias) {
+                if (jcp.signed_input)
+                    mov(reg_bias_data,
+                        EVEX_compress_addr(rsp,reg_bias_data_off));
                 cvt2ps(jcp.bia_dt, zmm_bias, bias_ptr(i_load), mask_flag);
+                if (jcp.signed_input && jcp.ver != ver_vnni)
+                    vmulps(zmm_bias, zmm_bias, zmm_bias_alpha());
+            }
+            if (jcp.signed_input) {
+                mov(reg_comp_data, EVEX_compress_addr(rsp, reg_comp_data_off));
+                cvt2ps(data_type::s32, zmm_comp, comp_ptr(i_load), mask_flag);
+            }
 
             for (int i_ur = 0; i_ur < ur; ++i_ur) {
                 auto r = vreg_accum(i_load, i_ur);
                 vcvtdq2ps(r, r);
+                if (jcp.signed_input)
+                    vaddps(r, r, zmm_comp);
                 if (jcp.with_bias)
                     vaddps(r, r, zmm_bias);
 
                 zmm_t mask_zmm = mask_flag ? r | ktail_mask | T_z : r;
                 vmulps(mask_zmm, r, scale_ptr(i_load));
-                if (maybe_relu(0))
+                if (maybe_relu(0)) {
+                    vpxord(zmm_zero, zmm_zero, zmm_zero);
                     vmaxps(r, zmm_zero, r);
+                }
                 if (p_sum_scale) { // post_op: sum
-                    auto zmm_prev_dst = zmm_bcast;
+                    vpxord(zmm_zero, zmm_zero, zmm_zero);
+                    auto zmm_prev_dst = zmm_zero;
 
                     cvt2ps(jcp.dst_dt, zmm_prev_dst, output_ptr(i_load, i_ur),
                         mask_flag);
@@ -218,8 +258,10 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk,
                     else
                         vfmadd231ps(r, zmm_prev_dst, zword_b[reg_ptr_sum_scale]);
                 }
-                if (maybe_relu(1))
+                if (maybe_relu(1)) {
+                    vpxord(zmm_zero, zmm_zero, zmm_zero);
                     vmaxps(r, zmm_zero, r);
+                }
                 if (jcp.dst_dt != data_type::f32) {
                     if (attr_.round_mode_ == round_mode::nearest) {
                         vcvtps2dq(r | T_rn_sae, r);
@@ -280,6 +322,8 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk,
                 } else {
                     vpbroadcastd(zmm_bcast, bcast_ptr(i_reduce, i_ur, false));
                 }
+                if (jcp.signed_input)
+                    vpsubb(zmm_bcast, zmm_bcast, zmm_shift);
                 for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
                     compute(vreg_accum(i_load, i_ur),
                                 vreg_load(i_load), zmm_bcast);
@@ -342,7 +386,7 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::reduce_loop(int load_loop_blk,
     }
 }
 
-void jit_avx512_core_u8s8s32x_1x1_conv_kernel::generate()
+void jit_avx512_core_x8s8s32x_1x1_conv_kernel::generate()
 {
     preamble();
 
@@ -363,7 +407,11 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::generate()
 
     if (jcp.with_bias)
         mov(reg_bias_data, ptr[param1 + GET_OFF(bias_data)]);
-
+    if (jcp.signed_input) {
+        mov(EVEX_compress_addr(rsp, reg_bias_data_off), reg_bias_data);
+        mov(reg_comp_data, ptr[param1 + GET_OFF(compensation)]);
+        mov(EVEX_compress_addr(rsp, reg_comp_data_off), reg_comp_data);
+    }
     mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]);
     mov(EVEX_compress_addr(rsp, reg_ptr_sum_scale_off), reg_ptr_scales);
     mov(reg_bcast_data, ptr[param1 + GET_OFF(bcast_data)]);
@@ -372,7 +420,7 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::generate()
 
     mov(reg_load_loop_work, ptr[param1 + GET_OFF(load_dim)]);
     mov(reg_bcast_loop_work, ptr[param1 + GET_OFF(bcast_dim)]);
-    mov(EVEX_compress_addr(rsp, bcast_loop_work_offt), reg_bcast_loop_work);
+    mov(EVEX_compress_addr(rsp, bcast_loop_work_off), reg_bcast_loop_work);
     mov(reg_reduce_loop_work, ptr[param1 + GET_OFF(reduce_dim)]);
     mov(reg_reduce_pos_flag, ptr[param1 + GET_OFF(first_last_flag)]);
 
@@ -380,9 +428,20 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::generate()
     auto load_loop_body = [=](int load_loop_blk) {
         bcast_loop(load_loop_blk);
         add(reg_load_data, load_loop_blk * jcp.load_loop_load_step);
-        if (jcp.with_bias)
+        if (jcp.with_bias) {
+            if (jcp.signed_input)
+                mov(reg_bias_data, EVEX_compress_addr(rsp, reg_bias_data_off));
             add(reg_bias_data,
                 load_loop_blk * jcp.load_block * jcp.typesize_bia);
+            if (jcp.signed_input)
+                mov(EVEX_compress_addr(rsp, reg_bias_data_off), reg_bias_data);
+        }
+        if (jcp.signed_input) {
+            mov(reg_comp_data, EVEX_compress_addr(rsp, reg_comp_data_off));
+            add(reg_comp_data,
+                load_loop_blk * jcp.load_block * sizeof(int32_t));
+            mov(EVEX_compress_addr(rsp, reg_comp_data_off), reg_comp_data);
+        }
         mov(EVEX_compress_addr(rsp, reg_bcast_data_off), reg_bcast_data);
         mov(reg_ptr_scales, EVEX_compress_addr(rsp, reg_ptr_sum_scale_off));
         add(reg_ptr_scales,
@@ -446,7 +505,7 @@ void jit_avx512_core_u8s8s32x_1x1_conv_kernel::generate()
     postamble();
 }
 
-bool jit_avx512_core_u8s8s32x_1x1_conv_kernel::post_ops_ok(
+bool jit_avx512_core_x8s8s32x_1x1_conv_kernel::post_ops_ok(
         jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr) {
     using namespace primitive_kind;
     const auto &p = attr.post_ops_;
@@ -461,11 +520,11 @@ bool jit_avx512_core_u8s8s32x_1x1_conv_kernel::post_ops_ok(
     switch (p.len_) {
     case 0: return true;
     case 1: return true
-                && implication(jcp.with_eltwise, p.contain(sum, 0))
-                && implication(!jcp.with_eltwise, is_relu(0) || p.contain(sum, 0));
+                && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0))
+                && IMPLICATION(!jcp.with_eltwise, is_relu(0) || p.contain(sum, 0));
     case 2: return true
-                && implication(jcp.with_eltwise, p.contain(sum, 0) && is_relu(1))
-                && implication(!jcp.with_eltwise, false
+                && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0) && is_relu(1))
+                && IMPLICATION(!jcp.with_eltwise, false
                         || (p.contain(sum, 0) && is_relu(1))
                         || (p.contain(sum, 1) && is_relu(0)));
     case 3: return true
@@ -477,7 +536,7 @@ bool jit_avx512_core_u8s8s32x_1x1_conv_kernel::post_ops_ok(
     return false;
 }
 
-status_t jit_avx512_core_u8s8s32x_1x1_conv_kernel::init_conf(
+status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(
         jit_1x1_conv_conf_t &jcp, const convolution_desc_t &cd,
         const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d,
         const memory_desc_wrapper &dst_d, const memory_desc_wrapper &bias_d,
@@ -487,14 +546,15 @@ status_t jit_avx512_core_u8s8s32x_1x1_conv_kernel::init_conf(
     if (!mayiuse(avx512_core)) return status::unimplemented;
 
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
-    if (src_d.data_type() != data_type::u8
+    if (!one_of(src_d.data_type(), data_type::u8, data_type::s8)
         || weights_d.data_type() != data_type::s8
         || !one_of(dst_d.data_type(),
             data_type::f32, data_type::s32, data_type::s8, data_type::u8))
         return status::unimplemented;
-    if (!one_of(weights_d.format(), gOIhw4i16o4i, OIhw4i16o4i))
+    if (!one_of(weights_d.format(), gOIhw4i16o4i, OIhw4i16o4i,
+                gOIhw4i16o4i_s8s8, OIhw4i16o4i_s8s8)) {
         return status::unimplemented;
-
+    }
     jcp.ver = ver_avx512_core;
     if (mayiuse(avx512_core_vnni))
         jcp.ver = ver_vnni;
@@ -519,9 +579,11 @@ status_t jit_avx512_core_u8s8s32x_1x1_conv_kernel::init_conf(
     jcp.with_bias = cd.bias_desc.format != memory_format::undef;
     jcp.with_eltwise = with_relu;
     jcp.eltwise_alpha = relu_negative_slope;
-    if (!implication(with_relu, relu_negative_slope == 0.))
+    if (!IMPLICATION(with_relu, relu_negative_slope == 0.))
         return status::unimplemented;
 
+    jcp.signed_input = (src_d.data_type() == data_type::s8) ? true : false;
+
     jcp.os = jcp.oh * jcp.ow;
     jcp.is = jcp.ih * jcp.iw;
     jcp.tr_is = rnd_up(jcp.is, 4);
@@ -717,7 +779,9 @@ status_t jit_avx512_core_u8s8s32x_1x1_conv_kernel::init_conf(
 
     const auto &oscales = attr.output_scales_;
     jcp.is_oc_scale = oscales.mask_ == 1 << 1;
-    assert(utils::implication(!jcp.is_oc_scale, oscales.mask_ == 0));
+    assert(IMPLICATION(!jcp.is_oc_scale, oscales.mask_ == 0));
+
+    jcp.wei_adj_scale = (jcp.signed_input) ? (1.f / 2.f) : 1.f;
 
     return status::success;
 }
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef JIT_AVX512_CORE_U8S8S32X_1X1_CONV_KERNEL_HPP
-#define JIT_AVX512_CORE_U8S8S32X_1X1_CONV_KERNEL_HPP
+#ifndef JIT_AVX512_CORE_X8S8S32X_1X1_CONV_KERNEL_HPP
+#define JIT_AVX512_CORE_X8S8S32X_1X1_CONV_KERNEL_HPP
 
 #include "c_types_map.hpp"
 #include "jit_generator.hpp"
@@ -25,9 +25,9 @@ namespace mkldnn {
 namespace impl {
 namespace cpu {
 
-struct jit_avx512_core_u8s8s32x_1x1_conv_kernel: public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8s8s32x_1x1_conv_fwd_ker_t)
-    jit_avx512_core_u8s8s32x_1x1_conv_kernel(jit_1x1_conv_conf_t ajcp,
+struct jit_avx512_core_x8s8s32x_1x1_conv_kernel: public jit_generator {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_x8s8s32x_1x1_conv_fwd_ker_t)
+    jit_avx512_core_x8s8s32x_1x1_conv_kernel(jit_1x1_conv_conf_t ajcp,
             const primitive_attr_t &attr) : jcp(ajcp), attr_(attr)
     {
         this->generate();
@@ -77,6 +77,7 @@ struct jit_avx512_core_u8s8s32x_1x1_conv_kernel: public jit_generator {
     reg64_t reg_ptr_sum_scale = r10;
     reg64_t reg_reduce_loop_work = r11;
     reg64_t reg_bias_data = r12;
+    reg64_t reg_comp_data = r12;
     reg64_t reg_scratch = r13;
     reg64_t aux_reg_bcast_data = r14;
     reg64_t aux_reg_load_data = r15;
@@ -98,14 +99,19 @@ struct jit_avx512_core_u8s8s32x_1x1_conv_kernel: public jit_generator {
     Xbyak::Zmm zmm_one = Xbyak::Zmm(29);
     Xbyak::Zmm zmm_zero = Xbyak::Zmm(30);
     Xbyak::Zmm zmm_bcast = Xbyak::Zmm(31);
+    Xbyak::Zmm zmm_shift = Xbyak::Zmm(30);
 
-    int bcast_loop_work_offt = 0;
-    int reg_bias_data_offt = 8;
+    Xbyak::Zmm zmm_bias_alpha = Xbyak::Zmm(31);
+    Xbyak::Xmm xmm_bias_alpha = Xbyak::Xmm(31);
+
+    int bcast_loop_work_off = 0;
+    int reg_bias_data_off = 8;
     int reg_bcast_data_off = 16;
     int reg_load_data_off = 24;
     int reg_ptr_sum_scale_off = 32;
     int reg_last_load_off = 40;
-    int stack_space_needed = 48;
+    int reg_comp_data_off = 48;
+    int stack_space_needed = 56;
 
     void bcast_loop(int load_loop_blk);
     void reduce_loop(int load_loop_blk, int ur, int substep, bool wraparound);
@@ -22,7 +22,7 @@
 #include "type_helpers.hpp"
 #include "jit_generator.hpp"
 
-#include "jit_avx512_core_u8s8s32x_1x1_convolution.hpp"
+#include "jit_avx512_core_x8s8s32x_1x1_convolution.hpp"
 
 namespace mkldnn {
 namespace impl {
@@ -56,8 +56,9 @@ void balance2D(U nthr, U ithr, T ny, T &ny_start, T &ny_end,
 }
 
 /* convolution forward */
-template <bool with_relu, data_type_t dst_type>
-void _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<with_relu, dst_type>::execute_forward()
+template <bool with_relu, data_type_t src_type, data_type_t dst_type>
+void _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t
+                              <with_relu, src_type, dst_type>::execute_forward()
 {
     auto src = reinterpret_cast<const src_data_t *>(this->input_memory(0));
     auto weights =
@@ -69,8 +70,8 @@ void _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<with_relu, dst_type>::execu
     });
 }
 
-template <bool with_relu, data_type_t dst_type>
-void _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<with_relu, dst_type>
+template <bool with_relu, data_type_t src_type, data_type_t dst_type>
+void _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<with_relu, src_type, dst_type>
 ::execute_forward_thr(const int ithr, const int nthr, const src_data_t *src,
         const wei_data_t *weights, const char *bias, dst_data_t *dst) {
     const memory_desc_wrapper src_d(conf_.src_pd());
@@ -91,6 +92,12 @@ void _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<with_relu, dst_type>
 
     const auto &oscales = conf_.attr()->output_scales_;
 
+    int offset = jcp.ngroups * (jcp.oc / jcp.oc_block) * (jcp.ic / jcp.ic_block)
+        * jcp.oc_block * jcp.ic_block;
+    wei_data_t *w = const_cast<wei_data_t *>(weights);
+    int32_t* compensation = (jcp.signed_input)
+        ? reinterpret_cast<int32_t *>(w + offset) : 0;
+
     auto step = [](int default_step, int remaining, int tail_step) {
         assert(default_step <= tail_step);
         return remaining < tail_step ? remaining : default_step;
@@ -164,7 +171,11 @@ void _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<with_relu, dst_type>
             ? weights_d.blk_off(g, ocb, icb)
             : weights_d.blk_off(ocb, icb)];
         p.bias_data = &bias[_ocb * jcp.oc_block * bia_dt_size];
-        p.scales = &oscales.scales_[jcp.is_oc_scale * _ocb * jcp.oc_block];
+        p.compensation = (jcp.signed_input)
+            ? &compensation[_ocb * jcp.oc_block] : 0;
+        p.scales = (jcp.signed_input && jcp.ver != ver_vnni)
+            ? &local_scales_[jcp.is_oc_scale * _ocb * jcp.oc_block]
+            : &oscales.scales_[jcp.is_oc_scale * _ocb * jcp.oc_block];
         if (conf_.rtus_.reduce_src_) {
             rp.ws = scratch_ + ithr * ws_per_thread_
                 + _icb * jcp.is * jcp.ic_block;
@@ -244,19 +255,38 @@ void _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<with_relu, dst_type>
     }
 }
 
-
-template struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<false, data_type::u8>;
-template struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<true, data_type::u8>;
-
-template struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<false, data_type::s8>;
-template struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<true, data_type::s8>;
-
-template struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<false, data_type::s32>;
-template struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<true, data_type::s32>;
-
-template struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<false, data_type::f32>;
-template struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<true, data_type::f32>;
-
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<false,
+                                                  data_type::u8, data_type::u8>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<true,
+                                                  data_type::u8, data_type::u8>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<false,
+                                                  data_type::s8, data_type::u8>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<true,
+                                                  data_type::s8, data_type::u8>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<false,
+                                                  data_type::u8, data_type::s8>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<true,
+                                                  data_type::u8, data_type::s8>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<false,
+                                                  data_type::s8, data_type::s8>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<true,
+                                                  data_type::s8, data_type::s8>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<false,
+                                                 data_type::u8, data_type::s32>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<true,
+                                                 data_type::u8, data_type::s32>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<false,
+                                                 data_type::s8, data_type::s32>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<true,
+                                                 data_type::s8, data_type::s32>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<false,
+                                                 data_type::u8, data_type::f32>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<true,
+                                                 data_type::u8, data_type::f32>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<false,
+                                                 data_type::s8, data_type::f32>;
+template struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<true,
+                                                 data_type::s8, data_type::f32>;
 }
 }
 }
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_JIT_AVX512_CORE_U8S8S32X_1X1_CONVOLUTION_HPP
-#define CPU_JIT_AVX512_CORE_U8S8S32X_1X1_CONVOLUTION_HPP
+#ifndef CPU_JIT_AVX512_CORE_X8S8S32X_1X1_CONVOLUTION_HPP
+#define CPU_JIT_AVX512_CORE_X8S8S32X_1X1_CONVOLUTION_HPP
 
 #include "c_types_map.hpp"
 #include "cpu_convolution_pd.hpp"
 #include "utils.hpp"
 
 #include "jit_uni_1x1_conv_utils.hpp"
-#include "jit_avx512_core_u8s8s32x_1x1_conv_kernel.hpp"
+#include "jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp"
 
 namespace mkldnn {
 namespace impl {
 namespace cpu {
 
-template <bool with_relu, impl::data_type_t dst_type>
-struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t {
+template<bool with_relu, impl::data_type_t src_type, impl::data_type_t dst_type>
+struct _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t {
     struct pd_t: public _cpu_convolution_fwd_pd_t<with_relu> {
         pd_t(engine_t *engine,
                 const typename pd_t::base_desc_t *adesc,
@@ -43,9 +43,9 @@ struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t
             , jcp_(), rtus_() {}
 
         DECLARE_COMMON_PD_T(
-                JIT_IMPL_NAME_HELPER("jit_1x1:", avx512_core, ""),
-                _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<with_relu,
-                dst_type>);
+                JIT_IMPL_NAME_HELPER("jit_int8_1x1:", avx512_core, ""),
+                _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<with_relu,
+                src_type, dst_type>);
 
         virtual status_t init() override {
             using namespace prop_kind;
@@ -57,10 +57,10 @@ struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t
                         forward_inference)
                 && this->cdesc_().alg_kind == alg_kind::convolution_direct
                 && !this->has_zero_dim_memory()
-                && this->cdesc_().src_desc.data_type == data_type::u8
+                && this->cdesc_().src_desc.data_type == src_type
                 && this->cdesc_().dst_desc.data_type == dst_type
                 && this->cdesc_().weights_desc.data_type == data_type::s8
-                && utils::implication(this->with_bias(), utils::one_of(
+                && IMPLICATION(this->with_bias(), utils::one_of(
                             this->cdesc_().bias_desc.data_type, data_type::f32,
                             data_type::s32, data_type::s8, data_type::u8))
                 && this->cdesc_().accum_data_type == data_type::s32;
@@ -70,7 +70,7 @@ struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t
             const convolution_desc_t *conv_d = &this->cdesc_();
             const memory_desc_t *src_d = this->src_pd_.desc();
             rtus_prepare(this, conv_d, src_d, this->dst_pd_.desc());
-            return jit_avx512_core_u8s8s32x_1x1_conv_kernel::init_conf(jcp_,
+            return jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(jcp_,
                     *conv_d, *src_d, *this->weights_pd_.desc(),
                     *this->dst_pd_.desc(), *this->bias_pd_.desc(), *this->attr(),
                     with_relu, this->negative_slope(),
@@ -83,42 +83,57 @@ struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t
             bool reduce_src_;
         } rtus_;
 
-      protected:
-        virtual status_t set_default_params() override {
-            using namespace memory_format;
-            if (this->src_pd_.desc()->format == any)
-                CHECK(this->src_pd_.set_format(nhwc));
-            if (this->dst_pd_.desc()->format == any)
-                CHECK(this->dst_pd_.set_format(nhwc));
-            if (this->weights_pd_.desc()->format == any)
-                CHECK(this->weights_pd_.set_format(this->with_groups()
-                                        ? gOIhw4i16o4i : OIhw4i16o4i));
-            if (this->bias_pd_.desc()->format == any)
-                CHECK(this->bias_pd_.set_format(x));
-            return status::success;
-        }
+        protected:
+            virtual status_t set_default_params() override {
+                using namespace memory_format;
+                bool is_sign_input =
+                    (this->cdesc_().src_desc.data_type == data_type::s8)
+                        ? true : false;
+                if (this->src_pd_.desc()->format == any)
+                    CHECK(this->src_pd_.set_format(nhwc));
+                if (this->dst_pd_.desc()->format == any)
+                    CHECK(this->dst_pd_.set_format(nhwc));
+                if (this->weights_pd_.desc()->format == any)
+                    CHECK(this->weights_pd_.set_format(this->with_groups()
+                        ? ((is_sign_input) ? gOIhw4i16o4i_s8s8 : gOIhw4i16o4i)
+                        : ((is_sign_input) ? OIhw4i16o4i_s8s8 : OIhw4i16o4i)));
+                if (this->bias_pd_.desc()->format == any)
+                    CHECK(this->bias_pd_.set_format(x));
+                return status::success;
+            }
     };
 
     template <cpu_isa_t isa, typename conv_t>
     friend void init_rtus_driver(conv_t *self);
-    _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t(const pd_t *pd,
+    _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t(const pd_t *pd,
                                           const input_vector &inputs,
                                           const output_vector &outputs)
         : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
         , kernel_(nullptr), rtus_driver_(nullptr), ws_per_thread_(0)
-        , scratch_(nullptr)
+        , scratch_(nullptr), local_scales_(nullptr)
     {
-        kernel_ = new jit_avx512_core_u8s8s32x_1x1_conv_kernel(conf_.jcp_,
+        kernel_ = new jit_avx512_core_x8s8s32x_1x1_conv_kernel(conf_.jcp_,
                     *conf_.attr());
         init_rtus_driver<avx512_common>(this);
+        if (conf_.jcp_.signed_input && conf_.jcp_.ver != ver_vnni) {
+            size_t scales_size = ((conf_.attr()->output_scales_.count_ == 1)
+                    ? 16
+                    : conf_.attr()->output_scales_.count_);
+            local_scales_ = (float *)malloc(sizeof(float) * scales_size, 64);
+            for (size_t i = 0; i < scales_size; i++) {
+                local_scales_[i] = conf_.attr()->output_scales_.scales_[i] *
+                                        (1.f / conf_.jcp_.wei_adj_scale);
+            }
+        }
     }
-    ~_jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t() {
+    ~_jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t() {
         delete kernel_;
         delete rtus_driver_;
         free(scratch_);
+        if (local_scales_) free(local_scales_);
     }
 
-    typedef typename prec_traits<data_type::u8>::type src_data_t;
+    typedef typename prec_traits<src_type>::type src_data_t;
     typedef typename prec_traits<data_type::s8>::type wei_data_t;
     typedef typename prec_traits<dst_type>::type dst_data_t;
     typedef typename prec_traits<data_type::s32>::type acc_data_t;
@@ -134,21 +149,21 @@ struct _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t : public cpu_primitive_t
             const src_data_t *src, const wei_data_t *weights,
             const char *bias, dst_data_t *dst);
     pd_t conf_;
-    jit_avx512_core_u8s8s32x_1x1_conv_kernel *kernel_;
+    jit_avx512_core_x8s8s32x_1x1_conv_kernel *kernel_;
 
     rtus_driver_t<avx512_common> *rtus_driver_;
     size_t ws_per_thread_;
     src_data_t *scratch_;
+    float* local_scales_;
 };
 
-template <impl::data_type_t dst_type>
-using jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t =
-    _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<false, dst_type>;
-
-template <impl::data_type_t dst_type>
-using jit_avx512_core_u8s8s32x_1x1_convolution_relu_t =
-    _jit_avx512_core_u8s8s32x_1x1_convolution_fwd_t<true, dst_type>;
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t =
+    _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<false, src_type, dst_type>;
 
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_avx512_core_x8s8s32x_1x1_convolution_relu_t =
+    _jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t<true, src_type, dst_type>;
 }
 }
 }
@@ -20,7 +20,7 @@
 #include "utils.hpp"
 #include "cpu_memory.hpp"
 
-#include "jit_avx512_core_u8s8s32x_conv_kernel.hpp"
+#include "jit_avx512_core_x8s8s32x_conv_kernel.hpp"
 
 #define GET_OFF(field) offsetof(jit_conv_call_s, field)
 
@@ -35,13 +35,13 @@ using namespace Xbyak;
 namespace {
 void pick_loop_order(jit_conv_conf_t &jcp)
 {
-    jcp.loop_order = loop_cgn;
+    jcp.loop_order = loop_cwgn;
     if (jcp.ngroups > 1)
-        jcp.loop_order = loop_ngc;
+        jcp.loop_order = loop_ngcw;
 }
 }
 
-bool jit_avx512_core_u8s8s32x_fwd_kernel::maybe_relu(int position)
+bool jit_avx512_core_x8s8s32x_fwd_kernel::maybe_relu(int position)
 {
     using namespace primitive_kind;
     const auto &p = attr_.post_ops_;
@@ -67,16 +67,22 @@ bool jit_avx512_core_u8s8s32x_fwd_kernel::maybe_relu(int position)
     return false;
 }
 
-void jit_avx512_core_u8s8s32x_fwd_kernel::prepare_output(int ur_w)
+void jit_avx512_core_x8s8s32x_fwd_kernel::prepare_output(int ur_w)
 {
     for (int k = 0; k < jcp.nb_oc_blocking; k++)
         for (int j = 0; j < ur_w; j++) {
             Zmm zmm = zmm_out(j, k);
             vpxord(zmm, zmm, zmm);
         }
+    if (jcp.signed_input) {
+        xor_(reg_scratch, reg_scratch);
+        Reg8 _t8 = reg_scratch.cvt8();
+        mov(_t8, (int8_t)-128);
+        vpbroadcastb(zmm_shift, _t8);
+    }
 }
 
-void jit_avx512_core_u8s8s32x_fwd_kernel::cvt2ps(data_type_t type_in,
+void jit_avx512_core_x8s8s32x_fwd_kernel::cvt2ps(data_type_t type_in,
         zmm_t zmm_in, const Xbyak::Operand &op, bool mask_flag) {
     zmm_t zmm = mask_flag ? zmm_in | ktail_mask | T_z : zmm_in;
     switch (type_in) {
@@ -90,13 +96,15 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::cvt2ps(data_type_t type_in,
         vcvtdq2ps(zmm_in, zmm_in);
 }
 
-void jit_avx512_core_u8s8s32x_fwd_kernel::store_output(int ur_w,
+void jit_avx512_core_x8s8s32x_fwd_kernel::store_output(int ur_w,
         int last_oc_block_flag)
 {
     int nb_oc_block = jcp.nb_oc_blocking;
 
     mov(reg_bias, ptr[param1 + GET_OFF(bias)]);
     mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]);
+    if (jcp.signed_input)
+        mov(reg_compensation, ptr[param1 + GET_OFF(compensation)]);
 
     const auto &p = attr_.post_ops_;
     const int sum_idx = p.find(primitive_kind::sum);
@@ -106,16 +114,30 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::store_output(int ur_w,
     if (p_sum_scale && *p_sum_scale != 1.f)
         mov(reg_ptr_sum_scale, (size_t)p_sum_scale);
 
-    vpxord(zmm_zero, zmm_zero, zmm_zero);
+    if (jcp. signed_input && jcp.ver != ver_vnni) {
+        mov(reg_bias_alpha, float2int(jcp.wei_adj_scale));
+        vmovq(xmm_bias_alpha(), reg_bias_alpha);
+        vbroadcastss(zmm_bias_alpha(), xmm_bias_alpha());
+    }
+
     for (int k = 0; k < nb_oc_block; k++) {
         const bool mask_flag = last_oc_block_flag == 1 && k == nb_oc_block - 1;
         int scale_offset = jcp.is_oc_scale * (sizeof(float) * k * jcp.oc_block);
         auto zmm_bias = zmm_tmp;
+        auto zmm_comp = zmm_shift;
         if (jcp.with_bias) {
             int bias_offset = jcp.typesize_bia * k * jcp.oc_block;
             auto bias_addr = EVEX_compress_addr(reg_bias, bias_offset);
 
             cvt2ps(jcp.bia_dt, zmm_bias, bias_addr, mask_flag);
+            if (jcp. signed_input && jcp.ver != ver_vnni)
+                vmulps(zmm_bias, zmm_bias, zmm_bias_alpha());
+        }
+        if (jcp.signed_input) {
+            int comp_offset = sizeof(int32_t) * k * jcp.oc_block;
+            auto comp_addr = EVEX_compress_addr(reg_compensation, comp_offset);
+
+            cvt2ps(data_type::s32, zmm_comp, comp_addr, mask_flag);
         }
         for (int j = 0; j < ur_w; j++) {
             int aux_output_offset
@@ -125,27 +147,31 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::store_output(int ur_w,
 
             Zmm zmm = zmm_out(j, k);
             vcvtdq2ps(zmm, zmm);
+            if (jcp.signed_input)
+                vaddps(zmm, zmm, zmm_comp);
             if (jcp.with_bias)
                 vaddps(zmm, zmm, zmm_bias);
 
             zmm_t mask_zmm = mask_flag ? zmm | ktail_mask | T_z : zmm;
             vmulps(mask_zmm, zmm,
                     EVEX_compress_addr(reg_ptr_scales, scale_offset));
-            if (maybe_relu(0))
+            if (maybe_relu(0)) {
+                vpxord(zmm_zero, zmm_zero, zmm_zero);
                 vmaxps(zmm, zmm_zero, zmm);
+            }
             if (p_sum_scale) { // post_op: sum
-                auto zmm_prev_dst = zmm_bcast;
-
+                vpxord(zmm_zero, zmm_zero, zmm_zero);
+                auto zmm_prev_dst = zmm_zero;
                 cvt2ps(jcp.dst_dt, zmm_prev_dst, addr, mask_flag);
-
                 if (*p_sum_scale == 1.f)
                     vaddps(zmm, zmm_prev_dst);
                 else
                     vfmadd231ps(zmm, zmm_prev_dst, zword_b[reg_ptr_sum_scale]);
             }
-            if (maybe_relu(1))
+            if (maybe_relu(1)) {
+                vpxord(zmm_zero, zmm_zero, zmm_zero);
                 vmaxps(zmm, zmm_zero, zmm);
-
+            }
             if (jcp.dst_dt != data_type::f32) {
                 if (attr_.round_mode_ == round_mode::nearest)
                     vcvtps2dq(zmm | T_rn_sae, zmm);
@@ -174,8 +200,8 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::store_output(int ur_w,
     }
 }
 
-void jit_avx512_core_u8s8s32x_fwd_kernel::compute_ker(int ur_w,
-    int pad_l, int pad_r, int last_ic_block_flag)
+void jit_avx512_core_x8s8s32x_fwd_kernel::compute_ker(int ur_w,
+    int pad_l, int pad_r, int last_ic_block_flag, bool h_padded)
 {
     int kw = jcp.kw;
     int stride_w = jcp.stride_w;
@@ -185,12 +211,6 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::compute_ker(int ur_w,
 
     int nb_oc_block = jcp.nb_oc_blocking;
 
-    Label kh_label, skip_kh_loop;
-
-    int shift_kernel_ptr = jcp.typesize_in * jcp.kw * ch_block_all;
-    int shift_input_ptr = jcp.typesize_in * (jcp.dilate_h + 1) * jcp.iw
-        * jcp.ic_without_padding * jcp.ngroups;
-
     auto input_offset = [=](int oi, int ic, int ki) {
         return jcp.typesize_in
                 * ((ki * (jcp.dilate_w + 1) + oi * stride_w - pad_l)
@@ -215,64 +235,111 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::compute_ker(int ur_w,
         }
     };
 
-    mov(aux_reg_inp, reg_inp);
-    mov(aux_reg_ker, reg_ker);
-
-    mov(reg_kj, reg_kh);
-    if ((jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad)) {
-        cmp(reg_kj, 0);
-        je(skip_kh_loop, T_NEAR);
-    }
-    L(kh_label); {
-        for (int ki = 0; ki < kw; ki++) {
-            int jj_start = get_ow_start(ki, pad_l);
-            int jj_end = get_ow_end(ur_w, ki, pad_r);
-            int tail_size = jcp.ic_without_padding % 4;
-            /* Skip the last loads of input if (ic%16)/4 < ic_block/4 */
-            int icb = jcp.is_depthwise
-                ? 1
-                : (last_ic_block_flag != no_last_block)
-                    ? div_up((jcp.ic_without_padding % ic_block), 4)
-                    : ic_block / 4;
-            for (int ic = 0; ic < icb; ic++) {
-                for (int jj = jj_start; jj < jj_end; jj++) {
+    for (int ki = 0; ki < kw; ki++) {
+        int jj_start = get_ow_start(ki, pad_l);
+        int jj_end = get_ow_end(ur_w, ki, pad_r);
+        int tail_size = jcp.ic_without_padding % 4;
+        int _start = (jcp.signed_input) ? 0 : jj_start;
+        int _end = (jcp.signed_input) ? ur_w : jj_end;
+        /* Skip the last loads of input if (ic%16)/4 < ic_block/4 */
+        int icb = jcp.is_depthwise
+            ? 1
+            : (last_ic_block_flag != no_last_block)
+                ? div_up((jcp.ic_without_padding % ic_block), 4)
+                : ic_block / 4;
+        for (int ic = 0; ic < icb; ic++) {
+            if (h_padded == true) {
+                Zmm inp = zmm_inp(0,nb_oc_block);
+                vpxord(inp, inp, inp);
+                vpsubb(inp, inp, zmm_shift);
+            } else {
+                for (int jj = _start; jj < _end; jj++) {
                     int aux_input_offset = input_offset(jj, ic, ki);
-                    if (jcp.is_depthwise) {
-                        vpmovzxbd(zmm_inp(jj, nb_oc_block),
-                                EVEX_compress_addr(
-                                          aux_reg_inp, aux_input_offset));
-                    } else if (last_ic_block_flag == last_sp_block
-                            && tail_size != 0 && ic == icb - 1) {
-                        Xmm xmm_tmp = Xmm(zmm_inp(jj, nb_oc_block).getIdx());
-                        for (int r = 0; r < tail_size; ++r)
-                            vpinsrb(xmm_tmp, xmm_tmp,
+                    if (jj >= jj_start && jj < jj_end) {
+                        if (jcp.is_depthwise) {
+                            vpmovzxbd(zmm_inp(jj, nb_oc_block),
+                                    EVEX_compress_addr(
+                                              aux_reg_inp, aux_input_offset));
+                        } else if (last_ic_block_flag == last_sp_block
+                                && tail_size != 0 && ic == icb - 1) {
+                            Xmm xmm_tmp = Xmm(zmm_inp(jj, nb_oc_block).getIdx());
+                            for (int r = 0; r < tail_size; ++r)
+                                vpinsrb(xmm_tmp, xmm_tmp,
                                     ptr[aux_reg_inp + aux_input_offset + r], r);
-                        vpbroadcastd(zmm_inp(jj, nb_oc_block), xmm_tmp);
+                            vpbroadcastd(zmm_inp(jj, nb_oc_block), xmm_tmp);
+                        } else {
+                            vpbroadcastd(zmm_inp(jj, nb_oc_block),
+                                    EVEX_compress_addr(
+                                                 aux_reg_inp, aux_input_offset));
+                        }
+                        if (jcp.signed_input)
+                            vpsubb(zmm_inp(jj, nb_oc_block),
+                                   zmm_inp(jj, nb_oc_block), zmm_shift);
                     } else {
-                        vpbroadcastd(zmm_inp(jj, nb_oc_block),
-                                EVEX_compress_addr(
-                                             aux_reg_inp, aux_input_offset));
+                        if (jcp.signed_input) {
+                            Zmm inp = zmm_inp(jj, nb_oc_block);
+                            vpxord(inp, inp, inp);
+                            vpsubb(inp, inp, zmm_shift);
+                        }
                     }
                 }
-
-                for (int ii = 0; ii < nb_oc_block; ii++) {
-                    int aux_kernel_offset = kernel_offset(ii, ic, ki);
-                    if (jj_end - jj_start > 0) {
-                        if (jcp.is_depthwise)
-                            vpmovsxbd(
-                                    zmm_wei, EVEX_compress_addr(aux_reg_ker,
-                                                     aux_kernel_offset));
-                        else
-                            vmovups(zmm_wei, EVEX_compress_addr(aux_reg_ker,
-                                                     aux_kernel_offset));
-                    }
-                    for (int jj = jj_start; jj < jj_end; jj++) {
-                        compute(zmm_out(jj, ii), zmm_wei,
-                                                 zmm_inp(jj, nb_oc_block));
-                    }
+            }
+            for (int ii = 0; ii < nb_oc_block; ii++) {
+                int aux_kernel_offset = kernel_offset(ii, ic, ki);
+                if (jcp.is_depthwise)
+                    vpmovsxbd(
+                            zmm_wei, EVEX_compress_addr(aux_reg_ker,
+                                             aux_kernel_offset));
+                else
+                    vmovups(zmm_wei, EVEX_compress_addr(aux_reg_ker,
+                                             aux_kernel_offset));
+                for (int jj = _start; jj < _end; jj++)  {
+                    Zmm inp = (h_padded == true)
+                        ? zmm_inp(0,nb_oc_block) : zmm_inp(jj, nb_oc_block);
+                    compute(zmm_out(jj, ii), zmm_wei, inp);
                 }
             }
         }
+    }
+}
+void jit_avx512_core_x8s8s32x_fwd_kernel::kh_loop(int ur_w,
+    int pad_l, int pad_r, int last_ic_block_flag)
+{
+    Label kh_label, skip_kh_loop;
+    Label t_overflow_label, no_t_overflow_label,
+          b_overflow_label, no_b_overflow_label;
+
+    int ch_block_all = jcp.ch_block * jcp.ic_block * jcp.oc_block;
+    int shift_kernel_ptr = jcp.typesize_in * jcp.kw * ch_block_all;
+    int shift_input_ptr = jcp.typesize_in * (jcp.dilate_h + 1) * jcp.iw
+        * jcp.ic_without_padding * jcp.ngroups;
+
+    mov(aux_reg_inp, reg_inp);
+    mov(aux_reg_ker, reg_ker);
+
+    if (jcp.signed_input) {
+        mov(reg_overflow,  ptr[param1 + GET_OFF(t_overflow)]);
+        cmp(reg_overflow, 0);
+        je(no_t_overflow_label, T_NEAR);
+        L(t_overflow_label); {
+            compute_ker(ur_w, pad_l, pad_r, last_ic_block_flag, true);
+
+            add(aux_reg_ker, shift_kernel_ptr);
+            dec(reg_overflow);
+            cmp(reg_overflow, 0);
+            jg(t_overflow_label, T_NEAR);
+        }
+        L(no_t_overflow_label);
+    }
+    mov(reg_kj, ptr[param1 + GET_OFF(kh_padding)]);
+    if ((jcp.signed_input) || (!jcp.signed_input &&
+       (jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad))) {
+        cmp(reg_kj, 0);
+        je(skip_kh_loop, T_NEAR);
+    }
+    L(kh_label); {
+        compute_ker(ur_w, pad_l, pad_r, last_ic_block_flag, false);
+
         add(aux_reg_ker, shift_kernel_ptr);
         add(aux_reg_inp, shift_input_ptr);
         dec(reg_kj);
@@ -280,9 +347,23 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::compute_ker(int ur_w,
         jg(kh_label, T_NEAR);
     }
     L(skip_kh_loop);
+    if (jcp.signed_input) {
+        mov(reg_overflow,  ptr[param1 + GET_OFF(b_overflow)]);
+        cmp(reg_overflow, 0);
+        je(no_b_overflow_label, T_NEAR);
+        L(b_overflow_label); {
+            compute_ker(ur_w, pad_l, pad_r, last_ic_block_flag, true);
+
+            add(aux_reg_ker, shift_kernel_ptr);
+            dec(reg_overflow);
+            cmp(reg_overflow, 0);
+            jg(b_overflow_label, T_NEAR);
+        }
+        L(no_b_overflow_label);
+    }
 }
 
-void jit_avx512_core_u8s8s32x_fwd_kernel::compute_loop(
+void jit_avx512_core_x8s8s32x_fwd_kernel::icb_loop(
         int ur_w, int pad_l, int pad_r, bool is_last_sp_block)
 {
     prepare_output(ur_w);
@@ -297,16 +378,16 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::compute_loop(
         cmp(reg_icb, 1); // The last IC block
         jne(common_ker, T_NEAR);
 
-        compute_ker(ur_w, pad_l, pad_r,
+        kh_loop(ur_w, pad_l, pad_r,
                 is_last_sp_block ? last_sp_block : last_ic_block);
         jmp(end_ker, T_NEAR);
 
         L(common_ker);
-        compute_ker(ur_w, pad_l, pad_r, no_last_block);
+        kh_loop(ur_w, pad_l, pad_r, no_last_block);
 
         L(end_ker);
     } else {
-        compute_ker(ur_w, pad_l, pad_r, no_last_block);
+        kh_loop(ur_w, pad_l, pad_r, no_last_block);
     }
     // End of IC Loop
     int inp_step = jcp.ic_block;
@@ -343,10 +424,12 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::compute_loop(
     }
 }
 
-void jit_avx512_core_u8s8s32x_fwd_kernel::generate()
+void jit_avx512_core_x8s8s32x_fwd_kernel::generate()
 {
     int inp_shift_pad = jcp.typesize_in * (jcp.ur_w * jcp.stride_w - jcp.l_pad)
         * jcp.ic_without_padding * jcp.ngroups;
+    int inp_shift_pad_second_block = -1 * jcp.typesize_in * jcp.l_pad
+        * jcp.ic_without_padding * jcp.ngroups;
     int inp_shift = jcp.typesize_in *
                         (jcp.ur_w * jcp.stride_w * jcp.ic_without_padding
                          * jcp.ngroups);
@@ -355,14 +438,13 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::generate()
     preamble();
 
     xor_(reg_scratch, reg_scratch);
-    Reg16 _t = reg_scratch.cvt16();
-    mov(_t, 0x1);
-    vpbroadcastw(zmm_one, _t);
+    Reg16 _t16 = reg_scratch.cvt16();
+    mov(_t16, 0x1);
+    vpbroadcastw(zmm_one, _t16);
 
     mov(reg_inp, ptr[param1 + GET_OFF(src)]);
     mov(reg_out, ptr[param1 + GET_OFF(dst)]);
     mov(reg_ker, ptr[param1 + GET_OFF(filt)]);
-    mov(reg_kh, ptr[param1 + GET_OFF(kh_padding)]);
 
     if (jcp.ngroups % jcp.ch_block != 0 || jcp.oc_without_padding != jcp.oc) {
         int tail_size = jcp.is_depthwise
@@ -380,56 +462,173 @@ void jit_avx512_core_u8s8s32x_fwd_kernel::generate()
                     - (jcp.iw + jcp.l_pad - 1));
     int n_oi = jcp.ow / jcp.ur_w;
     int r_pad1 = (jcp.ur_w * n_oi - 1) * jcp.stride_w
-            + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1);
-    if (r_pad1 > 0 || jcp.ur_w_tail == 0)
-        n_oi--;
+        + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1);
 
-    xor_(reg_oi, reg_oi);
-    if (jcp.ow == jcp.ur_w) {
-        compute_loop(jcp.ur_w, jcp.l_pad, r_pad, true);
-    } else {
-        if (n_oi == 0) {
-            compute_loop(jcp.ur_w, jcp.l_pad, r_pad1, jcp.ur_w_tail == 0);
-            add(reg_inp, inp_shift_pad);
-            add(reg_out, out_shift);
-            if (jcp.ur_w_tail != 0) {
-                compute_loop(jcp.ur_w_tail, 0, r_pad, true);
-            }
+    if (jcp.nb_ow == 1) {
+        if (r_pad1 > 0 || jcp.ur_w_tail == 0)
+            n_oi--;
+
+        xor_(reg_oi, reg_oi);
+        if (jcp.ow == jcp.ur_w) {
+            icb_loop(jcp.ur_w, jcp.l_pad, r_pad, true);
         } else {
-            if (jcp.l_pad > 0) {
-                compute_loop(jcp.ur_w, jcp.l_pad, 0, false);
+            if (n_oi == 0) {
+                icb_loop(jcp.ur_w, jcp.l_pad, r_pad1, jcp.ur_w_tail == 0);
                 add(reg_inp, inp_shift_pad);
                 add(reg_out, out_shift);
-
-                inc(reg_oi);
-            }
-            if ((jcp.l_pad <= 0 && n_oi > 0) || (jcp.l_pad > 0 && n_oi > 1)) {
-                Label ow_loop_label;
-                L(ow_loop_label); {
-                    compute_loop(jcp.ur_w, 0, 0, false);
-                    add(reg_inp, inp_shift);
+                if (jcp.ur_w_tail != 0) {
+                    icb_loop(jcp.ur_w_tail, 0, r_pad, true);
+                }
+            } else {
+                if (jcp.l_pad > 0) {
+                    icb_loop(jcp.ur_w, jcp.l_pad, 0, false);
+                    add(reg_inp, inp_shift_pad);
                     add(reg_out, out_shift);
 
                     inc(reg_oi);
-                    cmp(reg_oi, n_oi);
-                    jl(ow_loop_label, T_NEAR);
                 }
-            }
-            if (r_pad1 > 0 || jcp.ur_w_tail == 0) {
-                compute_loop(jcp.ur_w, 0, r_pad1, jcp.ur_w_tail == 0);
-                add(reg_inp, inp_shift);
-                add(reg_out, out_shift);
-            }
-            if (jcp.ur_w_tail != 0) {
-                compute_loop(jcp.ur_w_tail, 0, r_pad, true);
+                if ((jcp.l_pad <= 0 && n_oi > 0) || (jcp.l_pad > 0 && n_oi > 1))
+                {
+                    Label ow_loop_label;
+                    L(ow_loop_label); {
+                        icb_loop(jcp.ur_w, 0, 0, false);
+                        add(reg_inp, inp_shift);
+                        add(reg_out, out_shift);
+
+                        inc(reg_oi);
+                        cmp(reg_oi, n_oi);
+                        jl(ow_loop_label, T_NEAR);
+                    }
+                }
+                if (r_pad1 > 0 || jcp.ur_w_tail == 0) {
+                    icb_loop(jcp.ur_w, 0, r_pad1, jcp.ur_w_tail == 0);
+                    add(reg_inp, inp_shift);
+                    add(reg_out, out_shift);
+                }
+                if (jcp.ur_w_tail != 0) {
+                    icb_loop(jcp.ur_w_tail, 0, r_pad, true);
+                }
             }
         }
-    }
+    } else {
+        // ow block is only processed.
+        // Number of block is passed as parameter owb,
+        // and padding processing depends on this number.
+        Label end_label, last_oi_label, middle_ow_blocks_label, tail_label,
+            oi_loop_label, oi_loop_end_label;
+
+        assert(jcp.ow_block % jcp.ur_w == 0);
+        int n_oi_not_last_ow_block = jcp.ow_block / jcp.ur_w;
+        // to simplify code (and general regs usage),
+        // size of ow block must be >= 2 * ur_w
+        assert(n_oi_not_last_ow_block > 1);
+        int n_oi_next_last_ow_block = n_oi_not_last_ow_block;
+        int n_oi_first_ow_block = n_oi_not_last_ow_block;
+        int n_oi_last_ow_block
+            = (jcp.ow - jcp.ow_block * (jcp.nb_ow - 1)) / jcp.ur_w;
+        // prepare right padding
+        bool next_last_ow_block_padded = r_pad1 > 0 && n_oi_last_ow_block == 0;
+        bool first_ow_block_padded
+                = next_last_ow_block_padded && jcp.nb_ow == 2;
+        bool last_ow_block_padded
+                = (r_pad1 > 0 || jcp.ur_w_tail == 0) && n_oi_last_ow_block > 0;
+
+        if (last_ow_block_padded) n_oi_last_ow_block--;
+        else if (first_ow_block_padded) n_oi_first_ow_block--;
+        else if (next_last_ow_block_padded) n_oi_next_last_ow_block--;
+
+        mov(reg_owb, ptr[param1 + GET_OFF(owb)]);
+        cmp(reg_owb, 0); // is that the first ow-block ?
+        jg(middle_ow_blocks_label, T_NEAR);
+
+        // the first ow block, compute left padding
+        mov(reg_oi, n_oi_first_ow_block);
+        if (jcp.l_pad > 0) {
+            icb_loop(jcp.ur_w, jcp.l_pad, 0, false);
+            add(reg_inp, inp_shift_pad);
+            add(reg_out, out_shift);
+
+            dec(reg_oi);
+        }
+        jmp(oi_loop_label, T_NEAR);
+
+        // middle or last ow block entry
+        L(middle_ow_blocks_label);
+
+        if (jcp.l_pad > 0) {
+            // just to consider left padding, not compute
+            add(reg_inp, inp_shift_pad_second_block);
+        }
+
+        // set number of iteration for oi-loop
+        if (n_oi_last_ow_block != n_oi_not_last_ow_block) {
+            cmp(reg_owb, jcp.nb_ow - 1); // last ow-block ?
+            mov(reg_oi, n_oi_last_ow_block);
+            je(oi_loop_label, T_NEAR);
+        }
+
+        if (n_oi_next_last_ow_block != n_oi_not_last_ow_block) {
+            cmp(reg_owb, jcp.nb_ow - 2); // next to last ow-block ?
+
+            mov(reg_oi, n_oi_next_last_ow_block);
+            je(oi_loop_label, T_NEAR);
+        }
+        mov(reg_oi, n_oi_not_last_ow_block); // other middle ow-blocks
 
+        // oi loop w/o padding
+        L(oi_loop_label); {
+            cmp(reg_oi, 0);
+            jle(oi_loop_end_label, T_NEAR);
+
+            icb_loop(jcp.ur_w, 0, 0, false);
+
+            add(reg_inp, inp_shift);
+            add(reg_out, out_shift);
+            dec(reg_oi);
+
+            jmp(oi_loop_label, T_NEAR);
+        }
+        L(oi_loop_end_label);
+
+        mov(reg_owb, ptr[param1 + GET_OFF(owb)]);
+        cmp(reg_owb, 0); // first ow-block ?
+        if (first_ow_block_padded)
+            je(last_oi_label, T_NEAR);
+        else
+            je(end_label, T_NEAR);
+
+        cmp(reg_owb, jcp.nb_ow - 2); // next to last ow-block ?
+        jl(end_label, T_NEAR);
+        if (next_last_ow_block_padded)
+            je(last_oi_label, T_NEAR);
+        else
+            je(end_label, T_NEAR);
+
+        // that is last block
+        if (!last_ow_block_padded)
+            jmp(tail_label, T_NEAR);
+
+        // last oi block with right padding
+        L(last_oi_label);
+        icb_loop(jcp.ur_w, 0, r_pad1, jcp.ur_w_tail == 0);
+        add(reg_inp, inp_shift);
+        add(reg_out, out_shift);
+
+        mov(reg_owb, ptr[param1 + GET_OFF(owb)]);
+        cmp(reg_owb, jcp.nb_ow - 1); // last ow_block?
+        jl(end_label, T_NEAR);
+
+        // ur_w tail
+        L(tail_label);
+        if (jcp.ur_w_tail != 0) {
+            icb_loop(jcp.ur_w_tail, 0, r_pad, true);
+        }
+        L(end_label);
+    }
     postamble();
 }
 
-bool jit_avx512_core_u8s8s32x_fwd_kernel::post_ops_ok(
+bool jit_avx512_core_x8s8s32x_fwd_kernel::post_ops_ok(
         jit_conv_conf_t &jcp, const primitive_attr_t &attr)
 {
     using namespace primitive_kind;
@@ -445,11 +644,11 @@ bool jit_avx512_core_u8s8s32x_fwd_kernel::post_ops_ok(
     switch (p.len_) {
     case 0: return true;
     case 1: return true
-                && implication(jcp.with_eltwise, p.contain(sum, 0))
-                && implication(!jcp.with_eltwise, is_relu(0) || p.contain(sum, 0));
+                && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0))
+                && IMPLICATION(!jcp.with_eltwise, is_relu(0) || p.contain(sum, 0));
     case 2: return true
-                && implication(jcp.with_eltwise, p.contain(sum, 0) && is_relu(1))
-                && implication(!jcp.with_eltwise, false
+                && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0) && is_relu(1))
+                && IMPLICATION(!jcp.with_eltwise, false
                         || (p.contain(sum, 0) && is_relu(1))
                         || (p.contain(sum, 1) && is_relu(0)));
     case 3: return true
@@ -461,11 +660,11 @@ bool jit_avx512_core_u8s8s32x_fwd_kernel::post_ops_ok(
     return false;
 }
 
-status_t jit_avx512_core_u8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
+status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
             const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd,
             cpu_memory_t::pd_t &weights_pd, cpu_memory_t::pd_t &dst_pd,
             cpu_memory_t::pd_t &bias_pd, const primitive_attr_t &attr,
-            bool with_relu, float relu_negative_slope)
+            int nthreads, bool with_relu, float relu_negative_slope)
 {
     using namespace prop_kind;
 
@@ -476,8 +675,8 @@ status_t jit_avx512_core_u8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
 
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
 
-    if (!(mayiuse(avx512_core) &&
-            src_d.data_type() == data_type::u8
+    if (!(mayiuse(avx512_core)
+         && one_of(src_d.data_type(), data_type::u8, data_type::s8)
          && weights_d.data_type() == data_type::s8
          && one_of(dst_d.data_type(), data_type::f32, data_type::s32,
             data_type::s8, data_type::u8)))
@@ -507,10 +706,18 @@ status_t jit_avx512_core_u8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     jcp.eltwise_alpha = relu_negative_slope;
     jcp.ur_h = 1;
 
-    if (!implication(with_relu, relu_negative_slope == 0.))
+    jcp.dilate_h = cd.dilates[0];
+    jcp.dilate_w = cd.dilates[1];
+
+    if (!IMPLICATION(with_relu, relu_negative_slope == 0.))
         return status::unimplemented;
 
+    jcp.signed_input = (src_d.data_type() == data_type::s8) ? true : false;
     jcp.is_depthwise = true && with_groups && everyone_is(1, jcp.ic, jcp.oc);
+
+    if (jcp.is_depthwise && jcp.signed_input)
+        return status::unimplemented;
+
     if (jcp.is_depthwise) {
         jcp.ch_block = 16;
         jcp.ic_block = 1;
@@ -529,9 +736,6 @@ status_t jit_avx512_core_u8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
             return status::unimplemented;
     }
 
-    jcp.dilate_h = cd.dilates[0];
-    jcp.dilate_w = cd.dilates[1];
-
     jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1)
             - (jcp.ih + jcp.t_pad - 1);
 
@@ -545,7 +749,9 @@ status_t jit_avx512_core_u8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     const int regs = (jcp.ver == ver_vnni && !jcp.is_depthwise) ? 31 : 28;
 
     const auto w_format = with_groups
-        ? (jcp.is_depthwise ? Goihw16g : gOIhw4i16o4i) : OIhw4i16o4i;
+        ? (jcp.is_depthwise ? Goihw16g
+                : (jcp.signed_input) ? gOIhw4i16o4i_s8s8 : gOIhw4i16o4i)
+        : (jcp.signed_input) ? OIhw4i16o4i_s8s8 : OIhw4i16o4i;
     if (weights_d.format() == any)
         CHECK(weights_pd.set_format(w_format));
     if (weights_d.format() != w_format)
@@ -584,20 +790,47 @@ status_t jit_avx512_core_u8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     // factor smaller than the left padding (special requirement for SSD:fc6),
     // then search for a smaller OC blocking that satisfies both constraints.
     jcp.nb_oc_blocking = nstl::min(4, jcp.nb_oc);
-    for (; jcp.nb_oc_blocking > 1; jcp.nb_oc_blocking--)
+    for (; jcp.nb_oc_blocking > 1; jcp.nb_oc_blocking--) {
+        int ur_w = regs / (jcp.nb_oc_blocking + 1);
         if (jcp.nb_oc % jcp.nb_oc_blocking == 0
-                && jcp.l_pad <= regs / (jcp.nb_oc_blocking + 1))
+                && (jcp.l_pad <= ur_w
+                         && IMPLICATION(jcp.ow != 1, jcp.ow % ur_w != 1)))
             break;
+    }
 
     jcp.ur_w = regs / (jcp.nb_oc_blocking + 1);
     if (jcp.ow < jcp.ur_w)
         jcp.ur_w = jcp.ow;
     jcp.ur_w_tail = jcp.ow % jcp.ur_w;
 
+    jcp.ow_block = jcp.ow;
+    int base_work_amount
+            = jcp.mb * jcp.nb_ch * jcp.oh * (jcp.nb_oc / jcp.nb_oc_blocking);
+    float best_thr_eff
+            = (float)base_work_amount / rnd_up(base_work_amount, nthreads);
+    int max_nb_ow = div_up(jcp.ow, 2 * jcp.ur_w);
+    for (int nb_ow = 1; nb_ow <= max_nb_ow; nb_ow++) {
+        int ow_block
+                = nstl::min(rnd_up(div_up(jcp.ow, nb_ow), jcp.ur_w), jcp.ow);
+        if (ow_block < jcp.nb_oc_blocking * jcp.oc_block && best_thr_eff > 0.8f)
+            break;
+        if (div_up(jcp.ow, ow_block) != nb_ow)
+            continue;
+        auto work_amount = base_work_amount * nb_ow;
+        float thr_eff = (float)work_amount / rnd_up(work_amount, nthreads);
+        if (ow_block >= 2 * jcp.ur_w && thr_eff > 1.1f * best_thr_eff) {
+            jcp.ow_block = ow_block;
+            best_thr_eff = thr_eff;
+        }
+        if (best_thr_eff > 0.9f)
+            break;
+    }
+    jcp.nb_ow = div_up(jcp.ow, jcp.ow_block);
+
     bool args_ok = true
         && jcp.oc % jcp.oc_block == 0
         && jcp.l_pad <= jcp.ur_w
-        && implication(!jcp.is_1stconv, jcp.ic % jcp.ic_block == 0);
+        && IMPLICATION(!jcp.is_1stconv, jcp.ic % jcp.ic_block == 0);
     if (!args_ok)
         return status::unimplemented;
 
@@ -614,7 +847,9 @@ status_t jit_avx512_core_u8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     const auto &oscales = attr.output_scales_;
     jcp.is_oc_scale = oscales.mask_ == 1 << 1;
 
-    assert(utils::implication(!jcp.is_oc_scale, oscales.mask_ == 0));
+    assert(IMPLICATION(!jcp.is_oc_scale, oscales.mask_ == 0));
+
+    jcp.wei_adj_scale = (jcp.signed_input) ? (1.f / 2.f) : 1.f;
 
     return status::success;
 }
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_JIT_AVX512_CORE_U8S8S32X_CONV_KERNEL_HPP
-#define CPU_JIT_AVX512_CORE_U8S8S32X_CONV_KERNEL_HPP
+#ifndef CPU_JIT_AVX512_CORE_X8S8S32X_CONV_KERNEL_HPP
+#define CPU_JIT_AVX512_CORE_X8S8S32X_CONV_KERNEL_HPP
 
 #include "c_types_map.hpp"
 #include "cpu_memory.hpp"
@@ -27,12 +27,12 @@ namespace mkldnn {
 namespace impl {
 namespace cpu {
 
-struct jit_avx512_core_u8s8s32x_fwd_kernel : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8s8s32x_conv_fwd_ker_t)
+struct jit_avx512_core_x8s8s32x_fwd_kernel : public jit_generator {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_x8s8s32x_conv_fwd_ker_t)
 
     enum { STATE_FIRST_DST_LOAD = 0x1U };
 
-    jit_avx512_core_u8s8s32x_fwd_kernel(jit_conv_conf_t ajcp,
+    jit_avx512_core_x8s8s32x_fwd_kernel(jit_conv_conf_t ajcp,
             const primitive_attr_t &attr) : jcp(ajcp), attr_(attr)
     {
         generate();
@@ -47,6 +47,7 @@ struct jit_avx512_core_u8s8s32x_fwd_kernel : public jit_generator {
             cpu_memory_t::pd_t &dst_pd,
             cpu_memory_t::pd_t &bias_pd,
             const primitive_attr_t &attr,
+            int nthreads,
             bool with_relu = false,
             float relu_negative_slope = 0.);
 
@@ -74,24 +75,29 @@ private:
     reg64_t aux_reg_inp = r11;
     reg64_t reg_ptr_sum_scale = r11;
     reg64_t aux_reg_ker = r12;
+    reg64_t reg_owb = r12;
+
     reg64_t reg_scratch = r14;
     reg64_t reg_kj = rax;
+    reg64_t reg_overflow = rax;
     reg64_t reg_ptr_scales = rax;
     reg64_t reg_oi = rbx;
     reg64_t reg_bias = rdx;
+    reg64_t reg_compensation = reg_scratch;
     reg64_t reg_kh = abi_not_param1;
     reg64_t param = abi_param1;
     reg64_t reg_tmp = rbp;
     reg64_t imm_addr64 = r15;
     reg64_t reg_oc_blocks = rsi;
     reg64_t reg_icb = reg_bias;
+    reg64_t reg_bias_alpha = reg_kh;
 
     Xbyak::Opmask ktail_mask = Xbyak::Opmask(2);
 
     zmm_t zmm_tmp = zmm_t(28);
     zmm_t zmm_one = zmm_t(29);
     zmm_t zmm_scales = zmm_t(30);
-    zmm_t zmm_bcast = zmm_t(30);
+    zmm_t zmm_shift = zmm_t(30);
     zmm_t zmm_zero = zmm_t(31);
     zmm_t zmm_wei = zmm_t(31);
 
@@ -110,6 +116,12 @@ private:
         assert(idx < 31);
         return zmm_t(idx);
     }
+    zmm_t zmm_bias_alpha() {
+        return zmm_t(jcp.nb_oc_blocking * jcp.ur_w);
+    }
+    xmm_t xmm_bias_alpha() {
+        return xmm_t(jcp.nb_oc_blocking * jcp.ur_w);
+    }
     int get_ow_start(int ki, int pad_l) {
         return nstl::max(0,
                 utils::div_up(pad_l - ki * (jcp.dilate_w + 1), jcp.stride_w));
@@ -123,8 +135,10 @@ private:
     bool maybe_relu(int position);
     void prepare_output(int ur_w);
     void store_output(int ur_w, int last_oc_block_flag);
-    void compute_ker(int ur_w, int pad_l, int pad_r, int last_ic_block_flag);
-    void compute_loop(
+    void compute_ker(int ur_w, int pad_l, int pad_r, int last_ic_block_flag,
+                                                        bool h_padded = false);
+    void kh_loop(int ur_w, int pad_l, int pad_r, int last_ic_block_flag);
+    void icb_loop(
             int ur_w, int pad_l, int pad_r, bool is_last_spatial_block);
     void generate();
     void cvt2ps(data_type_t type_in, zmm_t zmm_in, const Xbyak::Operand &op,
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_x8s8s32x_convolution.cpp
new file mode 100644 (file)
index 0000000..8d1297f
--- /dev/null
@@ -0,0 +1,203 @@
+/*******************************************************************************
+* Copyright 2016-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "mkldnn_types.h"
+#include "c_types_map.hpp"
+#include "mkldnn_thread.hpp"
+#include "type_helpers.hpp"
+#include "utils.hpp"
+
+#include "jit_avx512_core_x8s8s32x_convolution.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using namespace mkldnn::impl::status;
+using namespace mkldnn::impl::memory_format;
+using namespace mkldnn::impl::utils;
+
+using namespace nstl;
+
+using jit_conv_ker_t = void (*)(jit_conv_call_s *);
+
+#define wht_blk_off(d, g, ...) \
+        (conf_.with_groups() \
+         ? (d).blk_off((g), __VA_ARGS__) \
+         : (d).blk_off(__VA_ARGS__))
+
+template <bool with_relu, data_type_t src_type, data_type_t dst_type>
+void _jit_avx512_core_x8s8s32x_convolution_fwd_t<with_relu, src_type, dst_type>::
+execute_forward()
+{
+    auto src = reinterpret_cast<const src_data_t *>(this->input_memory(0));
+    auto weights = reinterpret_cast<const wei_data_t *>(this->input_memory(1));
+    auto bias = reinterpret_cast<const char *>(this->input_memory(2));
+    auto dst = reinterpret_cast<dst_data_t *>(this->memory());
+
+    const memory_desc_wrapper src_d(conf_.src_pd());
+    const memory_desc_wrapper dst_d(conf_.dst_pd());
+    const memory_desc_wrapper weights_d(conf_.weights_pd(0));
+    const memory_desc_wrapper bias_d(conf_.weights_pd(1));
+
+    const size_t bia_dt_size = conf_.with_bias()
+        ? types::data_type_size(conf_.cdesc()->bias_desc.data_type) : 0;
+
+    const auto &jcp = kernel_->jcp;
+    assert(jcp.nb_oc % jcp.nb_oc_blocking == 0);
+
+    size_t offset = (size_t)jcp.ngroups * jcp.oc * jcp.ic * jcp.kh * jcp.kw;
+    auto w = const_cast<wei_data_t *>(weights);
+    int32_t* compensation = (jcp.signed_input)
+                                ? reinterpret_cast<int32_t *>(&w[offset]) : 0;
+    const auto &oscales = conf_.attr()->output_scales_;
+    int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking;
+    int nb_groups = jcp.nb_ch;
+    int group_block = jcp.ch_block;
+    int work_amount = jcp.mb * nb_groups * oc_chunks * jcp.oh * jcp.nb_ow;
+
+    parallel(0, [&](const int ithr, const int nthr) {
+
+        int start{0}, end{0};
+        balance211(work_amount, nthr, ithr, start, end);
+
+        auto p = jit_conv_call_s();
+
+        size_t src_h_stride = src_d.blk_off(0, 0, 1);
+        size_t dst_h_stride = dst_d.blk_off(0, 0, 1);
+        size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
+
+        int n{ 0 }, gb{ 0 }, occ{ 0 }, oh_s{ 0 }, owb{ 0 };
+        if (jcp.loop_order == loop_cwgn)
+            nd_iterator_init(start, occ, oc_chunks, owb, jcp.nb_ow, gb,
+                    nb_groups, n, jcp.mb, oh_s, jcp.oh);
+        else if (jcp.loop_order == loop_gncw)
+            nd_iterator_init(start, gb, nb_groups, n, jcp.mb, occ, oc_chunks,
+                    owb, jcp.nb_ow, oh_s, jcp.oh);
+        else if (jcp.loop_order == loop_ngcw)
+            nd_iterator_init(start, n, jcp.mb, gb, nb_groups, occ, oc_chunks,
+                    owb, jcp.nb_ow, oh_s, jcp.oh);
+        else
+            assert(!"unsupported loop order");
+        while (start < end) {
+            int ocb = occ * jcp.nb_oc_blocking;
+            int g = gb * group_block;
+            int g_oc = (g * jcp.nb_oc + ocb) * jcp.oc_block;
+
+            int g_ic = g * jcp.nb_ic * jcp.ic_block;
+
+            int work_rem = end - start;
+            int ih_s = -jcp.t_pad + oh_s * jcp.stride_h;
+            int oh_e = oh_s + work_rem > jcp.oh ? jcp.oh : oh_s + work_rem;
+            int ow_s = owb * jcp.ow_block;
+            int iw_s = ow_s * jcp.stride_w;
+
+            auto bias_w = bias
+                ? bias + (bias_d.blk_off(g_oc) * bia_dt_size)
+                : 0;
+            int32_t *compensation_w = (jcp.signed_input)
+                                                    ? compensation + g_oc : 0;
+
+            auto dst_w = dst + dst_d.blk_off(n, g_oc, oh_s, ow_s);
+            auto src_w = src + src_d.blk_off(n, g_ic, ih_s, iw_s);
+            auto wht_w = weights + wht_blk_off(weights_d, gb, ocb, 0);
+
+            auto scales = (jcp.signed_input && jcp.ver != ver_vnni)
+                ? &local_scales_[jcp.is_oc_scale * g_oc]
+                : &oscales.scales_[jcp.is_oc_scale * g_oc];
+
+            for (int oj = oh_s, ij = ih_s; oj < oh_e;
+                ++oj, ij += jcp.stride_h) {
+                int dilate_h = jcp.dilate_h + 1;
+                int i_t_overflow = nstl::min(jcp.kh,
+                                                div_up(max(0, -ij), dilate_h));
+                int i_b_overflow = nstl::min(jcp.kh, div_up(
+                        max(0, ij - jcp.ih + (jcp.kh - 1) * dilate_h + 1),
+                        dilate_h));
+                int kh_padding = nstl::max(0,
+                    jcp.kh - i_t_overflow - i_b_overflow);
+
+                size_t wei_stride = (!jcp.signed_input)
+                                            ? i_t_overflow * wht_h_stride : 0;
+                p.src = src_w + i_t_overflow * dilate_h * src_h_stride;
+                p.dst = dst_w;
+                p.filt = wht_w + wei_stride;
+                p.bias = bias_w;
+                p.compensation = compensation_w;
+                p.oc_blocks = jcp.is_depthwise ? gb : ocb;
+                p.kh_padding = kh_padding;
+                p.scales = scales;
+                p.t_overflow = i_t_overflow;
+                p.b_overflow = i_b_overflow;
+                p.owb = owb;
+
+                kernel_->jit_ker(&p);
+
+                src_w += src_h_stride * jcp.stride_h;
+                dst_w += dst_h_stride;
+            }
+            if (jcp.loop_order == loop_cwgn)
+                nd_iterator_jump(start, end, occ, oc_chunks, owb, jcp.nb_ow, gb,
+                        nb_groups, n, jcp.mb, oh_s, jcp.oh);
+            else if (jcp.loop_order == loop_gncw)
+                nd_iterator_jump(start, end, gb, nb_groups, n, jcp.mb, occ,
+                        oc_chunks, owb, jcp.nb_ow, oh_s, jcp.oh);
+            else if (jcp.loop_order == loop_ngcw)
+                nd_iterator_jump(start, end, n, jcp.mb, gb, nb_groups, occ,
+                        oc_chunks, owb, jcp.nb_ow, oh_s, jcp.oh);
+            else
+                assert(!"unsupported loop order");
+        }
+    });
+}
+
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<false,
+                                                data_type::s8, data_type::u8>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<true,
+                                                data_type::s8, data_type::u8>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<false,
+                                                data_type::u8, data_type::u8>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<true,
+                                                data_type::u8, data_type::u8>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<false,
+                                                data_type::s8, data_type::s8>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<true,
+                                                data_type::s8, data_type::s8>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<false,
+                                                data_type::u8, data_type::s8>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<true,
+                                                data_type::u8, data_type::s8>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<false,
+                                                data_type::s8, data_type::s32>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<true,
+                                                data_type::s8, data_type::s32>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<false,
+                                                data_type::u8, data_type::s32>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<true,
+                                                data_type::u8, data_type::s32>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<false,
+                                                data_type::s8, data_type::f32>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<true,
+                                                data_type::s8, data_type::f32>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<false,
+                                                data_type::u8, data_type::f32>;
+template struct _jit_avx512_core_x8s8s32x_convolution_fwd_t<true,
+                                                data_type::u8, data_type::f32>;
+}
+}
+}
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_JIT_AVX512_CORE_U8S8S32X_CONVOLUTION_HPP
-#define CPU_JIT_AVX512_CORE_U8S8S32X_CONVOLUTION_HPP
+#ifndef CPU_JIT_AVX512_CORE_X8S8S32X_CONVOLUTION_HPP
+#define CPU_JIT_AVX512_CORE_X8S8S32X_CONVOLUTION_HPP
 
 #include "c_types_map.hpp"
 #include "cpu_convolution_pd.hpp"
 #include "cpu_reducer.hpp"
 #include "cpu_barrier.hpp"
 
-#include "jit_avx512_core_u8s8s32x_conv_kernel.hpp"
+#include "jit_avx512_core_x8s8s32x_conv_kernel.hpp"
 
 namespace mkldnn {
 namespace impl {
 namespace cpu {
 
-template <bool with_relu, impl::data_type_t dst_type>
-struct _jit_avx512_core_u8s8s32x_convolution_fwd_t : public cpu_primitive_t {
+template <bool with_relu, impl::data_type_t src_type, impl::data_type_t dst_type>
+struct _jit_avx512_core_x8s8s32x_convolution_fwd_t : public cpu_primitive_t {
     struct pd_t : public _cpu_convolution_fwd_pd_t<with_relu> {
         pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc,
                 const primitive_attr_t *attr,
@@ -42,8 +42,8 @@ struct _jit_avx512_core_u8s8s32x_convolution_fwd_t : public cpu_primitive_t {
         {
         }
         DECLARE_COMMON_PD_T(
-                JIT_IMPL_NAME_HELPER("jit:", avx512_core, ""),
-                _jit_avx512_core_u8s8s32x_convolution_fwd_t<with_relu,
+                JIT_IMPL_NAME_HELPER("jit_int8:", avx512_core, ""),
+                _jit_avx512_core_x8s8s32x_convolution_fwd_t<with_relu, src_type,
                 dst_type>);
 
         virtual status_t init() override
@@ -55,36 +55,50 @@ struct _jit_avx512_core_u8s8s32x_convolution_fwd_t : public cpu_primitive_t {
                                forward_inference)
                     && this->cdesc_().alg_kind == alg_kind::convolution_direct
                     && !this->has_zero_dim_memory()
+                    && this->cdesc_().src_desc.data_type == src_type
                     && this->cdesc_().dst_desc.data_type == dst_type
-                    && utils::implication(this->with_bias(), utils::one_of(
+                    && IMPLICATION(this->with_bias(), utils::one_of(
                             this->cdesc_().bias_desc.data_type, data_type::f32,
                             data_type::s32, data_type::s8, data_type::u8))
                     && this->cdesc_().accum_data_type == data_type::s32;
             if (!ok)
                 return status::unimplemented;
 
-            return jit_avx512_core_u8s8s32x_fwd_kernel::init_conf(
+            return jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(
                     jcp_, this->cdesc_(), this->src_pd_, this->weights_pd_,
                     this->dst_pd_,this->bias_pd_, *this->attr(),
+                    mkldnn_get_max_threads(),
                     with_relu, this->negative_slope());
         }
 
         jit_conv_conf_t jcp_;
     };
 
-    _jit_avx512_core_u8s8s32x_convolution_fwd_t(const pd_t *pd,
+    _jit_avx512_core_x8s8s32x_convolution_fwd_t(const pd_t *pd,
             const input_vector &inputs, const output_vector &outputs)
         : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
+        , local_scales_(nullptr)
     {
-        kernel_ = new jit_avx512_core_u8s8s32x_fwd_kernel(conf_.jcp_,
+        kernel_ = new jit_avx512_core_x8s8s32x_fwd_kernel(conf_.jcp_,
                     *conf_.attr());
+        if (conf_.jcp_.signed_input && conf_.jcp_.ver != ver_vnni) {
+            size_t scales_size = (conf_.attr()->output_scales_.count_ == 1)
+                ? 16
+                : conf_.attr()->output_scales_.count_;
+            local_scales_ = (float *)malloc(sizeof(float) * scales_size, 64);
+            for (size_t i = 0; i < scales_size; i++) {
+                local_scales_[i] = conf_.attr()->output_scales_.scales_[i] *
+                                        (1.f / conf_.jcp_.wei_adj_scale);
+            }
+        }
     }
 
-    ~_jit_avx512_core_u8s8s32x_convolution_fwd_t() {
+    ~_jit_avx512_core_x8s8s32x_convolution_fwd_t() {
         delete kernel_;
+        if (local_scales_) free(local_scales_);
     };
 
-    typedef typename prec_traits<data_type::u8>::type src_data_t;
+    typedef typename prec_traits<src_type>::type src_data_t;
     typedef typename prec_traits<data_type::s8>::type wei_data_t;
     typedef typename prec_traits<dst_type>::type dst_data_t;
 
@@ -97,16 +111,17 @@ struct _jit_avx512_core_u8s8s32x_convolution_fwd_t : public cpu_primitive_t {
 private:
     void execute_forward();
     pd_t conf_;
-    jit_avx512_core_u8s8s32x_fwd_kernel *kernel_;
+    jit_avx512_core_x8s8s32x_fwd_kernel *kernel_;
+    float *local_scales_;
 };
 
-template <impl::data_type_t dst_type>
-using jit_avx512_core_u8s8s32x_convolution_fwd_t =
-    _jit_avx512_core_u8s8s32x_convolution_fwd_t<false, dst_type>;
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_avx512_core_x8s8s32x_convolution_fwd_t =
+    _jit_avx512_core_x8s8s32x_convolution_fwd_t<false, src_type, dst_type>;
 
-template <impl::data_type_t dst_type>
-using jit_avx512_core_u8s8s32x_convolution_relu_t =
-    _jit_avx512_core_u8s8s32x_convolution_fwd_t<true, dst_type>;
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_avx512_core_x8s8s32x_convolution_relu_t =
+    _jit_avx512_core_x8s8s32x_convolution_fwd_t<true, src_type, dst_type>;
 
 }
 }
index 9f8f7c0..b72ed2d 100644 (file)
@@ -142,42 +142,11 @@ struct all_same<T, T, Us...> : all_same<T, Us...> { };
 template <typename T>
 struct all_same<T, T> : std::true_type {};
 
-template <size_t len = 64>
-class jit_tagged_label_base {
-public:
-    enum { maxlen = len };
-    template <size_t n, typename... Tags,
-             typename = std::enable_if<all_same<char, Tags...>::value>>
-    jit_tagged_label_base(const char (&base)[n], Tags... tags) {
-        // XXX: This code is ugly but useful
-        constexpr size_t ntags = sizeof...(tags);
-        static_assert(n + ntags < maxlen, "resulting label may be too long");
-        // paste tags first in case base has unexpected null chars
-        paste_tags(tags...);
-        for (size_t i = 0; i < n; i++)
-            label_name_[ntags + i] = base[i];
-        // don't assume that the base string is 0-terminated
-        label_name_[ntags + n] = '\0';
-    }
-    operator const char*() const { return label_name_; }
-    const char *c_str() const { return label_name_; }
-private:
-    char label_name_[maxlen];
-    void paste_tags() { }
-    template <typename... Tags>
-    void paste_tags(char tag, Tags... tags) {
-        label_name_[sizeof...(tags)] = tag;
-        paste_tags(tags...);
-    }
-};
-
 struct jit_code_injection {
     const Xbyak::uint8* code;
     size_t size;
 };
 
-typedef jit_tagged_label_base<> jit_tagged_label;
-
 class jit_generator : public Xbyak::CodeGenerator
 {
 private:
@@ -331,16 +300,8 @@ public:
         }
     }
 
-    // Provide overrides for custom jit_tagged_label and C strings rather than
-    // implement a conversion of jit_tagge_label to std::string to avoid
-    // additional C++ runtime dependency
-
-    template <size_t len>
-    void L(const jit_tagged_label_base<len> &label) {
-        Xbyak::CodeGenerator::L(label.c_str());
-    }
-
-    void L(const char *label) { Xbyak::CodeGenerator::L(label); }
+    // Disallow char-based labels completely
+    void L(const char *label) = delete;
     void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); }
 
     void uni_vpxor(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
@@ -407,7 +368,7 @@ public:
         shufps(x, x, 0x0);
     }
     void uni_vbroadcastss(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
-        if (mayiuse(avx2)) {
+        if (op.isMEM() || mayiuse(avx2)) {
             vbroadcastss(x, op);
         } else {
             Xbyak::Xmm t(x.getIdx());
@@ -668,6 +629,108 @@ public:
         vmovmskps(x1, x2);
     }
 
+    void uni_vpmovsxbd(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
+        pmovsxbd(x, op);
+    }
+    void uni_vpmovsxbd(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
+        vpmovsxbd(x, op);
+    }
+
+    void uni_vpmovzxbd(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
+        pmovzxbd(x, op);
+    }
+    void uni_vpmovzxbd(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
+        vpmovzxbd(x, op);
+    }
+
+    void uni_vpackssdw(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        packssdw(x1, op);
+    }
+    void uni_vpackssdw(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpackssdw(x1, x2, op);
+    }
+
+    void uni_vpackusdw(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        packusdw(x1, op);
+    }
+    void uni_vpackusdw(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpackusdw(x1, x2, op);
+    }
+
+    void uni_vpacksswb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        packsswb(x1, op);
+    }
+    void uni_vpacksswb(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpacksswb(x1, x2, op);
+    }
+
+    void uni_vpackuswb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        packuswb(x1, op);
+    }
+    void uni_vpackuswb(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpackuswb(x1, x2, op);
+    }
+
+    void uni_vpmaxsd(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        pmaxsd(x1, op);
+    }
+    void uni_vpmaxsd(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpmaxsd(x1, x2, op);
+    }
+
+    void uni_vpmaxsb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        pmaxsb(x1, op);
+    }
+    void uni_vpmaxsb(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpmaxsb(x1, x2, op);
+    }
+
+    void uni_vpmaxub(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        pmaxub(x1, op);
+    }
+    void uni_vpmaxub(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpmaxub(x1, x2, op);
+    }
+
+    void uni_vpmaddubsw(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        pmaddubsw(x1, op);
+    }
+    void uni_vpmaddubsw(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpmaddubsw(x1, x2, op);
+    }
+
+    void uni_vpmaddwd(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        pmaddwd(x1, op);
+    }
+    void uni_vpmaddwd(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpmaddwd(x1, x2, op);
+    }
+
+    void uni_vpmulld(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        pmulld(x1, op);
+    }
+    void uni_vpmulld(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpmulld(x1, x2, op);
+    }
+
+    void uni_vpsubb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op) {
+        assert(x1.getIdx() == x2.getIdx());
+        psubb(x1, op);
+    }
+    void uni_vpsubb(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpsubb(x1, x2, op);
+    }
+
     void mul_by_const(const Xbyak::Reg &out,
             const Xbyak::Reg64 &tmp, int value) {
         // Generates a shift + add sequence for multiplicating contents of the
index a67558c..47c9799 100644 (file)
@@ -26,7 +26,8 @@ namespace cpu {
 /* convolution */
 enum conv_version_t {ver_unused, ver_fma, ver_avx512_core, ver_4fma, ver_4vnni,
                      ver_vnni};
-enum conv_loop_order_t {loop_cgn, loop_gnc, loop_ngc};
+enum conv_loop_order_t {loop_cgn, loop_gnc, loop_ngc, loop_gncw, loop_cwgn,
+                            loop_ngcw};
 enum conv_1x1_loop_order_t {loop_rbl, loop_rlb, loop_lbr, loop_lrb, loop_blr,
                             loop_brl};
 enum conv_kernel_kind_t {embd_bcast, expl_bcast};
@@ -37,6 +38,14 @@ enum {
     FLAG_IC_FIRST = 1 << 4, FLAG_IC_LAST = 1 << 5,
     FLAG_SP_FIRST = 1 << 6, FLAG_SP_LAST = 1 << 7,
     FLAG_REDUCE_FIRST = 1<<8, FLAG_REDUCE_LAST = 1<<9,
+    FLAG_ZERO_FILTER = 1 << 0, /* Controls whether the inner kernel skips
+                                   loading weights-data from memory; this
+                                   needs to happen on the first Group/16
+                                   iteration. */
+    FLAG_ZERO_BIAS = 1 << 1, /* Controls whether the inner kernel skip
+                               loading bias data from memory */
+    FLAG_COMPUTE_BIAS = 1 << 2, /* Controls bias computation during execution
+                                    pass */
 };
 
 struct jit_conv_conf_t {
@@ -47,6 +56,7 @@ struct jit_conv_conf_t {
     int ndims;
     int mb;
     int ngroups, ic, oc, oc_without_padding, ic_without_padding;
+    int oc_padded;
     int id, ih, iw, od, oh, ow;
     int f_pad, l_pad, t_pad;
     int back_pad, r_pad, b_pad;
@@ -83,9 +93,11 @@ struct jit_conv_conf_t {
 
     int nb_ic, ic_block;
     int nb_oc, oc_block;
+    int nb_ow, ow_block;
     int nb_ic_blocking, nb_oc_blocking; // blocking of nb_ic and nb_ic
     int nb_ic_blocking_max;
     int nb_ic_L2;
+    int h_blocking;
     int nb_oc_L2;
     int ur_h, ur_w;
     int ur_w_tail;
@@ -121,6 +133,12 @@ struct jit_conv_conf_t {
     int nb_ch, ch_block, nb_ch_blocking;
     bool is_depthwise;
     int aligned_threads;
+    // large spatial
+    int oh_blk_size;
+    int ow_blk_size;
+    // s8s8 convolution
+    bool signed_input;
+    float wei_adj_scale;
 };
 
 struct jit_conv_conf_2x3_wino_t {
@@ -260,17 +278,30 @@ struct jit_conv_call_s {
     const void *bias_prf;
     const void *scales;
     const void *acc_s32;
+    const void *compensation;
+    size_t kd_offset;
+    size_t kd_offset_prf;
+    size_t d_index;
+    size_t d_index_prf;
+    size_t d_worksize;
+    size_t d_worksize_prf;
     size_t kd_padding;
     size_t kd_padding_prf;
     size_t kh_padding;
     size_t kh_padding_prf;
+    size_t owb;
+    size_t owb_prf;
     size_t kw_padding;
     size_t channel;
     size_t channel_prf;
     size_t oc_blocks;
+    size_t oc_work;
     size_t ur_w;
     size_t ur_str_w;
     size_t ch_blocks;
+    size_t ch_work;
+    size_t t_overflow;
+    size_t b_overflow;
     int flags;
 
     const void *src_row0; /* hack, non-const for backward_data */
@@ -281,6 +312,36 @@ struct jit_conv_call_s {
     size_t oc_off_prf;
 };
 
+struct jit_deconv_call_s {
+    const void *src; /* hack, non-const for backward_data */
+    const void *dst; /* hack, non-const for forward */
+    const void *filt; /* hack, non-const for backward_weights */
+    const void *bias; /* hack, non-const for backward_bias */
+    const void *scales;
+    size_t kh_padding;
+    size_t oc_blocks;
+};
+
+struct jit_dw_conv_call_s {
+    const void *input;
+    const void *output;
+    const void *filter;
+    const void *bias;
+    union {
+        size_t table_flags; /* This allows both bytes to be read simultaneously
+                               */
+        struct {
+            unsigned char
+                    table_idx; /* Indicates the table entry for the
+                                        JIT-generated values that control the
+                                        inner loop execution. The entry is
+                                        determined by the oh_block exectuion. */
+            unsigned char
+                    exec_flag; /* Flags passed by driver execution to inner kernel */
+        };
+    };
+};
+
 struct jit_wino_transform_call_s {
     size_t tile_block;
     size_t tile_block_ur;
@@ -368,6 +429,18 @@ struct jit_1x1_conv_conf_t {
     int is_oc_scale;
     data_type_t bia_dt;
     data_type_t dst_dt;
+    bool signed_input;
+    float wei_adj_scale;
+
+    /* u8s8s32x */
+    int ic_dim, nb_ic, nb_ic_blocking, nb_ic_blocking_max;
+    int oc_dim, nb_oc, nb_oc_blocking, nb_oc_blocking_max;
+    int is_dim, os_block, nb_oh_blocking, nb_oh_blocking_max;
+    int ow_tail;
+
+    int ic_loop_unroll, ic_loop_src_step, ic_loop_wei_step;
+    int os_loop_dst_step, os_loop_src_step, os_loop_acc_step;
+    int os_loop_src_tail_step, os_loop_dst_tail_step, os_loop_acc_tail_step;
 };
 
 struct jit_gemm_conv_conf_t {
@@ -390,6 +463,8 @@ struct jit_gemm_conv_conf_t {
     int nthr;
     ptrdiff_t im2col_sz;
     bool need_wei_reduction;
+    bool signed_input;
+    float wei_adj_scale;
 };
 
 struct jit_1x1_conv_call_s {
@@ -399,6 +474,7 @@ struct jit_1x1_conv_call_s {
     const void *bias_data; // used in forward and backward_weights only
     const void *acc_s32;
     const void *scales;
+    const void *compensation;
 
     size_t load_dim;
     size_t bcast_dim;
@@ -413,6 +489,14 @@ struct jit_1x1_conv_call_s {
     const void *bias_dw;
 
     size_t oc_off;
+
+    /* u8s8s32x */
+    size_t oc_dim;
+    size_t os_dim;
+    size_t ic_dim;
+    size_t ic_pos_flag;
+       const void *is_data;
+       const void *oc_data;
 };
 
 /* pooling */
@@ -422,7 +506,7 @@ struct jit_pool_conf_t {
     int id, ih, iw, od, oh, ow;
     int stride_d, stride_h, stride_w;
     int kd, kh, kw;
-    int f_pad, t_pad, l_pad, b_pad, r_pad;
+    int f_pad, t_pad, l_pad, back_pad, b_pad, r_pad;
     alg_kind_t alg;
     bool is_training;
     bool pad_w_is_null;
index b3c7ba4..cbce262 100644 (file)
@@ -34,15 +34,14 @@ using namespace mkldnn::impl::utils;
 
 using namespace Xbyak;
 
-void jit_sse42_1x1_conv_kernel_f32::bcast_loop(int load_loop_blk,
-        char load_loop_tag)
+void jit_sse42_1x1_conv_kernel_f32::generate_bcast_loop(int load_loop_blk)
 {
     mov(aux1_reg_bcast_data, reg_bcast_data);
     mov(aux_reg_output_data, reg_output_data);
     mov(bcast_loop_iter, reg_bcast_loop_work);
 
-    jit_tagged_label bcast_loop("bcast_loop", load_loop_tag);
-    jit_tagged_label bcast_loop_tail("bcast_loop_tail", load_loop_tag);
+    Label bcast_loop;
+    Label bcast_loop_tail;
 
     cmp(bcast_loop_iter, jcp.ur);
     jl(bcast_loop_tail, T_NEAR);
@@ -52,7 +51,7 @@ void jit_sse42_1x1_conv_kernel_f32::bcast_loop(int load_loop_blk,
         int num_substeps = jcp.bcast_block / jcp.ur;
         assert(num_substeps > 0 && num_substeps < 10);
         for (int i = 0; i < num_substeps; i++) {
-            reduce_loop(load_loop_blk, jcp.ur, load_loop_tag, '0' + i);
+            generate_reduce_loop(load_loop_blk, jcp.ur);
             if (i < num_substeps - 1) {
                 add(aux1_reg_bcast_data, jcp.bcast_loop_bcast_substep);
                 add(aux_reg_output_data, jcp.bcast_loop_output_substep);
@@ -70,17 +69,16 @@ void jit_sse42_1x1_conv_kernel_f32::bcast_loop(int load_loop_blk,
 
     L(bcast_loop_tail);
     if (jcp.ur_tail) {
-        jit_tagged_label bcast_loop_tail_out(
-                "bcast_loop_tail_out", load_loop_tag);
+        Label bcast_loop_tail_out;
         cmp(bcast_loop_iter, 0);
         jz(bcast_loop_tail_out, T_NEAR);
-        reduce_loop(load_loop_blk, jcp.ur_tail, load_loop_tag, '1');
+        generate_reduce_loop(load_loop_blk, jcp.ur_tail);
         L(bcast_loop_tail_out);
     }
 }
 
-void jit_sse42_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
-        char load_loop_tag, char bcast_loop_tag)
+void jit_sse42_1x1_conv_kernel_f32::generate_reduce_loop(
+        int load_loop_blk, int ur)
 {
     auto reg_load = [=](int i, int n) {
         return Xmm(2*ur * load_loop_blk + 2*i + n + 1);
@@ -149,8 +147,8 @@ void jit_sse42_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
     };
 
     auto init = [=]() {
-        jit_tagged_label init_done("init_done", load_loop_tag, bcast_loop_tag);
-        jit_tagged_label init_zero("init_zero", load_loop_tag, bcast_loop_tag);
+        Label init_done;
+        Label init_zero;
 
         if (jcp.with_bias && one_of(jcp.prop_kind, forward_training,
                     forward_inference)) {
@@ -187,10 +185,8 @@ void jit_sse42_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
     }; // init()
 
     auto store = [=]() {
-        jit_tagged_label store_done(
-                "store_done", load_loop_tag, bcast_loop_tag);
-        jit_tagged_label store_noadd(
-                "store_noadd", load_loop_tag, bcast_loop_tag);
+        Label store_done;
+        Label store_noadd;
 
         if (!jcp.with_sum) {
             test(reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
@@ -207,15 +203,13 @@ void jit_sse42_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
 
         L(store_noadd);
 
-        jit_tagged_label store_norelu(
-                "store_norelu", load_loop_tag, bcast_loop_tag);
+        Label store_norelu;
         test(reg_reduce_pos_flag, FLAG_REDUCE_LAST);
         jz(store_norelu, T_NEAR);
 
         int eltwise_inj_idx = 0;
         int depthwise_inj_idx = 0;
         const auto &p = attr_.post_ops_;
-
         if (p.len_ == 0 && eltwise_injectors.size() == 1) {
             eltwise_injectors[0]->compute_vector_range(1, 2 * ur * load_loop_blk + 1);
         }
@@ -287,9 +281,8 @@ void jit_sse42_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
         } // for reduce_loop_unroll
     };
 
-    jit_tagged_label reduce_loop("reduce_loop", load_loop_tag, bcast_loop_tag);
-    jit_tagged_label reduce_loop_tail(
-            "reduce_loop_tail", load_loop_tag, bcast_loop_tag);
+    Label reduce_loop;
+    Label reduce_loop_tail;
 
     mov(aux_reg_load_data, reg_load_data);
     mov(aux_reg_bcast_data, aux1_reg_bcast_data);
@@ -314,16 +307,13 @@ void jit_sse42_1x1_conv_kernel_f32::reduce_loop(int load_loop_blk, int ur,
     store();
 } // reduce_loop()
 
-void jit_sse42_1x1_conv_kernel_f32::diff_bias_loop(int load_loop_blk,
-        char load_loop_tag)
+void jit_sse42_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk)
 {
     if (!jcp.with_bias || jcp.prop_kind != backward_weights)
         return;
 
-    jit_tagged_label diff_bias_loop("diff_bias_loop", load_loop_tag);
-    jit_tagged_label diff_bias_loop_out("diff_bias_loop_out", load_loop_tag);
-    jit_tagged_label diff_bias_init_out("diff_bias_init_out", load_loop_tag);
-    jit_tagged_label diff_bias_load("diff_bias_load", load_loop_tag);
+    Label diff_bias_loop, diff_bias_loop_out, diff_bias_init_out;
+    Label diff_bias_load;
 
     auto diff_bias_ptr = [=](int i, int n) {
         return ptr[reg_diff_bias_data + i * jcp.oc_block * sizeof(float)+ 4*n*sizeof(float)];
@@ -432,8 +422,8 @@ void jit_sse42_1x1_conv_kernel_f32::generate()
         mov(reg_output_stride, ptr[param1 + GET_OFF(output_stride)]);
     mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
 
-    auto load_loop_body = [=] (int load_loop_blk, char bcast_loop_tag) {
-        bcast_loop(load_loop_blk, bcast_loop_tag);
+    auto generate_load_loop_body = [=] (int load_loop_blk) {
+        generate_bcast_loop(load_loop_blk);
         add(reg_load_data, load_loop_blk * jcp.load_loop_load_step);
         switch (jcp.prop_kind) {
         case forward_training:
@@ -461,10 +451,10 @@ void jit_sse42_1x1_conv_kernel_f32::generate()
         add(reg_oc_off, load_loop_blk * jcp.oc_block * sizeof(float));
     };
 
-    const char *load_loop_blk_8 = "load_loop_blk_8";
-    const char *load_loop_blk_16 = "load_loop_blk_16";
-    const char *load_loop_blk_24 = "load_loop_blk_24";
-    const char *load_loop_blk_end = "load_loop_blk_end";
+    Label load_loop_blk_8;
+    Label load_loop_blk_16;
+    Label load_loop_blk_24;
+    Label load_loop_blk_end;
 
     cmp(reg_load_loop_work, 8);
     jle(load_loop_blk_8, T_NEAR);
@@ -476,8 +466,8 @@ void jit_sse42_1x1_conv_kernel_f32::generate()
     jle(load_loop_blk_16, T_NEAR);
 
     L(load_loop_blk_24); {
-        diff_bias_loop(3, '3');
-        load_loop_body(3, '3');
+        generate_diff_bias_loop(3);
+        generate_load_loop_body(3);
         cmp(reg_load_loop_work, 32);
         je(load_loop_blk_16);
         cmp(reg_load_loop_work, 24);
@@ -488,8 +478,8 @@ void jit_sse42_1x1_conv_kernel_f32::generate()
     jle(load_loop_blk_8, T_NEAR);
 
     L(load_loop_blk_16); {
-        diff_bias_loop(2, '2');
-        load_loop_body(2, '2');
+        generate_diff_bias_loop(2);
+        generate_load_loop_body(2);
         cmp(reg_load_loop_work, 16);
         jge(load_loop_blk_16);
     }
@@ -497,8 +487,8 @@ void jit_sse42_1x1_conv_kernel_f32::generate()
     L(load_loop_blk_8); {
         cmp(reg_load_loop_work, 0);
         je(load_loop_blk_end, T_NEAR);
-        diff_bias_loop(1, '1');
-        load_loop_body(1, '1');
+        generate_diff_bias_loop(1);
+        generate_load_loop_body(1);
     }
 
     L(load_loop_blk_end);
@@ -558,6 +548,7 @@ status_t jit_sse42_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     // TODO (Roma): this code is duplicated from the generic kernel; maybe the
     // configuration struct could do some stuff below
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+    const int ndims = src_d.ndims();
 
     jcp.prop_kind = cd.prop_kind;
 
@@ -568,19 +559,19 @@ status_t jit_sse42_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     jcp.oc_without_padding = jcp.oc;
     jcp.ic = src_d.dims()[1] / jcp.ngroups;
 
-    jcp.ih = src_d.dims()[2];
-    jcp.iw = src_d.dims()[3];
-    jcp.oh = dst_d.dims()[2];
-    jcp.ow = dst_d.dims()[3];
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[2];
+    jcp.iw = src_d.dims()[ndims - 1];
+    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[2];
+    jcp.ow = dst_d.dims()[ndims - 1];
 
-    jcp.kh = weights_d.dims()[with_groups + 2];
-    jcp.kw = weights_d.dims()[with_groups + 3];
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + 2];
+    jcp.kw = weights_d.dims()[with_groups + ndims - 1];
 
-    jcp.t_pad = cd.padding[0][0];
-    jcp.l_pad = cd.padding[0][1];
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][0];
+    jcp.l_pad = cd.padding[0][ndims - 3];
 
-    jcp.stride_h = cd.strides[0];
-    jcp.stride_w = cd.strides[1];
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[0];
+    jcp.stride_w = cd.strides[ndims - 3];
 
     jcp.src_fmt = src_d.format();
     jcp.with_bias = cd.bias_desc.format != memory_format::undef;
@@ -630,19 +621,19 @@ status_t jit_sse42_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     jcp.os = jcp.oh * jcp.ow;
     jcp.is = jcp.ih * jcp.iw;
 
-    constexpr memory_format_t weights_formats[2][2] = {
-        { OIhw8i8o, OIhw8o8i },
-        { gOIhw8i8o, gOIhw8o8i }
-    };
-    memory_format_t weights_format
-        = weights_formats[with_groups][jcp.prop_kind == backward_data];
+    const int is_bwd_d = jcp.prop_kind == backward_data;
+    memory_format_t weights_format = with_groups
+        ? utils::pick(2 * ndims - 6 + is_bwd_d, gOIw8i8o, gOIw8o8i, gOIhw8i8o,
+            gOIhw8o8i)
+        : utils::pick(2 * ndims - 6 + is_bwd_d, OIw8i8o, OIw8o8i, OIhw8i8o,
+            OIhw8o8i);
 
     bool args_ok = true
         && jcp.ngroups == 1
-        && src_d.format() == nChw8c
+        && one_of(src_d.format(), nCw8c, nChw8c)
         && weights_d.format() == weights_format
         && one_of(cd.bias_desc.format, memory_format::undef, any, x)
-        && dst_d.format() == nChw8c;
+        && one_of(dst_d.format(), nCw8c, nChw8c);
     if (!args_ok) return status::unimplemented;
 
     const int simd_w = 4;
index 95515e3..f2b7edd 100644 (file)
@@ -20,6 +20,7 @@
 #include "c_types_map.hpp"
 #include "jit_generator.hpp"
 #include "jit_primitive_conf.hpp"
+#include "cpu_memory.hpp"
 #include "jit_uni_eltwise.hpp"
 #include "jit_uni_depthwise.hpp"
 
@@ -106,10 +107,9 @@ private:
     nstl::vector<jit_uni_eltwise_injector_f32<sse42>*> eltwise_injectors;
     nstl::vector<jit_uni_depthwise_injector_f32<sse42>*> depthwise_injectors;
 
-    void bcast_loop(int load_loop_blk, char load_loop_tag);
-    void reduce_loop(int load_loop_blk, int ur, char load_loop_tag,
-            char bcast_loop_tag);
-    void diff_bias_loop(int load_loop_blk, char load_loop_tag);
+    void generate_bcast_loop(int load_loop_blk);
+    void generate_reduce_loop(int load_loop_blk, int ur);
+    void generate_diff_bias_loop(int load_loop_blk);
 
     void generate();
 };
index 41f7794..3b95a10 100644 (file)
@@ -27,6 +27,11 @@ namespace mkldnn {
 namespace impl {
 namespace cpu {
 
+#define data_blk_off(f, n, c, h, w) \
+    ((ndims == 3) \
+    ? (f).blk_off(n, c, w) \
+    : (f).blk_off(n, c, h, w))
+
 using namespace mkldnn::impl::status;
 using namespace mkldnn::impl::memory_format;
 using namespace mkldnn::impl::utils;
@@ -42,6 +47,7 @@ void _jit_sse42_1x1_convolution_fwd_t<with_relu>::execute_forward() {
     const memory_desc_wrapper dst_d(conf_.dst_pd());
     const memory_desc_wrapper weights_d(conf_.weights_pd(0));
 
+    const int ndims = src_d.ndims();
     const auto &jcp = kernel_->jcp;
     int MB = conf_.MB();
 
@@ -97,7 +103,7 @@ void _jit_sse42_1x1_convolution_fwd_t<with_relu>::execute_forward() {
                 par_conv.load_dim = this_block_size(ocb * jcp.oc_block, jcp.oc,
                         load_step * jcp.oc_block);
 
-                const size_t dst_off = dst_d.blk_off(n, _ocb, oh, ow);
+                const size_t dst_off = data_blk_off(dst_d, n, _ocb, oh, ow);
                 par_conv.output_data = &dst[dst_off];
 
                 par_conv.bias_data = &bias[_ocb * jcp.oc_block];
@@ -111,7 +117,7 @@ void _jit_sse42_1x1_convolution_fwd_t<with_relu>::execute_forward() {
                             jcp.ic, nb_ic_blocking * jcp.ic_block);
 
                     const size_t _icb = g * nb_ic + icb;
-                    const size_t src_off = src_d.blk_off(n, _icb, ih, iw);
+                    const size_t src_off = data_blk_off(src_d, n, _icb, ih, iw);
                     par_conv.bcast_data = &src[src_off];
 
                     par_conv.load_data = &weights[conf_.with_groups()
index 6ec7f61..a98619d 100644 (file)
@@ -60,7 +60,7 @@ struct _jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
                         this->cdesc_().src_desc.data_type,
                         this->cdesc_().weights_desc.data_type,
                         this->cdesc_().dst_desc.data_type)
-                && utils::implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                         data_type::f32 == this->cdesc_().bias_desc.data_type);
             if (!ok) return status::unimplemented;
 
@@ -94,12 +94,15 @@ struct _jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
         virtual status_t set_default_params() override {
             using namespace memory_format;
             if (this->src_pd_.desc()->format == any)
-                CHECK(this->src_pd_.set_format(nChw8c));
+                CHECK(this->src_pd_.set_format(utils::pick(this->ndims() - 3,
+                    nCw8c, nChw8c)));
             if (this->dst_pd_.desc()->format == any)
-                CHECK(this->dst_pd_.set_format(nChw8c));
+                CHECK(this->dst_pd_.set_format(utils::pick(this->ndims() - 3,
+                    nCw8c, nChw8c)));
             if (this->weights_pd_.desc()->format == any)
                 CHECK(this->weights_pd_.set_format(this->with_groups()
-                            ? gOIhw8i8o : OIhw8i8o));
+                    ? utils::pick(this->ndims() - 3, gOIw8i8o, gOIhw8i8o)
+                    : utils::pick(this->ndims() - 3, OIw8i8o, OIhw8i8o)));
             if (this->bias_pd_.desc()->format == any)
                 CHECK(this->bias_pd_.set_format(x));
             return status::success;
index 9307cbe..32f1903 100644 (file)
@@ -53,7 +53,7 @@ void jit_sse42_conv_fwd_kernel_f32::oh_step_unroll_kw(int ur_w,
         for (int ifm2 = 0; ifm2 < ic_blk; ifm2++) {
             for (int jj = jj_start; jj < jj_end; jj++) {
                 int inp_off;
-                if (jcp.src_fmt == nchw)
+                if (one_of(jcp.src_fmt, ncw, nchw))
                     inp_off = ifm2*ih*iw + (ki*dilate_w + jj*stride_w - pad_l);
                 else
                     inp_off = (ki*dilate_w + jj*stride_w - pad_l)*ic_blk + ifm2;
@@ -81,10 +81,9 @@ void jit_sse42_conv_fwd_kernel_f32::oh_step_unroll_kw(int ur_w,
 }
 
 void jit_sse42_conv_fwd_kernel_f32::oh_step_nopad(int ur_w,
-        int pad_l, int pad_r, char pad_tag,
-        int oc_blocks, char oc_blocks_tag)
+        int pad_l, int pad_r, int oc_blocks)
 {
-    jit_tagged_label kw_label("kw", pad_tag, oc_blocks_tag);
+    Label kw_loop;
 
     int iw = jcp.iw;
     int ih = jcp.ih;
@@ -97,14 +96,14 @@ void jit_sse42_conv_fwd_kernel_f32::oh_step_nopad(int ur_w,
     int oc_blk = jcp.oc_block;
 
     xor_(ki_iter, ki_iter);
-    L(kw_label);
+    L(kw_loop);
     {
         int jj_start = 0;
         int jj_end = ur_w;
         for (int ifm2 = 0; ifm2 < ic_blk; ifm2++) {
             for (int jj = jj_start; jj < jj_end; jj++) {
                 int inp_off;
-                if (jcp.src_fmt == nchw)
+                if (one_of(jcp.src_fmt, ncw, nchw))
                     inp_off = ifm2 * ih * iw + (jj * stride_w - pad_l);
                 else
                     inp_off = (jj * stride_w - pad_l) * ic_blk + ifm2;
@@ -126,18 +125,17 @@ void jit_sse42_conv_fwd_kernel_f32::oh_step_nopad(int ur_w,
             }
         }
         add(aux_reg_kernel, sizeof(float) * oc_blk * ic_blk);
-        add(aux_reg_input, sizeof(float) * (jcp.src_fmt == nchw ?
+        add(aux_reg_input, sizeof(float) * (one_of(jcp.src_fmt, ncw, nchw) ?
             dilate_w : ic_blk * dilate_w));
 
         inc(ki_iter);
         cmp(ki_iter, kw);
-        jl(kw_label, T_NEAR);
+        jl(kw_loop, T_NEAR);
     }
 }
 
 void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w,
-        int pad_l, int pad_r, char pad_tag,
-        int oc_blocks, char oc_blocks_tag)
+        int pad_l, int pad_r, int oc_blocks)
 {
     int iw = jcp.iw;
     int kw = jcp.kw;
@@ -147,23 +145,25 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w,
     int dilate_w = jcp.dilate_w + 1;
     int ic_blk = jcp.ic_block;
     int oc_blk = jcp.oc_block;
-    const int inp_mult = jcp.src_fmt == nchw ? dilate_h : ic_blk * dilate_h;
-    const int inp_off = jcp.src_fmt == nchw ? dilate_w : ic_blk * dilate_w;
+    const int inp_mult = one_of(jcp.src_fmt, ncw, nchw)
+        ? dilate_h : ic_blk * dilate_h;
+    const int inp_off = one_of(jcp.src_fmt, ncw, nchw)
+        ? dilate_w : ic_blk * dilate_w;
 
     xor_(simd_iter, simd_iter);
 
     mov(aux_reg_input, reg_input);
     mov(aux_reg_kernel, reg_kernel);
 
-    jit_tagged_label init_simd_iter_label("simd_iter", pad_tag, oc_blocks_tag);
-    jit_tagged_label init_done_label("init", pad_tag, oc_blocks_tag);
-    jit_tagged_label init_first_label("first", pad_tag, oc_blocks_tag);
+    Label init_simd_iter_loop;
+    Label init_done;
+    Label init_first;
 
-    L(init_simd_iter_label);
+    L(init_simd_iter_loop);
 
     if (!jcp.with_sum) {
         test(reg_ci_flag, FLAG_IC_FIRST);
-        jne(init_first_label, T_NEAR);
+        jne(init_first, T_NEAR);
     }
 
     for (int ii = 0; ii < oc_blocks; ii++)
@@ -180,7 +180,7 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w,
 
     if (jcp.with_sum && jcp.with_bias) {
         test(reg_ci_flag, FLAG_IC_FIRST);
-        je(init_done_label, T_NEAR);
+        je(init_done, T_NEAR);
 
         for (int ii = 0; ii < oc_blocks; ii++)
             for (int jj = 0; jj < ur_w; jj++)
@@ -188,9 +188,9 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w,
                     xword[reg_bias + sizeof(float) * ii * oc_blk]);
     }
 
-    jmp(init_done_label);
+    jmp(init_done);
 
-    L(init_first_label);
+    L(init_first);
     if (this->jcp.with_bias) {
         for (int ii = 0; ii < oc_blocks; ii++)
             for (int jj = 0; jj < ur_w; jj++)
@@ -202,7 +202,7 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w,
                 pxor(Xmm(ur_w * ii + jj + 1), Xmm(ur_w * ii + jj + 1));
     }
 
-    L(init_done_label);
+    L(init_done);
 
     Label skip_kh_loop;
     mov(kj, reg_kh);
@@ -210,12 +210,11 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w,
         cmp(kj, 0);
         je(skip_kh_loop, T_NEAR);
     }
-    jit_tagged_label kh_label("kh", pad_tag, oc_blocks_tag);
-    L(kh_label);
+    Label kh_loop;
+    L(kh_loop);
     {
         if (jcp.kw >= 5 && pad_l == 0 && pad_r == 0) {
-            oh_step_nopad(ur_w, pad_l, pad_r, pad_tag, oc_blocks,
-                          oc_blocks_tag);
+            oh_step_nopad(ur_w, pad_l, pad_r, oc_blocks);
             sub(aux_reg_input, sizeof(float) * kw * inp_off);
             add(aux_reg_input, sizeof(float) * iw * inp_mult);
         } else {
@@ -226,16 +225,16 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w,
 
         dec(kj);
         cmp(kj, 0);
-        jg(kh_label, T_NEAR);
+        jg(kh_loop, T_NEAR);
     }
 
     L(skip_kh_loop);
 
-    jit_tagged_label done_label("done", pad_tag, oc_blocks_tag);
-    jit_tagged_label regular_store_label("store", pad_tag, oc_blocks_tag);
+    Label done;
+    Label regular_store;
 
     test(reg_ci_flag, FLAG_IC_LAST);
-    je(regular_store_label, T_NEAR);
+    je(regular_store, T_NEAR);
 
     int eltwise_inj_idx = 0;
     int depthwise_inj_idx = 0;
@@ -270,7 +269,7 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w,
         }
     }
 
-    L(regular_store_label);
+    L(regular_store);
 
     for (int ii = 0; ii < oc_blocks; ii++) {
         for (int jj = 0; jj < ur_w; jj++) {
@@ -285,7 +284,7 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w,
         }
     }
 
-    L(done_label);
+    L(done);
 
     mov(aux_reg_kernel, reg_kernel);
     mov(aux_reg_input, reg_input);
@@ -296,15 +295,14 @@ void jit_sse42_conv_fwd_kernel_f32::width_blk_step(int ur_w,
 
     inc(simd_iter);
     cmp(simd_iter, 2);
-    jl(init_simd_iter_label, T_NEAR);
+    jl(init_simd_iter_loop, T_NEAR);
 
     sub(reg_output, sizeof(float) * 8);
     sub(reg_bias,   sizeof(float) * 8);
     sub(reg_oc_off, sizeof(float) * 8);
 }
 
-inline void jit_sse42_conv_fwd_kernel_f32::solve_common(
-        int oc_blocks, char oc_blocks_tag)
+inline void jit_sse42_conv_fwd_kernel_f32::solve_common(int oc_blocks)
 {
     int ur_w = jcp.ur_w;
     int ur_w_tail = jcp.ur_w_tail;
@@ -315,7 +313,7 @@ inline void jit_sse42_conv_fwd_kernel_f32::solve_common(
     int oc_blk = jcp.oc_block;
     int dilate_w = jcp.dilate_w + 1;
     int str_w = jcp.stride_w;
-    const int inp_mult = jcp.src_fmt == nchw ? 1 : ic_blk;
+    const int inp_mult = one_of(jcp.src_fmt, ncw, nchw) ? 1 : ic_blk;
 
     int l_pad = jcp.l_pad;
     int r_pad = nstl::max(0, (int(jcp.ow) - 1) * str_w + (kw - 1) * dilate_w
@@ -327,41 +325,36 @@ inline void jit_sse42_conv_fwd_kernel_f32::solve_common(
     if (l_pad > 0) {
         n_oi--;
         if (n_oi < 0 && r_pad1 > 0)
-            width_blk_step(ur_w, l_pad, r_pad1,
-                           'l', oc_blocks, oc_blocks_tag); // "lrpad"
+            width_blk_step(ur_w, l_pad, r_pad1, oc_blocks); // "lrpad"
         else
-            width_blk_step(ur_w, l_pad, 0,
-                           'l', oc_blocks, oc_blocks_tag); // "lpad"
+            width_blk_step(ur_w, l_pad, 0, oc_blocks); // "lpad"
         add(reg_input, sizeof(float) * (ur_w * str_w - l_pad) * inp_mult);
         add(reg_output, sizeof(float) * ur_w * oc_blk);
     }
 
-    jit_tagged_label ow_loop_label("ow", oc_blocks_tag);
+    Label ow_loop;
     xor_(oi_iter, oi_iter);
 
     if (n_oi > 0) {
-        L(ow_loop_label);
+        L(ow_loop);
 
-        width_blk_step(ur_w, 0, 0,
-                       'm', oc_blocks, oc_blocks_tag); // "middle"
+        width_blk_step(ur_w, 0, 0, oc_blocks); // "middle"
         add(reg_input, sizeof(float) * ur_w * str_w * inp_mult);
         add(reg_output, sizeof(float) * ur_w * oc_blk);
 
         inc(oi_iter);
         cmp(oi_iter, n_oi);
-        jl(ow_loop_label, T_NEAR);
+        jl(ow_loop, T_NEAR);
     }
 
     if (r_pad1 > 0 && n_oi >=0) {
-        width_blk_step(ur_w, 0, r_pad1,
-                       'r', oc_blocks, oc_blocks_tag); // "rpad"
+        width_blk_step(ur_w, 0, r_pad1, oc_blocks); // "rpad"
         add(reg_input, sizeof(float) * ur_w * str_w * inp_mult);
         add(reg_output, sizeof(float) * ur_w * oc_blk);
     }
 
     if (ur_w_tail != 0)
-        width_blk_step(ur_w_tail, 0, r_pad,
-                       't', oc_blocks, oc_blocks_tag); // "tail"
+        width_blk_step(ur_w_tail, 0, r_pad, oc_blocks); // "tail"
 }
 
 void jit_sse42_conv_fwd_kernel_f32::generate()
@@ -404,23 +397,22 @@ void jit_sse42_conv_fwd_kernel_f32::generate()
     mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
 
     int nb_oc_tail = jcp.nb_oc % jcp.nb_oc_blocking;
-    const char *tail_label = ".tail";
-    const char *exit_label = ".exit";
+    Label tail, exit;
 
     cmp(reg_oc_blocks, jcp.nb_oc_blocking);
-    jne(nb_oc_tail ? tail_label : exit_label, T_NEAR);
+    jne(nb_oc_tail ? tail : exit, T_NEAR);
 
-    solve_common(jcp.nb_oc_blocking, '0' + jcp.nb_oc_blocking);
-    jmp(exit_label, T_NEAR);
+    solve_common(jcp.nb_oc_blocking);
+    jmp(exit, T_NEAR);
 
     if (nb_oc_tail) {
-        L(tail_label);
+        L(tail);
         cmp(reg_oc_blocks, nb_oc_tail);
-        jne(exit_label, T_NEAR);
-        solve_common(nb_oc_tail, '0' + nb_oc_tail);
+        jne(exit, T_NEAR);
+        solve_common(nb_oc_tail);
     }
 
-    L(exit_label);
+    L(exit);
 
     this->postamble();
 
@@ -473,6 +465,8 @@ status_t jit_sse42_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     jcp.prop_kind = cd.prop_kind;
 
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+    const int ndims = src_d.ndims();
+    jcp.ndims = ndims;
 
     jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
     jcp.mb = src_d.dims()[0];
@@ -481,22 +475,22 @@ status_t jit_sse42_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     jcp.oc_without_padding = jcp.oc;
     jcp.ic = src_d.dims()[1] / jcp.ngroups;
 
-    jcp.ih = src_d.dims()[2];
-    jcp.iw = src_d.dims()[3];
-    jcp.oh = dst_d.dims()[2];
-    jcp.ow = dst_d.dims()[3];
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[2];
+    jcp.iw = src_d.dims()[ndims - 1];
+    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[2];
+    jcp.ow = dst_d.dims()[ndims - 1];
 
-    jcp.kh = weights_d.dims()[with_groups + 2];
-    jcp.kw = weights_d.dims()[with_groups + 3];
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + 2];
+    jcp.kw = weights_d.dims()[with_groups + ndims - 1];
 
-    jcp.t_pad = cd.padding[0][0];
-    jcp.l_pad = cd.padding[0][1];
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][0];
+    jcp.l_pad = cd.padding[0][ndims - 3];
 
-    jcp.stride_h = cd.strides[0];
-    jcp.stride_w = cd.strides[1];
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[0];
+    jcp.stride_w = cd.strides[ndims - 3];
 
-    jcp.dilate_h = cd.dilates[0];
-    jcp.dilate_w = cd.dilates[1];
+    jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[0];
+    jcp.dilate_w = cd.dilates[ndims - 3];
     jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1)
             - (jcp.ih + jcp.t_pad - 1);
 
@@ -548,12 +542,13 @@ status_t jit_sse42_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     const bool mimo = !flat;
 
     bool args_ok = true
-        && implication(flat, one_of(src_d.format(), nchw, nhwc)
-                && one_of(weights_d.format(), Ohwi8o, gOhwi8o))
-        && implication(mimo, src_d.format() == nChw8c
-                && one_of(weights_d.format(), OIhw8i8o, gOIhw8i8o))
+        && IMPLICATION(flat, one_of(src_d.format(), ncw, nwc, nchw, nhwc)
+                && one_of(weights_d.format(), Owi8o, gOwi8o, Ohwi8o, gOhwi8o))
+        && IMPLICATION(mimo, one_of(src_d.format(), nCw8c, nChw8c)
+                && one_of(weights_d.format(), OIw8i8o, gOIw8i8o, OIhw8i8o,
+                    gOIhw8i8o))
         && one_of(cd.bias_desc.format, memory_format::undef, any, x)
-        && dst_d.format() == nChw8c;
+        && one_of(dst_d.format(), nCw8c, nChw8c);
     if (!args_ok) return status::unimplemented;
 
     bool ok_to_pad_channels = true
@@ -576,26 +571,30 @@ status_t jit_sse42_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     args_ok = true
         && jcp.oc % simd_w == 0
         && jcp.l_pad <= jcp.ur_w
-        && implication(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0)
+        && IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0)
                 || (jcp.stride_w == 1 && jcp.stride_h == 1))
-        && implication(mimo, jcp.ic % simd_w == 0);
+        && IMPLICATION(mimo, jcp.ic % simd_w == 0);
     if (!args_ok) return status::unimplemented;
 
     int r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w
         + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1));
 
-    if (r_pad_no_tail > jcp.ur_w) {
+    // kernel needs 1 temporary YMM register
+    const int num_avail_regs = 15;
+    if (r_pad_no_tail > jcp.ur_w * jcp.stride_w && jcp.ow / jcp.ur_w > 1) {
         /* recalculate ur_w, nb_oc_blocking and ur_w_tail */
-        jcp.ur_w = r_pad_no_tail + 1;
-        jcp.nb_oc_blocking = ((16 - 1)-jcp.ur_w)/jcp.ur_w;
+        jcp.ur_w = nstl::min(r_pad_no_tail / jcp.stride_w + jcp.ur_w_tail,
+                nstl::min(jcp.ow, num_avail_regs / 2));
+        jcp.nb_oc_blocking = (num_avail_regs - jcp.ur_w) / jcp.ur_w;
         jcp.ur_w_tail = jcp.ow % jcp.ur_w;
         /* check again ... */
         r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w
             + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1));
-        if ((r_pad_no_tail > jcp.ur_w) || (jcp.ow < jcp.ur_w))
+        if (jcp.ur_w < nstl::max(jcp.l_pad, r_pad_no_tail))
             return status::unimplemented;
     }
-    if (jcp.l_pad > jcp.ur_w) return status::unimplemented;
+    assert(jcp.nb_oc_blocking > 0);
+    assert(jcp.ur_w * (jcp.nb_oc_blocking + 1) <= num_avail_regs);
 
     jcp.ic_block = (jcp.ic % simd_w != 0) ? jcp.ic : simd_w;
     jcp.nb_ic = jcp.ic / jcp.ic_block;
index f973500..ea30028 100644 (file)
@@ -20,6 +20,7 @@
 #include "c_types_map.hpp"
 #include "jit_generator.hpp"
 #include "jit_primitive_conf.hpp"
+#include "cpu_memory.hpp"
 #include "jit_uni_eltwise.hpp"
 #include "jit_uni_depthwise.hpp"
 
@@ -86,11 +87,9 @@ private:
 
     inline void oh_step_unroll_kw(int ur_w, int pad_l, int pad_r,
             int oc_blocks);
-    inline void oh_step_nopad(int ur_w, int pad_l, int pad_r,
-            char pad_label, int oc_blocks, char oc_blocks_label);
-    inline void width_blk_step(int ur_w, int pad_l, int pad_r,
-            char pad_label, int oc_blocks, char oc_blocks_label);
-    inline void solve_common(int oc_blocks, char oc_blocks_label);
+    inline void oh_step_nopad(int ur_w, int pad_l, int pad_r, int oc_blocks);
+    inline void width_blk_step(int ur_w, int pad_l, int pad_r, int oc_blocks);
+    inline void solve_common(int oc_blocks);
 
     void generate();
 };
index c00f8ad..a37c317 100644 (file)
@@ -29,6 +29,20 @@ using namespace mkldnn::impl::status;
 using namespace mkldnn::impl::memory_format;
 using namespace mkldnn::impl::utils;
 
+#define src_blk_off(f, n, c, h, w) \
+    (conf_.ndims() == 3) \
+    ? (f).blk_off(n, c, w) \
+    : (f).blk_off(n, c, h, w)
+
+#define wht_blk_off_(f, g, ...) \
+    conf_.with_groups() \
+    ? (f).blk_off(g, __VA_ARGS__) \
+    : (f).blk_off(__VA_ARGS__)
+#define wht_blk_off(f, g, oc, ic, kh, kw) \
+        conf_.ndims() == 3 \
+        ? wht_blk_off_(f, g, oc, ic, kw) \
+        : wht_blk_off_(f, g, oc, ic, kh, kw)
+
 template <bool with_relu>
 void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward() {
     auto src = reinterpret_cast<const data_t *>(this->input_memory(0));
@@ -85,17 +99,14 @@ void _jit_sse42_convolution_fwd_t<with_relu>::execute_forward() {
                     const int ih = nstl::max(ij - jcp.t_pad
                         + div_up(i_t_overflow,
                                  (jcp.dilate_h+1)) * (jcp.dilate_h + 1), 0);
-                    par_conv.src = &src[src_d.blk_off(n,
+                    par_conv.src = &src[src_blk_off(src_d, n,
                         jcp.ic == 3 ? 0 : _ic, ih, 0)];
 
-                    par_conv.dst = &dst[dst_d.blk_off(n, _oc, oh, 0)];
+                    par_conv.dst = &dst[src_blk_off(dst_d, n, _oc, oh, 0)];
 
                     const int wh = div_up(i_t_overflow, (jcp.dilate_h + 1));
-                    par_conv.filt = &weights[conf_.with_groups()
-                                        ? weights_d.blk_off(g, ocb,
-                                            jcp.ic == 3 ? 0 : icb, wh, 0)
-                                        : weights_d.blk_off(ocb,
-                                            jcp.ic == 3 ? 0 : icb, wh, 0)];
+                    par_conv.filt = &weights[wht_blk_off(weights_d, g, ocb,
+                        jcp.ic == 3 ? 0 : icb, wh, 0)];
 
                     if (icb == 0) {
                         if (bias)
index 8aca31d..1923495 100644 (file)
@@ -56,7 +56,7 @@ struct _jit_sse42_convolution_fwd_t: public cpu_primitive_t {
                         this->cdesc_().src_desc.data_type,
                         this->cdesc_().weights_desc.data_type,
                         this->cdesc_().dst_desc.data_type)
-                && utils::implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                         data_type::f32 == this->cdesc_().bias_desc.data_type);
             if (!ok) return status::unimplemented;
 
@@ -91,13 +91,18 @@ struct _jit_sse42_convolution_fwd_t: public cpu_primitive_t {
 
             const bool flat = this->IC() == 3 || this->IC() == 1;
             if (this->src_pd_.desc()->format == any)
-                CHECK(this->src_pd_.set_format(flat ? nchw : nChw8c));
+                CHECK(this->src_pd_.set_format(flat
+                    ? utils::pick(this->ndims() - 3, ncw, nchw)
+                    : utils::pick(this->ndims() - 3, nCw8c, nChw8c)));
             if (this->dst_pd_.desc()->format == any)
-                CHECK(this->dst_pd_.set_format(nChw8c));
+                CHECK(this->dst_pd_.set_format(utils::pick(this->ndims() - 3,
+                    nCw8c, nChw8c)));
             if (this->weights_pd_.desc()->format == any)
                 CHECK(this->weights_pd_.set_format(this->with_groups()
-                            ? (flat ? gOhwi8o : gOIhw8i8o)
-                            : (flat ? Ohwi8o : OIhw8i8o)));
+                    ? utils::pick(2 * this->ndims() - 6 + flat, gOIw8i8o,
+                        gOwi8o, gOIhw8i8o, gOhwi8o)
+                    : utils::pick(2 * this->ndims() - 6 + flat, OIw8i8o, Owi8o,
+                        OIhw8i8o, Ohwi8o)));
             if (this->bias_pd_.desc()->format == any)
                 CHECK(this->bias_pd_.set_format(x));
             return status::success;
@@ -108,7 +113,6 @@ struct _jit_sse42_convolution_fwd_t: public cpu_primitive_t {
             const output_vector &outputs)
         : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd),
           dw_conv_buffer_size_(0), dw_conv_buffer_(nullptr), padded_bias_(nullptr), dw_padded_bias_(nullptr)
-
     {
         kernel_ = new jit_sse42_conv_fwd_kernel_f32(conf_.jcp_, *conf_.attr());
         if (conf_.jcp_.with_dw_conv) {
index 6c16452..d360a14 100644 (file)
@@ -41,11 +41,16 @@ inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d,
     const bool is_bwd_data = self->cdesc()->prop_kind
         == prop_kind::backward_data;
 
+    const int ndims = src_d->ndims;
     bool rtus_applicable = true
-        && (conv_d->strides[0] != 1 || conv_d->strides[1] != 1)
-        && utils::one_of(src_d->format,
-            memory_format::nChw8c, memory_format::nChw16c);
-    for (int d = 2; d < 4; ++d) {
+        && utils::pick(ndims - 3,
+            (conv_d->strides[0] != 1 && !one_of(conv_d->src_desc.data_type,
+                data_type::s16, data_type::s32)),
+            (conv_d->strides[0] != 1 || conv_d->strides[1] != 1))
+        && utils::one_of(src_d->format, memory_format::nCw8c,
+            memory_format::nCw16c, memory_format::nChw8c,
+            memory_format::nChw16c);
+    for (int d = 2; d < ndims; ++d) {
         /* TODO: relax these conditions (by improving reducer) */
         rtus_applicable = rtus_applicable
             && conv_d->padding[0][d - 2] == 0
@@ -55,9 +60,12 @@ inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d,
     if (rtus_applicable) {
         self->rtus_.reduce_src_ = true;
         conv_d = &(self->rtus_.conv_d_ = *conv_d);
-        self->rtus_.conv_d_.strides[0] = self->rtus_.conv_d_.strides[1] = 1;
+        self->rtus_.conv_d_.strides[0] = 1;
+        if (ndims == 4)
+            self->rtus_.conv_d_.strides[1] = 1;
         utils::array_set(self->rtus_.conv_d_.padding[0], 0, 2);
-        utils::array_set(self->rtus_.conv_d_.padding[1], 0, 2);
+        if (ndims == 4)
+            utils::array_set(self->rtus_.conv_d_.padding[1], 0, 2);
         const int ic = src_d->dims[1];
         if (is_bwd_data) {
             src_d = &(self->rtus_.conv_d_.diff_src_desc = *dst_d);
@@ -158,6 +166,8 @@ struct rtus_driver_t: public jit_generator {
 
         cmp(reg_cur_iw, iw_);
         jl(skip_h_step);
+        /* for 1d convolution the loop over h should be skipped */
+        if (src_step_icb_ == iw_) jmp(skip_h_step);
 
         if (src_to_ws_) {
             add(reg_cur_src, (src_step_h_ - iw_) * vlen_);
@@ -239,6 +249,7 @@ inline void init_rtus_driver(conv_t *self) {
     const auto &conf = self->conf_;
     const auto &cd = *conf.cdesc();
     const bool is_bwd_data = cd.prop_kind == prop_kind::backward_data;
+    const int ndims = conf.ndims();
 
     if (!conf.rtus_.reduce_src_) return;
 
@@ -260,16 +271,17 @@ inline void init_rtus_driver(conv_t *self) {
     self->scratch_ = (decltype(self->scratch_))malloc(
             max_threads * self->ws_per_thread_ * typesize, 64);
 
-    const int stride_h = cd.strides[0];
-    const int stride_w = cd.strides[1];
+    const int stride_h = (conf.ndims() == 3) ? 1 : cd.strides[0];
+    const int stride_w = cd.strides[ndims - 3];
 
     const auto &src_d = is_bwd_data ? *conf.diff_src_pd()->desc()
                                     : *conf.src_pd()->desc();
-    assert((isa == avx2 && src_d.format == memory_format::nChw8c)
-           || (isa == avx512_common && src_d.format == memory_format::nChw16c));
+    assert((isa == avx2 && utils::one_of(src_d.format, memory_format::nCw8c,
+        memory_format::nChw8c)) || (isa == avx512_common && utils::one_of(
+            src_d.format, memory_format::nCw16c, memory_format::nChw16c)));
 
-    const int ih = src_d.dims[2];
-    const int iw = src_d.dims[3];
+    const int ih = (ndims == 3) ? 1 : src_d.dims[2];
+    const int iw = src_d.dims[ndims - 1];
 
     const int src_step_h = stride_h * iw;
     const int src_step_icb = ih * iw;
index e3de4dd..3a667ac 100644 (file)
@@ -1125,7 +1125,7 @@ struct uni_bnorm_driver_t: public c_compatible {
 
         int SP_N_ithr = N_ithr * S_nthr + S_ithr;
         int SP_N_nthr = N_nthr * S_nthr;
-        assert(utils::implication(!mkldnn_thr_syncable(), SP_N_nthr == 1));
+        assert(IMPLICATION(!mkldnn_thr_syncable(), SP_N_nthr == 1));
 
         p.N_ithr = SP_N_ithr;
         p.N_nthr = SP_N_nthr;
index 7549f07..7dbc47a 100644 (file)
@@ -60,7 +60,7 @@ struct jit_uni_batch_normalization_fwd_t: public cpu_primitive_t {
                 && !has_zero_dim_memory()
                 && utils::one_of(ndims(), 4, 5)
                 && desc()->data_desc.data_type == f32
-                && utils::implication(use_scaleshift(),
+                && IMPLICATION(use_scaleshift(),
                         desc()->data_scaleshift_desc.data_type == f32)
                 && desc()->data_desc.format == desired_fmt
                 && (attr()->has_default_values() || this->with_relu_post_op());
@@ -129,7 +129,7 @@ struct jit_uni_batch_normalization_bwd_t: public cpu_primitive_t {
                 && utils::one_of(ndims(), 4, 5)
                 && everyone_is(f32, desc()->data_desc.data_type,
                         desc()->diff_data_desc.data_type)
-                && implication(use_scaleshift(),
+                && IMPLICATION(use_scaleshift(),
                         desc()->data_scaleshift_desc.data_type == f32)
                 && everyone_is(desired_fmt, desc()->diff_data_desc.format,
                         desc()->data_desc.format)
index c09dbe8..634e9f9 100644 (file)
@@ -458,7 +458,7 @@ status_t jit_uni_depthwise_fwd_t<isa>::pd_t::init() {
         && utils::one_of(desc()->src_desc.format, desired_blk_fmt, nchw)
         && utils::one_of(desc()->dst_desc.format, desired_blk_fmt, nchw)
         && utils::one_of(desc()->weights_desc.format, x)
-        && utils::implication(this->with_bias(), x == desc()->bias_desc.format)
+        && IMPLICATION(this->with_bias(), x == desc()->bias_desc.format)
         && attr()->has_default_values();
 
     return ok ? status::success : status::unimplemented;
index 7178e1a..0d97cce 100644 (file)
@@ -758,6 +758,616 @@ template struct jit_uni_dw_conv_bwd_data_kernel_f32<avx512_common>;
 template struct jit_uni_dw_conv_bwd_data_kernel_f32<avx2>;
 template struct jit_uni_dw_conv_bwd_data_kernel_f32<sse42>;
 
+template <cpu_isa_t isa>
+inline void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::zero_filter() {
+    for (int r = 0; r < reg_repeats; ++r) {
+        for (int i = 0; i < jcp.kw; ++i) {
+            Vmm vmm_acc = get_acc_reg(r * jcp.kw + i);
+            uni_vpxor(vmm_acc, vmm_acc, vmm_acc);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::load_filter() {
+    for (int r = 0; r < reg_repeats; ++r) {
+        const int reg_set = r * jcp.kw;
+        for (int i = 0; i < jcp.kw; ++i) {
+            int off_filter = (reg_set + i) * simd_w;
+            Vmm vmm_acc = get_acc_reg(reg_set + i);
+            uni_vmovups(vmm_acc,
+                    vmmword[tmp_reg_filter + off_filter * sizeof(float)]);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::zero_bias() {
+    for (int r = 0; r < reg_repeats; ++r) {
+        Vmm vmm_bias = get_bias_reg(r);
+        uni_vpxor(vmm_bias, vmm_bias, vmm_bias);
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::load_bias() {
+    for (int r = 0; r < reg_repeats; ++r) {
+        Vmm vmm_bias = get_bias_reg(r);
+        uni_vmovups(
+                vmm_bias, vmmword[reg_bias_baddr + r * simd_w * sizeof(float)]);
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::compute_ow_step_unroll(
+        int l_pad, int r_pad, int pad_offset, int ow_block) {
+    const int pad = nstl::max(jcp.l_pad, jcp.r_pad);
+    const int iw_overlap = jcp.iw + jcp.kw - 1 - jcp.l_pad - jcp.r_pad;
+    const int unroll_w = nstl::min(jcp.ur_w, iw_overlap);
+    const int right_border = iw_overlap - ow_block;
+
+    /* preamble count for number of cascaded LOAD + FMA operation */
+    const int input_preamble_count
+            = nstl::max(jcp.kw - jcp.stride_w - l_pad, 0);
+
+    /* LOAD initial input registers, then cascade LOADs and FMAs*/
+    for (int r = 0; r < reg_repeats; ++r) {
+        for (int i = 0; i < input_preamble_count; i++) {
+            int off_input = ((i - pad_offset) * reg_repeats + r) * simd_w;
+            Vmm vmm_input = get_input_reg((i + l_pad) * reg_repeats + r);
+            uni_vmovups(vmm_input,
+                    ptr[tmp_reg_idx_input + off_input * sizeof(float)]);
+        }
+
+        for (int i = 0; i < unroll_w; ++i) {
+            int off_output = (i * reg_repeats + r) * simd_w;
+            Vmm vmm_output = get_output_reg(r);
+            uni_vmovups(vmm_output,
+                    ptr[tmp_reg_idx_output + off_output * sizeof(float)]);
+
+            int input_load_overlap = i * jcp.stride_w + input_preamble_count;
+
+            /* Cascade 'input' loads for the corresponding FMAs */
+            const int cascade_input = nstl::min(jcp.stride_w, jcp.kw);
+            for (int c = 0; c < cascade_input; ++c) {
+                int off_input
+                        = ((c + input_load_overlap - pad_offset) * reg_repeats
+                                  + r)
+                        * simd_w;
+                Vmm vmm_input = get_input_reg(
+                        ((c + input_load_overlap + l_pad) % jcp.kw)
+                                * reg_repeats
+                        + r);
+                uni_vmovups(vmm_input,
+                        ptr[tmp_reg_idx_input + off_input * sizeof(float)]);
+            }
+
+            for (int j = 0; j < jcp.kw; ++j) {
+
+                /* Don't apply FMAs that fall into the padded region */
+                if (i + j < l_pad || i + j - pad >= right_border)
+                    continue;
+                Vmm vmm_input = get_input_reg(
+                        ((i * jcp.stride_w + j) % jcp.kw) * reg_repeats + r);
+                Vmm vmm_acc = get_acc_reg(j * reg_repeats + r);
+                Vmm vmm_aux = isa == sse42 ? get_aux_reg() : vmm_input;
+                if( isa == sse42 ) uni_vmovups(vmm_aux, vmm_input);
+                uni_vfmadd231ps(vmm_acc, vmm_aux, vmm_output);
+            }
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+inline void
+jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::compute_bias_step_unroll(
+        const int unroll_w) {
+    for (int r = 0; r < reg_repeats; ++r) {
+        for (int i = 0; i < unroll_w; ++i) {
+            Vmm vmm_bias = get_bias_reg(r);
+            int off_output = (i * reg_repeats + r) * simd_w;
+            uni_vaddps(vmm_bias, vmm_bias,
+                    vmmword[tmp_reg_idx_output + off_output * sizeof(float)]);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::store_filter() {
+    for (int r = 0; r < reg_repeats; ++r) {
+        const int reg_set = r * jcp.kw;
+        for (int i = 0; i < jcp.kw; ++i) {
+            int off_filter = (i + reg_set) * simd_w;
+            Vmm vmm_acc = get_acc_reg(i + reg_set);
+            uni_vmovups(vmmword[tmp_reg_filter + off_filter * sizeof(float)],
+                    vmm_acc);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::store_bias() {
+    for (int r = 0; r < reg_repeats; ++r) {
+        Vmm vmm_bias = get_bias_reg(r);
+        uni_vmovups(
+                vmmword[reg_bias_baddr + r * simd_w * sizeof(float)], vmm_bias);
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::create_h_bounds_table() {
+    /* Bounds are stored on an 8-bit sized element.
+     * XXX: potential issues if bounds exceed 255.
+     */
+    const bool handle_padding = (jcp.t_pad > 0) || (jcp.b_pad > 0);
+    if (handle_padding) {
+
+        /* Calculate how many 'h_start' bounds are needed */
+        const int h_bounds_count = get_loop_bounds_count(
+                nstl::max(jcp.t_pad, jcp.b_pad), jcp.oh, jcp.oh_blk_size);
+
+        align(64);
+        L(bound_start_table);
+        /* Generate starting bounds for 'oh' loop. This value also determines
+         * the overlap (computed as an address offset) between the output over
+         * the input for that loop iteration. */
+        for (int oh_block = 0; oh_block < h_bounds_count; ++oh_block) {
+            for (int kh = 0; kh < jcp.kh; ++kh) {
+                te_size start_bound = nstl::max(
+                        jcp.t_pad - oh_block * jcp.oh_blk_size - kh, 0);
+                write_table(start_bound);
+            }
+        }
+        /* Write offset count for 'input' address calculation. The offset for
+         * the input address is conditioned by the 'h' padding intersection over
+         * the output rows. */
+        for (int kh = 1; kh < jcp.kh; ++kh) {
+            te_size kh_accum_value = nstl::max(nstl::min(kh - jcp.t_pad, 1), 0);
+            write_table(kh_accum_value);
+        }
+        /* Last value is not used for offset calculation, write 'nop'
+         * equivalent*/
+        write_table(0);
+
+        /* Non-padded blocks always increment 'kh' dimension */
+        for (int oh_block = 0; oh_block < h_bounds_count - 1; oh_block++) {
+            for (int kh = 0; kh < jcp.kh; ++kh) {
+                te_size kh_accum_value = 1;
+                write_table(kh_accum_value);
+            }
+        }
+
+        /* number of input elements that overlap over output */
+        int ih_overlap = jcp.oh_blk_size + jcp.kh - 1 - jcp.t_pad - jcp.b_pad;
+
+        /* End Bounds for 'oh' default to 'OH' or OH_BLOCK_SIZE, unless
+         * the 'oh_block' is within the 'bottom_padding' region. */
+        int oh_end_blk = 0;
+        for (; oh_end_blk < h_bounds_count - 1; ++oh_end_blk) {
+            for (int kh = 0; kh < jcp.kh; ++kh) {
+                te_size end_bound = nstl::min((jcp.ih / jcp.stride_h)
+                                - jcp.oh_blk_size - oh_end_blk * jcp.oh_blk_size
+                                + ih_overlap + 1 - kh,
+                        jcp.oh_blk_size);
+                write_table(end_bound);
+            }
+        }
+        /* Write bounds for the special case of when 'oh_block' falls within the
+         * 'bottom_paddin' region - this always executes since at least 1 row of
+         * bounds should exist. */
+        const int pad = nstl::max(jcp.b_pad, jcp.t_pad);
+        ih_overlap
+                = (jcp.ih / jcp.stride_h + jcp.kh - 1 - jcp.t_pad - jcp.b_pad);
+        oh_end_blk = jcp.oh - jcp.oh_blk_size;
+        for (int kh = 0; kh < jcp.kh; ++kh) {
+            te_size end_bound = nstl::min(
+                    jcp.oh_blk_size, ih_overlap - oh_end_blk + pad - kh);
+            write_table(end_bound);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::compute_bias_loop() {
+
+    Label oh_label;
+    Label ow_blk_label;
+
+    const int oh_block_size = jcp.oh_blk_size;
+    const int ow_unroll = jcp.ur_w;
+    const int ow_block_count = jcp.ow / ow_unroll;
+    const int ch_offset = jcp.ch_block;
+
+    mov(tmp_reg_idx_output, reg_output_baddr);
+
+    xor_(iter_oh, iter_oh);
+    L(oh_label);
+    {
+
+        xor_(iter_ow_blk, iter_ow_blk);
+        L(ow_blk_label);
+        {
+
+            compute_bias_step_unroll(ow_unroll);
+
+            add(tmp_reg_idx_output, ow_unroll * ch_offset * sizeof(float));
+
+            inc(iter_ow_blk);
+            cmp(iter_ow_blk, ow_block_count);
+            jl(ow_blk_label, T_NEAR);
+        }
+
+        inc(iter_oh);
+        cmp(iter_oh, oh_block_size);
+        jl(oh_label, T_NEAR);
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::compute_kh_loop(
+        int l_pad, int r_pad, int pad_offset, bool first_iteration,
+        int ow_block) {
+
+    Label kh_label;
+    Label oh_label;
+    Label exit_innerloop_label;
+    Label skip_load_acc;
+
+    const int table_row_count = get_loop_bounds_count(
+            nstl::max(jcp.t_pad, jcp.b_pad), jcp.oh, jcp.oh_blk_size);
+    const int ih_table_off = 1 * table_row_count * jcp.kh * sizeof(te_size);
+    const int end_bound_table_off
+            = 2 * table_row_count * jcp.kh * sizeof(te_size);
+
+    const int ch_offset = jcp.ch_block;
+
+    const bool handle_padding = (jcp.t_pad > 0) || (jcp.b_pad > 0);
+
+    mov(tmp_reg_filter, reg_filter_baddr);
+    mov(tmp_reg_kh_input, reg_input_baddr);
+    xor_(reg_tmp_off, reg_tmp_off);
+
+    if (handle_padding) {
+        mov(reg_bound_table_addr, bound_start_table);
+
+        /* move to the row containing the indices for the current 'h' block */
+        mov(reg_tmp_off, reg_table_idx);
+        imul(reg_tmp_off, reg_tmp_off, jcp.kh * sizeof(unsigned char));
+        add(reg_bound_table_addr, reg_tmp_off);
+    }
+
+    xor_(iter_kh, iter_kh);
+    L(kh_label);
+    {
+
+        mov(tmp_reg_idx_output, reg_output_baddr);
+        mov(tmp_reg_idx_input, tmp_reg_kh_input);
+
+        if (first_iteration) {
+
+            /* apply zero filter */
+            zero_filter();
+
+            /* if zero_filter_flag is set to '1', load filter memory into
+             * reg_accum */
+            if (jcp.with_bias) {
+                mov(reg_tmp_al, reg_exec_flag);
+                and_(reg_tmp_al, FLAG_ZERO_FILTER);
+                cmp(reg_tmp_al, 0);
+            } else {
+                /* none of the other flags are active, so we can use the
+                 * register directly */
+                cmp(reg_exec_flag, 0);
+            }
+            je(skip_load_acc);
+            load_filter();
+            L(skip_load_acc);
+
+        } else {
+            load_filter();
+        }
+
+        xor_(iter_oh, iter_oh);
+
+        if (handle_padding) {
+
+            /* 'oh loop' initial bounds are stored in bound_table */
+            mov(iter_oh_lb, byte[reg_bound_table_addr]);
+
+            /* skip 'oh' row that intersects with top padding */
+            xor_(reg_tmp_off, reg_tmp_off);
+            mov(reg_tmp_off, iter_oh);
+            imul(reg_tmp_off, reg_tmp_off, jcp.ow * ch_offset * sizeof(float));
+            add(tmp_reg_idx_output, reg_tmp_off);
+
+            /* forward the input address by 'stride_h' */
+            if (jcp.stride_h > 1) {
+                xor_(reg_tmp_off, reg_tmp_off);
+                mov(reg_tmp_off, iter_oh);
+                imul(reg_tmp_off, reg_tmp_off,
+                        (jcp.stride_h - 1) * jcp.iw * ch_offset * sizeof(float));
+                add(tmp_reg_idx_input, reg_tmp_off);
+            }
+        }
+
+        L(oh_label);
+        {
+
+            compute_ow_step_unroll(l_pad, r_pad, pad_offset, ow_block);
+
+            add(tmp_reg_idx_input,
+                    jcp.stride_h * jcp.iw * ch_offset * sizeof(float));
+            add(tmp_reg_idx_output, jcp.ow * ch_offset * sizeof(float));
+
+            inc(iter_oh);
+            if (handle_padding) {
+                /* 'oh loop' end bounds are stored in bound_table (precomputed
+                 * during JIT generation) */
+                cmp(iter_oh_lb,
+                        byte[reg_bound_table_addr + end_bound_table_off]);
+            } else {
+                cmp(iter_oh, jcp.oh_blk_size);
+            }
+            jl(oh_label, T_NEAR);
+        }
+
+        store_filter();
+
+        add(tmp_reg_filter, jcp.kw * ch_offset * sizeof(float));
+
+        if (handle_padding) {
+            xor_(kh_offset, kh_offset);
+            mov(kh_offset_lb, byte[reg_bound_table_addr + ih_table_off]);
+            /* increase 'ih' row in regards to 'kh'. */
+            imul(kh_offset, kh_offset, jcp.iw * ch_offset * sizeof(float));
+            add(tmp_reg_kh_input, kh_offset);
+
+            /* increase bound_table idx for the next 'kh' value in table*/
+            add(reg_bound_table_addr, sizeof(te_size));
+        } else {
+            add(tmp_reg_kh_input, jcp.iw * ch_offset * sizeof(float));
+        }
+
+        inc(iter_kh);
+        cmp(iter_kh, jcp.kh);
+        jl(kh_label, T_NEAR);
+    }
+}
+
+template <cpu_isa_t isa>
+inline void
+jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::compute_ow_block_unroll() {
+
+    Label skip_load_bias;
+
+    /* Only apply zero_filter (xor'ing accum_reg) on the left edge */
+    bool zero_filter_1st_iter = true;
+
+    const int ch_offset = jcp.ch_block;
+
+    const int ow_block_size = jcp.ow_blk_size;
+    const int iw_block_size = jcp.ow_blk_size * jcp.stride_w;
+
+    int w_unrolled_loop_count = jcp.ow / ow_block_size;
+
+    const bool handle_padding = (jcp.l_pad > 0) || (jcp.r_pad > 0);
+
+    int pad_offset = jcp.l_pad;
+
+    int ow_block = 0;
+
+    if (jcp.with_bias) {
+
+        zero_bias();
+
+        /* if zero_bias is '1', load bias accumulator from memory. This happens
+         * after the first iteration is executed  */
+        mov(reg_tmp_al, reg_exec_flag);
+        and_(reg_tmp_al, FLAG_ZERO_BIAS);
+        cmp(reg_tmp_al, 0);
+        je(skip_load_bias);
+        load_bias();
+        L(skip_load_bias);
+
+        compute_bias_loop();
+
+        store_bias();
+    }
+
+    /* compute left padded block */
+    if (handle_padding) {
+
+        const int r_pad = jcp.iw - ow_block_size > 0 ? 0 : jcp.r_pad;
+
+        compute_kh_loop(jcp.l_pad, r_pad, 0, zero_filter_1st_iter, ow_block);
+        zero_filter_1st_iter = false;
+
+        w_unrolled_loop_count--;
+
+        if (w_unrolled_loop_count >= 1) {
+            add(reg_output_baddr, ow_block_size * ch_offset * sizeof(float));
+            add(reg_input_baddr, iw_block_size * ch_offset * sizeof(float));
+        }
+    }
+
+    /* This block may execute under 2 different scenarios:
+     * 1) When padding is present, this executes the middle loop (if any).
+     * 2) With no padding, it writes the full loop of the micro-kernel. */
+    int middle_loop_count = handle_padding ? w_unrolled_loop_count - 1 :
+                                             w_unrolled_loop_count;
+    if (middle_loop_count >= 1) {
+        Label ow_blk_label;
+
+        /* Insert loop for 'ow' block when middle block needs to execute more
+         * than once */
+        bool do_ow_blk_loop = middle_loop_count > 1;
+        if (do_ow_blk_loop) {
+            mov(iter_ow_blk, middle_loop_count);
+            L(ow_blk_label);
+        }
+
+        compute_kh_loop(0, 0, pad_offset, zero_filter_1st_iter);
+        /* disable zero_filter for the rest of the iterations i.e. from now on
+         * load contents of 'filter' from memory */
+        mov(reg_exec_flag, FLAG_ZERO_FILTER);
+
+        if (do_ow_blk_loop || handle_padding) {
+            add(reg_output_baddr, ow_block_size * ch_offset * sizeof(float));
+            add(reg_input_baddr, iw_block_size * ch_offset * sizeof(float));
+        }
+
+        if (do_ow_blk_loop) {
+            dec(iter_ow_blk);
+            cmp(iter_ow_blk, 0);
+            jg(ow_blk_label, T_NEAR);
+        }
+
+        w_unrolled_loop_count -= middle_loop_count;
+    }
+
+    /* compute right padded block: ow_blk = LAST */
+    if (handle_padding && w_unrolled_loop_count >= 1) {
+        ow_block = jcp.ow - ow_block_size;
+        compute_kh_loop(
+                0, jcp.r_pad, pad_offset, zero_filter_1st_iter, ow_block);
+
+        w_unrolled_loop_count--;
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::generate() {
+    preamble();
+
+    mov(reg_input_baddr,
+            ptr[this->param1 + offsetof(jit_dw_conv_call_s, input)]);
+    mov(reg_output_baddr,
+            ptr[this->param1 + offsetof(jit_dw_conv_call_s, output)]);
+    mov(reg_filter_baddr,
+            ptr[this->param1 + offsetof(jit_dw_conv_call_s, filter)]);
+    if (jcp.with_bias)
+        mov(reg_bias_baddr,
+                ptr[this->param1 + offsetof(jit_dw_conv_call_s, bias)]);
+    mov(reg_table_flags,
+            ptr[this->param1 + offsetof(jit_dw_conv_call_s, table_flags)]);
+
+    compute_ow_block_unroll();
+
+    this->postamble();
+
+    create_h_bounds_table();
+}
+
+template <cpu_isa_t isa>
+status_t jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::init_conf(
+        jit_conv_conf_t &jcp, const convolution_desc_t &cd,
+        const memory_desc_wrapper &src_d,
+        const memory_desc_wrapper &diff_weights_d,
+        const memory_desc_wrapper &diff_dst_d) {
+
+    if (!mayiuse(isa))
+        return status::unimplemented;
+
+    jcp.ngroups = diff_weights_d.dims()[0];
+    jcp.oc = diff_dst_d.dims()[1] / jcp.ngroups;
+    jcp.ic = src_d.dims()[1] / jcp.ngroups;
+
+    const bool with_groups = diff_weights_d.ndims() == src_d.ndims() + 1;
+
+    jcp.is_depthwise = true && with_groups && everyone_is(1, jcp.oc, jcp.ic);
+
+    if (!jcp.is_depthwise)
+        return status::unimplemented;
+
+    jcp.ch_block = isa == avx512_common ? 16 : 8;
+
+    jcp.mb = src_d.dims()[0];
+
+    jcp.ih = src_d.dims()[2];
+    jcp.iw = src_d.dims()[3];
+    jcp.oh = diff_dst_d.dims()[2];
+    jcp.ow = diff_dst_d.dims()[3];
+
+    jcp.kh = diff_weights_d.dims()[3];
+    jcp.kw = diff_weights_d.dims()[4];
+
+    jcp.stride_h = cd.strides[0];
+    jcp.stride_w = cd.strides[1];
+
+    jcp.t_pad = cd.padding[0][0];
+    /* bottom padding should equal top padding to generate the proper 'h' loop
+     * bounds. */
+    jcp.b_pad = cd.padding[1][0];
+
+    jcp.l_pad = cd.padding[0][1];
+    jcp.r_pad = cd.padding[1][1];
+
+    jcp.dilate_h = cd.dilates[0];
+    jcp.dilate_w = cd.dilates[1];
+
+    jcp.ihp = jcp.ih + jcp.t_pad + jcp.b_pad;
+    jcp.iwp = jcp.iw + jcp.l_pad + jcp.r_pad;
+
+    jcp.src_fmt = src_d.format();
+
+    jcp.with_bias = cd.diff_bias_desc.format != memory_format::undef;
+
+    auto desired_act_fmt = isa == avx512_common ? nChw16c : nChw8c;
+    auto desired_wei_fmt = isa == avx512_common ? Goihw16g : Goihw8g;
+
+    bool args_ok = true
+                   && src_d.format() == desired_act_fmt
+                   && diff_weights_d.format() == desired_wei_fmt
+                   && diff_dst_d.format() == desired_act_fmt
+                   && one_of(cd.bias_desc.format, memory_format::undef, any, x)
+                   //&& jcp.ngroups % simd_w == 0
+                   && jcp.ngroups % jcp.ch_block == 0
+                   && jcp.dilate_h == 0
+                   && jcp.dilate_w == 0
+                   && jcp.kw <= 3
+                   && jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1
+                   && jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1;
+    if (!args_ok) return status::unimplemented;
+
+    /* Note: this IMPLICATION-check does not allow 'negative padding' execution
+     */
+    bool ok = true && IMPLICATION(jcp.r_pad > 0, jcp.r_pad == jcp.l_pad)
+            && IMPLICATION(jcp.b_pad > 0, jcp.b_pad == jcp.t_pad);
+    if (!ok)
+        return status::unimplemented;
+
+    jcp.nb_ch = jcp.ngroups / jcp.ch_block;
+
+    /* Values for block size to try; order gives priority */
+    constexpr int BLOCK_SIZE[] = { 14, 16, 7, 8 };
+
+    int block_size_h = 1;
+    int block_size_w = 1;
+
+    /* *Try different block sizes for convolution */
+    for (int block : BLOCK_SIZE) {
+
+        block_size_h = block / jcp.stride_h;
+        block_size_w = block / jcp.stride_w;
+
+        if ((jcp.oh % block_size_h == 0) && (jcp.ow % block_size_w == 0))
+            break;
+    }
+
+    if (jcp.oh % block_size_h != 0 || jcp.ow % block_size_w != 0)
+        return status::unimplemented;
+
+    jcp.oh_blk_size = block_size_h;
+
+    jcp.ur_w = jcp.ow_blk_size = block_size_w;
+
+    return status::success;
+}
+
+template struct jit_uni_dw_conv_bwd_weights_kernel_f32<avx512_common>;
+template struct jit_uni_dw_conv_bwd_weights_kernel_f32<avx2>;
+template struct jit_uni_dw_conv_bwd_weights_kernel_f32<sse42>;
+
 }
 }
 }
index 341d89c..103687b 100644 (file)
@@ -154,6 +154,162 @@ private:
     void generate();
 };
 
+template <cpu_isa_t isa>
+struct jit_uni_dw_conv_bwd_weights_kernel_f32 : public jit_generator {
+
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_dw_conv_bwd_weights_kernel_f32)
+
+    jit_uni_dw_conv_bwd_weights_kernel_f32(jit_conv_conf_t ajcp) : jcp(ajcp) {
+        this->generate();
+        jit_ker = (void (*)(jit_dw_conv_call_s *)) this->getCode();
+    }
+
+    static status_t init_conf(jit_conv_conf_t &jcp,
+            const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &diff_weights_d,
+            const memory_desc_wrapper &diff_dst_d);
+
+    jit_conv_conf_t jcp;
+    void (*jit_ker)(jit_dw_conv_call_s *);
+
+private:
+    //using Vmm = Xbyak::Zmm;
+    using Vmm = typename utils::conditional3<isa == sse42, Xbyak::Xmm,
+            isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+    using reg64_t = const Xbyak::Reg64;
+    using te_size
+            = unsigned char; /* set the 'table_entry' data size. For this
+                                implementation, only values > 255 are needed. */
+    const int simd_w = cpu_isa_traits<isa>::vlen / sizeof(float);
+    const int reg_repeats = (isa == sse42) ? 2 : 1;
+    inline void write_table(te_size data) { db(data); }
+    //const Xbyak::AddressFrame &vmmword = zword;
+    const Xbyak::AddressFrame &vmmword
+            = (isa == sse42) ? xword : (isa == avx2) ? yword : zword;
+
+    /* XXX: offset between input and accummulators is 3, therefore, assume 'kw'
+     * is no larger than 3*/
+    inline Vmm get_bias_reg(int idx = 0) { return Vmm(idx); }
+    inline Vmm get_output_reg(int idx) { return Vmm(idx + 1); }
+    inline Vmm get_input_reg(int idx) { return Vmm(idx + 4 * reg_repeats + 1); }
+    inline Vmm get_acc_reg(int idx) { return Vmm(idx + 1 * reg_repeats + 1); }
+    inline Vmm get_aux_reg() { return Vmm(0); }
+
+    reg64_t tmp_reg_idx_input = r8;
+    reg64_t tmp_reg_kh_input = r9;
+    reg64_t tmp_reg_idx_output = r10;
+    reg64_t tmp_reg_filter = r11;
+
+    /* parameter passed by driver into kernel */
+    reg64_t reg_table_flags = rbx;
+    Xbyak::Reg8 reg_table_idx = bl;
+    Xbyak::Reg8 reg_exec_flag = bh;
+
+    /* holds the address for the 'bounds table' that is generated during JIT */
+    reg64_t reg_bound_table_addr = r13;
+
+    reg64_t reg_tmp_off = rax;
+    Xbyak::Reg8 reg_tmp_al = al;
+
+    reg64_t iter_oh = rdx;
+    Xbyak::Reg8 iter_oh_lb = dl;
+    reg64_t kh_offset = rdx;
+    Xbyak::Reg8 kh_offset_lb = dl;
+
+    reg64_t iter_ow_blk = rbp;
+    reg64_t iter_kh  = rsi;
+
+    /* Base addresses for convolution parameters. */
+    reg64_t reg_input_baddr = r15;
+    reg64_t reg_output_baddr = r12;
+    reg64_t reg_filter_baddr = abi_not_param1;
+    reg64_t reg_bias_baddr = r14;
+
+    Xbyak::Label bound_start_table;
+
+    /* Return the amount of blocks to execute depending on the convolution
+     * dimensions and block_size e.g.
+     *  {ow = 112, ow_block_size = 14} -> requires:
+     *      1 left block,
+     *      1 middle block,
+     *      1 right block;
+     * {ow = 28, ow_block_size = * 14} -> requires:
+     *      1 left block,
+     *      1 right block. */
+    inline int get_loop_bounds_count(
+            const int padding, const int h_dimension, const int block_size) {
+        const int num_top_padded_blk = utils::div_up(padding, block_size);
+        const int num_tail_blk
+                = (h_dimension - num_top_padded_blk * block_size > 0) ? 1 : 0;
+        const int num_middle_blk
+                = (h_dimension
+                    - (num_top_padded_blk + num_tail_blk) * block_size
+                          > 0) ? 1 : 0;
+        return num_top_padded_blk + num_middle_blk + num_tail_blk;
+    }
+
+    /* Create a table containing the values that define the kernel's loop
+     * behavior. The purpose of using this table is to eliminate the
+     * implementation complexities and performance impact of in-execution
+     * computation of loop bounds in regards to stride and padding.  The table
+     * consists of 3 sections:
+     * 1) Initial Bounds for 'oh' loop.
+     * 2) Input address offset flag: '1' indicates an input address increment,
+     *    '0' results in no increment.
+     * 3) End-bounds for 'oh' loop.
+     *
+     * The table is written into memory as the following format:
+     * Filter_size:    |--- kh ---|
+     * Table:           __________
+     * 1st section:    |          |
+     *                 |- - - - - |
+     * 2nd section:    |          |
+     *                 |- - - - - |
+     * 3rd section:    |__________|
+     *
+     * Example for convolution: ih=112, oh=112, kh=3, ph=1
+     *   __________
+     *  | 1,  0,  0| -> upper 'oh' loop initial bounds
+     *  | 0,  0,  0| -> middle 'oh' loop initial bounds
+     *  | 0,  0,  0| -> bottom loop initial bounds
+     *  |----------|
+     *  | 0,  1,  0| -> *There is no input offset for kh = 0, i.e. the
+     *  | 1,  1,  1|    offset_flag is '0' becase of padding.
+     *  | 1,  1,  1|
+     *  |----------|
+     *  |14, 14, 14| -> lower 'oh' loop end bounds
+     *  |14, 14, 14| -> (etc)
+     *  |14, 14, 13| -> *The last 'kh' loop has an upper bound of 13
+     *  |__________|    because of padding.
+     *    0,  1,  2  -> kh values
+     * */
+    inline void create_h_bounds_table();
+
+    /* Micro-kernel JIT'ing, fusing 'kw' and 'ow_block' loops into unrolled FMAs
+     */
+    inline void compute_ow_step_unroll(
+            int l_pad, int r_pad, int pad_offset, int ow_block);
+
+    /* JIT'ing the outer loops for the micro-kernel -> {kh, oh_block} */
+    inline void compute_kh_loop(int l_pad, int r_pad, int pad_offset,
+            bool first_iteration, int ow_block = 0);
+
+    /* Write 'width' micro-kernel JITs; depending on the padding and convolution
+     * size, write a micro-kernel for the left ow-block, middle ow-block(s), and
+     * right ow-block.*/
+    inline void compute_ow_block_unroll();
+
+    inline void load_filter();
+    inline void zero_filter();
+    inline void load_bias();
+    inline void zero_bias();
+    inline void compute_bias_step_unroll(const int unroll_w);
+    inline void compute_bias_loop();
+    inline void store_filter();
+    inline void store_bias();
+
+    void generate();
+};
 }
 }
 }
index 2568dc9..48c1961 100644 (file)
@@ -254,6 +254,258 @@ template void _jit_uni_dw_convolution_bwd_data_t<avx2>
 template void _jit_uni_dw_convolution_bwd_data_t<sse42>
     ::execute_backward_data();
 
+template <cpu_isa_t isa>
+_jit_uni_dw_convolution_bwd_weights_t<isa>::
+        _jit_uni_dw_convolution_bwd_weights_t(const pd_t *pd,
+                const input_vector &inputs, const output_vector &outputs)
+    : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {
+
+    const auto &jcp = conf_.jcp_;
+
+    kernel_ = new jit_uni_dw_conv_bwd_weights_kernel_f32<isa>(jcp);
+
+    const int max_threads
+            = (mkldnn_in_parallel()) ? 1 : mkldnn_get_max_threads();
+    nthr_ = max_threads;
+
+    nthr_g_ = nthr_mb_ = 1;
+
+    /* Basic-Heuristics for parallel strategy:
+     * 1) Tries to parallel on the number of Groups (g) where tasks are
+     * independent. Otherwise,
+     * 2) Tries to split the work across g and MiniBatch (mb).
+     * Parallelizing on mb requires computing a reduction for weights.
+     *
+     * NOTE: because of 'task partitioning' scheme, there will be unbalanced
+     * per-thread load when the number of threads is high (e.g. > 16).
+     */
+    nthr_g_ = nstl::min(jcp.nb_ch, nthr_);
+    nthr_mb_ = nstl::min(nstl::max(1, nthr_ / nthr_g_), jcp.mb);
+
+    nthr_ = nthr_g_ * nthr_mb_;
+
+    /* Notes: if splitting thread work on 'mb', then a reduction has to take
+     * place. Hence, allocate a per-thread, local weights-buffer for the
+     * reduction */
+    if (nthr_mb_ > 1) {
+        const size_t wei_size = jcp.ngroups * jcp.kh * jcp.kw;
+        ws_reduction_ = (data_t *)malloc(
+                (nthr_mb_ - 1) * wei_size * sizeof(data_t), 64);
+
+        if (jcp.with_bias) {
+            const size_t bias_size = jcp.ngroups;
+            bias_reduction_ = (data_t *)malloc(
+                    (nthr_mb_ - 1) * bias_size * sizeof(data_t), 64);
+        }
+
+        /* Used when executing a parallel reduction */
+        if(do_parallel_reduction()){
+            acc_ker_ = new cpu_accumulator_1d_t<data_type::f32>();
+            simple_barrier::ctx_init(&reduction_bctx_);
+        }
+    }
+}
+template <cpu_isa_t isa>
+void _jit_uni_dw_convolution_bwd_weights_t<isa>::execute_backward_weights() {
+
+    auto src
+            = (data_t *)reinterpret_cast<const data_t *>(this->input_memory(0));
+    auto diff_dst
+            = (data_t *)reinterpret_cast<const data_t *>(this->input_memory(1));
+    const auto &jcp = kernel_->jcp;
+
+    /* JIT-code skips the unnecessary computations within the padded region. */
+    const int SKIP_TOP_PADDING = 0;
+
+    const size_t wei_size = jcp.ngroups * jcp.kh * jcp.kw;
+    const size_t bias_size = jcp.with_bias ? jcp.ngroups : 0;
+
+    const int oh_blk_size = jcp.oh_blk_size;
+
+    //const int simd_w = jcp.ch_block;
+    const int ch_block = jcp.ch_block;
+
+    auto set_kernel_params = [&](jit_dw_conv_call_s *conv_params,
+            const int batch, const int group, const int oh_block,
+            const unsigned char table_idx, const int negative_padding_offset,
+            const unsigned char exec_flag) {
+
+        const int ih_block = oh_block * jcp.stride_h;
+
+        conv_params->table_idx = table_idx;
+        conv_params->exec_flag = exec_flag;
+
+        size_t diff_dst_off
+                = ((batch * (jcp.ngroups / ch_block) + group) * jcp.oh + oh_block)
+                * jcp.ow;
+
+        size_t src_off = ((batch * (jcp.ngroups / ch_block) + group) * jcp.ih
+                              + ih_block - negative_padding_offset)
+                * jcp.iw;
+
+        conv_params->output = &diff_dst[diff_dst_off * ch_block];
+        conv_params->input = &src[src_off * ch_block];
+    };
+
+    parallel(nthr_, [&](const int ithr, const int nthr_) {
+        auto conv_params = jit_dw_conv_call_s();
+
+        /* assign iteration space to thread */
+        const int ithr_g = ithr % nthr_g_;
+        const int ithr_mb = (ithr / nthr_g_) % nthr_mb_;
+
+        /* split dimensions */
+        int g_start{ 0 }, g_end{ 0 };
+        balance211(jcp.nb_ch, nthr_g_, ithr_g, g_start, g_end);
+
+        int mb_start{ 0 }, mb_end{ 0 };
+        balance211(jcp.mb, nthr_mb_, ithr_mb, mb_start, mb_end);
+
+        auto diff_wei = ithr_mb == 0 ?
+                (data_t *)reinterpret_cast<data_t *>(this->memory(0)) :
+                (data_t *)ws_reduction_ + (ithr_mb - 1) * wei_size;
+
+        auto diff_bias = ithr_mb == 0 ?
+                (data_t *)reinterpret_cast<const data_t *>(this->memory(1)) :
+                (data_t *)bias_reduction_ + (ithr_mb - 1) * bias_size;
+
+        for (int g = g_start; g < g_end; ++g) {
+
+            /* This flag controls whether the kernel loads weights from memory
+             * or initializes the 'weight accummulator' registers to '0'. The
+             * latter happens at the beginning of each group/16 computation. */
+            unsigned char zero_filter_flag = ~FLAG_ZERO_FILTER;
+            unsigned char zero_bias_flag = jcp.with_bias ? ~FLAG_ZERO_BIAS : 0;
+
+            size_t diff_wei_off = g * jcp.kh * jcp.kw;
+            conv_params.filter = &diff_wei[diff_wei_off * ch_block];
+
+            if (jcp.with_bias)
+                conv_params.bias = &diff_bias[g * ch_block];
+
+            for (int mb = mb_start; mb < mb_end; ++mb) {
+
+                /* The 'table index' parameter controls the table entry for the
+                 * inner kernel execution. For more details see
+                 * jit_uni_dw_conv_kernel_f32. */
+                int table_idx = 0;
+
+                /* OH_BLOCK is unrolled to separate the computations according
+                 * to numerous condition-setting 'h' parameter. */
+                int oh_blk = 0;
+
+                /* Top-padding case - this case always executes. */
+                set_kernel_params(&conv_params, mb, g, oh_blk, table_idx,
+                        SKIP_TOP_PADDING, zero_filter_flag & zero_bias_flag);
+                kernel_->jit_ker(&conv_params);
+
+                zero_bias_flag |= FLAG_ZERO_BIAS;
+                zero_filter_flag |= FLAG_ZERO_FILTER;
+                oh_blk += oh_blk_size;
+
+                /* Middle OH_BLOCK cases. */
+                for (; oh_blk < (jcp.oh - oh_blk_size); oh_blk += oh_blk_size) {
+                    table_idx = 1;
+                    set_kernel_params(&conv_params, mb, g, oh_blk, table_idx,
+                            jcp.t_pad, zero_filter_flag & zero_bias_flag);
+                    kernel_->jit_ker(&conv_params);
+                }
+                table_idx++;
+
+                /* Bottom block */
+                if (oh_blk < jcp.oh) {
+                    set_kernel_params(&conv_params, mb, g, oh_blk, table_idx,
+                            jcp.t_pad, zero_filter_flag & zero_bias_flag);
+                    kernel_->jit_ker(&conv_params);
+                }
+            }
+        }
+        if (do_parallel_reduction() && nthr_mb_ > 1) {
+
+            size_t reduct_start{ 0 }, reduct_end{ 0 };
+            balance211(wei_size, nthr_, ithr, reduct_start, reduct_end);
+
+            const size_t reduct_off = reduct_start;
+
+            auto *acc_data
+                    = (data_t *)reinterpret_cast<data_t *>(this->memory(0))
+                    + reduct_off;
+
+            const int acc_size = reduct_end - reduct_start;
+
+            simple_barrier::barrier(&reduction_bctx_, nthr_);
+
+            for (int thr_mb = 1; thr_mb < nthr_mb_; ++thr_mb) {
+
+                auto *src_data = (data_t *)ws_reduction_
+                        + (thr_mb - 1) * wei_size + reduct_off;
+
+                acc_ker_->accumulate(acc_data, src_data, acc_size);
+            }
+        }
+    });
+
+    /* Apply single-threaded 'mb' reduction */
+    if (nthr_mb_ > 1) {
+
+        auto diff_weights
+                = (data_t *)reinterpret_cast<data_t *>(this->memory(0));
+        auto diff_bias
+                = (data_t *)reinterpret_cast<const data_t *>(this->memory(1));
+
+        for (int thr_mb = 1; thr_mb < nthr_mb_; ++thr_mb) {
+
+            size_t mb_accum_offset = (thr_mb - 1) * wei_size;
+            size_t b_accum_offset = (thr_mb - 1) * bias_size;
+
+            for (int g = 0; g < jcp.nb_ch; ++g) {
+
+                /* Reduction on Bias */
+                if (jcp.with_bias) {
+                    PRAGMA_OMP_SIMD()
+                    for (int g_block = 0; g_block < ch_block; ++g_block) {
+                        size_t bias_offset = g * ch_block + g_block;
+                        diff_bias[bias_offset] += bias_reduction_[b_accum_offset
+                                + bias_offset];
+                    }
+                }
+                if (!do_parallel_reduction()) {
+                    for (int kh = 0; kh < jcp.kh; ++kh) {
+                        for (int kw = 0; kw < jcp.kw; ++kw) {
+
+                            size_t wei_offset = (g * jcp.kh + kh) * jcp.kw + kw;
+                            PRAGMA_OMP_SIMD()
+                            for (int g_block = 0; g_block < ch_block; ++g_block) {
+                                diff_weights[wei_offset * ch_block + g_block]
+                                        += ws_reduction_[mb_accum_offset
+                                                + wei_offset * ch_block
+                                                + g_block];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template _jit_uni_dw_convolution_bwd_weights_t<avx512_common>::
+        _jit_uni_dw_convolution_bwd_weights_t(const pd_t *pd,
+                const input_vector &inputs, const output_vector &outputs);
+template _jit_uni_dw_convolution_bwd_weights_t<avx2>::
+        _jit_uni_dw_convolution_bwd_weights_t(const pd_t *pd,
+                const input_vector &inputs, const output_vector &outputs);
+template _jit_uni_dw_convolution_bwd_weights_t<sse42>::
+        _jit_uni_dw_convolution_bwd_weights_t(const pd_t *pd,
+                const input_vector &inputs, const output_vector &outputs);
+
+template void _jit_uni_dw_convolution_bwd_weights_t<avx512_common>::
+        execute_backward_weights();
+template void _jit_uni_dw_convolution_bwd_weights_t<avx2>::
+        execute_backward_weights();
+template void _jit_uni_dw_convolution_bwd_weights_t<sse42>::
+        execute_backward_weights();
+
 }
 }
 }
index e9e45c0..b723c1c 100644 (file)
@@ -22,6 +22,8 @@
 #include "cpu_engine.hpp"
 #include "jit_primitive_conf.hpp"
 #include "jit_uni_dw_conv_kernel_f32.hpp"
+#include "cpu_reducer.hpp"
+#include "cpu_barrier.hpp"
 
 namespace mkldnn {
 namespace impl {
@@ -54,7 +56,7 @@ struct _jit_uni_dw_convolution_fwd_t: public cpu_primitive_t {
                         this->cdesc_().src_desc.data_type,
                         this->cdesc_().weights_desc.data_type,
                         this->cdesc_().dst_desc.data_type)
-                && utils::implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                         data_type::f32 == this->cdesc_().bias_desc.data_type);
 
             if (!ok) return status::unimplemented;
@@ -220,6 +222,109 @@ using jit_avx2_dw_convolution_bwd_data_t =
 using jit_sse42_dw_convolution_bwd_data_t =
     _jit_uni_dw_convolution_bwd_data_t<sse42>;
 
+template <cpu_isa_t isa>
+struct _jit_uni_dw_convolution_bwd_weights_t: public cpu_primitive_t {
+    struct pd_t: public cpu_convolution_bwd_weights_pd_t {
+        pd_t(engine_t *engine,
+                const convolution_desc_t *adesc,
+                const primitive_attr_t *attr,
+                const convolution_fwd_pd_t *hint_fwd_pd)
+            : cpu_convolution_bwd_weights_pd_t(engine, adesc, attr, hint_fwd_pd)
+            , jcp_() {}
+
+        DECLARE_COMMON_PD_T(
+                JIT_IMPL_NAME_HELPER("jit_dw:", isa, ""),
+                _jit_uni_dw_convolution_bwd_weights_t<isa>);
+
+        virtual status_t init() override {
+            using namespace prop_kind;
+
+            assert(this->engine()->kind() == engine_kind::cpu);
+            bool ok = true
+                && this->set_default_params() == status::success
+                && this->desc()->prop_kind == prop_kind::backward_weights
+                && this->desc()->alg_kind == alg_kind::convolution_direct
+                && utils::everyone_is(data_type::f32,
+                        this->desc()->src_desc.data_type,
+                        this->desc()->diff_weights_desc.data_type,
+                        this->desc()->diff_dst_desc.data_type);
+
+            if (!ok) return status::unimplemented;
+
+            return jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::init_conf(jcp_,
+                        *this->desc(), *this->src_pd_.desc(),
+                        *this->diff_weights_pd_.desc(), *this->diff_dst_pd_.desc());
+        }
+
+        jit_conv_conf_t jcp_;
+
+    protected:
+        virtual status_t set_default_params() override {
+
+            using namespace memory_format;
+            auto desired_act_fmt = isa == avx512_common ? nChw16c : nChw8c;
+            auto desired_wei_fmt = isa == avx512_common ? Goihw16g : Goihw8g;
+
+            if (this->src_pd_.desc()->format == any)
+                CHECK(this->src_pd_.set_format(desired_act_fmt));
+            if (this->diff_dst_pd_.desc()->format == any)
+                CHECK(this->diff_dst_pd_.set_format(desired_act_fmt));
+            if (this->diff_weights_pd_.desc()->format == any)
+                CHECK(this->diff_weights_pd_.set_format(desired_wei_fmt));
+            if (this->diff_bias_pd_.desc()->format == any)
+                CHECK(this->diff_bias_pd_.set_format(x));
+
+            return status::success;
+        }
+    };
+
+    _jit_uni_dw_convolution_bwd_weights_t(const pd_t *pd,
+            const input_vector &inputs, const output_vector &outputs);
+    ~_jit_uni_dw_convolution_bwd_weights_t() {
+        delete kernel_;
+        if (acc_ker_)
+            delete acc_ker_;
+
+        free(ws_reduction_);
+        free(bias_reduction_);
+    };
+
+    typedef typename prec_traits<data_type::f32>::type data_t;
+
+    virtual void execute(event_t *e) {
+        execute_backward_weights();
+        e->set_state(event_t::ready);
+    }
+
+private:
+    void execute_backward_weights();
+
+    pd_t conf_;
+    jit_uni_dw_conv_bwd_weights_kernel_f32<isa> *kernel_;
+
+    data_t *ws_reduction_ = nullptr;
+    data_t *bias_reduction_ = nullptr;
+
+    /* Used when executing a parallel reduction */
+    cpu_accumulator_1d_t<data_type::f32> *acc_ker_ = nullptr;
+    simple_barrier::ctx_t reduction_bctx_;
+
+    /* For parallel implementation details see '.cpp' file in the
+     * backwards-by-wights section. */
+    int nthr_, nthr_g_, nthr_mb_;
+
+    inline bool do_parallel_reduction(){
+        return false;
+    }
+};
+
+using jit_avx512_common_dw_convolution_bwd_weights_t =
+    _jit_uni_dw_convolution_bwd_weights_t<avx512_common>;
+using jit_avx2_dw_convolution_bwd_weights_t =
+    _jit_uni_dw_convolution_bwd_weights_t<avx2>;
+using jit_sse42_dw_convolution_bwd_weights_t =
+    _jit_uni_dw_convolution_bwd_weights_t<sse42>;
+
 }
 }
 }
index 21fe6d0..2896b1b 100644 (file)
@@ -873,15 +873,20 @@ struct jit_uni_kernel_fwd_f32: public jit_uni_eltwise_kernel_f32,
 
         preamble();
 
+        Label vectorized_loop_start;
+        Label reminder_loop_start;
+        Label vectorized_loop_end;
+        Label reminder_loop_end;
+
         Reg64 param = abi_param1;
         mov(reg_from, ptr[param + GET_OFF(from)]);
         mov(reg_to, ptr[param + GET_OFF(to)]);
         mov(reg_work_amount, ptr[param + GET_OFF(work_amount)]);
 
         cmp(reg_work_amount, simd_w);
-        jl("reminder_loop_start", T_NEAR);
+        jl(reminder_loop_start, T_NEAR);
 
-        L("vectorized_loop_start");
+        L(vectorized_loop_start);
 
         uni_vmovups(vmm_src, ptr[reg_from]);
         eltwise_injector->compute_vector(vmm_src.getIdx());
@@ -892,14 +897,14 @@ struct jit_uni_kernel_fwd_f32: public jit_uni_eltwise_kernel_f32,
 
         sub(reg_work_amount, simd_w);
         cmp(reg_work_amount, simd_w);
-        jge("vectorized_loop_start", T_NEAR);
+        jge(vectorized_loop_start, T_NEAR);
 
-        L("vectorized_loop_end");
+        L(vectorized_loop_end);
 
-        L("reminder_loop_start");
+        L(reminder_loop_start);
 
         cmp(reg_work_amount, 0);
-        jle("reminder_loop_end", T_NEAR);
+        jle(reminder_loop_end, T_NEAR);
 
         movss(xmm_src, ptr[reg_from]);
         eltwise_injector->compute_vector(xmm_src.getIdx());
@@ -909,9 +914,9 @@ struct jit_uni_kernel_fwd_f32: public jit_uni_eltwise_kernel_f32,
         add(reg_to, sizeof(float));
 
         dec(reg_work_amount);
-        jmp("reminder_loop_start", T_NEAR);
+        jmp(reminder_loop_start, T_NEAR);
 
-        L("reminder_loop_end");
+        L(reminder_loop_end);
 
         postamble();
 
@@ -954,9 +959,9 @@ status_t jit_uni_eltwise_fwd_t<isa>::pd_t::init() {
                 prop_kind::forward_inference)
         && utils::everyone_is(data_type::f32, desc()->data_desc.data_type)
         && !has_zero_dim_memory()
-        && utils::implication(isa > avx2, utils::one_of(desc()->alg_kind,
+        && IMPLICATION(isa > avx2, utils::one_of(desc()->alg_kind,
                 eltwise_relu, eltwise_elu))
-        && utils::implication(isa == sse42 || isa == avx2, utils::one_of(
+        && IMPLICATION(isa == sse42 || isa == avx2, utils::one_of(
                     desc()->alg_kind, eltwise_relu, eltwise_tanh, eltwise_elu,
                     eltwise_square, eltwise_abs, eltwise_sqrt, eltwise_linear,
                     eltwise_bounded_relu, eltwise_soft_relu, eltwise_logistic))
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.cpp
new file mode 100644 (file)
index 0000000..ccc1c34
--- /dev/null
@@ -0,0 +1,634 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <math.h>
+
+#include "mkldnn_types.h"
+
+#include "mkldnn_thread.hpp"
+#include "utils.hpp"
+
+#include "jit_generator.hpp"
+
+#include "jit_uni_i8i8_pooling.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using namespace Xbyak;
+
+using namespace mkldnn::impl::utils;
+using namespace mkldnn::impl::memory_format;
+using namespace mkldnn::impl::utils;
+using namespace mkldnn::impl::types;
+using namespace alg_kind;
+
+struct call_params_t {
+    const char *src_i8;
+    const char *dst_i8;
+    size_t kw_range;
+    size_t kh_range;
+    float idivider;
+};
+
+template <cpu_isa_t isa>
+struct jit_uni_i8i8_pool_fwd_ker_t : public jit_generator {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_i8i8_pool_fwd_ker_t)
+
+    Reg64 reg_ptr_src_i8 = r8;
+    Reg64 reg_ptr_dst_i8 = r9;
+
+    Reg64 ki = r10;
+    Reg64 kj = r11;
+    Reg64 reg_kw = r12;
+    Reg64 reg_kh = r13;
+    Reg64 c_iter = r14;
+
+    Reg64 aux_reg_src_h = rax;
+    Reg64 aux_reg_src_w = rbx;
+
+    Reg64 reg_tmp = rdx;
+    Reg64 reg_src_64 = r15;
+    Reg32 reg_src_32 = r15d;
+    Reg8 reg_src_8 = r15b;
+
+    size_t sizeof_src_dt() const { return data_type_size(jpp.src_dt); }
+    size_t sizeof_dst_dt() const { return data_type_size(jpp.dst_dt); }
+
+    using Vmm = typename utils::conditional3<isa == sse42, Xbyak::Xmm,
+            isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+
+    Xmm xmm_tmp = Xmm(0);
+    Vmm vreg_tmp = Vmm(14);
+    Vmm vreg_zeros = Vmm(15);
+
+    /* max pooling */
+    Vmm vmm_src(int jj, int ii) {
+        return Vmm(2*jj + ii);
+    }
+
+    Xmm xmm_src(int jj) {
+        return Xmm(2*jj);
+    }
+
+    Vmm vmm_dst(int jj, int ii) {
+        return Vmm(2*jj + ii + 2 * jpp.ur_c);
+    }
+
+    Xmm xmm_dst(int jj) {
+        return Xmm(2*jj + 2 * jpp.ur_c);
+    }
+
+    /* avg pooling */
+    Vmm vmm_src_s32(int jj, int ii) {
+        return Vmm(2*jj + ii);
+    }
+
+    Xmm xmm_src_s32(int jj, int ii) {
+        return Xmm(2*jj + ii);
+    }
+
+    Vmm vmm_dst_s32(int jj, int ii) {
+        return Vmm(2*jj + ii + 2 * jpp.ur_c);
+    }
+
+    Ymm ymm_dst_s32(int jj, int ii) {
+        return Ymm(2*jj + ii + 2 * jpp.ur_c);
+    }
+
+    Xmm xmm_dst_s32(int jj, int ii) {
+        return Xmm(2*jj + ii + 2 * jpp.ur_c);
+    }
+
+    Vmm vmm_dst_f32(int jj, int ii) {
+        return Vmm(2*jj + ii + 4 * jpp.ur_c);
+    }
+
+    void (*ker_)(const call_params_t *);
+    jit_pool_conf_t jpp;
+
+    void init_tmp_reg();
+
+    void load_src(int jj, int c_step);
+    void store_dst(int jj, int c_step);
+
+    void compute_avg_step(int ur_c, int c_step);
+    void compute_max_step(int ur_c, int c_step);
+    void compute_step(int ur_c, int c_step);
+
+    void compute_c_block();
+    void generate();
+
+    static status_t init_conf(jit_pool_conf_t &jpp,
+        const pooling_desc_t &pd, const memory_desc_wrapper &src_d,
+        const memory_desc_wrapper &dst_d);
+
+    jit_uni_i8i8_pool_fwd_ker_t(const jit_pool_conf_t &jpp_)
+           : jpp(jpp_) {
+        generate();
+        ker_ = reinterpret_cast<decltype(ker_)>(const_cast<uint8_t*>(
+                       getCode()));
+    }
+};
+
+template <cpu_isa_t isa>
+void jit_uni_i8i8_pool_fwd_ker_t<isa>::load_src(int jj, int c_step) {
+    using namespace data_type;
+
+    int repeats = isa == sse42 && c_step != 1 ? 2 : 1;
+    switch (jpp.alg) {
+        case pooling_max: {
+            auto offset = jj*c_step*sizeof_src_dt();
+            if (c_step == jpp.c_block) {
+                for (int ii = 0; ii < repeats; ii++)
+                    uni_vmovups(vmm_src(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]);
+            } else if (c_step == 1) {
+                if (jpp.src_dt == s32) {
+                    movsd(xmm_src(jj), ptr[aux_reg_src_w + offset]);
+                } else {
+                    mov(reg_src_8, ptr[aux_reg_src_w + offset]);
+                    movq(xmm_src(jj), reg_src_64);
+                }
+            }
+            break;
+        }
+        case pooling_avg_include_padding:
+        case pooling_avg_exclude_padding: {
+            auto offset = jj*c_step*sizeof_src_dt();
+            switch (jpp.src_dt) {
+                case s32:
+                    if (c_step == jpp.c_block) {
+                        for (int ii = 0; ii < repeats; ii++)
+                            uni_vmovups(vmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]);
+                    } else if (c_step == 1) {
+                        movsd(xmm_src_s32(jj, 0), ptr[aux_reg_src_w + offset]);
+                    }
+                    break;
+                case s8:
+                    if (c_step == jpp.c_block) {
+                        for (int ii = 0; ii < repeats; ii++) {
+                            if (isa == sse42)
+                                movd(xmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]);
+                            else
+                                movq(xmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]);
+
+                            uni_vpmovsxbd(vmm_src_s32(jj, ii), xmm_src_s32(jj, ii));
+                        }
+                    } else if (c_step == 1) {
+                        movsx(reg_src_32, ptr[aux_reg_src_w + offset]);
+                        movq(xmm_src_s32(jj, 0), reg_src_64);
+                    }
+                    break;
+                case u8:
+                    if (c_step == jpp.c_block) {
+                        for (int ii = 0; ii < repeats; ii++) {
+                            if (isa == sse42)
+                                movd(xmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]);
+                            else
+                                movq(xmm_src_s32(jj, ii), ptr[aux_reg_src_w + offset + (jpp.c_block / 2) * ii * sizeof_src_dt()]);
+
+                            uni_vpmovzxbd(vmm_src_s32(jj, ii), xmm_src_s32(jj, ii));
+                        }
+                    } else if (c_step == 1) {
+                        movzx(reg_src_32, ptr[aux_reg_src_w + offset]);
+                        movq(xmm_src_s32(jj, 0), reg_src_64);
+                    }
+                    break;
+                default: assert(!"unsupported src data type");
+            }
+            break;
+        }
+        default: assert(!"unsupported algorithm");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_i8i8_pool_fwd_ker_t<isa>::store_dst(int jj, int c_step) {
+    using namespace data_type;
+
+    int repeats = isa == sse42 && c_step != 1 ? 2 : 1;
+    switch(jpp.alg) {
+        case pooling_max: {
+            auto offset = jj*c_step*sizeof_dst_dt();
+            if (c_step == jpp.c_block) {
+                for (int ii = 0; ii < repeats; ii++)
+                    uni_vmovups(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], vmm_dst(jj, ii));
+            } else if (c_step == 1) {
+                if (jpp.src_dt == s32) {
+                    movq(reg_src_64, xmm_dst(jj));
+                    mov(ptr[reg_ptr_dst_i8 + offset], reg_src_32);
+                } else {
+                    movq(reg_src_64, xmm_dst(jj));
+                    mov(ptr[reg_ptr_dst_i8 + offset], reg_src_8);
+                }
+            }
+            break;
+        }
+        case pooling_avg_include_padding:
+        case pooling_avg_exclude_padding: {
+            auto offset = jj*c_step*sizeof_dst_dt();
+            switch (jpp.dst_dt) {
+                case s32:
+                    if (c_step == jpp.c_block) {
+                        for (int ii = 0; ii < repeats; ii++)
+                            uni_vmovups(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], vmm_dst_s32(jj, ii));
+                    } else if (c_step == 1) {
+                        movq(reg_src_64, xmm_dst_s32(jj, 0));
+                        mov(ptr[reg_ptr_dst_i8 + offset], reg_src_32);
+                    }
+                    break;
+                case s8:
+                    if (c_step == jpp.c_block) {
+                        for (int ii = 0; ii < repeats; ii++) {
+                            uni_vpackssdw(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii));
+
+                            if (isa != sse42)
+                                vpermq(ymm_dst_s32(jj, ii), ymm_dst_s32(jj, ii), 0x08);
+
+                            uni_vpacksswb(xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii));
+
+                            if (isa != sse42)
+                                movq(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], xmm_dst_s32(jj, ii));
+                            else
+                                movd(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], xmm_dst_s32(jj, ii));
+                        }
+                    } else if (c_step == 1) {
+                        vpackssdw(vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0));
+                        vpacksswb(xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0));
+                        movq(reg_src_64, xmm_dst_s32(jj, 0));
+                        mov(ptr[reg_ptr_dst_i8 + offset], reg_src_8);
+                    }
+                    break;
+                case u8:
+                    if (c_step == jpp.c_block) {
+                        for (int ii = 0; ii < repeats; ii++) {
+                            uni_vpackusdw(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii));
+
+                            if (isa != sse42)
+                                vpermq(ymm_dst_s32(jj, ii), ymm_dst_s32(jj, ii), 0x08);
+
+                            uni_vpackuswb(xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii), xmm_dst_s32(jj, ii));
+
+                            if (isa != sse42)
+                                movq(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], xmm_dst_s32(jj, ii));
+                            else
+                                movd(ptr[reg_ptr_dst_i8 + offset + (jpp.c_block / 2) * ii * sizeof_dst_dt()], xmm_dst_s32(jj, ii));
+                        }
+                    } else if (c_step == 1) {
+                        vpackusdw(vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0), vmm_dst_s32(jj, 0));
+                        vpackuswb(xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0), xmm_dst_s32(jj, 0));
+                        movq(reg_src_64, xmm_dst_s32(jj, 0));
+                        mov(ptr[reg_ptr_dst_i8 + offset], reg_src_8);
+                    }
+                    break;
+                default: assert(!"unsuppotred dst data_type");
+            }
+            break;
+        }
+        default: assert(!"unsupported pooling algorithm");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_i8i8_pool_fwd_ker_t<isa>::compute_max_step(int ur_c, int c_step)
+{
+    Label l_kw, l_kh;
+
+    int iw = jpp.iw;
+    int c = jpp.c;
+
+    int repeats = isa == sse42 && c_step != 1 ? 2 : 1;
+
+    for (int jj = 0; jj < ur_c; jj++) {
+        for (int ii = 0; ii < repeats; ii++) {
+            uni_vmovups(vmm_dst(jj, ii), vreg_tmp);
+        }
+    }
+
+    mov(aux_reg_src_h, reg_ptr_src_i8);
+
+    xor_(kj, kj);
+    L(l_kh);
+    {
+        mov(aux_reg_src_w, aux_reg_src_h);
+        xor_(ki, ki);
+        L(l_kw);
+        {
+            for (int jj = 0; jj < ur_c; jj++) {
+                load_src(jj, c_step);
+
+                for (int ii = 0; ii < repeats; ii++) {
+                    if (jpp.src_dt == data_type::s32) {
+                        uni_vpmaxsd(vmm_dst(jj, ii), vmm_dst(jj, ii), vmm_src(jj, ii));
+                    } else {
+                        if (jpp.src_dt == data_type::s8)
+                            uni_vpmaxsb(vmm_dst(jj, ii), vmm_dst(jj, ii), vmm_src(jj, ii));
+                        else
+                            uni_vpmaxub(vmm_dst(jj, ii), vmm_dst(jj, ii), vmm_src(jj, ii));
+                    }
+                }
+            }
+            add(aux_reg_src_w, c * sizeof_src_dt());
+            inc(ki);
+            cmp(ki, reg_kw);
+            jl(l_kw, T_NEAR);
+        }
+        add(aux_reg_src_h, iw * c * sizeof_src_dt());
+        inc(kj);
+        cmp(kj, reg_kh);
+        jl(l_kh, T_NEAR);
+    }
+
+    for (int jj = 0; jj < ur_c; jj++)
+        store_dst(jj, c_step);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_i8i8_pool_fwd_ker_t<isa>::compute_avg_step(int ur_c, int c_step)
+{
+    using namespace data_type;
+
+    Label l_kw, l_kh;
+
+    int iw = jpp.iw;
+    int c = jpp.c;
+
+    int repeats = isa == sse42 && c_step != 1 ? 2 : 1;
+
+    for (int jj = 0; jj < ur_c; jj++) {
+        for (int ii = 0; ii < repeats; ii++) {
+            uni_vpxor(vmm_src_s32(jj, ii), vmm_src_s32(jj, ii), vmm_src_s32(jj, ii));
+            uni_vpxor(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii));
+        }
+    }
+
+    mov(aux_reg_src_h, reg_ptr_src_i8);
+
+    xor_(kj, kj);
+    L(l_kh);
+    {
+        mov(aux_reg_src_w, aux_reg_src_h);
+        xor_(ki, ki);
+        L(l_kw);
+        {
+            for (int jj = 0; jj < ur_c; jj++) {
+                load_src(jj, c_step);
+
+                for (int ii = 0; ii < repeats; ii++) {
+                    uni_vpaddd(vmm_dst_s32(jj, ii), vmm_dst_s32(jj, ii), vmm_src_s32(jj, ii));
+                }
+            }
+            add(aux_reg_src_w, c * sizeof_src_dt());
+            inc(ki);
+            cmp(ki, reg_kw);
+            jl(l_kw, T_NEAR);
+        }
+        add(aux_reg_src_h, iw * c * sizeof_src_dt());
+        inc(kj);
+        cmp(kj, reg_kh);
+        jl(l_kh, T_NEAR);
+    }
+
+    for (int jj = 0; jj < ur_c; jj++) {
+        for (int ii = 0; ii < repeats; ii++) {
+            uni_vcvtdq2ps(vmm_dst_f32(jj, ii), vmm_dst_s32(jj, ii));
+
+            if (isa == sse42)
+                mulps(vmm_dst_f32(jj, ii), vreg_tmp);
+            else
+                vfmadd132ps(vmm_dst_f32(jj, ii), vreg_zeros, vreg_tmp);
+
+            uni_vcvtps2dq(vmm_dst_s32(jj, ii), vmm_dst_f32(jj, ii));
+        }
+
+        store_dst(jj, c_step);
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_i8i8_pool_fwd_ker_t<isa>::compute_step(int ur_c, int c_step) {
+    switch (jpp.alg) {
+        case pooling_max:
+            compute_max_step(ur_c, c_step); break;
+        case pooling_avg_include_padding:
+        case pooling_avg_exclude_padding:
+            compute_avg_step(ur_c, c_step); break;
+        default: assert(!"unsupported pooling algorithm");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_i8i8_pool_fwd_ker_t<isa>::compute_c_block() {
+    Label l_main_loop;
+    Label l_tail_loop;
+    Label exit;
+
+    int ur_c = jpp.ur_c;
+
+    xor_(c_iter, c_iter);
+
+    L(l_main_loop);
+    {
+        cmp(c_iter, jpp.c - ur_c * jpp.c_block);
+        jg(l_tail_loop, T_NEAR);
+
+        compute_step(ur_c, jpp.c_block);
+
+        add(reg_ptr_src_i8, ur_c * jpp.c_block * sizeof_src_dt());
+        add(reg_ptr_dst_i8, ur_c * jpp.c_block * sizeof_dst_dt());
+        add(c_iter, ur_c * jpp.c_block);
+        jmp(l_main_loop);
+    }
+
+    L(l_tail_loop);
+    {
+        cmp(c_iter, jpp.c - ur_c);
+        jg(exit, T_NEAR);
+
+        compute_step(ur_c, 1);
+
+        add(reg_ptr_src_i8, ur_c * sizeof_src_dt());
+        add(reg_ptr_dst_i8, ur_c * sizeof_dst_dt());
+        add(c_iter, ur_c);
+        jmp(l_tail_loop);
+    }
+
+    L(exit);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_i8i8_pool_fwd_ker_t<isa>::init_tmp_reg() {
+    using namespace data_type;
+
+    switch (jpp.alg) {
+        case pooling_avg_include_padding:
+        case pooling_avg_exclude_padding:
+            mov(reg_tmp, ptr[abi_param1 + offsetof(call_params_t, idivider)]);
+            movq(xmm_tmp, reg_tmp);
+            uni_vpbroadcastd(vreg_tmp, xmm_tmp);
+            break;
+        case pooling_max:
+            switch (jpp.src_dt) {
+                case s32:
+                    mov(reg_tmp, nstl::numeric_limits<int32_t>::lowest());
+                    break;
+                case s8:
+                    mov(reg_tmp, nstl::numeric_limits<int8_t>::lowest());
+                    break;
+                case u8:
+                    mov(reg_tmp, nstl::numeric_limits<uint8_t>::lowest());
+                    break;
+                default: assert(!"unsupported src data_type");
+            }
+
+            movq(xmm_tmp, reg_tmp);
+            if (jpp.src_dt == s32) {
+                uni_vpbroadcastd(vreg_tmp, xmm_tmp);
+            } else {
+                if (isa == avx2) {
+                    vpbroadcastb(vreg_tmp, xmm_tmp);
+                } else {
+                    movups(vreg_tmp, xmm_tmp);
+                    uni_vpxor(xmm_tmp, xmm_tmp, xmm_tmp);
+                    pshufb(vreg_tmp, xmm_tmp);
+                }
+            }
+            break;
+        default: assert(!"unsupported pooling algorithm");
+    }
+
+}
+
+template <cpu_isa_t isa>
+void jit_uni_i8i8_pool_fwd_ker_t<isa>::generate() {
+    preamble();
+
+#   define READ_PARAM(reg, field) \
+        mov(reg, ptr[abi_param1 + offsetof(call_params_t, field)])
+    READ_PARAM(reg_ptr_src_i8, src_i8);
+    READ_PARAM(reg_ptr_dst_i8, dst_i8);
+    READ_PARAM(reg_kw, kw_range);
+    READ_PARAM(reg_kh, kh_range);
+
+#   undef READ_PARAM
+
+    init_tmp_reg();
+
+    uni_vpxor(vreg_zeros, vreg_zeros, vreg_zeros);
+
+    compute_c_block();
+
+    postamble();
+}
+
+template <cpu_isa_t isa>
+status_t jit_uni_i8i8_pool_fwd_ker_t<isa>::init_conf(jit_pool_conf_t &jpp,
+        const pooling_desc_t &pd, const memory_desc_wrapper &src_d,
+        const memory_desc_wrapper &dst_d) {
+    if (!mayiuse(isa)) {
+        return status::unimplemented;
+    }
+
+    jpp.mb = src_d.dims()[0];
+    jpp.c = src_d.dims()[1];
+    jpp.ih = src_d.dims()[2];
+    jpp.iw = src_d.dims()[3];
+    jpp.oh = dst_d.dims()[2];
+    jpp.ow = dst_d.dims()[3];
+
+    jpp.stride_h = pd.strides[0];
+    jpp.stride_w = pd.strides[1];
+    jpp.kh = pd.kernel[0];
+    jpp.kw = pd.kernel[1];
+
+    jpp.t_pad = pd.padding[0][0];
+    jpp.l_pad = pd.padding[0][1];
+
+    jpp.alg = pd.alg_kind;
+
+    jpp.src_dt = pd.src_desc.data_type;
+    jpp.dst_dt = pd.dst_desc.data_type;
+
+    jpp.c_block = jpp.alg == pooling_max ? 32 / (jpp.src_dt == data_type::s32 ? 4 : 1) : 8;
+    jpp.c_tail = jpp.c % jpp.c_block;
+    jpp.nb_c = jpp.c / jpp.c_block;
+    jpp.ur_c = 1;
+    jpp.ur_c_tail = jpp.nb_c - (jpp.nb_c / jpp.ur_c)*jpp.ur_c + (jpp.c_tail != 0);
+
+    return status::success;
+}
+
+template <cpu_isa_t isa>
+status_t jit_uni_i8i8_pooling_fwd_t<isa>::pd_t::jit_conf() {
+    return jit_uni_i8i8_pool_fwd_ker_t<isa>::init_conf(jpp_,
+       desc_, src_pd_.desc(), dst_pd_.desc());
+}
+
+template <cpu_isa_t isa>
+jit_uni_i8i8_pooling_fwd_t<isa>::jit_uni_i8i8_pooling_fwd_t(const pd_t *pd,
+          const input_vector &inputs, const output_vector &outputs)
+    : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), ker_(nullptr) {
+    ker_ = new jit_uni_i8i8_pool_fwd_ker_t<isa>(conf_.jpp_);
+}
+
+template <cpu_isa_t isa>
+jit_uni_i8i8_pooling_fwd_t<isa>::~jit_uni_i8i8_pooling_fwd_t() {
+    delete ker_;
+}
+
+template <cpu_isa_t isa>
+void jit_uni_i8i8_pooling_fwd_t<isa>::execute_forward() {
+    auto src_i8 = reinterpret_cast<const char *>(input_memory(0));
+    auto dst_i8 = reinterpret_cast<char *>(memory());
+
+    const memory_desc_wrapper src_d(conf_.src_pd());
+    const memory_desc_wrapper dst_d(conf_.dst_pd());
+
+    const auto &jpp = conf_.jpp_;
+
+    parallel_nd(jpp.mb, jpp.oh, jpp.ow,
+        [&](int n, int oh, int ow) {
+        const int ih = nstl::max(oh * jpp.stride_h - jpp.t_pad, 0);
+        const int iw = nstl::max(ow * jpp.stride_w - jpp.l_pad, 0);
+
+        const int kh_start = nstl::max(0, jpp.t_pad - oh * jpp.stride_h);
+        const int kh_end = nstl::min(jpp.kh,
+                                     jpp.ih + jpp.t_pad - oh * jpp.stride_h);
+        const int kw_start = nstl::max(0, jpp.l_pad - ow * jpp.stride_w);
+        const int kw_end = nstl::min(jpp.kw,
+                                     jpp.iw + jpp.l_pad - ow * jpp.stride_w);
+
+        auto p = call_params_t();
+        p.src_i8 = &src_i8[
+                src_d.blk_off(n, 0, ih, iw) * src_d.data_type_size()];
+        p.dst_i8 = &dst_i8[
+                dst_d.blk_off(n, 0, oh, ow) * dst_d.data_type_size()];
+        p.kw_range = (size_t) (kw_end - kw_start);
+        p.kh_range = (size_t) (kh_end - kh_start);
+        p.idivider = 1.0f / ((jpp.alg == pooling_avg_exclude_padding) ?
+                             p.kh_range * p.kw_range : jpp.kw * jpp.kh);
+
+        ker_->ker_(&p);
+    });
+}
+
+template struct jit_uni_i8i8_pooling_fwd_t<avx2>;
+template struct jit_uni_i8i8_pooling_fwd_t<sse42>;
+
+}
+}
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_i8i8_pooling.hpp
new file mode 100644 (file)
index 0000000..2e274ed
--- /dev/null
@@ -0,0 +1,98 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_JIT_uni_I8I8_POOLING_HPP
+#define CPU_JIT_uni_I8I8_POOLING_HPP
+
+#include "c_types_map.hpp"
+#include "cpu_pooling_pd.hpp"
+#include "cpu_engine.hpp"
+#include "jit_generator.hpp"
+#include "jit_primitive_conf.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+template <cpu_isa_t isa>
+struct jit_uni_i8i8_pool_fwd_ker_t;
+
+template <cpu_isa_t isa>
+struct jit_uni_i8i8_pooling_fwd_t : public cpu_primitive_t {
+    struct pd_t : public cpu_pooling_fwd_pd_t {
+        pd_t(engine_t *engine, const pooling_desc_t  *adesc,
+                const primitive_attr_t *attr,
+                const pooling_fwd_pd_t  *hint_fwd_pd)
+        : cpu_pooling_fwd_pd_t(engine, adesc, attr, hint_fwd_pd) {}
+
+        DECLARE_COMMON_PD_T(
+                JIT_IMPL_NAME_HELPER("jit:", isa, ""),
+                jit_uni_i8i8_pooling_fwd_t);
+
+        virtual status_t init() override {
+            assert(this->engine()->kind() == engine_kind::cpu);
+            bool ok = true
+                && desc()->src_desc.ndims == 4
+                && set_default_params() == status::success
+                && desc()->prop_kind == prop_kind::forward_inference
+                && utils::one_of(desc()->alg_kind, alg_kind::pooling_max,
+                        alg_kind::pooling_avg_include_padding,
+                        alg_kind::pooling_avg_exclude_padding)
+                && utils::one_of(src_pd()->desc()->data_type, data_type::s32,
+                        data_type::s8, data_type::u8)
+                && src_pd()->desc()->data_type == dst_pd()->desc()->data_type
+                && utils::everyone_is(memory_format::nhwc,
+                        src_pd()->desc()->format, dst_pd()->desc()->format)
+                && attr()->has_default_values();
+            if (!ok) return status::unimplemented;
+
+            return jit_conf();
+        }
+
+        jit_pool_conf_t jpp_;
+
+    protected:
+        status_t jit_conf();
+
+        virtual status_t set_default_params() override {
+            using namespace memory_format;
+            if (dst_pd_.desc()->format == any)
+                CHECK(dst_pd_.set_format(nhwc));
+            return status::success;
+        }
+    };
+
+    jit_uni_i8i8_pooling_fwd_t(const pd_t *pd,
+            const input_vector &inputs, const output_vector &outputs);
+    ~jit_uni_i8i8_pooling_fwd_t();
+
+    virtual void execute(event_t *e) {
+        execute_forward();
+        e->set_state(event_t::ready);
+    }
+
+private:
+    void execute_forward();
+    pd_t conf_;
+
+    jit_uni_i8i8_pool_fwd_ker_t<isa> *ker_;
+};
+
+}
+}
+}
+
+#endif
index ffce596..97b4835 100644 (file)
@@ -158,13 +158,6 @@ jit_uni_lrn_fwd_kernel_f32<isa>::jit_uni_lrn_fwd_kernel_f32(
     Vmm ydst = Vmm(isa == avx2 ? 11 : 11);
     Vmm ytmp = Vmm(isa == avx2 ? 12 : 12);
 
-    static const char *label[MAX_LOCAL_SIZE] = {
-        ".l00", ".l01", ".l02", ".l03", ".l04", ".l05", ".l06", ".l07",
-        ".l08", ".l09", ".l10", ".l11", ".l12", ".l13", ".l14", ".l15",
-        ".l16", ".l17", ".l18", ".l19", ".l20", ".l21", ".l22", ".l23",
-        ".l24", ".l25", ".l26", ".l27", ".l28", ".l29", ".l30", ".l31"
-    };
-
     this->preamble();
 
     mov(src, ptr[this->param1 + 0]);
@@ -189,11 +182,10 @@ jit_uni_lrn_fwd_kernel_f32<isa>::jit_uni_lrn_fwd_kernel_f32(
     }
 
     int s2 = (J.size - 1) / 2, S2 = J.size - s2 - 1;
-    const char **label_t = &label[0];
-    const char **label_b = &label[s2];
 
     for (int i = 0; i < s2; ++i)
     {
+        Label label_t;
         for (int j = 0; j < s2; ++j) {
             if (isa == avx2) {
                 within_body(-i, S2, -j, S2, J.W, ysum, ydst, ytmp, ysum2, pk);
@@ -203,7 +195,7 @@ jit_uni_lrn_fwd_kernel_f32<isa>::jit_uni_lrn_fwd_kernel_f32(
             }
         }
         mov(w, J.W - J.size + 1);
-        L(label_t[i]);
+        L(label_t);
         if (isa == avx2) {
             within_body(-i, S2, -s2, S2, J.W, ysum, ydst, ytmp, ysum2, pk);
         } else {
@@ -211,7 +203,7 @@ jit_uni_lrn_fwd_kernel_f32<isa>::jit_uni_lrn_fwd_kernel_f32(
         }
         dec(w);
         cmp(w, 0);
-        jne(label_t[i], T_NEAR);
+        jne(label_t, T_NEAR);
         for (int j = J.W - S2; j < J.W; ++j) {
             if (isa == avx2) {
                 within_body(-i, S2, -s2, J.W - 1 - j, J.W,
@@ -223,7 +215,8 @@ jit_uni_lrn_fwd_kernel_f32<isa>::jit_uni_lrn_fwd_kernel_f32(
     }
 
     mov(h, J.H - J.size + 1);
-    L(".lrn_loop_h");
+    Label lrn_loop_h;
+    L(lrn_loop_h);
     for (int j = 0; j < s2; ++j) {
         if (isa == avx2) {
             within_body(-s2, S2, -j, S2, J.W, ysum, ydst, ytmp, ysum2, pk);
@@ -232,7 +225,8 @@ jit_uni_lrn_fwd_kernel_f32<isa>::jit_uni_lrn_fwd_kernel_f32(
         }
     }
     mov(w, J.W - J.size + 1);
-    L(".lrn_loop_w");
+    Label lrn_loop_w;
+    L(lrn_loop_w);
     if (isa == avx2) {
         within_body(-s2, S2, -s2, S2, J.W, ysum, ydst, ytmp, ysum2, pk);
     } else {
@@ -240,7 +234,7 @@ jit_uni_lrn_fwd_kernel_f32<isa>::jit_uni_lrn_fwd_kernel_f32(
     }
     dec(w);
     cmp(w, 0);
-    jne(".lrn_loop_w", T_NEAR);
+    jne(lrn_loop_w, T_NEAR);
     for (int j = J.W - S2; j < J.W; ++j) {
         if (isa == avx2) {
             within_body(-s2, S2, -s2, J.W - 1 - j, J.W,
@@ -251,7 +245,7 @@ jit_uni_lrn_fwd_kernel_f32<isa>::jit_uni_lrn_fwd_kernel_f32(
     }
     dec(h);
     cmp(h, 0);
-    jne(".lrn_loop_h", T_NEAR);
+    jne(lrn_loop_h, T_NEAR);
 
     for (int i = J.H - S2; i < J.H; ++i)
     {
@@ -265,7 +259,8 @@ jit_uni_lrn_fwd_kernel_f32<isa>::jit_uni_lrn_fwd_kernel_f32(
         }
 
         mov(w, J.W - J.size + 1);
-        L(label_b[i - (J.H - S2)]);
+        Label label_b;
+        L(label_b);
         if (isa == avx2) {
             within_body(-s2, J.H - 1 - i, -s2, S2, J.W,
                 ysum, ydst, ytmp, ysum2, pk);
@@ -274,7 +269,7 @@ jit_uni_lrn_fwd_kernel_f32<isa>::jit_uni_lrn_fwd_kernel_f32(
         }
         dec(w);
         cmp(w, 0);
-        jne(label_b[i - (J.H - S2)], T_NEAR);
+        jne(label_b, T_NEAR);
 
         for (int j = J.W - S2; j < J.W; ++j) {
             if (isa == avx2) {
@@ -345,7 +340,9 @@ jit_uni_lrn_fwd_kernel_f32<avx2>::jit_uni_lrn_fwd_kernel_f32(
     }
 
     mov(hw, J.H*J.W);
-    L(".lrn_loop");
+
+    Label lrn_loop;
+    L(lrn_loop);
 
     if (J.version != -1) vmovups(xsrc_prev, ptr[src - J.H*J.W * 32 + 16]);
     vmovups(ysrc, ptr[src]);
@@ -382,7 +379,7 @@ jit_uni_lrn_fwd_kernel_f32<avx2>::jit_uni_lrn_fwd_kernel_f32(
         add(scratch, 32);
     dec(hw);
     cmp(hw, 0);
-    jne(".lrn_loop", T_NEAR);
+    jne(lrn_loop, T_NEAR);
 
     add(t, 64);
     this->postamble();
@@ -451,7 +448,8 @@ jit_uni_lrn_fwd_kernel_f32<sse42>::jit_uni_lrn_fwd_kernel_f32(
     }
 
     mov(hw, J.H*J.W);
-    L(".lrn_loop");
+    Label lrn_loop;
+    L(lrn_loop);
 
     if (J.version != -1) movups(xsrc_prev, ptr[src - J.H*J.W * 32 + 16]);
     movups(xsrc_lo, ptr[src]);
@@ -522,7 +520,7 @@ jit_uni_lrn_fwd_kernel_f32<sse42>::jit_uni_lrn_fwd_kernel_f32(
         add(scratch, 32);
     dec(hw);
     cmp(hw, 0);
-    jne(".lrn_loop", T_NEAR);
+    jne(lrn_loop, T_NEAR);
 
     add(t, 64);
     this->postamble();
@@ -585,7 +583,8 @@ jit_uni_lrn_fwd_kernel_f32<avx2>::jit_uni_lrn_fwd_kernel_f32(
     vfmadd231ps(ysum, yb, yb);
 
     mov(c, J.C / 8 - 1);
-    L(".lrn_loop");
+    Label lrn_loop;
+    L(lrn_loop);
 
     vmovups(yc, ptr[src]);
     vmovups(yd, ptr[src + 4]);
@@ -622,7 +621,7 @@ jit_uni_lrn_fwd_kernel_f32<avx2>::jit_uni_lrn_fwd_kernel_f32(
 
     dec(c);
     cmp(c, 0);
-    jne(".lrn_loop", T_NEAR);
+    jne(lrn_loop, T_NEAR);
 
     vmovups(yc, ptr[src]);
     vfmadd231ps(ysum, yc, yc);
@@ -744,7 +743,8 @@ jit_uni_lrn_fwd_kernel_f32<sse42>::jit_uni_lrn_fwd_kernel_f32(
     addps(xsum_hi, xb_hi);
 
     mov(c, J.C / 8 - 1);
-    L(".lrn_loop");
+    Label lrn_loop;
+    L(lrn_loop);
 
     movups(xc_lo, ptr[src]);
     movups(xc_hi, ptr[src + 4 * sizeof(float)]);
@@ -818,7 +818,7 @@ jit_uni_lrn_fwd_kernel_f32<sse42>::jit_uni_lrn_fwd_kernel_f32(
 
     dec(c);
     cmp(c, 0);
-    jne(".lrn_loop", T_NEAR);
+    jne(lrn_loop, T_NEAR);
 
     movups(xc_lo, ptr[src]);
     movups(xc_hi, ptr[src + 4 * sizeof(float)]);
@@ -945,6 +945,33 @@ void jit_uni_lrn_fwd_kernel_f32<avx2>::nchw_body(
 }
 
 template<>
+void jit_uni_lrn_fwd_kernel_f32<avx2>::nchw_tail_sse42(
+    int tail, Xbyak::Reg64 reg_dst, Xbyak::Xmm xtail_lo, Xbyak::Xmm xtail_hi)
+{}
+
+template<>
+void jit_uni_lrn_fwd_kernel_f32<sse42>::nchw_tail_sse42(
+    int tail, Xbyak::Reg64 reg_dst, Xbyak::Xmm xtail_lo, Xbyak::Xmm xtail_hi)
+{
+    Xbyak::Xmm xmm_tmp = xmm10;
+    movaps(xmm_tmp, xtail_lo);
+    size_t offset = 0;
+
+    if (tail > 4) {
+        movups(ptr[reg_dst], xtail_lo);
+        movaps(xmm_tmp, xtail_hi);
+        offset += 4 * sizeof(float);
+        tail -= 4;
+    }
+    movss(ptr[reg_dst + offset], xmm_tmp);
+    for (int i = 1; i < tail; i++)
+    {
+        psrldq(xmm_tmp, 4);
+        movss(ptr[reg_dst + offset + i * sizeof(float)], xmm_tmp);
+    }
+}
+
+template<>
 void jit_uni_lrn_fwd_kernel_f32<sse42>::nchw_body_sse42(
     int tail, int HW, prop_kind_t pk,
     Xbyak::Xmm xmask_lo, Xbyak::Xmm xmask_hi,
@@ -957,8 +984,6 @@ void jit_uni_lrn_fwd_kernel_f32<sse42>::nchw_body_sse42(
     Xbyak::Xmm xbase_hi = xmm7;
     Xbyak::Xmm xtmp_lo = xmm8;
     Xbyak::Xmm xtmp_hi = xmm9;
-    Xbyak::Xmm xtmp2_lo = xmm10;
-    Xbyak::Xmm xtmp2_hi = xmm11;
     Xbyak::Xmm xa_lo = xmm6;
     Xbyak::Xmm xa_hi = xmm7;
     Xbyak::Xmm xb_lo = xmm8;
@@ -990,20 +1015,7 @@ void jit_uni_lrn_fwd_kernel_f32<sse42>::nchw_body_sse42(
     if (pk != prop_kind::forward_inference)
     {
         if (tail != 0) {
-            movaps(xtmp_lo, xmask_lo);
-            movaps(xtmp_hi, xmask_hi);
-            movups(xtmp2_lo, ptr[scratch]);
-            movups(xtmp2_hi, ptr[scratch + 4 * sizeof(float)]);
-            andnps(xtmp_lo, xtmp2_lo);
-            andnps(xtmp_hi, xtmp2_hi);
-            movaps(xtmp2_lo, xbase_lo);
-            movaps(xtmp2_hi, xbase_hi);
-            andps(xtmp2_lo, xmask_lo);
-            andps(xtmp2_hi, xmask_hi);
-            orps(xtmp_lo, xtmp2_lo);
-            orps(xtmp_hi, xtmp2_hi);
-            movups(ptr[scratch], xtmp_lo);
-            movups(ptr[scratch + 4 * sizeof(float)], xtmp_hi);
+            nchw_tail_sse42(tail, scratch, xbase_lo, xbase_hi);
         }
         else {
             movups(ptr[scratch], xbase_lo);
@@ -1026,20 +1038,7 @@ void jit_uni_lrn_fwd_kernel_f32<sse42>::nchw_body_sse42(
     movaps(xdst_hi, xtmp_hi);
 
     if (tail != 0) {
-        movaps(xtmp_lo, xmask_lo);
-        movaps(xtmp_hi, xmask_hi);
-        movups(xtmp2_lo, ptr[dst]);
-        movups(xtmp2_hi, ptr[dst + 4 * sizeof(float)]);
-        andnps(xtmp_lo, xtmp2_lo);
-        andnps(xtmp_hi, xtmp2_hi);
-        movaps(xtmp2_lo, xdst_lo);
-        movaps(xtmp2_hi, xdst_hi);
-        andps(xtmp2_lo, xmask_lo);
-        andps(xtmp2_hi, xmask_hi);
-        orps(xtmp_lo, xtmp2_lo);
-        orps(xtmp_hi, xtmp2_hi);
-        movups(ptr[dst], xtmp_lo);
-        movups(ptr[dst + 4 * sizeof(float)], xtmp_hi);
+        nchw_tail_sse42(tail, dst, xdst_lo, xdst_hi);
     }
     else {
         movups(ptr[dst], xdst_lo);
@@ -1145,7 +1144,8 @@ jit_uni_lrn_fwd_kernel_f32<avx2>::jit_uni_lrn_fwd_kernel_f32(
     vfmadd231ps(ysum, yd, yd);
 
     mov(c, J.C - 2);
-    L(".lrn_loop");
+    Label lrn_loop;
+    L(lrn_loop);
 
     if (J.tail != 0)
         vmaskmovps(ye, ymask, ptr[src + J.HW * 8]);
@@ -1160,7 +1160,7 @@ jit_uni_lrn_fwd_kernel_f32<avx2>::jit_uni_lrn_fwd_kernel_f32(
         add(scratch, J.HW * 4);
     dec(c);
     cmp(c, 0);
-    jne(".lrn_loop", T_NEAR);
+    jne(lrn_loop, T_NEAR);
 
     vxorps(ye, ye, ye);
 
@@ -1290,7 +1290,8 @@ jit_uni_lrn_fwd_kernel_f32<sse42>::jit_uni_lrn_fwd_kernel_f32(
     addps(xsum_hi, xd_hi); // xsum <- xsum + xa^2+xb^2+xc^2+xd^2+xe^2
 
     mov(c, J.C - 2);
-    L(".lrn_loop");
+    Label lrn_loop;
+    L(lrn_loop);
 
     if (J.tail != 0) {
         movups(xe_lo, ptr[src + J.HW * 8]);
@@ -1313,7 +1314,7 @@ jit_uni_lrn_fwd_kernel_f32<sse42>::jit_uni_lrn_fwd_kernel_f32(
         add(scratch, J.HW * 4);
     dec(c);
     cmp(c, 0);
-    jne(".lrn_loop", T_NEAR);
+    jne(lrn_loop, T_NEAR);
 
     xorps(xe_lo, xe_lo);
     xorps(xe_hi, xe_hi);
@@ -1388,20 +1389,16 @@ jit_uni_lrn_bwd_kernel_f32<isa>::jit_uni_lrn_bwd_kernel_f32(
     bool is_first = J.version == -1 || J.version == -2;
     bool is_last = J.version == +1 || J.version == -2;
 
-    char tag = '\0';
     if (is_first || is_single) {
         vxorps(xsrc_prev, xsrc_prev, xsrc_prev);
         vmovups(ptr[t + 0], xsrc_prev);
-        tag = 'f';
     }
     if (is_last || is_single) {
         vxorps(xsrc_next, xsrc_next, xsrc_next);
         vmovups(ptr[t + 48], xsrc_next);
-        tag = 'l';
     }
     mov(hw, this->use_h_parallelizm ? J.W : J.H*J.W);
-
-    jit_tagged_label lrn_loop("lrn_loop", tag);
+    Label lrn_loop;
     L(lrn_loop);
     {
         if (!is_first && !is_single) {
index 8d7c5ea..6827019 100644 (file)
@@ -137,6 +137,8 @@ struct jit_uni_lrn_fwd_kernel_f32 : public jit_generator {
         Xbyak::Xmm xmask_lo, Xbyak::Xmm xmask_hi,
         Xbyak::Xmm xe_lo, Xbyak::Xmm xe_hi,
         Xbyak::Xmm xsum_lo, Xbyak::Xmm xsum_hi);
+    void nchw_tail_sse42(int tail, Xbyak::Reg64 reg_dst,
+        Xbyak::Xmm xtail_lo, Xbyak::Xmm xtail_hi);
 
     void operator()(jit_args_fwd_t *arg) { ker(arg); }
     void(*ker)(jit_args_fwd_t *);
index 40cf7c7..c6585e4 100644 (file)
@@ -69,6 +69,10 @@ status_t jit_uni_pool_kernel_f32<isa>::init_conf(jit_pool_conf_t &jpp,
     jpp.f_pad = (ndims == 5 ) ? pd.padding[0][0] : 0;
     jpp.t_pad = pd.padding[0][ndims-4];
     jpp.l_pad = pd.padding[0][ndims-3];
+    jpp.back_pad = (ndims == 5) ? pd.padding[1][0] : 0;
+    // TODO fix this case
+    if (jpp.back_pad != 0)
+        return status::unimplemented;
     jpp.b_pad = pd.padding[1][ndims-4];
     jpp.r_pad = pd.padding[1][ndims-3];
 
@@ -79,7 +83,7 @@ status_t jit_uni_pool_kernel_f32<isa>::init_conf(jit_pool_conf_t &jpp,
     jpp.ind_dt = pooling_index_data_type(&pd);
 
     jpp.simple_alg = jpp.is_training
-        || utils::implication(jpp.is_backward, jpp.kd <= jpp.stride_d);
+        || IMPLICATION(jpp.is_backward, jpp.kd <= jpp.stride_d);
 
     jpp.c_block = simd_w;
 
@@ -152,7 +156,7 @@ inline void jit_uni_pool_kernel_f32<isa>::avg_step(int ur_w, int pad_l,
         push(reg_input);
         push(reg_output);
         mov(aux_reg_input_d, reg_input);
-        mov(ki, ptr[this->param1 + GET_OFF(kd_padding)]);
+        mov(ki, ptr[reg_param + GET_OFF(kd_padding)]);
         L(kd_label);
         mov(aux_reg_input, aux_reg_input_d);
     } else {
@@ -237,7 +241,7 @@ inline void jit_uni_pool_kernel_f32<isa>::max_step_fwd(int ur_w, int pad_l,
         push(reg_input);
         push(reg_output);
         mov(aux_reg_input_d, reg_input);
-        mov(ki, ptr[this->param1 + GET_OFF(kd_padding)]);
+        mov(ki, ptr[reg_param + GET_OFF(kd_padding)]);
         L(kd_label);
         mov(aux_reg_input, aux_reg_input_d);
     } else {
@@ -296,7 +300,7 @@ inline void jit_uni_pool_kernel_f32<isa>::max_step_fwd(int ur_w, int pad_l,
     {
         add(aux_reg_input_d,  sizeof(float) * jpp.ih * iw * c_block);
         if (jpp.is_training) {
-            mov(tmp_gpr, ptr[this->param1 + GET_OFF(kd_padding_shift)]);
+            mov(tmp_gpr, ptr[reg_param + GET_OFF(kd_padding_shift)]);
             movq(xmm_tmp, tmp_gpr);
             uni_vpbroadcastd(vmm_tmp, xmm_tmp);
             if (isa == avx && !mayiuse(avx2)) {
@@ -398,12 +402,14 @@ inline void jit_uni_pool_kernel_f32<isa>::max_step_bwd(int ur_w, int pad_l,
     if (jpp.simple_alg && jpp.ndims == 5) {
         push(reg_input);
         push(reg_output);
-        /*Save this->param1 as it is used in maskmovdqu*/
-        if (isa == sse42)
-            push(this->param1);
+        if (isa == sse42) {
+            // Save rdi since it is used in maskmovdqu
+            assert(dst_ptr == rdi);
+            push(dst_ptr);
+        }
         mov(aux_reg_input_d, reg_input);
-        mov(ki, ptr[this->param1 + GET_OFF(kd_padding)]);
-        mov(reg_kd_pad_shift, ptr[this->param1 + GET_OFF(kd_padding_shift)]);
+        mov(ki, ptr[reg_param + GET_OFF(kd_padding)]);
+        mov(reg_kd_pad_shift, ptr[reg_param + GET_OFF(kd_padding_shift)]);
         L(kd_label);
         mov(aux_reg_input, aux_reg_input_d);
     } else {
@@ -476,8 +482,11 @@ inline void jit_uni_pool_kernel_f32<isa>::max_step_bwd(int ur_w, int pad_l,
         dec(ki);
         cmp(ki, 0);
         jg(kd_label, T_NEAR);
-        if (isa == sse42)
-            pop(this->param1);
+        if (isa == sse42) {
+            // Save rdi since it is used in maskmovdqu
+            assert(dst_ptr == rdi);
+            pop(dst_ptr);
+        }
         pop(reg_output);
         pop(reg_input);
     }
@@ -489,12 +498,12 @@ void jit_uni_pool_kernel_f32<isa>::maybe_zero_diff_src() {
     Label l_skip, l_zero;
 
     auto reg_oh = tmp_gpr;
-    mov(reg_oh, ptr[this->param1 + GET_OFF(oh)]);
+    mov(reg_oh, ptr[reg_param + GET_OFF(oh)]);
     cmp(reg_oh, 0);
     jz(l_skip, T_NEAR);
 
     if (jpp.ndims == 5) {
-        mov(zero_size, ptr[this->param1 + GET_OFF(oh)]);
+        mov(zero_size, ptr[reg_param + GET_OFF(oh)]);
         mov(tmp_gpr, jpp.ih * jpp.iw * jpp.c_block * sizeof(float));
         imul(zero_size, tmp_gpr);
     }
@@ -540,13 +549,21 @@ void jit_uni_pool_kernel_f32<isa>::generate() {
 
     int vlen = cpu_isa_traits<isa>::vlen;
 
-    mov(reg_input, ptr[this->param1 + GET_OFF(src)]);
-    mov(reg_output, ptr[this->param1 + GET_OFF(dst)]);
+#if defined(_WIN32)
+    // Always mimic the Unix ABI (see the note about maskmovdqu in the header
+    // file).
+    xor_(rdi, rcx);
+    xor_(rcx, rdi);
+    xor_(rdi, rcx);
+#endif
+
+    mov(reg_input, ptr[reg_param + GET_OFF(src)]);
+    mov(reg_output, ptr[reg_param + GET_OFF(dst)]);
     if (jpp.alg == pooling_max && (jpp.is_training || jpp.is_backward))
-        mov(reg_index, ptr[this->param1 + GET_OFF(indices)]);
-    mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]);
-    mov(reg_k_shift, ptr[this->param1 + GET_OFF(kh_padding_shift)]);
-    mov(reg_ker_area_h, ptr[this->param1 + GET_OFF(ker_area_h)]);
+        mov(reg_index, ptr[reg_param + GET_OFF(indices)]);
+    mov(reg_kh, ptr[reg_param + GET_OFF(kh_padding)]);
+    mov(reg_k_shift, ptr[reg_param + GET_OFF(kh_padding_shift)]);
+    mov(reg_ker_area_h, ptr[reg_param + GET_OFF(ker_area_h)]);
 
     if (jpp.is_backward)
         maybe_zero_diff_src();
index e59684c..ff26d27 100644 (file)
@@ -73,19 +73,29 @@ private:
     Opmask k_index_mask = Opmask(6);
     Opmask k_store_mask = Opmask(7);
 
+    // Here be some (tame) dragons. This kernel does not follow the regular
+    // OS-agnostic ABI pattern because when isa is sse42 it uses maskmovdqu
+    // instruction which has its destination hardcoded in rdi. Therefore:
+    // - all registers are hardcoded
+    // - on Windows rdi and rcx are swapped to mimic the Unix x86_64 ABI
+    //
+    // While this is only required by the backward pass, the quirk above
+    // is applied to the forward pass as well to keep things simpler.
+
     using reg64_t = const Xbyak::Reg64;
+    reg64_t reg_param      = rdi; // Always mimic the Unix ABI
     reg64_t reg_input      = r8;
     reg64_t aux_reg_input  = r9;
     reg64_t reg_index      = r10;
     reg64_t reg_output     = r12;
     reg64_t reg_kd_pad_shift = r13;
-    reg64_t dst_ptr        = abi_param1;
+    reg64_t dst_ptr        = rdi; // Must be rdi due to maskmovdqu
 
     reg64_t kj      = r14;
     reg64_t oi_iter = r15;
     reg64_t reg_kh  = rax;
     reg64_t reg_k_shift  = rbx;
-    reg64_t tmp_gpr = abi_not_param1;
+    reg64_t tmp_gpr = rcx; // Must be rcx because rdi is used above
     reg64_t reg_ker_area_h = rdx;
 
     reg64_t zero_size = r15;
index 6769dcd..520ab12 100644 (file)
@@ -143,7 +143,7 @@ struct jit_uni_pooling_bwd_t: public cpu_primitive_t {
                         diff_dst_pd()->desc()->format)
                 && everyone_is(data_type::f32, diff_src_pd()->desc()->data_type,
                         diff_dst_pd()->desc()->data_type)
-                && utils::implication(desc()->alg_kind == pooling_max,
+                && IMPLICATION(desc()->alg_kind == pooling_max,
                         hint_fwd_pd_ && hint_fwd_pd_->workspace_pd()
                         && hint_fwd_pd_->workspace_pd()->desc()->format
                                 == desired_fmt())
index e40a896..81677ba 100644 (file)
@@ -115,7 +115,7 @@ struct jit_uni_reorder_kernel_f32: public kernel_t, public jit_generator {
             && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
             && simple_impl_desc_init(p, nullptr)
             && mayiuse(sse42)
-            && utils::implication(!utils::everyone_is(f32, p.itype, p.otype),
+            && IMPLICATION(!utils::everyone_is(f32, p.itype, p.otype),
                     mayiuse(avx512_core));
         if (!ok) return false;
 
@@ -432,7 +432,8 @@ struct jit_uni_reorder_kernel_f32: public kernel_t, public jit_generator {
                             scale_load_type = scale_load_type_t::load;
 
                     if (scale_load_type == scale_load_type_t::bcast) {
-                        vbroadcastss(xmm_scale, s_addr(s_off[ur]));
+                        movss(xmm_scale, s_addr(s_off[ur]));
+                        shufps(xmm_scale, xmm_scale, 0x0);
                         mulps(Xmm(ur), xmm_scale);
                         continue;
                     }
@@ -443,7 +444,7 @@ struct jit_uni_reorder_kernel_f32: public kernel_t, public jit_generator {
                             scale_load_type = scale_load_type_t::gather;
 
                     if (scale_load_type == scale_load_type_t::load) {
-                        vmovups(xmm_scale, s_addr(s_off[ur]));
+                        movups(xmm_scale, s_addr(s_off[ur]));
                         mulps(Xmm(ur), xmm_scale);
                         continue;
                     }
@@ -652,7 +653,7 @@ private:
 
     Reg64 reg_ptr_in = rsi;
     Reg64 reg_ptr_out = rdx;
-    Reg64 reg_ptr_scale = rcx;
+    Reg64 reg_ptr_scale = abi_not_param1;
 
     Reg64 reg_off_in = r8;
     Reg64 reg_off_out = r9;
index 498e813..cb9c1d1 100644 (file)
@@ -67,6 +67,10 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
     switch (md.format()) {
     case memory_format::undef:
     case memory_format::any:
+    case hwio_s8s8:
+    case hwigo_s8s8:
+    case gOIhw4i16o4i_s8s8:
+    case OIhw4i16o4i_s8s8:
     case wino_fmt:
         return invalid_arguments;
     case OIhw4i16o4i:
@@ -78,6 +82,7 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
         P(2, bd.padding_dims[2], bd.strides[0][2]);
         P(3, bd.padding_dims[3], bd.strides[0][3]);
         return success;
+    case OIw8i16o2i:
     case OIhw8i16o2i:
     case OIdhw8i16o2i:
         P(0, bd.padding_dims[0] / 16, bd.strides[0][0]);
@@ -86,10 +91,12 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
         P(1, 8, 16*2);
         P(1, 2, 1);
         P(2, bd.padding_dims[2], bd.strides[0][2]);
-        P(3, bd.padding_dims[3], bd.strides[0][3]);
+        if (md.format() == OIhw8i16o2i || md.format() == OIdhw8i16o2i)
+            P(3, bd.padding_dims[3], bd.strides[0][3]);
         if (md.format() == OIdhw8i16o2i)
             P(4, bd.padding_dims[4], bd.strides[0][4]);
         return success;
+    case OIw8o16i2o:
     case OIhw8o16i2o:
         P(0, bd.padding_dims[0] / 16, bd.strides[0][0]);
         P(0, 8, 16*2);
@@ -97,7 +104,8 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
         P(1, bd.padding_dims[1] / 16, bd.strides[0][1]);
         P(1, 16, 2);
         P(2, bd.padding_dims[2], bd.strides[0][2]);
-        P(3, bd.padding_dims[3], bd.strides[0][3]);
+        if (md.format() == OIhw8o16i2o)
+            P(3, bd.padding_dims[3], bd.strides[0][3]);
         return success;
     case gOIhw4i16o4i:
         P(0, bd.padding_dims[0], bd.strides[0][0]);
@@ -109,6 +117,7 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
         P(3, bd.padding_dims[3], bd.strides[0][3]);
         P(4, bd.padding_dims[4], bd.strides[0][4]);
         return success;
+    case gOIw8i16o2i:
     case gOIhw8i16o2i:
     case gOIdhw8i16o2i:
         P(0, bd.padding_dims[0], bd.strides[0][0]);
@@ -118,10 +127,12 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
         P(2, 8, 16*2);
         P(2, 2, 1);
         P(3, bd.padding_dims[3], bd.strides[0][3]);
-        P(4, bd.padding_dims[4], bd.strides[0][4]);
-        if (md.format() == OIdhw8i16o2i)
+        if (md.format() == gOIhw8i16o2i || md.format() == gOIdhw8i16o2i)
+            P(4, bd.padding_dims[4], bd.strides[0][4]);
+        if (md.format() == gOIdhw8i16o2i)
             P(5, bd.padding_dims[5], bd.strides[0][5]);
         return success;
+    case gOIw8o16i2o:
     case gOIhw8o16i2o:
         P(0, bd.padding_dims[0], bd.strides[0][0]);
         P(1, bd.padding_dims[1] / 16, bd.strides[0][1]);
@@ -130,7 +141,8 @@ status_t cvt_mem_desc_to_layout_desc(const memory_desc_t &md_,
         P(2, bd.padding_dims[2] / 16, bd.strides[0][2]);
         P(2, 16, 2);
         P(3, bd.padding_dims[3], bd.strides[0][3]);
-        P(4, bd.padding_dims[4], bd.strides[0][4]);
+        if (md.format() == gOIhw8o16i2o)
+            P(4, bd.padding_dims[4], bd.strides[0][4]);
         return success;
     default: break;
     }
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.cpp
new file mode 100644 (file)
index 0000000..b3917d5
--- /dev/null
@@ -0,0 +1,507 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "c_types_map.hpp"
+#include "nstl.hpp"
+#include "type_helpers.hpp"
+#include "utils.hpp"
+#include "cpu_memory.hpp"
+
+#include "jit_uni_x8s8s32x_1x1_conv_kernel.hpp"
+
+#define GET_OFF(field) offsetof(jit_1x1_conv_call_s, field)
+
+#include <iostream>
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using namespace mkldnn::impl::prop_kind;
+using namespace mkldnn::impl::memory_format;
+using namespace mkldnn::impl::utils;
+using namespace mkldnn::impl::types;
+
+using namespace Xbyak;
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_1x1_conv_fwd_kernel<isa>::cvt2ps(data_type_t type_in,
+        Vmm vmm_in, const Xbyak::Operand &op) {
+    switch (type_in) {
+    case data_type::f32:
+    case data_type::s32: vmovups(vmm_in, op); break;
+    case data_type::s8: vpmovsxbd(vmm_in, op); break;
+    case data_type::u8: vpmovzxbd(vmm_in, op); break;
+    default: assert(!"unsupported data type");
+    }
+    if (type_in != data_type::f32)
+        vcvtdq2ps(vmm_in, vmm_in);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_1x1_conv_fwd_kernel<isa>::loop_os(int oc_loop_blk)
+{
+    mov(aux_reg_dst_data, reg_dst_data);
+
+    Label loop_os;
+    Label loop_ow_tail;
+
+    mov(reg_ow_loop_work, jcp.ow);
+
+    L(loop_os); {
+        assert(jcp.os_block == jcp.ur);
+        cmp(reg_ow_loop_work, jcp.ow_tail);
+        je(loop_ow_tail, T_NEAR);
+
+        ic_loop(oc_loop_blk, jcp.ur);
+
+        sub(reg_ow_loop_work, jcp.ur);
+
+        add(reg_src_data, jcp.os_loop_src_step);
+        add(aux_reg_dst_data, jcp.os_loop_dst_step);
+
+        sub(reg_loop_os_iter, jcp.os_block);
+        cmp(reg_loop_os_iter, jcp.os_block);
+        jge(loop_os, T_NEAR);
+
+        L(loop_ow_tail); {
+            if (jcp.ow_tail > 0) {
+                ic_loop(oc_loop_blk, jcp.ow_tail);
+            }
+
+            add(reg_src_data, jcp.os_loop_src_tail_step);
+            add(aux_reg_dst_data, jcp.os_loop_dst_tail_step);
+
+            mov(reg_ow_loop_work, jcp.ow);
+
+            sub(reg_loop_os_iter, jcp.ow_tail);
+            cmp(reg_loop_os_iter, 0);
+            jg(loop_os, T_NEAR);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_1x1_conv_fwd_kernel<isa>::ic_loop(int oc_loop_blk, int ur)
+{
+    auto vreg_wei = [=](int i) {
+        return Vmm(ur * oc_loop_blk + i);
+    };
+
+    auto vreg_accum_vmm = [=](int i, int j) {
+        return Vmm(j * oc_loop_blk + i);
+    };
+
+    auto vreg_accum_xmm = [=](int i, int j) {
+        return Xmm(j * oc_loop_blk + i);
+    };
+
+    auto src_ptr = [=](int u, int j) {
+        size_t offt = j * jcp.ic * jcp.stride_w + u*jcp.ic_block;
+        return ptr[aux_reg_src_data + jcp.typesize_in * offt];
+    };
+
+    auto wei_ptr = [=](int u, int i) {
+        size_t offt = i*jcp.nb_ic*jcp.oc_block*jcp.ic_block + u*jcp.ic_block * jcp.oc_block;
+        return ptr[aux_reg_weight_data + offt * jcp.typesize_in];
+    };
+
+    auto output_ptr = [=](int i, int j) {
+        return ptr[aux_reg_dst_data + (i * jcp.oc_block + j * jcp.oc) *
+                                              jcp.typesize_out];
+    };
+
+    auto init = [&]() {
+        for (int i = 0; i < oc_loop_blk; ++i) {
+            for (int j = 0; j < ur; ++j) {
+                auto vmm_acc = vreg_accum_vmm(i, j);
+                uni_vpxor(vmm_acc, vmm_acc, vmm_acc);
+            }
+        }
+
+        for (int i = 0; i < oc_loop_blk; ++i)
+            uni_vmovdqu(vreg_wei(i), wei_ptr(0, i));
+
+        uni_vpbroadcastd(vreg_src, src_ptr(0, 0));
+    };
+
+    auto store = [=]() {
+        mov(reg_scales, ptr[this->param1 + GET_OFF(scales)]);
+        uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
+
+        for (int j = 0; j < ur; ++j)
+            for (int i = 0; i < oc_loop_blk; ++i) {
+                int b_off = i*jcp.oc_block;
+
+                if (jcp.with_bias) {
+                    switch (jcp.bia_dt) {
+                        case data_type::f32:
+                        case data_type::s32: vmovups(vmm_bias, ptr[reg_bias_data + b_off*jcp.typesize_bia]); break;
+                        case data_type::s8: vpmovsxbd(vmm_bias, ptr[reg_bias_data + b_off*jcp.typesize_bia]); break;
+                        case data_type::u8: vpmovzxbd(vmm_bias, ptr[reg_bias_data + b_off*jcp.typesize_bia]); break;
+                        default: assert(!"unsupported dst data type");
+                    }
+                }
+                if (jcp.bia_dt != data_type::f32)
+                    vcvtdq2ps(vmm_bias, vmm_bias);
+
+                Vmm vmm_dst = vreg_accum_vmm(i, j);
+                Xmm xmm_dst = vreg_accum_xmm(i, j);
+
+                vcvtdq2ps(vmm_dst, vmm_dst);
+
+                if (jcp.with_bias)
+                    vaddps(vmm_dst, vmm_dst, vmm_bias);
+
+                int s_off = jcp.is_oc_scale * (sizeof(float) * (i*jcp.oc_block));
+                vmulps(vmm_dst, vmm_dst, ptr[reg_scales + s_off]);
+
+                if (jcp.with_sum) {
+                    Ymm vmm_prev_dst = Ymm(12);
+                    cvt2ps(jcp.dst_dt, vmm_prev_dst, output_ptr(i, j));
+                    vaddps(vmm_dst, vmm_prev_dst);
+                }
+
+                if (maybe_relu(0))
+                    vmaxps(vmm_dst, vmm_zero, vmm_dst);
+
+                if (maybe_relu(1))
+                    vmaxps(vmm_dst, vmm_zero, vmm_dst);
+
+                if (jcp.dst_dt != data_type::f32) {
+                    if (attr_.round_mode_ == round_mode::nearest)
+                        if (isa == avx512_common) {
+                            vcvtps2dq(vmm_dst | T_rn_sae, vmm_dst);
+                        } else {
+                            vcvtps2dq(vmm_dst, vmm_dst);
+                        }
+                    else if (attr_.round_mode_ == round_mode::down) {
+                        if (isa == avx512_common) {
+                            vcvtps2dq(vmm_dst | T_rd_sae, vmm_dst);
+                        } else {
+                            vroundps(vmm_dst, vmm_dst, 1);
+                            vcvtps2dq(vmm_dst, vmm_dst);
+                        }
+                    } else
+                        assert(!"unimplemented");
+                }
+
+                switch (jcp.dst_dt) {
+                    case data_type::f32:
+                    case data_type::s32: vmovups(output_ptr(i, j), vmm_dst); break;
+                    case data_type::s8:
+                        if (isa == avx512_common) {
+                            vpmovsdb(xmm_dst, vmm_dst);
+                            vmovups(output_ptr(i, j), xmm_dst);
+                        } else if (isa == avx2) {
+                            Ymm ymm_dst = Ymm(vmm_dst.getIdx());
+
+                            vpackssdw(ymm_dst, ymm_dst, ymm_dst);
+                            vpermq(ymm_dst, ymm_dst, 0x08);
+                            vpacksswb(xmm_dst, xmm_dst, xmm_dst);
+                            vmovq(output_ptr(i, j), xmm_dst);
+                        }
+                        break;
+                    case data_type::u8:
+                        if (isa == avx512_common) {
+                            vpmovusdb(xmm_dst, vmm_dst);
+                            vmovups(output_ptr(i, j), xmm_dst);
+                        } else if (isa == avx2) {
+                            Ymm ymm_dst = Ymm(vmm_dst.getIdx());
+
+                            vpackusdw(ymm_dst, ymm_dst, ymm_dst);
+                            vpermq(ymm_dst, ymm_dst, 0x08);
+                            vpackuswb(xmm_dst, xmm_dst, xmm_dst);
+                            vmovq(output_ptr(i, j), xmm_dst);
+                        }
+                        break;
+                    default: assert(!"unknown dst_dt");
+                }
+            }
+    };
+
+    auto fma_block = [=]() {
+        for (int j = 0; j < ur; ++j) {
+            for (int i = 0; i < oc_loop_blk; i++) {
+                vpmaddubsw(vreg_sum_0, vreg_src, vreg_wei(i));
+                vpmaddwd(vreg_sum_0, vreg_sum_0, vmm_one);
+                vpaddd(vreg_accum_vmm(i, j), vreg_accum_vmm(i, j), vreg_sum_0);
+
+                if (j == ur - 1) {
+                    uni_vmovdqu(vreg_wei(i), wei_ptr(1, i));
+                }
+            }
+
+            if (j < ur - 1)
+                uni_vpbroadcastd(vreg_src, src_ptr(0, j + 1));
+        }
+
+        uni_vpbroadcastd(vreg_src, src_ptr(1, 0));
+    };
+
+    mov(aux_reg_weight_data, reg_weight_data);
+    mov(aux_reg_src_data, reg_src_data);
+
+    init();
+
+    Label ic_loop;
+    Label exit;
+
+    xor_(reg_loop_ic_iter, reg_loop_ic_iter);
+    L(ic_loop); {
+        cmp(reg_loop_ic_iter, jcp.nb_ic);
+        jge(exit, T_NEAR);
+
+        fma_block();
+
+        add(aux_reg_src_data, jcp.ic_block * jcp.typesize_in);
+        add(aux_reg_weight_data, jcp.ic_block * jcp.oc_block * jcp.typesize_in);
+        inc(reg_loop_ic_iter);
+        jmp(ic_loop, T_NEAR);
+    }
+
+    L(exit);
+
+    store();
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_1x1_conv_fwd_kernel<isa>::generate()
+{
+    preamble();
+
+    mov(reg_scratch, 0x1);
+    movq(xmm_one, reg_scratch);
+    vpbroadcastw(vmm_one, xmm_one);
+
+    mov(reg_weight_data, ptr[param1 + GET_OFF(oc_data)]);
+    mov(reg_dst_data,    ptr[param1 + GET_OFF(output_data)]);
+    if (jcp.with_bias) {
+        mov(reg_bias_data, ptr[param1 + GET_OFF(bias_data)]);
+    }
+
+    mov(reg_oc_loop_work, ptr[param1 + GET_OFF(oc_dim)]);
+    mov(reg_src_data, ptr[param1 + GET_OFF(is_data)]);
+    mov(reg_loop_os_iter,  ptr[param1 + GET_OFF(os_dim)]);
+
+    Label oc_blocks_tail_label;
+    Label exit_label;
+
+    int oc_blocks_tail = jcp.nb_oc % jcp.nb_oc_blocking;
+
+    cmp(reg_oc_loop_work, jcp.nb_oc_blocking);
+    jne(oc_blocks_tail ? oc_blocks_tail_label : exit_label, T_NEAR);
+
+    loop_os(jcp.nb_oc_blocking); // channel main loop
+    jmp(exit_label, T_NEAR);
+
+    if (oc_blocks_tail) {
+        L(oc_blocks_tail_label);
+
+        cmp(reg_oc_loop_work, oc_blocks_tail);
+        jne(exit_label, T_NEAR);
+
+        loop_os(oc_blocks_tail); // channel tail loop
+    }
+
+    L(exit_label);
+
+    postamble();
+}
+
+template <cpu_isa_t isa>
+bool jit_uni_x8s8s32x_1x1_conv_fwd_kernel<isa>::post_ops_ok(
+        jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr) {
+    const auto &p = attr.post_ops_;
+
+    auto is_relu = [&](int idx) { return p.entry_[idx].is_relu(); };
+    auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(); };
+
+    switch (p.len_) {
+        case 0: return true; // no post_ops
+        case 1: return !jcp.with_eltwise && (is_relu(0) || is_sum(0)); // sum OR relu
+        case 2: return !jcp.with_eltwise && (is_sum(0) && is_relu(1)); // sum->relu
+        default: return false;
+    }
+
+    return false;
+}
+
+template <cpu_isa_t isa>
+bool jit_uni_x8s8s32x_1x1_conv_fwd_kernel<isa>::maybe_relu(int position) {
+    using namespace primitive_kind;
+    const auto &p = attr_.post_ops_;
+
+    if (position == 0) {
+        /* relu before sum */
+        return false
+               || jcp.with_eltwise
+               || p.contain(eltwise, 0)
+               || (jcp.dst_dt == data_type::u8 && !p.contain(sum, 0));
+    } else if (position == 1) {
+        /* relu after sum */
+        const int sum_idx = p.contain(sum, 0)
+                            ? 0 : (p.contain(sum, 1) ? 1 : -1);
+        if (sum_idx == -1)
+            return false;
+
+        return false
+               || p.contain(eltwise, sum_idx + 1)
+               || jcp.dst_dt == data_type::u8;
+    }
+
+    return false;
+}
+
+template <cpu_isa_t isa>
+status_t jit_uni_x8s8s32x_1x1_conv_fwd_kernel<isa>::init_conf(jit_1x1_conv_conf_t &jcp,
+        const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+        const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d,
+        const memory_desc_wrapper &bias_pd, const primitive_attr_t &attr,
+        bool with_relu, float relu_negative_slope)
+{
+    if (!mayiuse(isa)) return status::unimplemented;
+
+    const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+
+    jcp.prop_kind = cd.prop_kind;
+
+    jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
+    jcp.mb = src_d.dims()[0];
+
+    jcp.oc = dst_d.dims()[1] / jcp.ngroups;
+    jcp.ic = src_d.dims()[1] / jcp.ngroups;
+
+    jcp.ih = src_d.dims()[2];
+    jcp.iw = src_d.dims()[3];
+    jcp.oh = dst_d.dims()[2];
+    jcp.ow = dst_d.dims()[3];
+
+    jcp.kh = weights_d.dims()[with_groups + 2];
+    jcp.kw = weights_d.dims()[with_groups + 3];
+
+    jcp.t_pad = cd.padding[0][0];
+    jcp.l_pad = cd.padding[0][1];
+
+    jcp.stride_h = cd.strides[0];
+    jcp.stride_w = cd.strides[1];
+
+    jcp.with_bias = cd.bias_desc.format != memory_format::undef;
+    jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef;
+    jcp.dst_dt = cd.dst_desc.data_type;
+
+    jcp.src_fmt = src_d.format();
+    jcp.with_eltwise = with_relu;
+    jcp.eltwise_alpha = relu_negative_slope;
+
+    jcp.os = jcp.oh * jcp.ow;
+    jcp.is = jcp.ih * jcp.iw;
+
+    auto desired_wei_fmt = OhIw8o4i;
+    auto desired_gr_wei_fmt = gOhIw8o4i;
+
+    int simd_w = isa == avx512_common ? 16 : 8;
+
+    bool args_ok = true
+        && jcp.ngroups == 1
+        && src_d.format() == nhwc
+        && one_of(weights_d.format(), desired_wei_fmt, desired_gr_wei_fmt)
+        && one_of(cd.bias_desc.format, memory_format::undef, any, x)
+        && dst_d.format() == nhwc
+        && jcp.oc % simd_w == 0 && jcp.ic % simd_w == 0
+        && jcp.t_pad == 0 && jcp.l_pad == 0
+        && jcp.kh == 1 && jcp.kw == 1
+        && jcp.stride_h == 1 && jcp.stride_w == 1;
+
+    if (!args_ok) return status::unimplemented;
+
+    jcp.ic_block = 4;
+    jcp.oc_block = simd_w;
+
+    jcp.ur = 2;
+    jcp.ow_tail = jcp.ow % jcp.ur;
+
+    int oc_blocking{ 0 };
+    int oc_blocking_max{ 0 };
+    int os_blocking{ 0 };
+    int os_blocking_max{ 0 };
+    int ic_blocking{ 0 };
+
+    jcp.ic_dim = jcp.ic;
+    jcp.oc_dim = jcp.oc;
+    jcp.is_dim = jcp.is;
+    jcp.os_block = jcp.ur;
+
+    jcp.typesize_in = types::data_type_size(src_d.data_type());
+    jcp.typesize_out = types::data_type_size(dst_d.data_type());
+    jcp.typesize_acc = sizeof(int32_t);
+    jcp.typesize_bia = jcp.with_bias
+                       ? types::data_type_size(bias_pd.data_type())
+                       : 0;
+
+    const auto &oscales = attr.output_scales_;
+    jcp.is_oc_scale = oscales.mask_ == 1 << 1;
+
+    const auto &p = attr.post_ops_;
+    jcp.with_sum = p.find(primitive_kind::sum) != -1;
+
+    assert(IMPLICATION(!jcp.is_oc_scale, oscales.mask_ == 0));
+
+    jcp.ic_loop_src_step = jcp.ic_block * jcp.ic_loop_unroll * jcp.typesize_in;
+    jcp.ic_loop_wei_step = jcp.ic_block * jcp.ic_loop_unroll * jcp.oc_block * jcp.typesize_in;
+
+    jcp.os_loop_dst_step = jcp.ur * jcp.oc * jcp.typesize_out;
+    jcp.os_loop_acc_step = jcp.ur * jcp.oc_block * jcp.typesize_acc;
+    jcp.os_loop_src_step = jcp.stride_w * jcp.ur * jcp.ic * jcp.typesize_in;
+    jcp.os_loop_dst_tail_step = jcp.ow_tail * jcp.oc * jcp.typesize_out;
+    jcp.os_loop_acc_tail_step = jcp.ow_tail * jcp.oc_block * jcp.typesize_acc;
+    jcp.os_loop_src_tail_step = jcp.stride_w * jcp.ow_tail * jcp.ic * jcp.typesize_in
+             + ((jcp.stride_h-1)*jcp.iw*jcp.ic*jcp.typesize_in);
+
+    oc_blocking     = 4 * jcp.oc_block;
+    oc_blocking_max = 4 * jcp.oc_block;
+    os_blocking     = 48; // affects oc balancing across threads
+    os_blocking_max = 320;
+    ic_blocking     = 4*128; // affects L1$ utilization
+
+    assert(oc_blocking);
+    assert(oc_blocking_max);
+    assert(os_blocking);
+    assert(os_blocking_max);
+    assert(ic_blocking);
+
+    assert(jcp.os_block % jcp.ur == 0);
+    jcp.ur_tail = jcp.is_dim % jcp.ur;
+
+    jcp.nb_oh_blocking     = nstl::max(1, os_blocking     / jcp.ow);
+    jcp.nb_oh_blocking_max = nstl::max(1, os_blocking_max / jcp.ow);
+    jcp.nb_oc_blocking     = oc_blocking / jcp.oc_block;
+    jcp.nb_oc_blocking_max = oc_blocking_max / jcp.oc_block;
+    jcp.nb_ic_blocking     = ic_blocking / jcp.ic_block;
+
+    jcp.nb_oc = div_up(jcp.oc_dim, jcp.oc_block);
+
+    jcp.nb_ic = jcp.ic / jcp.ic_block;
+
+    return status::success;
+}
+
+template struct jit_uni_x8s8s32x_1x1_conv_fwd_kernel<avx2>;
+template struct jit_uni_x8s8s32x_1x1_conv_fwd_kernel<sse42>;
+
+}
+}
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_conv_kernel.hpp
new file mode 100644 (file)
index 0000000..d082231
--- /dev/null
@@ -0,0 +1,98 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef JIT_UNI_X8S8S32X_1x1_CONV_KERNEL_HPP
+#define JIT_UNI_X8S8S32X_1x1_CONV_KERNEL_HPP
+
+#include "c_types_map.hpp"
+#include "type_helpers.hpp"
+#include "jit_generator.hpp"
+#include "jit_primitive_conf.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using Xbyak::Reg64;
+using Xbyak::Ymm;
+using Xbyak::Xmm;
+
+template <cpu_isa_t isa>
+struct jit_uni_x8s8s32x_1x1_conv_fwd_kernel: public jit_generator {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_x8s8s32x_1x1_conv_fwd_kernel)
+
+    jit_uni_x8s8s32x_1x1_conv_fwd_kernel(jit_1x1_conv_conf_t ajcp,
+        const primitive_attr_t &attr): jcp(ajcp), attr_(attr)
+    {
+        this->generate();
+        jit_ker = (void (*)(jit_1x1_conv_call_s *))this->getCode();
+    }
+
+    static bool post_ops_ok(jit_1x1_conv_conf_t &jcp,
+                            const primitive_attr_t &attr);
+    static status_t init_conf(jit_1x1_conv_conf_t &jcp,
+                              const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+                              const memory_desc_wrapper &weights_d,
+                              const memory_desc_wrapper &dst_d,
+                              const memory_desc_wrapper &bias_pd,
+                              const primitive_attr_t &attr,
+                              bool with_relu = false, float relu_negative_slope = 0.f);
+
+    jit_1x1_conv_conf_t jcp;
+    const primitive_attr_t &attr_;
+    void (*jit_ker)(jit_1x1_conv_call_s *);
+
+private:
+    using Vmm = typename utils::conditional3<isa == sse42, Xbyak::Xmm,
+            isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+
+    Reg64 reg_weight_data = rsi;
+    Reg64 reg_src_data = abi_not_param1;
+    Reg64 reg_dst_data = rbx;
+    Reg64 reg_bias_data = r12;
+
+    Reg64 reg_scales = rdx;
+    Reg64 aux_reg_src_data = rdx;
+    Reg64 aux_reg_weight_data = rax;
+    Reg64 aux_reg_dst_data = rbp;
+    Reg64 reg_oc_loop_work = r9;
+    Reg64 reg_ow_loop_work = r10;
+    Reg64 reg_loop_os_iter = r14;
+    Reg64 reg_loop_ic_iter = r15;
+
+    Reg64 reg_scratch = r14;
+
+    Vmm vreg_sum_0 = Vmm(15);
+    Vmm vreg_src = Vmm(14);
+    Vmm vmm_bias = Vmm(15);
+    Vmm vmm_zero = Vmm(14);
+    Vmm vmm_one = Vmm(13);
+    Xmm xmm_one = Xmm(13);
+
+    void loop_os(int oc_loop_blk);
+    void ic_loop(int oc_loop_blk, int ur);
+
+    void generate();
+
+    bool maybe_relu(int position);
+    void cvt2ps(data_type_t type_in, Vmm vmm_in, const Xbyak::Operand &op);
+};
+
+}
+}
+}
+
+#endif
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.cpp
new file mode 100644 (file)
index 0000000..1eddc79
--- /dev/null
@@ -0,0 +1,147 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "mkldnn_types.h"
+#include "c_types_map.hpp"
+#include "jit_uni_x8s8s32x_1x1_convolution.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using namespace mkldnn::impl::status;
+using namespace mkldnn::impl::memory_format;
+using namespace mkldnn::impl::utils;
+
+template <cpu_isa_t isa, bool with_relu, data_type_t src_type, data_type_t dst_type>
+void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa, with_relu, src_type, dst_type>::execute_forward() {
+    auto src = reinterpret_cast<const src_data_t *>(this->input_memory(0));
+    auto weights = reinterpret_cast<const wei_data_t *>(this->input_memory(1));
+    auto bias = reinterpret_cast<const char *>(this->input_memory(2));
+    auto dst = reinterpret_cast<dst_data_t *>(this->memory());
+
+    const memory_desc_wrapper src_d(conf_.src_pd());
+    const memory_desc_wrapper weights_d(conf_.weights_pd(0));
+    const memory_desc_wrapper dst_d(conf_.dst_pd());
+    const memory_desc_wrapper bias_d(conf_.weights_pd(1));
+
+    const auto &jcp = kernel_->jcp;
+
+    int ocb_work = utils::div_up(jcp.nb_oc, jcp.nb_oc_blocking);
+    int ohb_work = utils::div_up(jcp.oh, jcp.nb_oh_blocking);
+    const int work_amount = jcp.mb * jcp.ngroups * ocb_work * ohb_work;
+
+    const int stride_h = conf_.cdesc()->strides[0];
+    const int stride_w = conf_.cdesc()->strides[1];
+    const int pad_t = conf_.cdesc()->padding[0][0];
+    const int pad_l = conf_.cdesc()->padding[0][1];
+
+    const size_t bia_dt_size = conf_.with_bias()
+        ? types::data_type_size(conf_.cdesc()->bias_desc.data_type) : 0;
+
+    const auto &oscales = conf_.attr()->output_scales_;
+
+    auto ker = [&](const int ithr, const int nthr) {
+        jit_1x1_conv_call_s p = {};
+        p.acc_s32 = ws_ + ithr * ws_per_thread_;
+
+        const int oh_block = jcp.ow;
+
+        int start{0}, end{0};
+        balance211(work_amount, nthr, ithr, start, end);
+
+        int n{0}, g{0}, ocb{0}, ohb{0};
+        nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ohb,
+                         ohb_work, ocb, ocb_work);
+
+        for (int iwork = start; iwork < end; ++iwork) {
+            int oc_ = ocb * jcp.nb_oc_blocking;
+            int oc_num = jcp.nb_oc_blocking;
+
+            int oh_ = ohb * jcp.nb_oh_blocking;
+            int oh_num = jcp.nb_oh_blocking;
+
+            int oh_step = nstl::min(oh_ + oh_num, jcp.oh) - oh_;
+
+            const int os = oh_ * oh_block;
+            const int oh = os / jcp.ow;
+            const int ow = os % jcp.ow;
+
+            const int ih = nstl::max(oh * stride_h - pad_t, 0);
+            const int iw = nstl::max(ow * stride_w - pad_l, 0);
+
+            p.os_dim = this_block_size(os, jcp.os, oh_step * oh_block);
+            p.oc_dim = nstl::min(oc_ + oc_num, jcp.nb_oc) - oc_;
+
+            const size_t dst_off = dst_d.blk_off(n, oc_*jcp.oc_block, oh, ow);
+            p.output_data = &dst[dst_off];
+
+            if (bias)
+                p.bias_data = &bias[bias_d.blk_off(oc_ * jcp.oc_block * bia_dt_size)];
+
+            p.scales = &oscales.scales_[jcp.is_oc_scale * oc_ * jcp.oc_block];
+            p.oc_data = &weights[conf_.with_groups() ? weights_d.blk_off(g, oc_, 0) : weights_d.blk_off(oc_, 0)];
+            p.is_data = src + src_d.blk_off(n, 0, ih, iw);
+
+            kernel_->jit_ker(&p);
+
+            nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ohb,
+                             ohb_work, ocb, ocb_work);
+        }
+    };
+
+    parallel(0, ker);
+}
+
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, true, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, true, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, true, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, true, data_type::u8, data_type::f32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, false, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, false, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, false, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, false, data_type::u8, data_type::f32>::execute_forward();
+
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, true, data_type::s8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, true, data_type::s8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, true, data_type::s8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, true, data_type::s8, data_type::f32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, false, data_type::s8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, false, data_type::s8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, false, data_type::s8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, false, data_type::s8, data_type::f32>::execute_forward();
+
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, true, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, true, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, true, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, true, data_type::u8, data_type::f32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, false, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, false, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, false, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, false, data_type::u8, data_type::f32>::execute_forward();
+
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, true, data_type::s8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, true, data_type::s8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, true, data_type::s8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, true, data_type::s8, data_type::f32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, false, data_type::s8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, false, data_type::s8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, false, data_type::s8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, false, data_type::s8, data_type::f32>::execute_forward();
+
+}
+}
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_1x1_convolution.hpp
new file mode 100644 (file)
index 0000000..5ae3b8f
--- /dev/null
@@ -0,0 +1,140 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_JIT_UNI_X8S8S32X_1x1_CONVOLUTION_HPP
+#define CPU_JIT_UNI_X8S8S32X_1x1_CONVOLUTION_HPP
+
+#include "c_types_map.hpp"
+#include "cpu_convolution_pd.hpp"
+#include "cpu_engine.hpp"
+#include "cpu_reducer.hpp"
+#include "jit_uni_x8s8s32x_1x1_conv_kernel.hpp"
+#include "mkldnn_thread.hpp"
+#include "utils.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+template <cpu_isa_t isa, bool with_relu, impl::data_type_t src_type, impl::data_type_t dst_type>
+struct _jit_uni_x8s8s32x_1x1_convolution_fwd_t: public cpu_primitive_t {
+    struct pd_t: public _cpu_convolution_fwd_pd_t<with_relu> {
+        pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc,
+                const primitive_attr_t *attr,
+                const typename pd_t::base_class *hint_fwd_pd)
+            : _cpu_convolution_fwd_pd_t<with_relu>(engine, adesc, attr,
+                hint_fwd_pd)
+            , jcp_({}) {}
+
+        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", isa, ""),
+            _jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa, with_relu, src_type, dst_type>);
+
+        virtual status_t init() override {
+            using namespace prop_kind;
+            assert(this->engine()->kind() == engine_kind::cpu);
+            bool ok = true
+                && this->set_default_params() == status::success
+                && utils::one_of(this->cdesc_().prop_kind, forward_training,
+                        forward_inference)
+                && this->cdesc_().alg_kind == alg_kind::convolution_direct
+                && this->cdesc_().src_desc.data_type == data_type::u8
+                && this->cdesc_().dst_desc.data_type == dst_type
+                && this->cdesc_().weights_desc.data_type == data_type::s8
+                && IMPLICATION(this->with_bias(), utils::one_of(
+                   this->cdesc_().bias_desc.data_type, data_type::f32,
+                   data_type::s32, data_type::s8, data_type::u8))
+                && this->cdesc_().accum_data_type == data_type::s32;
+            if (!ok) return status::unimplemented;
+
+            return jit_uni_x8s8s32x_1x1_conv_fwd_kernel<isa>::init_conf(jcp_,
+                        this->cdesc_(),
+                        this->src_pd_.desc(), *this->weights_pd_.desc(),
+                        *this->dst_pd_.desc(), *this->bias_pd_.desc(),
+                        *this->attr(), with_relu, this->negative_slope());
+        }
+
+        jit_1x1_conv_conf_t jcp_;
+
+    protected:
+        virtual status_t set_default_params() override {
+            using namespace memory_format;
+            auto desired_act_fmt = nhwc;
+
+            auto desired_wei_fmt = OhIw8o4i;
+            auto desired_gr_wei_fmt = gOhIw8o4i;
+
+            if (this->src_pd_.desc()->format == any)
+                CHECK(this->src_pd_.set_format(desired_act_fmt));
+            if (this->dst_pd_.desc()->format == any)
+                CHECK(this->dst_pd_.set_format(desired_act_fmt));
+            if (this->weights_pd_.desc()->format == any)
+                CHECK(this->weights_pd_.set_format(this->with_groups() ? desired_gr_wei_fmt : desired_wei_fmt));
+            if (this->bias_pd_.desc()->format == any)
+                CHECK(this->bias_pd_.set_format(x));
+            return status::success;
+        }
+    };
+
+    _jit_uni_x8s8s32x_1x1_convolution_fwd_t(const pd_t *pd, const
+                                            input_vector &inputs,
+                                            const output_vector &outputs)
+        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
+        , kernel_(nullptr), ws_(nullptr)
+    {
+        kernel_ = new jit_uni_x8s8s32x_1x1_conv_fwd_kernel<isa>(conf_.jcp_, *conf_.attr());
+        const int nthreads = mkldnn_get_max_threads();
+        ws_per_thread_ = conf_.jcp_.ow * conf_.jcp_.nb_oh_blocking_max * conf_.jcp_.oc_block;
+        ws_ = (acc_data_t*)malloc(nthreads * ws_per_thread_ * sizeof(acc_data_t), 64);
+    }
+    ~_jit_uni_x8s8s32x_1x1_convolution_fwd_t() {
+        delete kernel_;
+        free(ws_);
+    }
+
+    typedef typename prec_traits<data_type::u8>::type src_data_t;
+    typedef typename prec_traits<data_type::s8>::type wei_data_t;
+    typedef typename prec_traits<dst_type>::type dst_data_t;
+    typedef typename prec_traits<data_type::s32>::type acc_data_t;
+
+    virtual void execute(event_t *e) {
+        execute_forward();
+        e->set_state(event_t::ready);
+    }
+
+private:
+    void execute_forward();
+    pd_t conf_;
+    jit_uni_x8s8s32x_1x1_conv_fwd_kernel<isa> *kernel_;
+
+    /* reduction to unit stride */
+    size_t ws_per_thread_;
+    acc_data_t *ws_;
+};
+
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_avx2_x8s8s32x_1x1_convolution_fwd_t = _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, false, src_type, dst_type>;
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_sse42_x8s8s32x_1x1_convolution_fwd_t = _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, false, src_type, dst_type>;
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_avx2_x8s8s32x_1x1_convolution_relu_t = _jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2, true, src_type, dst_type>;
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_sse42_x8s8s32x_1x1_convolution_relu_t = _jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse42, true, src_type, dst_type>;
+
+}
+}
+}
+
+#endif
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.cpp
new file mode 100644 (file)
index 0000000..b94295b
--- /dev/null
@@ -0,0 +1,865 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "c_types_map.hpp"
+#include "nstl.hpp"
+#include "type_helpers.hpp"
+#include "utils.hpp"
+#include "cpu_memory.hpp"
+
+#include "jit_uni_x8s8s32x_conv_kernel.hpp"
+
+#define GET_OFF(field) offsetof(jit_conv_call_s, field)
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using namespace mkldnn::impl::prop_kind;
+using namespace mkldnn::impl::memory_format;
+using namespace mkldnn::impl::utils;
+
+using namespace Xbyak;
+
+template <cpu_isa_t isa>
+bool jit_uni_x8s8s32x_conv_fwd_kernel<isa>::maybe_relu(int position) {
+    using namespace primitive_kind;
+    const auto &p = attr_.post_ops_;
+
+    if (position == 0) {
+        /* relu before sum */
+        return false
+               || jcp.with_eltwise
+               || p.contain(eltwise, 0)
+               || (jcp.dst_dt == data_type::u8 && !p.contain(sum, 0));
+    } else if (position == 1) {
+        /* relu after sum */
+        const int sum_idx = p.contain(sum, 0)
+                            ? 0 : (p.contain(sum, 1) ? 1 : -1);
+        if (sum_idx == -1)
+            return false;
+
+        return false
+               || p.contain(eltwise, sum_idx + 1)
+               || jcp.dst_dt == data_type::u8;
+    }
+
+    return false;
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_conv_fwd_kernel<isa>::cvt2ps(data_type_t type_in, Vmm vmm_in,
+        const Xbyak::Operand &op, bool scalar_load) {
+    Xmm xmm_in = Xmm(vmm_in.getIdx());
+
+    switch (type_in) {
+        case data_type::f32:
+        case data_type::s32:
+            if (scalar_load) {
+                movsd(xmm_in, op);
+            } else {
+                uni_vmovups(vmm_in, op);
+            }
+            break;
+        case data_type::s8:
+            if (scalar_load) {
+                movsx(reg_tmp_32, op);
+                movq(xmm_in, reg_tmp_64);
+            } else {
+                uni_vpmovsxbd(vmm_in, op);
+            }
+            break;
+        case data_type::u8:
+            if (scalar_load) {
+                movzx(reg_tmp_32, op);
+                movq(xmm_in, reg_tmp_64);
+            } else {
+                uni_vpmovzxbd(vmm_in, op);
+            }
+            break;
+        default: assert(!"unsupported data type");
+    }
+
+    if (type_in != data_type::f32)
+        uni_vcvtdq2ps(vmm_in, vmm_in);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_conv_fwd_kernel<isa>::store_dst(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store) {
+    Ymm ymm_dst = Ymm(vmm_dst.getIdx());
+    Xmm xmm_dst = Xmm(vmm_dst.getIdx());
+
+    switch (jcp.dst_dt) {
+        case data_type::f32:
+        case data_type::s32:
+            if (scalar_store) {
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_32);
+            } else {
+                uni_vmovups(op, vmm_dst);
+            }
+            break;
+        case data_type::s8:
+            uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
+
+            if (isa != sse42 && !scalar_store)
+                vpermq(ymm_dst, ymm_dst, 0x08);
+
+            uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
+
+            if (scalar_store) {
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_8);
+            } else {
+                if (isa != sse42)
+                    vmovq(op, xmm_dst);
+                else
+                    movd(op, xmm_dst);
+            }
+            break;
+        case data_type::u8:
+            uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
+
+            if (isa != sse42 && !scalar_store)
+                vpermq(ymm_dst, ymm_dst, 0x08);
+
+            uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
+
+            if (scalar_store) {
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_8);
+            } else {
+                if (isa != sse42)
+                    vmovq(op, xmm_dst);
+                else
+                    movd(op, xmm_dst);
+            }
+
+            break;
+        default:
+            assert(!"unknown dst_dt");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_conv_fwd_kernel<isa>::apply_filter(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step,
+        int tail_size, bool h_padded) {
+    int kw = jcp.kw;
+    int kh = jcp.kh;
+    int nb_ic = jcp.nb_ic;
+    int stride_w = jcp.stride_w;
+    int dilate_w = jcp.dilate_w + 1;
+    int ic_blk = jcp.ic_block;
+    int oc_blk = jcp.oc_block;
+
+    int repeats = isa == sse42 && oc_step > (oc_blk / 2) ? 2 : 1;
+
+    for (int ki = 0; ki < kw; ki++) {
+        int jj_start = nstl::max(0, div_up(pad_l - ki * dilate_w, stride_w));
+        int jj_end = ur_w - nstl::max(0, div_up(ki*dilate_w+pad_r-(kw-1)*dilate_w, stride_w));
+
+        int _start = (jcp.signed_input) ? 0 : jj_start;
+        int _end = (jcp.signed_input) ? ur_w : jj_end;
+
+        for (int r = 0; r < repeats; r++) {
+            for (int jj = _start; jj < _end; jj++) {
+                int inp_off = (ki * dilate_w + jj * stride_w - pad_l) * jcp.ic * jcp.ngroups;
+                    if (tail_size > 0) {
+                        if (h_padded || jj < jj_start || jj >= jj_end) {
+                            uni_vpxor(get_src_reg(jj), get_src_reg(jj), get_src_reg(jj));
+                            uni_vpsubb(get_src_reg(jj), get_src_reg(jj), vmm_shift);
+                            uni_vandps(get_src_reg(jj), get_src_reg(jj), vmm_mask);
+                            uni_vpbroadcastd(get_src_reg(jj), Xmm(get_src_reg(jj).getIdx()));
+                        } else {
+                            uni_vpbroadcastd(get_src_reg(jj), ptr[aux1_reg_input + jcp.typesize_in * inp_off]);
+
+                            if (jcp.signed_input) {
+                                uni_vpsubb(get_src_reg(jj), get_src_reg(jj), vmm_shift);
+                            }
+
+                            uni_vandps(get_src_reg(jj), get_src_reg(jj), vmm_mask);
+                            uni_vpbroadcastd(get_src_reg(jj), Xmm(get_src_reg(jj).getIdx()));
+                        }
+                    } else {
+                        if (h_padded || jj < jj_start || jj >= jj_end) {
+                            uni_vpxor(get_src_reg(jj), get_src_reg(jj), get_src_reg(jj));
+                        } else {
+                            uni_vpbroadcastd(get_src_reg(jj), ptr[aux1_reg_input + jcp.typesize_in * inp_off]);
+                        }
+
+                        if (jcp.signed_input)
+                            uni_vpsubb(get_src_reg(jj), get_src_reg(jj), vmm_shift);
+                    }
+            }
+
+            for (int ii = 0; ii < oc_blocks; ii++) {
+                int ker_off = ii * nb_ic * kh * kw * ic_blk * oc_blk + ki * ic_blk * oc_blk + r * ic_blk * (oc_blk / 2);
+                uni_vmovups(get_ker_reg(0), ptr[aux1_reg_kernel + jcp.typesize_in * ker_off]);
+
+                for (int jj = _start; jj < _end; jj++) {
+                    Vmm vmm_src = get_src_reg(jj);
+                    if (isa == sse42) {
+                        uni_vmovups(get_tmp_reg(0), vmm_src);
+                        uni_vpmaddubsw(get_tmp_reg(0), get_tmp_reg(0), get_ker_reg(0));
+                    } else {
+                        uni_vpmaddubsw(get_tmp_reg(0), vmm_src, get_ker_reg(0));
+                    }
+                    uni_vpmaddwd(get_tmp_reg(0), get_tmp_reg(0), vmm_one);
+                    uni_vpaddd(get_acc_reg(r*jcp.ur_w*jcp.nb_oc_blocking + ur_w * ii + jj),
+                               get_acc_reg(r*jcp.ur_w*jcp.nb_oc_blocking + ur_w * ii + jj), get_tmp_reg(0));
+                }
+            }
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_conv_fwd_kernel<isa>::oh_step_unroll_kw(int ur_w,
+        int pad_l, int pad_r, int oc_blocks, int oc_step, bool h_padded) {
+    int kw = jcp.kw;
+    int ic_blk = jcp.ic_block;
+    int oc_blk = jcp.oc_block;
+
+    Label ic_main_loop;
+    Label ic_tail;
+    Label exit;
+
+    mov(aux1_reg_input, aux_reg_input);
+    mov(aux1_reg_kernel, aux_reg_kernel);
+
+    mov(reg_ic_iter, jcp.ic);
+
+    L(ic_main_loop); {
+        cmp(reg_ic_iter, ic_blk);
+        jl(ic_tail, T_NEAR);
+
+        apply_filter(ur_w, pad_l, pad_r, oc_blocks, oc_step, 0, h_padded);
+
+        add(aux1_reg_input, ic_blk * jcp.typesize_in);
+        add(aux1_reg_kernel, kw * ic_blk * oc_blk * jcp.typesize_in);
+        sub(reg_ic_iter, ic_blk);
+        jmp(ic_main_loop, T_NEAR);
+    }
+
+    L(ic_tail);
+    int ic_tail_size = jcp.ic % jcp.ic_block;
+
+    if (ic_tail_size > 0)
+        apply_filter(ur_w, pad_l, pad_r, oc_blocks, oc_step, ic_tail_size, h_padded);
+
+    L(exit);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_conv_fwd_kernel<isa>::kh_loop(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step) {
+    int iw = jcp.iw;
+    int kw = jcp.kw;
+    int dilate_h = jcp.dilate_h + 1;
+    const int inp_mult = jcp.ic * dilate_h * jcp.ngroups;
+
+    Label t_overflow_label, no_t_overflow_label,
+          b_overflow_label, no_b_overflow_label;
+
+    mov(aux_reg_input, reg_input);
+    mov(aux_reg_kernel, reg_kernel);
+
+    mov(imm_addr64, l_table);
+    uni_vmovups(vmm_one,   ptr[imm_addr64 + 0 * vlen]);
+    uni_vmovups(vmm_shift, ptr[imm_addr64 + 1 * vlen]);
+    uni_vmovups(vmm_mask, ptr[imm_addr64 + 4 * vlen]);
+
+    if (jcp.signed_input) {
+        mov(reg_overflow,  ptr[param1 + GET_OFF(t_overflow)]);
+        cmp(reg_overflow, 0);
+        je(no_t_overflow_label, T_NEAR);
+        L(t_overflow_label); {
+            oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks, oc_step, true);
+
+            add(aux_reg_kernel, jcp.typesize_in * kw * jcp.oc_block * rnd_up(jcp.ic, jcp.ic_block));
+            dec(reg_overflow);
+            cmp(reg_overflow, 0);
+            jg(t_overflow_label, T_NEAR);
+        }
+        L(no_t_overflow_label);
+    }
+
+    Label skip_kh_loop;
+    mov(reg_kj, ptr[this->param1 + GET_OFF(kh_padding)]);
+    if ((jcp.signed_input) || (!jcp.signed_input &&
+                               (jcp.kh - 1) * (jcp.dilate_h + 1) < nstl::max(jcp.t_pad, jcp.b_pad))) {
+        cmp(reg_kj, 0);
+        je(skip_kh_loop, T_NEAR);
+    }
+
+    Label kh_label;
+    L(kh_label);
+    {
+        oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks, oc_step, false);
+
+        add(aux_reg_kernel, jcp.typesize_in * kw * jcp.oc_block * rnd_up(jcp.ic, jcp.ic_block));
+        add(aux_reg_input, jcp.typesize_in * iw * inp_mult);
+
+        dec(reg_kj);
+        cmp(reg_kj, 0);
+        jg(kh_label, T_NEAR);
+    }
+
+    L(skip_kh_loop);
+
+    if (jcp.signed_input) {
+        mov(reg_overflow,  ptr[param1 + GET_OFF(b_overflow)]);
+        cmp(reg_overflow, 0);
+        je(no_b_overflow_label, T_NEAR);
+        L(b_overflow_label); {
+            oh_step_unroll_kw(ur_w, pad_l, pad_r, oc_blocks, oc_step, true);
+
+            add(aux_reg_kernel, jcp.typesize_in * kw * jcp.oc_block * rnd_up(jcp.ic, jcp.ic_block));
+            dec(reg_overflow);
+            cmp(reg_overflow, 0);
+            jg(b_overflow_label, T_NEAR);
+        }
+        L(no_b_overflow_label);
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_conv_fwd_kernel<isa>::width_blk_step(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step)
+{
+    int repeats = isa == sse42 && oc_step > (jcp.oc_block / 2) ? 2 : 1;
+
+    for (int r = 0; r < repeats; r++)
+        for (int ii = 0; ii < oc_blocks; ii++)
+            for (int jj = 0; jj < ur_w; jj++)
+                uni_vpxor(get_acc_reg(r*jcp.ur_w*jcp.nb_oc_blocking + ur_w * ii + jj),
+                          get_acc_reg(r*jcp.ur_w*jcp.nb_oc_blocking + ur_w * ii + jj),
+                          get_acc_reg(r*jcp.ur_w*jcp.nb_oc_blocking + ur_w * ii + jj));
+
+    kh_loop(ur_w, pad_l, pad_r, oc_blocks, oc_step);
+
+    pop(reg_scales_base);
+
+    mov(imm_addr64, l_table);
+    uni_vmovups(vmm_bias_alpha, ptr[imm_addr64 + 2 * vlen]);
+
+    const auto &p = attr_.post_ops_;
+    const int sum_idx = p.find(primitive_kind::sum);
+    const float p_sum_scale = (sum_idx != -1) ? p.entry_[sum_idx].sum.scale : 1.f;
+
+    for (int r = 0; r < repeats; r++) {
+        int tail_size = isa == avx2 ? oc_step : nstl::min(jcp.oc_block / 2, oc_step - r * jcp.oc_block / 2);
+        bool is_scalar_store = isa == avx2 ? tail_size < jcp.oc_block : tail_size < jcp.oc_block / 2;
+
+        if (is_scalar_store) {
+            for (int jj = 0; jj < ur_w; jj++) {
+                Vmm vmm_dst = get_acc_reg(r * jcp.ur_w * jcp.nb_oc_blocking + jj);
+                uni_vcvtdq2ps(vmm_dst, vmm_dst);
+                uni_vmovups(vmm_reminder_dst, vmm_dst);
+
+                for (int oc = 0; oc < tail_size; oc++) {
+                    uni_vmovups(vmm_dst, vmm_reminder_dst);
+
+                    if (jcp.with_bias) {
+                        int b_off = r * (jcp.oc_block / 2) + oc;
+                        cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias_base + b_off * jcp.typesize_bia], true);
+
+                        if (jcp.signed_input)
+                            uni_vmulps(vmm_bias, vmm_bias, vmm_bias_alpha);
+                    }
+                    if (jcp.signed_input) {
+                        int c_off = r * (jcp.oc_block / 2) + oc;
+                        cvt2ps(data_type::s32, vmm_comp, ptr[reg_compensation_base + c_off * sizeof(int32_t)], true);
+                    }
+
+                    if (jcp.signed_input)
+                        uni_vaddps(vmm_dst, vmm_dst, vmm_comp);
+                    if (jcp.with_bias)
+                        uni_vaddps(vmm_dst, vmm_dst, vmm_bias);
+
+                    int s_off = jcp.is_oc_scale * (r * (jcp.oc_block / 2) + oc);
+                    cvt2ps(mkldnn_f32, vmm_scale, ptr[reg_scales_base + s_off * sizeof(float)], true);
+                    uni_vmulps(vmm_dst, vmm_dst, vmm_scale);
+
+                    int o_off = jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2) + oc;
+                    if (jcp.with_sum) {
+                        uni_vpxor(vmm_prev_dst, vmm_prev_dst, vmm_prev_dst);
+                        cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + o_off * jcp.typesize_out], true);
+
+                        if (p_sum_scale == 1.f) {
+                            uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst);
+                        } else {
+                            uni_vfmadd231ps(vmm_dst, vmm_prev_dst, ptr[imm_addr64 + 3 * vlen]);
+                        }
+                    }
+
+                    if (maybe_relu(0)) {
+                        uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
+                        uni_vmaxps(vmm_dst, vmm_dst, vmm_zero);
+                    }
+
+                    if (maybe_relu(1)) {
+                        uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
+                        uni_vmaxps(vmm_dst, vmm_dst, vmm_zero);
+                    }
+
+                    if (jcp.dst_dt != data_type::f32) {
+                        if (attr_.round_mode_ == round_mode::nearest)
+                            uni_vcvtps2dq(vmm_dst, vmm_dst);
+                        else if (attr_.round_mode_ == round_mode::down) {
+                            uni_vroundps(vmm_dst, vmm_dst, 1);
+                            uni_vcvtps2dq(vmm_dst, vmm_dst);
+                        } else
+                            assert(!"unimplemented");
+                    }
+
+                    store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true);
+
+                    if (isa == avx2) {
+                        vperm2i128(ymm_tmp, ymm_reminder_dst, ymm_reminder_dst, 0x01);
+                        vpalignr(ymm_reminder_dst, ymm_tmp, ymm_reminder_dst, jcp.typesize_out);
+                    } else {
+                        psrldq(vmm_reminder_dst, jcp.typesize_out);
+                    }
+                }
+            }
+        } else {
+            for (int ii = 0; ii < oc_blocks; ii++) {
+                if (jcp.with_bias) {
+                    int b_off = ii * jcp.oc_block + r * (jcp.oc_block / 2);
+                    cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias_base + b_off * jcp.typesize_bia], false);
+
+                    if (jcp.signed_input)
+                        uni_vmulps(vmm_bias, vmm_bias, vmm_bias_alpha);
+                }
+
+                for (int jj = 0; jj < ur_w; jj++) {
+                    Vmm vmm_dst = get_acc_reg(r * jcp.ur_w * jcp.nb_oc_blocking + ur_w * ii + jj);
+                    uni_vcvtdq2ps(vmm_dst, vmm_dst);
+
+                    if (jcp.signed_input) {
+                        int c_off = ii * jcp.oc_block + r * (jcp.oc_block / 2);
+                        cvt2ps(data_type::s32, vmm_comp, ptr[reg_compensation_base + c_off * sizeof(int32_t)], false);
+                    }
+
+                    if (jcp.signed_input)
+                        uni_vaddps(vmm_dst, vmm_dst, vmm_comp);
+                    if (jcp.with_bias)
+                        uni_vaddps(vmm_dst, vmm_dst, vmm_bias);
+
+                    int s_off = jcp.is_oc_scale * (ii * jcp.oc_block + r * (jcp.oc_block / 2));
+                    cvt2ps(mkldnn_f32, vmm_scale, ptr[reg_scales_base + s_off * sizeof(float)], false);
+                    uni_vmulps(vmm_dst, vmm_dst, vmm_scale);
+
+                    int o_off = ii * jcp.oc_block + jj * jcp.oc * jcp.ngroups + r * (jcp.oc_block / 2);
+                    if (jcp.with_sum) {
+                        cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + o_off * jcp.typesize_out], false);
+
+                        if (p_sum_scale == 1.f) {
+                            uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst);
+                        } else {
+                            uni_vfmadd231ps(vmm_dst, vmm_prev_dst, ptr[imm_addr64 + 3 * vlen]);
+                        }
+                    }
+
+                    if (maybe_relu(0)) {
+                        uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
+                        uni_vmaxps(vmm_dst, vmm_dst, vmm_zero);
+                    }
+
+                    if (maybe_relu(1)) {
+                        uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
+                        uni_vmaxps(vmm_dst, vmm_dst, vmm_zero);
+                    }
+
+                    if (jcp.dst_dt != data_type::f32) {
+                        if (attr_.round_mode_ == round_mode::nearest)
+                            uni_vcvtps2dq(vmm_dst, vmm_dst);
+                        else if (attr_.round_mode_ == round_mode::down) {
+                            uni_vroundps(vmm_dst, vmm_dst, 1);
+                            uni_vcvtps2dq(vmm_dst, vmm_dst);
+                        } else
+                            assert(!"unimplemented");
+                    }
+
+                    store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, false);
+                }
+            }
+        }
+    }
+
+    push(reg_scales_base);
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_x8s8s32x_conv_fwd_kernel<isa>::solve_common(int oc_blocks, int oc_step)
+{
+    int ur_w = jcp.ur_w;
+    int ur_w_tail = jcp.ur_w_tail;
+    int n_oi = jcp.ow / ur_w;
+    int iw = jcp.iw;
+    int kw = jcp.kw;
+    int dilate_w = jcp.dilate_w + 1;
+    int str_w = jcp.stride_w;
+    const int inp_mult = jcp.ic * jcp.ngroups;
+
+    int l_pad = jcp.l_pad;
+    int r_pad = nstl::max(0, (int(jcp.ow) - 1) * str_w + (kw - 1) * dilate_w
+            - (iw + l_pad - 1));
+    int r_pad1 = (ur_w * n_oi - 1) * str_w + (kw - 1) * dilate_w
+            - (iw + l_pad - 1);
+    if (r_pad1 > 0) n_oi--;
+
+    mov(reg_input, reg_input_base);
+    mov(reg_output, reg_output_base);
+    mov(reg_kernel, reg_kernel_base);
+
+    push(reg_input_base);
+    push(reg_output_base);
+    push(reg_kernel_base);
+    push(reg_scales_base);
+
+    if (l_pad > 0) {
+        n_oi--;
+        if (n_oi < 0 && r_pad1 > 0)
+            width_blk_step(ur_w, l_pad, r_pad1, oc_blocks, oc_step); // "lrpad"
+        else
+            width_blk_step(ur_w, l_pad, 0, oc_blocks, oc_step); // "lpad"
+        add(reg_input, jcp.typesize_in * (ur_w * str_w - l_pad) * inp_mult);
+        add(reg_output, jcp.typesize_out * ur_w * jcp.oc * jcp.ngroups);
+    }
+
+    Label ow_loop_label;
+    xor_(reg_oi_iter, reg_oi_iter);
+
+    if (n_oi > 0) {
+        L(ow_loop_label);
+
+        width_blk_step(ur_w, 0, 0, oc_blocks, oc_step); // "middle"
+        add(reg_input, jcp.typesize_in * ur_w * str_w * inp_mult);
+        add(reg_output, jcp.typesize_out * ur_w * jcp.oc * jcp.ngroups);
+
+        inc(reg_oi_iter);
+        cmp(reg_oi_iter, n_oi);
+        jl(ow_loop_label, T_NEAR);
+    }
+
+    if (r_pad1 > 0 && n_oi >=0) {
+        width_blk_step(ur_w, 0, r_pad1, oc_blocks, oc_step); // "rpad"
+        add(reg_input, jcp.typesize_in * ur_w * str_w * inp_mult);
+        add(reg_output, jcp.typesize_out * ur_w * jcp.oc * jcp.ngroups);
+    }
+
+    if (ur_w_tail != 0)
+        width_blk_step(ur_w_tail, 0, r_pad, oc_blocks, oc_step); // "tail"
+
+    pop(reg_scales_base);
+    pop(reg_kernel_base);
+    pop(reg_output_base);
+    pop(reg_input_base);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_conv_fwd_kernel<isa>::generate()
+{
+    this->preamble();
+
+    mov(reg_kernel_base, ptr[this->param1 + GET_OFF(filt)]);
+    mov(reg_input_base, ptr[this->param1 + GET_OFF(src)]);
+    mov(reg_output_base, ptr[this->param1 + GET_OFF(dst)]);
+    mov(reg_oc, ptr[this->param1 + GET_OFF(oc_work)]);
+    if (jcp.with_bias)
+        mov(reg_bias_base, ptr[this->param1 + GET_OFF(bias)]);
+    mov(reg_scales_base, ptr[this->param1 + GET_OFF(scales)]);
+    if (jcp.signed_input)
+        mov(reg_compensation_base, ptr[param1 + GET_OFF(compensation)]);
+
+    Label main_loop_label;
+    Label tail_label;
+    Label exit_label;
+
+    cmp(reg_oc, jcp.nb_oc_blocking * jcp.oc_block);
+    jne(main_loop_label, T_NEAR);
+
+    solve_common(jcp.nb_oc_blocking, jcp.oc_block);
+
+    sub(reg_oc, jcp.nb_oc_blocking * jcp.oc_block);
+
+    jmp(exit_label, T_NEAR);
+
+    L(main_loop_label); {
+        cmp(reg_oc, jcp.oc_block);
+        jl(tail_label, T_NEAR);
+
+        solve_common(1, jcp.oc_block);
+
+        sub(reg_oc, jcp.oc_block);
+        add(reg_kernel_base, jcp.oc_block * jcp.nb_ic * jcp.kh * jcp.kw * jcp.ic_block * jcp.typesize_in);
+        add(reg_output_base, jcp.oc_block * jcp.typesize_out);
+        add(reg_bias_base, jcp.oc_block * jcp.typesize_bia);
+        add(reg_scales_base, jcp.is_oc_scale * jcp.oc_block * sizeof(float));
+        add(reg_compensation_base, jcp.oc_block * sizeof(int32_t));
+
+        jmp(main_loop_label, T_NEAR);
+    }
+
+    L(tail_label);
+
+    solve_common(1, jcp.oc % jcp.oc_block);
+
+    L(exit_label);
+
+    this->postamble();
+
+    prepare_table();
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_conv_fwd_kernel<isa>::prepare_table() {
+    const auto &p = attr_.post_ops_;
+    const int sum_idx = p.find(primitive_kind::sum);
+    const float p_sum_scale = (sum_idx != -1) ? p.entry_[sum_idx].sum.scale : 1.f;
+
+    const uint16_t cvals_one[] = {
+        0x0001,
+    };
+
+    const int8_t cvals_shift[] = {
+        -128,
+    };
+
+    const int32_t cvals_scale[] = {
+        float2int(jcp.wei_adj_scale)
+    };
+
+    const int32_t cvals_sum_scale[] = {
+        float2int(p_sum_scale)
+    };
+
+    align(64);
+    L(l_table);
+    for (size_t i = 0; i < sizeof(cvals_one) / sizeof(cvals_one[0]); ++i) {
+        for (size_t d = 0; d < vlen / sizeof(uint16_t); ++d) {
+            dw(cvals_one[i]);
+        }
+    }
+
+    for (size_t i = 0; i < sizeof(cvals_shift) / sizeof(cvals_shift[0]); ++i) {
+        for (size_t d = 0; d < vlen / sizeof(int8_t); ++d) {
+            db(cvals_shift[i]);
+        }
+    }
+
+    for (size_t i = 0; i < sizeof(cvals_scale) / sizeof(cvals_scale[0]); ++i) {
+        for (size_t d = 0; d < vlen / sizeof(int32_t); ++d) {
+            dd(cvals_scale[i]);
+        }
+    }
+
+    for (size_t i = 0; i < sizeof(cvals_sum_scale) / sizeof(cvals_sum_scale[0]); ++i) {
+        for (size_t d = 0; d < vlen / sizeof(int32_t); ++d) {
+            dd(cvals_sum_scale[i]);
+        }
+    }
+
+    for (size_t i = 0; i < sizeof(cvals_shift) / sizeof(cvals_shift[0]); ++i) {
+        for (size_t d = 0; d < vlen / sizeof(int8_t); ++d) {
+            if ((int)d < jcp.ic % jcp.ic_block)
+                db(255);
+            else
+                db(0);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+bool jit_uni_x8s8s32x_conv_fwd_kernel<isa>::post_ops_ok(
+        jit_conv_conf_t &jcp, const primitive_attr_t &attr) {
+    using namespace primitive_kind;
+    const auto &p = attr.post_ops_;
+
+    auto is_relu = [&](int idx) {
+        return p.entry_[idx].kind == eltwise
+               && p.entry_[idx].eltwise.scale == 1.
+               && p.entry_[idx].eltwise.alg == alg_kind::eltwise_relu
+               && p.entry_[idx].eltwise.alpha == 0.;
+    };
+
+    switch (p.len_) {
+        case 0: return true;
+        case 1: return true
+                       && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0))
+                       && IMPLICATION(!jcp.with_eltwise, is_relu(0) || p.contain(sum, 0));
+        case 2: return true
+                       && IMPLICATION(jcp.with_eltwise, p.contain(sum, 0) && is_relu(1))
+                       && IMPLICATION(!jcp.with_eltwise, false
+                                                         || (p.contain(sum, 0) && is_relu(1))
+                                                         || (p.contain(sum, 1) && is_relu(0)));
+        case 3: return true
+                       && jcp.with_eltwise == false
+                       && (is_relu(0) && p.contain(sum, 1) && is_relu(2));
+        default: return false;
+    }
+
+    return false;
+}
+
+template <cpu_isa_t isa>
+status_t jit_uni_x8s8s32x_conv_fwd_kernel<isa>::init_conf(jit_conv_conf_t &jcp,
+        const convolution_desc_t &cd, cpu_memory_t::pd_t &src_pd,
+        cpu_memory_t::pd_t &weights_pd, cpu_memory_t::pd_t &dst_pd,
+        cpu_memory_t::pd_t &bias_pd,
+        const primitive_attr_t &attr, bool with_relu, float relu_negative_slope)
+{
+    if (!mayiuse(isa)) return status::unimplemented;
+
+    const memory_desc_wrapper src_d(&src_pd);
+    const memory_desc_wrapper weights_d(&weights_pd);
+    const memory_desc_wrapper dst_d(&dst_pd);
+    const memory_desc_wrapper bias_d(&bias_pd);
+
+    jcp.prop_kind = cd.prop_kind;
+
+    const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+
+    jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
+    jcp.mb = src_d.dims()[0];
+
+    jcp.oc = dst_d.dims()[1] / jcp.ngroups;
+    jcp.ic = src_d.dims()[1] / jcp.ngroups;
+
+    jcp.ih = src_d.dims()[2];
+    jcp.iw = src_d.dims()[3];
+    jcp.oh = dst_d.dims()[2];
+    jcp.ow = dst_d.dims()[3];
+
+    jcp.kh = weights_d.dims()[with_groups + 2];
+    jcp.kw = weights_d.dims()[with_groups + 3];
+
+    jcp.t_pad = cd.padding[0][0];
+    jcp.l_pad = cd.padding[0][1];
+
+    jcp.stride_h = cd.strides[0];
+    jcp.stride_w = cd.strides[1];
+
+    jcp.dilate_h = cd.dilates[0];
+    jcp.dilate_w = cd.dilates[1];
+
+    jcp.src_fmt = src_d.format();
+    jcp.with_bias = cd.bias_desc.format != memory_format::undef;
+    jcp.with_eltwise = with_relu;
+    jcp.eltwise_alpha = relu_negative_slope;
+
+    jcp.signed_input = src_d.data_type() == data_type::s8;
+
+    const int simd_w = 8;
+
+    jcp.ic_block = 4;
+    jcp.nb_ic = div_up(jcp.ic, jcp.ic_block);
+
+    jcp.oc_block = simd_w;
+    jcp.oc_padded = rnd_up(jcp.oc, jcp.oc_block);
+    jcp.nb_oc = div_up(jcp.oc, jcp.oc_block);
+
+    if (!post_ops_ok(jcp, attr))
+        return status::unimplemented;
+
+    const auto &p = attr.post_ops_;
+    jcp.with_sum = p.find(primitive_kind::sum) != -1;
+    if (!jcp.with_eltwise) {
+        jcp.with_eltwise = p.find(primitive_kind::eltwise) != -1;
+        jcp.eltwise_alpha = 0.f;
+    }
+
+    auto desired_act_fmt = nhwc;
+    auto desired_wei_fmt = with_groups ? (jcp.signed_input) ? gOhIw8o4i_s8s8 : gOhIw8o4i
+                                       : (jcp.signed_input) ?  OhIw8o4i_s8s8 :  OhIw8o4i;
+
+    if (src_d.format() == any)
+        CHECK(src_pd.set_format(desired_act_fmt));
+    if (src_d.format() != desired_act_fmt)
+        return status::unimplemented;
+
+    if (dst_d.format() == any)
+        CHECK(dst_pd.set_format(desired_act_fmt));
+    if (dst_d.format() != desired_act_fmt)
+        return status::unimplemented;
+
+    if (weights_d.format() == any)
+        CHECK(weights_pd.set_format(desired_wei_fmt));
+    if (weights_d.format() != desired_wei_fmt)
+        return status::unimplemented;
+
+    if (jcp.with_bias) {
+        if (bias_d.format() == any)
+            CHECK(bias_pd.set_format(x));
+        if (bias_d.format() != x)
+            return status::unimplemented;
+    }
+
+    jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef;
+    jcp.dst_dt = cd.dst_desc.data_type;
+
+    jcp.typesize_in = types::data_type_size(src_d.data_type());
+    jcp.typesize_out = types::data_type_size(dst_d.data_type());
+    jcp.typesize_acc = sizeof(int32_t);
+    jcp.typesize_bia = jcp.with_bias
+                       ? types::data_type_size(bias_d.data_type())
+                       : 0;
+
+    const auto &oscales = attr.output_scales_;
+    jcp.is_oc_scale = oscales.mask_ == 1 << 1;
+
+    assert(IMPLICATION(!jcp.is_oc_scale, oscales.mask_ == 0));
+
+    jcp.ur_h = 1; /* no code-unrolling by h so far */
+    jcp.ur_w = isa == avx2 ? 3 : 2;
+    jcp.nb_oc_blocking = 2;
+    if (jcp.nb_oc % jcp.nb_oc_blocking != 0) jcp.nb_oc_blocking = 1;
+
+    if (jcp.ow < jcp.ur_w) jcp.ur_w = jcp.ow;
+    jcp.ur_w_tail = jcp.ow % jcp.ur_w;
+
+    bool args_ok = true
+        && jcp.l_pad <= jcp.ur_w
+        && IMPLICATION(jcp.kw > 7, (jcp.t_pad == 0 && jcp.l_pad == 0)
+                || (jcp.stride_w == 1 && jcp.stride_h == 1));
+    if (!args_ok) return status::unimplemented;
+
+    int r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w
+        + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1));
+
+    if (r_pad_no_tail > jcp.ur_w) {
+        /* recalculate ur_w, nb_oc_blocking and ur_w_tail */
+        jcp.ur_w = r_pad_no_tail + 1;
+        jcp.ur_w_tail = jcp.ow % jcp.ur_w;
+        /* check again ... */
+        r_pad_no_tail = nstl::max(0, (jcp.ow - jcp.ur_w_tail - 1) * jcp.stride_w
+            + (jcp.kw - 1) * (jcp.dilate_w + 1) - (jcp.iw + jcp.l_pad - 1));
+        if ((r_pad_no_tail > jcp.ur_w) || (jcp.ow < jcp.ur_w))
+            return status::unimplemented;
+    }
+    if (jcp.l_pad > jcp.ur_w) return status::unimplemented;
+
+    jcp.wei_adj_scale = (jcp.signed_input) ? (1.0f / 2.0f) : 1.0f;
+
+    return status::success;
+}
+
+template struct jit_uni_x8s8s32x_conv_fwd_kernel<avx2>;
+template struct jit_uni_x8s8s32x_conv_fwd_kernel<sse42>;
+
+}
+}
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_conv_kernel.hpp
new file mode 100644 (file)
index 0000000..110fa3a
--- /dev/null
@@ -0,0 +1,132 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef JIT_UNI_X8S8S32X_CONV_KERNEL_HPP
+#define JIT_UNI_X8S8S32X_CONV_KERNEL_HPP
+
+#include "c_types_map.hpp"
+#include "jit_generator.hpp"
+#include "jit_primitive_conf.hpp"
+#include "cpu_memory.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+template <cpu_isa_t isa>
+struct jit_uni_x8s8s32x_conv_fwd_kernel: public jit_generator {
+    jit_uni_x8s8s32x_conv_fwd_kernel(jit_conv_conf_t ajcp,
+            const primitive_attr_t &attr): jcp(ajcp), attr_(attr)
+    {
+        this->generate();
+        jit_ker = (void (*)(jit_conv_call_s *))this->getCode();
+    }
+
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_x8s8s32x_conv_fwd_kernel)
+
+    static bool post_ops_ok(jit_conv_conf_t &jcp,
+            const primitive_attr_t &attr);
+    static status_t init_conf(jit_conv_conf_t &jcp,
+            const convolution_desc_t &cd,
+            cpu_memory_t::pd_t &src_pd,
+            cpu_memory_t::pd_t &weights_pd,
+            cpu_memory_t::pd_t &dst_pd,
+            cpu_memory_t::pd_t &bias_pd,
+            const primitive_attr_t &attr,
+            bool with_relu = false,
+            float relu_negative_slope = 0.);
+
+    jit_conv_conf_t jcp;
+    const primitive_attr_t &attr_;
+    void (*jit_ker)(jit_conv_call_s *);
+
+private:
+    using Vmm = typename utils::conditional3<isa == sse42, Xbyak::Xmm,
+            isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+    const int vlen = cpu_isa_traits<isa>::vlen;
+    using Ymm = const Xbyak::Ymm;
+    using reg64_t = const Xbyak::Reg64;
+    using reg32_t = const Xbyak::Reg32;
+    using reg8_t = const Xbyak::Reg8;
+
+    reg64_t reg_scales_base = r13;
+    reg64_t reg_bias_base = rbp;
+    reg64_t reg_input_base = r8;
+    reg64_t reg_output_base = r9;
+    reg64_t reg_kernel_base = rbx;
+
+    reg64_t reg_input = rax;
+    reg64_t aux_reg_input = r8;
+    reg64_t aux1_reg_input = r13;
+    reg64_t reg_kernel = rdx;
+    reg64_t aux_reg_kernel = r9;
+    reg64_t aux1_reg_kernel = rbx;
+    reg64_t reg_output = rsi;
+
+    reg64_t reg_kj = r10;
+    reg64_t reg_overflow = r10;
+    reg64_t reg_oi_iter = r11;
+    reg64_t reg_ic_iter = r15;
+    reg64_t reg_compensation_base = abi_not_param1;
+    reg64_t reg_oc = r12;
+    reg64_t imm_addr64 = rbx;
+
+    reg8_t reg_tmp_8 = r14b;
+    reg32_t reg_tmp_32 = r14d;
+    reg64_t reg_tmp_64 = r14;
+
+    Vmm vmm_zero = Vmm(14);
+    Vmm vmm_one = Vmm(15);
+    Vmm vmm_bias_alpha = Vmm(13);
+    Vmm vmm_shift = Vmm(14);
+    Vmm vmm_mask = Vmm(13);
+    Vmm vmm_bias = Vmm(15);
+    Vmm vmm_reminder_dst = Vmm(11);
+    Ymm ymm_reminder_dst = Ymm(11);
+    Ymm ymm_tmp = Ymm(10);
+    Vmm vmm_scale = Vmm(12);
+    Vmm vmm_comp = Vmm(12);
+    Vmm vmm_prev_dst = Vmm(12);
+
+    inline Vmm get_src_reg(int idx) { return Vmm(idx + 8); }
+    inline Vmm get_ker_reg(int idx) { return Vmm(idx + 11); }
+    inline Vmm get_tmp_reg(int idx) { return Vmm(idx + 12); }
+    inline Vmm get_acc_reg(int idx) { return Vmm(idx + 0); }
+
+    inline void cvt2ps(data_type_t type_in, Vmm ymm_in, const Xbyak::Operand &op, bool scalar_load);
+    inline void store_dst(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store);
+
+    inline void apply_filter(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step,
+                             int tail_size, bool h_padded);
+    inline void oh_step_unroll_kw(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step, bool h_padded);
+    inline void kh_loop(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step);
+    inline void width_blk_step(int ur_w, int pad_l, int pad_r, int oc_blocks, int oc_step);
+    inline void solve_common(int oc_blocks, int oc_step);
+
+    bool maybe_relu(int position);
+
+    void generate();
+
+    void prepare_table();
+
+    Xbyak::Label l_table;
+};
+
+}
+}
+}
+
+#endif
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.cpp
new file mode 100644 (file)
index 0000000..d574361
--- /dev/null
@@ -0,0 +1,151 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "mkldnn_types.h"
+#include "c_types_map.hpp"
+#include "jit_uni_x8s8s32x_convolution.hpp"
+#include "utils.hpp"
+#include "mkldnn_thread.hpp"
+#include "type_helpers.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using namespace mkldnn::impl::status;
+using namespace mkldnn::impl::memory_format;
+using namespace mkldnn::impl::utils;
+
+template <cpu_isa_t isa, bool with_relu, impl::data_type_t src_type, data_type_t dst_type>
+void _jit_uni_x8s8s32x_convolution_fwd_t<isa, with_relu, src_type, dst_type>::execute_forward() {
+    auto src = reinterpret_cast<const src_data_t *>(this->input_memory(0));
+    auto weights = reinterpret_cast<const wei_data_t *>(this->input_memory(1));
+    auto bias = reinterpret_cast<const char *>(this->input_memory(2));
+    auto dst = reinterpret_cast<dst_data_t *>(this->memory());
+
+    const memory_desc_wrapper src_d(conf_.src_pd());
+    const memory_desc_wrapper dst_d(conf_.dst_pd());
+    const memory_desc_wrapper weights_d(conf_.weights_pd(0));
+    const memory_desc_wrapper bias_d(conf_.weights_pd(1));
+
+    const auto &jcp = kernel_->jcp;
+
+    size_t offset = (size_t)jcp.ngroups * rnd_up(jcp.oc, jcp.oc_block) * rnd_up(jcp.ic, jcp.ic_block) * jcp.kh * jcp.kw;
+    auto w = const_cast<wei_data_t *>(weights);
+    int32_t* compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[offset]) : 0;
+
+    const size_t bia_dt_size = conf_.with_bias() ? types::data_type_size(conf_.cdesc()->bias_desc.data_type) : 0;
+    float* scales = conf_.attr()->output_scales_.scales_;
+
+    int ocb_work = div_up(jcp.nb_oc, jcp.nb_oc_blocking);
+    const size_t work_amount = jcp.mb * jcp.ngroups * ocb_work * jcp.oh;
+
+    auto ker = [&](const int ithr, const int nthr) {
+        size_t start{0}, end{0};
+        balance211(work_amount, nthr, ithr, start, end);
+
+        size_t n{0}, g{0}, ocbb{0}, oh{0};
+        nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work,
+                         oh, jcp.oh);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            int ocb = ocbb * jcp.nb_oc_blocking;
+            int ocb_num = jcp.nb_oc_blocking;
+
+            jit_conv_call_s par_conv = {};
+
+            const int ij = oh * jcp.stride_h;
+            const int i_t_overflow = nstl::min(jcp.kh, div_up(nstl::max(0, jcp.t_pad - ij), (jcp.dilate_h+1)));
+            const int i_b_overflow = nstl::min(jcp.kh, div_up(nstl::max(jcp.ih, ij + (jcp.kh-1) * (jcp.dilate_h+1) -
+                                               jcp.t_pad+1) - jcp.ih, (jcp.dilate_h + 1)));
+
+            const size_t _oc = g * jcp.nb_oc + ocb;
+            const size_t _ic = g * jcp.nb_ic;
+
+            const int ih = nstl::max(ij - jcp.t_pad + i_t_overflow * (jcp.dilate_h + 1), 0);
+            par_conv.src = &src[src_d.blk_off(n, _ic*jcp.ic_block, ih, 0)];
+
+            size_t dst_off = dst_d.blk_off(n, _oc*jcp.oc_block, oh, 0);
+            par_conv.dst = &dst[dst_off];
+
+            const int wh = (!jcp.signed_input) ? i_t_overflow : 0;
+            par_conv.filt = &weights[conf_.with_groups()
+                                ? weights_d.blk_off(g, ocb, 0, wh, 0)
+                                : weights_d.blk_off(ocb, 0, wh, 0)];
+
+            if (bias)
+                par_conv.bias = &bias[bias_d.blk_off(_oc * jcp.oc_block*bia_dt_size)];
+
+            par_conv.oc_work =
+                    nstl::min((ocb + ocb_num) * jcp.oc_block, jcp.oc) - ocb*jcp.oc_block;
+
+            par_conv.kw_padding = 0;
+            const int kh_padding = jcp.kh - i_t_overflow - i_b_overflow;
+            par_conv.kh_padding = nstl::max(0, kh_padding);
+
+            par_conv.scales = (jcp.signed_input) ? &local_scales_[jcp.is_oc_scale * _oc * jcp.oc_block]
+                                                 : &scales[jcp.is_oc_scale * _oc * jcp.oc_block];
+
+            par_conv.compensation = (jcp.signed_input) ? compensation + _oc * jcp.oc_block : 0;
+            par_conv.t_overflow = i_t_overflow;
+            par_conv.b_overflow = i_b_overflow;
+
+            kernel_->jit_ker(&par_conv);
+            nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ocbb, ocb_work, oh, jcp.oh);
+        }
+    };
+
+    parallel(0, ker);
+}
+
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, true, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, true, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, true, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, true, data_type::u8, data_type::f32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, false, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, false, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, false, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, false, data_type::u8, data_type::f32>::execute_forward();
+
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, true, data_type::s8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, true, data_type::s8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, true, data_type::s8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, true, data_type::s8, data_type::f32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, false, data_type::s8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, false, data_type::s8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, false, data_type::s8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<avx2, false, data_type::s8, data_type::f32>::execute_forward();
+
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, true, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, true, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, true, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, true, data_type::u8, data_type::f32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, false, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, false, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, false, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, false, data_type::u8, data_type::f32>::execute_forward();
+
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, true, data_type::s8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, true, data_type::s8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, true, data_type::s8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, true, data_type::s8, data_type::f32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, false, data_type::s8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, false, data_type::s8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, false, data_type::s8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_convolution_fwd_t<sse42, false, data_type::s8, data_type::f32>::execute_forward();
+
+}
+}
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_convolution.hpp
new file mode 100644 (file)
index 0000000..efd1185
--- /dev/null
@@ -0,0 +1,127 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_JIT_UNI_X8S8S32X_CONVOLUTION_HPP
+#define CPU_JIT_UNI_X8S8S32X_CONVOLUTION_HPP
+
+#include "c_types_map.hpp"
+#include "cpu_convolution_pd.hpp"
+#include "cpu_engine.hpp"
+#include "cpu_reducer.hpp"
+#include "jit_primitive_conf.hpp"
+#include "jit_uni_x8s8s32x_conv_kernel.hpp"
+#include "jit_generator.hpp"
+#include "mkldnn_thread.hpp"
+
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+template <cpu_isa_t isa, bool with_relu, impl::data_type_t src_type, impl::data_type_t dst_type>
+struct _jit_uni_x8s8s32x_convolution_fwd_t: public cpu_primitive_t {
+    struct pd_t: public _cpu_convolution_fwd_pd_t<with_relu> {
+        pd_t(engine_t *engine,
+                const typename pd_t::base_desc_t *adesc,
+                const primitive_attr_t *attr,
+                const typename pd_t::base_class *hint_fwd_pd)
+            : _cpu_convolution_fwd_pd_t<with_relu>(engine, adesc, attr,
+                    hint_fwd_pd)
+            , jcp_({}) {}
+
+        DECLARE_COMMON_PD_T(
+                JIT_IMPL_NAME_HELPER("jit:", isa, ""),
+                _jit_uni_x8s8s32x_convolution_fwd_t<isa, with_relu, src_type, dst_type>);
+
+        virtual status_t init() override {
+            using namespace prop_kind;
+            assert(this->engine()->kind() == engine_kind::cpu);
+            bool ok = true
+                && utils::one_of(this->cdesc_().prop_kind, forward_training,
+                        forward_inference)
+                && this->cdesc_().alg_kind == alg_kind::convolution_direct
+                && IMPLICATION(this->with_bias(), utils::one_of(
+                    this->cdesc_().bias_desc.data_type, data_type::f32,
+                    data_type::s32, data_type::s8, data_type::u8))
+                && this->cdesc_().accum_data_type == data_type::s32
+                && this->cdesc_().src_desc.data_type == src_type
+                && this->cdesc_().dst_desc.data_type == dst_type;
+            if (!ok) return status::unimplemented;
+
+            return jit_uni_x8s8s32x_conv_fwd_kernel<isa>::init_conf(jcp_, this->cdesc_(),
+                    this->src_pd_, this->weights_pd_,
+                    this->dst_pd_, this->bias_pd_, *this->attr(),
+                    with_relu, this->negative_slope());
+        }
+
+        jit_conv_conf_t jcp_;
+    };
+
+    _jit_uni_x8s8s32x_convolution_fwd_t(const pd_t *pd, const input_vector &inputs,
+            const output_vector &outputs)
+        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), local_scales_(nullptr) {
+        kernel_ = new jit_uni_x8s8s32x_conv_fwd_kernel<isa>(conf_.jcp_, *conf_.attr());
+
+        if (conf_.jcp_.signed_input) {
+            size_t scales_size = (conf_.attr()->output_scales_.count_ == 1)
+                                 ? 8
+                                 : conf_.attr()->output_scales_.count_;
+            local_scales_ = (float *)malloc(sizeof(float) * scales_size, 64);
+            for (size_t i = 0; i < scales_size; i++) {
+                local_scales_[i] = conf_.attr()->output_scales_.scales_[i] *
+                                   (1.0 / conf_.jcp_.wei_adj_scale);
+            }
+        }
+    }
+
+    ~_jit_uni_x8s8s32x_convolution_fwd_t() {
+        delete kernel_;
+        if (local_scales_) free(local_scales_);
+    };
+
+    typedef typename prec_traits<data_type::u8>::type src_data_t;
+    typedef typename prec_traits<data_type::s8>::type wei_data_t;
+    typedef typename prec_traits<dst_type>::type dst_data_t;
+
+    virtual void execute(event_t *e) {
+        execute_forward();
+        e->set_state(event_t::ready);
+    }
+
+private:
+    void execute_forward();
+    pd_t conf_;
+    jit_uni_x8s8s32x_conv_fwd_kernel<isa> *kernel_;
+    float *local_scales_;
+};
+
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_avx2_x8s8s32x_convolution_fwd_t = _jit_uni_x8s8s32x_convolution_fwd_t<avx2, false, src_type, dst_type>;
+
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_avx2_x8s8s32x_convolution_relu_t = _jit_uni_x8s8s32x_convolution_fwd_t<avx2, true, src_type, dst_type>;
+
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_sse42_x8s8s32x_convolution_fwd_t = _jit_uni_x8s8s32x_convolution_fwd_t<sse42, false, src_type, dst_type>;
+
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_sse42_x8s8s32x_convolution_relu_t = _jit_uni_x8s8s32x_convolution_fwd_t<sse42, true, src_type, dst_type>;
+
+}
+}
+}
+
+#endif
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.cpp
new file mode 100644 (file)
index 0000000..c02bd80
--- /dev/null
@@ -0,0 +1,662 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "c_types_map.hpp"
+#include "nstl.hpp"
+#include "type_helpers.hpp"
+#include "utils.hpp"
+#include "cpu_memory.hpp"
+
+#include "jit_uni_x8s8s32x_dw_conv_kernel.hpp"
+
+#define GET_OFF(field) offsetof(jit_conv_call_s, field)
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using namespace mkldnn::impl::prop_kind;
+using namespace mkldnn::impl::memory_format;
+using namespace mkldnn::impl::utils;
+
+using namespace Xbyak;
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::load_src(int ur_ch_blocks, int ch_step, int ur_w) {
+    int repeats = isa == sse42 && ch_step > (jcp.ch_block / 2) ? 2 : 1;
+    for (int i = 0; i < repeats; i++) {
+        for (int ch = 0; ch < ur_ch_blocks; ch++) {
+            for (int ow = 0; ow < ur_w; ow++) {
+                Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_w + ch*ur_w + ow);
+
+                uni_vpxor(vmm_acc, vmm_acc, vmm_acc);
+            }
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::apply_filter(int ur_ch_blocks, int ch_step, int ur_w) {
+    int ch_blk = jcp.ch_block;
+    int dilate_h = jcp.dilate_h + 1;
+    int dilate_w = jcp.dilate_w + 1;
+    int stride_w = jcp.stride_w;
+
+    Label iter_exit_label;
+
+    cmp(reg_kh, 0);
+    je(iter_exit_label, T_NEAR);
+    cmp(reg_kw, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(iter_kh, reg_kh);
+    Label kh_label;
+    L(kh_label); {
+        mov(iter_kw, reg_kw);
+        mov(aux1_reg_input, aux_reg_input);
+        mov(aux1_reg_kernel, aux_reg_kernel);
+
+        Label kw_label;
+        L(kw_label); {
+            int repeats = isa == sse42 && ch_step > (jcp.ch_block / 2) ? 2 : 1;
+            for (int i = 0; i < repeats; i++) {
+                for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                    int ker_off = ch*jcp.kh*jcp.kw*ch_blk + i*(ch_blk / 2);
+                    Vmm vmm_ker = get_ker_reg(0);
+                    Xmm xmm_ker = Xmm(vmm_ker.getIdx());
+
+                    if (ch_step == 1) {
+                        movsx(reg_tmp_32, ptr[aux1_reg_kernel + ker_off*jcp.typesize_in]);
+                        movq(xmm_ker, reg_tmp_64);
+                    } else {
+                        uni_vpmovsxbd(vmm_ker, ptr[aux1_reg_kernel + ker_off*jcp.typesize_in]);
+                    }
+
+                    for (int ow = 0; ow < ur_w; ow++) {
+                        int inp_off = ch*ch_blk + ow*stride_w*jcp.oc + i*(ch_blk / 2);
+                        Vmm vmm_src = get_src_reg(0);
+                        Xmm xmm_src = Xmm(vmm_src.getIdx());
+
+                        if (ch_step == 1) {
+                            movzx(reg_tmp_32, ptr[aux1_reg_input + inp_off * jcp.typesize_in]);
+                            movq(xmm_src, reg_tmp_64);
+                        } else {
+                            uni_vpmovzxbd(vmm_src, ptr[aux1_reg_input + inp_off * jcp.typesize_in]);
+                        }
+
+                        Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_w + ch*ur_w + ow);
+                        uni_vpmulld(vmm_src, vmm_src, vmm_ker);
+                        uni_vpaddd(vmm_acc, vmm_acc, vmm_src);
+                    }
+                }
+            }
+            add(aux1_reg_kernel, ch_blk*jcp.typesize_in);
+            add(aux1_reg_input, jcp.oc*dilate_w*jcp.typesize_in);
+
+            dec(iter_kw);
+            cmp(iter_kw, 0);
+            jg(kw_label, T_NEAR);
+        }
+        add(aux_reg_kernel, jcp.kw*ch_blk*jcp.typesize_in);
+        add(aux_reg_input, jcp.iw*jcp.oc*dilate_h*jcp.typesize_in);
+
+        dec(iter_kh);
+        cmp(iter_kh, 0);
+        jg(kh_label, T_NEAR);
+    }
+
+    L(iter_exit_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::apply_filter_unrolled(int ur_ch_blocks, int ch_step, int ur_w) {
+    int ch_blk = jcp.ch_block;
+    int dilate_h = jcp.dilate_h + 1;
+    int dilate_w = jcp.dilate_w + 1;
+    int stride_w = jcp.stride_w;
+
+    Label iter_exit_label;
+
+    cmp(reg_kh, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(iter_kh, reg_kh);
+    Label kh_label;
+    L(kh_label); {
+        int repeats = isa == sse42 && ch_step > (jcp.ch_block / 2) ? 2 : 1;
+        for (int i = 0; i < repeats; i++) {
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                for (int kw = 0; kw < jcp.kw; kw++) {
+                    int ker_off = ch*jcp.kh*jcp.kw*ch_blk + kw*ch_blk + i*(ch_blk / 2);
+                    Vmm vmm_ker = get_ker_reg(0);
+                    Xmm xmm_ker = Xmm(vmm_ker.getIdx());
+
+                    if (ch_step == 1) {
+                        movsx(reg_tmp_32, ptr[aux_reg_kernel + ker_off*jcp.typesize_in]);
+                        movq(xmm_ker, reg_tmp_64);
+                    } else {
+                        uni_vpmovsxbd(vmm_ker, ptr[aux_reg_kernel + ker_off*jcp.typesize_in]);
+                    }
+
+                    for (int ow = 0; ow < ur_w; ow++) {
+                        int inp_off = ch*ch_blk + ow*stride_w*jcp.oc + kw*jcp.oc*dilate_w + i*(ch_blk / 2);
+                        Vmm vmm_src = get_src_reg(0);
+                        Xmm xmm_src = Xmm(vmm_src.getIdx());
+
+                        if (ch_step == 1) {
+                            movzx(reg_tmp_32, ptr[aux_reg_input + inp_off * jcp.typesize_in]);
+                            movq(xmm_src, reg_tmp_64);
+                        } else {
+                            uni_vpmovzxbd(vmm_src, ptr[aux_reg_input + inp_off * jcp.typesize_in]);
+                        }
+
+                        Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_w + ch*ur_w + ow);
+                        uni_vpmulld(vmm_src, vmm_src, vmm_ker);
+                        uni_vpaddd(vmm_acc, vmm_acc, vmm_src);
+                    }
+                }
+            }
+        }
+
+        add(aux_reg_kernel, jcp.kw*ch_blk*jcp.typesize_in);
+        add(aux_reg_input, jcp.iw*jcp.oc*dilate_h*jcp.typesize_in);
+
+        dec(iter_kh);
+        cmp(iter_kh, 0);
+        jg(kh_label, T_NEAR);
+    }
+
+    L(iter_exit_label);
+}
+
+template <cpu_isa_t isa>
+bool jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::maybe_relu(int position) {
+    using namespace primitive_kind;
+    const auto &p = attr_.post_ops_;
+
+    if (position == 0) {
+        /* relu before sum */
+        return false
+               || jcp.with_eltwise
+               || p.contain(eltwise, 0)
+               || (jcp.dst_dt == data_type::u8 && !p.contain(sum, 0));
+    } else if (position == 1) {
+        /* relu after sum */
+        const int sum_idx = p.contain(sum, 0)
+                            ? 0 : (p.contain(sum, 1) ? 1 : -1);
+        if (sum_idx == -1)
+            return false;
+
+        return false
+               || p.contain(eltwise, sum_idx + 1)
+               || jcp.dst_dt == data_type::u8;
+    }
+
+    return false;
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::store_dst(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store) {
+    Ymm ymm_dst = Ymm(vmm_dst.getIdx());
+    Xmm xmm_dst = Xmm(vmm_dst.getIdx());
+
+    switch (jcp.dst_dt) {
+        case data_type::f32:
+        case data_type::s32:
+            if (scalar_store) {
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_32);
+            } else {
+                uni_vmovups(op, vmm_dst);
+            }
+            break;
+        case data_type::s8:
+            uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
+
+            if (isa != sse42 && !scalar_store)
+                vpermq(ymm_dst, ymm_dst, 0x08);
+
+            uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
+
+            if (scalar_store) {
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_8);
+            } else {
+                if (isa != sse42)
+                    vmovq(op, xmm_dst);
+                else
+                    movd(op, xmm_dst);
+            }
+            break;
+        case data_type::u8:
+            uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
+
+            if (isa != sse42 && !scalar_store)
+                vpermq(ymm_dst, ymm_dst, 0x08);
+
+            uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
+
+            if (scalar_store) {
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_8);
+            } else {
+                if (isa != sse42)
+                    vmovq(op, xmm_dst);
+                else
+                    movd(op, xmm_dst);
+            }
+
+            break;
+        default:
+            assert(!"unknown dst_dt");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::cvt2ps(data_type_t type_in, Vmm vmm_in,
+        const Xbyak::Operand &op, bool scalar_load) {
+    Xmm xmm_in = Xmm(vmm_in.getIdx());
+
+    switch (type_in) {
+        case data_type::f32:
+        case data_type::s32:
+            if (scalar_load) {
+                movsd(xmm_in, op);
+            } else {
+                uni_vmovups(vmm_in, op);
+            }
+            break;
+        case data_type::s8:
+            if (scalar_load) {
+                movsx(reg_tmp_32, op);
+                movq(xmm_in, reg_tmp_64);
+            } else {
+                uni_vpmovsxbd(vmm_in, op);
+            }
+            break;
+        case data_type::u8:
+            if (scalar_load) {
+                movzx(reg_tmp_32, op);
+                movq(xmm_in, reg_tmp_64);
+            } else {
+                uni_vpmovzxbd(vmm_in, op);
+            }
+            break;
+        default: assert(!"unsupported data type");
+    }
+
+    if (type_in != data_type::f32)
+        uni_vcvtdq2ps(vmm_in, vmm_in);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::store_dst(int ur_ch_blocks, int ch_step, int ur_w) {
+    int repeats = isa == sse42 && ch_step > (jcp.ch_block / 2) ? 2 : 1;
+
+    pop(reg_scales_base);
+
+    uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
+    for (int r = 0; r < repeats; r++) {
+        if (ch_step < jcp.ch_block) {
+            for (int jj = 0; jj < ur_w; jj++) {
+                Vmm vmm_dst = get_acc_reg(r * ur_w * ur_ch_blocks + jj);
+                uni_vcvtdq2ps(vmm_dst, vmm_dst);
+
+                if (jcp.with_bias) {
+                    int b_off = r * (jcp.ch_block / 2);
+                    cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias_base + b_off * jcp.typesize_bia], true);
+                    uni_vaddps(vmm_dst, vmm_dst, vmm_bias);
+                }
+
+                int s_off = jcp.is_oc_scale * (r * (jcp.ch_block / 2));
+                cvt2ps(mkldnn_f32, vmm_scale, ptr[reg_scales_base + s_off * sizeof(float)], true);
+                uni_vmulps(vmm_dst, vmm_dst, vmm_scale);
+
+                int o_off = jj * jcp.oc + r * (jcp.ch_block / 2);
+                if (jcp.with_sum) {
+                    uni_vpxor(vmm_prev_dst, vmm_prev_dst, vmm_prev_dst);
+                    cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + o_off * jcp.typesize_out], true);
+                    uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst);
+                }
+
+                if (maybe_relu(0))
+                    uni_vmaxps(vmm_dst, vmm_dst, vmm_zero);
+
+                if (maybe_relu(1))
+                    uni_vmaxps(vmm_dst, vmm_dst, vmm_zero);
+
+                if (jcp.dst_dt != data_type::f32) {
+                    if (attr_.round_mode_ == round_mode::nearest)
+                        uni_vcvtps2dq(vmm_dst, vmm_dst);
+                    else if (attr_.round_mode_ == round_mode::down) {
+                        uni_vroundps(vmm_dst, vmm_dst, 1);
+                        uni_vcvtps2dq(vmm_dst, vmm_dst);
+                    } else
+                        assert(!"unimplemented");
+                }
+
+                store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true);
+            }
+        } else {
+            for (int ii = 0; ii < ur_ch_blocks; ii++) {
+                if (jcp.with_bias) {
+                    int b_off = ii * jcp.ch_block + r * (jcp.ch_block / 2);
+                    cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias_base + b_off * jcp.typesize_bia], false);
+                }
+
+                for (int jj = 0; jj < ur_w; jj++) {
+                    Vmm vmm_dst = get_acc_reg(r * ur_ch_blocks*ur_w + ur_w * ii + jj);
+                    uni_vcvtdq2ps(vmm_dst, vmm_dst);
+
+                    if (jcp.with_bias)
+                        uni_vaddps(vmm_dst, vmm_dst, vmm_bias);
+
+                    int s_off = jcp.is_oc_scale * (ii * jcp.ch_block + r * (jcp.ch_block / 2));
+                    cvt2ps(mkldnn_f32, vmm_scale, ptr[reg_scales_base + s_off * sizeof(float)], false);
+                    uni_vmulps(vmm_dst, vmm_dst, vmm_scale);
+
+                    int o_off = ii * jcp.ch_block + jj * jcp.oc + r * (jcp.ch_block / 2);
+                    if (jcp.with_sum) {
+                        cvt2ps(jcp.dst_dt, vmm_prev_dst, ptr[reg_output + o_off * jcp.typesize_out], false);
+                        uni_vaddps(vmm_dst, vmm_dst, vmm_prev_dst);
+                    }
+
+                    if (maybe_relu(0))
+                        uni_vmaxps(vmm_dst, vmm_dst, vmm_zero);
+
+                    if (maybe_relu(1))
+                        uni_vmaxps(vmm_dst, vmm_dst, vmm_zero);
+
+                    if (jcp.dst_dt != data_type::f32) {
+                        if (attr_.round_mode_ == round_mode::nearest)
+                            uni_vcvtps2dq(vmm_dst, vmm_dst);
+                        else if (attr_.round_mode_ == round_mode::down) {
+                            uni_vroundps(vmm_dst, vmm_dst, 1);
+                            uni_vcvtps2dq(vmm_dst, vmm_dst);
+                        } else
+                            assert(!"unimplemented");
+                    }
+
+                    store_dst(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, false);
+                }
+            }
+        }
+    }
+
+    push(reg_scales_base);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::loop_body(int ur_ch_blocks, int ch_step) {
+    Label unrolled_w_label;
+    Label tail_w_label;
+    Label exit_label;
+
+    mov(reg_ur_w, ptr[this->param1 + GET_OFF(ur_w)]);
+    mov(reg_input, reg_input_base);
+    mov(reg_output, reg_output_base);
+    mov(reg_kernel, reg_kernel_base);
+
+    push(reg_input_base);
+    push(reg_output_base);
+    push(reg_kernel_base);
+    push(reg_ch_work);
+    push(reg_scales_base);
+
+    L(unrolled_w_label); {
+        int ur_w = jcp.ur_w;
+
+        cmp(reg_ur_w, ur_w);
+        jl(tail_w_label, T_NEAR);
+
+        mov(aux_reg_input, reg_input);
+        mov(aux_reg_kernel, reg_kernel);
+
+        load_src(ur_ch_blocks, ch_step, ur_w);
+        apply_filter_unrolled(ur_ch_blocks, ch_step, ur_w);
+        store_dst(ur_ch_blocks, ch_step, ur_w);
+
+        add(reg_input, jcp.typesize_in * ur_w * jcp.ic * jcp.stride_w);
+        add(reg_output, jcp.typesize_out * ur_w * jcp.oc);
+
+        sub(reg_ur_w, ur_w);
+        jmp(unrolled_w_label);
+    }
+
+    L(tail_w_label); {
+        int ur_w = 1;
+
+        cmp(reg_ur_w, ur_w);
+        jl(exit_label, T_NEAR);
+
+        mov(aux_reg_input, reg_input);
+        mov(aux_reg_kernel, reg_kernel);
+
+        load_src(ur_ch_blocks, ch_step, ur_w);
+        apply_filter(ur_ch_blocks, ch_step, ur_w);
+        store_dst(ur_ch_blocks, ch_step, ur_w);
+
+        add(reg_input, jcp.typesize_in * ur_w * jcp.ic * jcp.stride_w);
+        add(reg_output, jcp.typesize_out * ur_w * jcp.oc);
+
+        sub(reg_ur_w, ur_w);
+        jmp(tail_w_label);
+    }
+
+    L(exit_label);
+
+    pop(reg_scales_base);
+    pop(reg_ch_work);
+    pop(reg_kernel_base);
+    pop(reg_output_base);
+    pop(reg_input_base);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::generate() {
+    this->preamble();
+
+    mov(reg_input_base, ptr[this->param1 + GET_OFF(src)]);
+    mov(reg_output_base, ptr[this->param1 + GET_OFF(dst)]);
+    mov(reg_kernel_base, ptr[this->param1 + GET_OFF(filt)]);
+    if (jcp.with_bias)
+        mov(reg_bias_base, ptr[this->param1 + GET_OFF(bias)]);
+    mov(reg_scales_base, ptr[this->param1 + GET_OFF(scales)]);
+    mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]);
+    mov(reg_kw, ptr[this->param1 + GET_OFF(kw_padding)]);
+    mov(reg_ch_work, ptr[this->param1 + GET_OFF(ch_work)]);
+
+    Label main_loop_label;
+    Label tail_loop_label;
+    Label exit_label;
+
+    cmp(reg_ch_work, jcp.nb_ch_blocking * jcp.ch_block);
+    jne(main_loop_label, T_NEAR);
+
+    loop_body(jcp.nb_ch_blocking, jcp.nb_ch_blocking * jcp.ch_block);
+
+    sub(reg_ch_work, jcp.nb_ch_blocking * jcp.ch_block);
+
+    jmp(exit_label, T_NEAR);
+
+    L(main_loop_label); {
+        cmp(reg_ch_work, jcp.ch_block);
+        jl(tail_loop_label, T_NEAR);
+
+        loop_body(1, jcp.ch_block);
+
+        sub(reg_ch_work, jcp.ch_block);
+        add(reg_input_base, jcp.ch_block * jcp.typesize_in);
+        add(reg_output_base, jcp.ch_block * jcp.typesize_out);
+        add(reg_kernel_base, jcp.ch_block * jcp.kh * jcp.kw * jcp.typesize_in);
+        add(reg_bias_base, jcp.ch_block * jcp.typesize_bia);
+        add(reg_scales_base, jcp.is_oc_scale * jcp.ch_block * sizeof(float));
+
+        jmp(main_loop_label, T_NEAR);
+    }
+
+    L(tail_loop_label); {
+        cmp(reg_ch_work, 1);
+        jl(exit_label, T_NEAR);
+
+        loop_body(1, 1);
+
+        sub(reg_ch_work, 1);
+        add(reg_input_base, 1 * jcp.typesize_in);
+        add(reg_output_base, 1 * jcp.typesize_out);
+        add(reg_kernel_base, 1 * jcp.typesize_in);
+        add(reg_bias_base, 1 * jcp.typesize_bia);
+        add(reg_scales_base, jcp.is_oc_scale * 1 * sizeof(float));
+
+        jmp(tail_loop_label, T_NEAR);
+    }
+
+    L(exit_label);
+
+    this->postamble();
+}
+
+template <cpu_isa_t isa>
+bool jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::post_ops_ok(
+        jit_conv_conf_t &jcp, const primitive_attr_t &attr) {
+    const auto &p = attr.post_ops_;
+
+    auto is_relu = [&](int idx) { return p.entry_[idx].is_relu(); };
+    auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(); };
+
+    switch (p.len_) {
+    case 0: return true; // no post_ops
+    case 1: return !jcp.with_eltwise && (is_relu(0) || is_sum(0)); // sum OR relu
+    case 2: return !jcp.with_eltwise && (is_sum(0) && is_relu(1)); // sum->relu
+    default: return false;
+    }
+
+    return false;
+}
+
+template <cpu_isa_t isa>
+status_t jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::init_conf(jit_conv_conf_t &jcp,
+        const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+        const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d,
+        const memory_desc_wrapper &bias_pd, const primitive_attr_t &attr,
+        bool with_relu, float relu_negative_slope)
+{
+    if (!mayiuse(isa)) return status::unimplemented;
+
+    if (!(src_d.data_type() == data_type::u8 &&
+          weights_d.data_type() == data_type::s8 &&
+          one_of(dst_d.data_type(), data_type::f32, data_type::s32, data_type::s8, data_type::u8)))
+        return status::unimplemented;
+
+    jcp.prop_kind = cd.prop_kind;
+
+    const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+    if (!with_groups) return status::unimplemented;
+
+    jcp.ngroups = weights_d.dims()[0];
+    jcp.mb = src_d.dims()[0];
+
+    jcp.oc = dst_d.dims()[1];
+    jcp.ic = src_d.dims()[1];
+
+    jcp.ih = src_d.dims()[2];
+    jcp.iw = src_d.dims()[3];
+    jcp.oh = dst_d.dims()[2];
+    jcp.ow = dst_d.dims()[3];
+
+    jcp.kh = weights_d.dims()[3];
+    jcp.kw = weights_d.dims()[4];
+
+    jcp.t_pad = cd.padding[0][0];
+    jcp.l_pad = cd.padding[0][1];
+    jcp.b_pad = cd.padding[1][0];
+    jcp.r_pad = cd.padding[1][1];
+
+    jcp.stride_h = cd.strides[0];
+    jcp.stride_w = cd.strides[1];
+
+    jcp.dilate_h = cd.dilates[0];
+    jcp.dilate_w = cd.dilates[1];
+
+    jcp.src_fmt = src_d.format();
+    jcp.with_bias = cd.bias_desc.format != memory_format::undef;
+    jcp.with_eltwise = with_relu;
+    jcp.eltwise_alpha = relu_negative_slope;
+
+    jcp.signed_input = (src_d.data_type() == data_type::s8) ? true : false;
+
+    if (jcp.signed_input)
+        return status::unimplemented;
+
+    const int simd_w = isa == avx512_common ? 16 : 8;
+    jcp.ch_block = simd_w;
+    jcp.nb_ch = div_up(jcp.oc, jcp.ch_block);
+
+    if (!post_ops_ok(jcp, attr))
+        return status::unimplemented;
+
+    const auto &p = attr.post_ops_;
+    jcp.with_sum = p.find(primitive_kind::sum) != -1;
+    if (!jcp.with_eltwise) {
+        int eltwise_ind = p.find(primitive_kind::eltwise);
+        if (eltwise_ind != -1) {
+            jcp.with_eltwise  = true;
+            jcp.eltwise_alpha = p.entry_[eltwise_ind].eltwise.alpha;
+        }
+    }
+
+    auto desired_act_fmt = nhwc;
+    auto desired_wei_fmt = isa == avx512_common ? Goihw16g : Goihw8g;
+
+    bool args_ok = true
+        && jcp.oc == jcp.ngroups
+        && jcp.ic == jcp.ngroups
+        && src_d.format() == desired_act_fmt
+        && weights_d.format() == desired_wei_fmt
+        && one_of(cd.bias_desc.format, memory_format::undef, any, x)
+        && dst_d.format() == desired_act_fmt;
+    if (!args_ok) return status::unimplemented;
+
+    jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef;
+    jcp.dst_dt = cd.dst_desc.data_type;
+
+    jcp.typesize_in = types::data_type_size(src_d.data_type());
+    jcp.typesize_out = types::data_type_size(dst_d.data_type());
+    jcp.typesize_acc = sizeof(int32_t);
+    jcp.typesize_bia = jcp.with_bias
+                       ? types::data_type_size(bias_pd.data_type())
+                       : 0;
+
+    const auto &oscales = attr.output_scales_;
+    jcp.is_oc_scale = oscales.mask_ == 1 << 1;
+
+    assert(IMPLICATION(!jcp.is_oc_scale, oscales.mask_ == 0));
+
+    jcp.ur_w = isa == avx512_common ? 6 : isa == avx2 ? 4 : 3;
+
+    jcp.nb_ch_blocking = isa == avx512_common ? 4 : isa == avx2 ? 3 : 2;
+    if (jcp.nb_ch < jcp.nb_ch_blocking)
+        jcp.nb_ch_blocking = jcp.nb_ch;
+
+    return status::success;
+}
+
+template struct jit_uni_x8s8s32x_dw_conv_fwd_kernel<avx2>;
+template struct jit_uni_x8s8s32x_dw_conv_fwd_kernel<sse42>;
+
+}
+}
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_conv_kernel.hpp
new file mode 100644 (file)
index 0000000..9c9b41f
--- /dev/null
@@ -0,0 +1,113 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef JIT_UNI_X8S8S32X_DW_CONV_KERNEL_F32_HPP
+#define JIT_UNI_X8S8S32X_DW_CONV_KERNEL_F32_HPP
+
+#include "c_types_map.hpp"
+#include "jit_generator.hpp"
+#include "jit_primitive_conf.hpp"
+#include "type_helpers.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+template <cpu_isa_t isa>
+struct jit_uni_x8s8s32x_dw_conv_fwd_kernel: public jit_generator {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_dw_conv_fwd_kernel_f32)
+
+    jit_uni_x8s8s32x_dw_conv_fwd_kernel(jit_conv_conf_t ajcp,
+            const primitive_attr_t &attr): jcp(ajcp), attr_(attr) {
+        this->generate();
+        jit_ker = (void (*)(jit_conv_call_s *))this->getCode();
+    }
+
+    static bool post_ops_ok(jit_conv_conf_t &jcp,
+            const primitive_attr_t &attr);
+    static status_t init_conf(jit_conv_conf_t &jcp,
+            const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d,
+            const memory_desc_wrapper &bias_pd,
+            const primitive_attr_t &attr,
+            bool with_relu = false, float relu_negative_slope = 0.f);
+
+    jit_conv_conf_t jcp;
+    const primitive_attr_t &attr_;
+    void (*jit_ker)(jit_conv_call_s *);
+
+private:
+    using Vmm = typename utils::conditional3<isa == sse42, Xbyak::Xmm,
+        isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+    using Ymm = const Xbyak::Ymm;
+    using reg64_t = const Xbyak::Reg64;
+    using reg32_t = const Xbyak::Reg32;
+    using reg8_t = const Xbyak::Reg8;
+    const int vlen = cpu_isa_traits<isa>::vlen;
+
+    reg64_t reg_input_base = r10;
+    reg64_t reg_output_base = r9;
+    reg64_t reg_kernel_base = r11;
+    reg64_t reg_ch_work = r13;
+    reg64_t reg_bias_base = abi_not_param1;
+    reg64_t reg_scales_base = rdx;
+
+    reg64_t reg_input = r8;
+    reg64_t reg_kernel = r12;
+    reg64_t aux_reg_input = r9;
+    reg64_t aux1_reg_input = r10;
+    reg64_t aux_reg_kernel = r13;
+    reg64_t aux1_reg_kernel = r11;
+    reg64_t reg_output = r14;
+
+    reg64_t reg_kh = rax;
+    reg64_t reg_kw = rbx;
+    reg64_t iter_kh = rdx;
+    reg64_t iter_kw = rsi;
+    reg64_t reg_ur_w = rbp;
+
+    reg32_t reg_tmp_32 = r15d;
+    reg64_t reg_tmp_64 = r15;
+    reg8_t reg_tmp_8 = r15b;
+
+    Vmm vmm_zero = Vmm(0);
+    Vmm vmm_bias = Vmm(3);
+    Vmm vmm_scale = Vmm(2);
+    Vmm vmm_prev_dst = Vmm(2);
+
+    inline Vmm get_ker_reg(int idx) { return Vmm(idx + 0); }
+    inline Vmm get_src_reg(int idx) { return Vmm(idx + 1); }
+    inline Vmm get_acc_reg(int idx) { return Vmm(idx + 4); }
+
+    inline void cvt2ps(data_type_t type_in, Vmm vmm_in, const Xbyak::Operand &op, bool scalar_load);
+    inline void store_dst(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store);
+
+    inline void load_src(int ur_ch_blocks, int ch_step, int ur_w);
+    inline void apply_filter(int ur_ch_blocks, int ch_step, int ur_w);
+    inline void apply_filter_unrolled(int ur_ch_blocks, int ch_step, int ur_w);
+    inline bool maybe_relu(int position);
+    inline void store_dst(int ur_ch_blocks, int ch_step, int ur_w);
+    inline void loop_body(int ur_ch_blocks, int ch_step);
+
+    void generate();
+};
+
+}
+}
+}
+
+#endif
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.cpp
new file mode 100644 (file)
index 0000000..bc31a38
--- /dev/null
@@ -0,0 +1,172 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "mkldnn_types.h"
+#include "c_types_map.hpp"
+#include "jit_uni_x8s8s32x_dw_convolution.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using namespace mkldnn::impl::status;
+using namespace mkldnn::impl::memory_format;
+using namespace mkldnn::impl::utils;
+
+template <cpu_isa_t isa, bool with_relu, data_type_t src_type, data_type_t dst_type>
+void _jit_uni_x8s8s32x_dw_convolution_fwd_t<isa, with_relu, src_type, dst_type>::execute_forward() {
+    auto src = reinterpret_cast<const src_data_t*>(this->input_memory(0));
+    auto weights = reinterpret_cast<const wei_data_t*>(this->input_memory(1));
+    auto bias = reinterpret_cast<const char*>(this->input_memory(2));
+    auto dst = reinterpret_cast<dst_data_t*>(this->memory());
+
+    const memory_desc_wrapper src_d(conf_.src_pd());
+    const memory_desc_wrapper dst_d(conf_.dst_pd());
+    const memory_desc_wrapper weights_d(conf_.weights_pd(0));
+    const memory_desc_wrapper bias_d(conf_.weights_pd(1));
+
+    const auto &jcp = kernel_->jcp;
+
+    int dil_h = jcp.dilate_h + 1;
+    int dil_w = jcp.dilate_w + 1;
+    int str_h = jcp.stride_h;
+    int str_w = jcp.stride_w;
+
+    const size_t bia_dt_size = conf_.with_bias()
+        ? types::data_type_size(conf_.cdesc()->bias_desc.data_type) : 0;
+
+    const auto &oscales = conf_.attr()->output_scales_;
+
+    int MB = jcp.mb;
+    int chb_work = utils::div_up(jcp.nb_ch, jcp.nb_ch_blocking);
+    const size_t work_amount = MB * chb_work * jcp.oh;
+
+    auto kernel_params = [&](int ur_w_step, int ow, int oh, int ih, int kh,
+            int kh_padding, int ch, int ch_num, int n) {
+        jit_conv_call_s par_conv = {};
+
+        const int i_l_overflow = nstl::max(0, (jcp.l_pad - ow * str_w));
+        const int i_r_overflow = nstl::max(jcp.iw, (ow * str_w
+            + (jcp.kw - 1)*dil_w - jcp.l_pad + 1)) - jcp.iw;
+
+        const int iw = nstl::max((ow*str_w - jcp.l_pad
+            + div_up(i_l_overflow, dil_w)*dil_w), 0);
+        const int kw = div_up(i_l_overflow, dil_w);
+
+        const int kw_padding = jcp.kw - div_up(i_l_overflow, dil_w)
+            - div_up(i_r_overflow, dil_w);
+
+        int src_off = src_d.blk_off(n, ch*jcp.ch_block, ih, iw);
+        int dst_off = dst_d.blk_off(n, ch*jcp.ch_block, oh, ow);
+
+        par_conv.src = &src[src_off];
+        par_conv.dst = &dst[dst_off];
+
+        par_conv.filt = &weights[weights_d.blk_off(ch, 0, 0, kh, kw)];
+        if (bias) par_conv.bias = &bias[bias_d.blk_off(ch*jcp.ch_block*bia_dt_size)];
+
+        par_conv.kh_padding = (size_t)nstl::max(0, kh_padding);
+        par_conv.kw_padding = (size_t)nstl::max(0, kw_padding);
+
+        par_conv.ur_w = (size_t)ur_w_step;
+
+        par_conv.ch_work = nstl::min((ch + ch_num) * jcp.ch_block, jcp.oc) - ch*jcp.ch_block;
+
+        par_conv.scales = &oscales.scales_[jcp.is_oc_scale * ch * jcp.ch_block];
+
+        return par_conv;
+    };
+
+    auto ker = [&](const int ithr, const int nthr) {
+        size_t start{0}, end{0};
+        balance211(work_amount, nthr, ithr, start, end);
+
+        size_t n{0}, chb{0}, oh{0};
+        nd_iterator_init(start, n, MB, chb, chb_work, oh, jcp.oh);
+        for (size_t iwork = start; iwork < end; ++iwork) {
+            int ch = chb * jcp.nb_ch_blocking;
+            int ch_num = jcp.nb_ch_blocking;
+
+            const int i_t_overflow = nstl::max(0, (int)(jcp.t_pad - oh*str_h));
+            const int i_b_overflow = nstl::max(jcp.ih,
+                (int)(oh*str_h + (jcp.kh - 1)*dil_h - jcp.t_pad + 1)) - jcp.ih;
+
+            const int ih = nstl::max((int)(oh*str_h - jcp.t_pad
+                + div_up(i_t_overflow, dil_h)*dil_h), 0);
+            const int kh = div_up(i_t_overflow, dil_h);
+            const int kh_padding = jcp.kh - div_up(i_t_overflow, dil_h)
+                - div_up(i_b_overflow, dil_h);
+
+            // left border
+            int ow = 0;
+            int l_border = nstl::min(div_up(jcp.l_pad, str_w), jcp.ow);
+            int ur_w_step = 1;
+            for (; ow < l_border; ow++) {
+                jit_conv_call_s par_conv = kernel_params(ur_w_step, ow, oh, ih,
+                                            kh, kh_padding, ch, ch_num, n);
+
+                kernel_->jit_ker(&par_conv);
+            }
+
+            // main loop
+            ur_w_step = (jcp.iw - (jcp.kw - 1)*dil_w + jcp.l_pad - 1)
+                / jcp.stride_w - ow + 1;
+            if (ur_w_step > 0) {
+                jit_conv_call_s par_conv = kernel_params(ur_w_step, ow, oh, ih,
+                                            kh, kh_padding, ch, ch_num, n);
+
+                kernel_->jit_ker(&par_conv);
+
+                ow += ur_w_step;
+            }
+
+            // right border
+            ur_w_step = 1;
+            for (; ow < jcp.ow; ow++) {
+                jit_conv_call_s par_conv = kernel_params(ur_w_step, ow, oh, ih,
+                                            kh, kh_padding, ch, ch_num, n);
+
+                kernel_->jit_ker(&par_conv);
+            }
+
+            nd_iterator_step(n, MB, chb, chb_work, oh, jcp.oh);
+        }
+    };
+
+    parallel(0, ker);
+}
+
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<avx2, true, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<avx2, true, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<avx2, true, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<avx2, true, data_type::u8, data_type::f32>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<avx2, false, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<avx2, false, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<avx2, false, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<avx2, false, data_type::u8, data_type::f32>::execute_forward();
+
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<sse42, true, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<sse42, true, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<sse42, true, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<sse42, true, data_type::u8, data_type::f32>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<sse42, false, data_type::u8, data_type::u8>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<sse42, false, data_type::u8, data_type::s8>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<sse42, false, data_type::u8, data_type::s32>::execute_forward();
+template void _jit_uni_x8s8s32x_dw_convolution_fwd_t<sse42, false, data_type::u8, data_type::f32>::execute_forward();
+
+}
+}
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/jit_uni_x8s8s32x_dw_convolution.hpp
new file mode 100644 (file)
index 0000000..17d70c1
--- /dev/null
@@ -0,0 +1,121 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_JIT_UNI_X8S8S32X_DW_CONVOLUTION_HPP
+#define CPU_JIT_UNI_X8S8S32X_DW_CONVOLUTION_HPP
+
+#include "c_types_map.hpp"
+#include "cpu_convolution_pd.hpp"
+#include "cpu_engine.hpp"
+#include "jit_primitive_conf.hpp"
+#include "jit_generator.hpp"
+#include "jit_uni_x8s8s32x_dw_conv_kernel.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+template <cpu_isa_t isa, bool with_relu, impl::data_type_t src_type, impl::data_type_t dst_type>
+struct _jit_uni_x8s8s32x_dw_convolution_fwd_t: public cpu_primitive_t {
+    struct pd_t: public _cpu_convolution_fwd_pd_t<with_relu> {
+        pd_t(engine_t *engine, const typename pd_t::base_desc_t *adesc,
+                const primitive_attr_t *attr,
+                const typename pd_t::base_class *hint_fwd_pd)
+            : _cpu_convolution_fwd_pd_t<with_relu>(engine, adesc, attr,
+                hint_fwd_pd)
+            , jcp_({}) {}
+
+        DECLARE_COMMON_PD_T(
+                JIT_IMPL_NAME_HELPER("jit_dw:", isa, ""),
+                _jit_uni_x8s8s32x_dw_convolution_fwd_t<isa, with_relu, src_type, dst_type>);
+
+        virtual status_t init() override {
+            using namespace prop_kind;
+            assert(this->engine()->kind() == engine_kind::cpu);
+            bool ok = true
+                && this->set_default_params() == status::success
+                && utils::one_of(this->cdesc_().prop_kind, forward_training,
+                        forward_inference)
+                && this->cdesc_().alg_kind == alg_kind::convolution_direct
+                && this->cdesc_().dst_desc.data_type == dst_type
+                && IMPLICATION(this->with_bias(), utils::one_of(
+                    this->cdesc_().bias_desc.data_type, data_type::f32,
+                    data_type::s32, data_type::s8, data_type::u8))
+                && this->cdesc_().accum_data_type == data_type::s32;
+            if (!ok) return status::unimplemented;
+
+            return jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>::init_conf(jcp_,
+                        this->cdesc_(),
+                        this->src_pd_.desc(), *this->weights_pd_.desc(),
+                        *this->dst_pd_.desc(), *this->bias_pd_.desc(),
+                        *this->attr(), with_relu, this->negative_slope());
+        }
+
+        jit_conv_conf_t jcp_;
+
+    protected:
+        virtual status_t set_default_params() override {
+            using namespace memory_format;
+            auto desired_act_fmt = nhwc;
+            auto desired_wei_fmt = isa == avx512_common ? Goihw16g : Goihw8g;
+
+            if (this->src_pd_.desc()->format == any)
+                CHECK(this->src_pd_.set_format(desired_act_fmt));
+            if (this->dst_pd_.desc()->format == any)
+                CHECK(this->dst_pd_.set_format(desired_act_fmt));
+            if (this->weights_pd_.desc()->format == any)
+                CHECK(this->weights_pd_.set_format(desired_wei_fmt));
+            if (this->bias_pd_.desc()->format == any)
+                CHECK(this->bias_pd_.set_format(x));
+            return status::success;
+        }
+    };
+
+    _jit_uni_x8s8s32x_dw_convolution_fwd_t(const pd_t *pd, const input_vector &inputs,
+                                    const output_vector &outputs)
+            : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
+    { kernel_ = new jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa>(conf_.jcp_, *conf_.attr()); }
+    ~_jit_uni_x8s8s32x_dw_convolution_fwd_t() { delete kernel_; };
+
+    typedef typename prec_traits<data_type::u8>::type src_data_t;
+    typedef typename prec_traits<data_type::s8>::type wei_data_t;
+    typedef typename prec_traits<dst_type>::type dst_data_t;
+
+    virtual void execute(event_t *e) {
+        execute_forward();
+        e->set_state(event_t::ready);
+    }
+
+private:
+    void execute_forward();
+    pd_t conf_;
+    jit_uni_x8s8s32x_dw_conv_fwd_kernel<isa> *kernel_;
+};
+
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_avx2_x8s8s32x_dw_convolution_fwd_t = _jit_uni_x8s8s32x_dw_convolution_fwd_t<avx2, false, src_type, dst_type>;
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_sse42_x8s8s32x_dw_convolution_fwd_t = _jit_uni_x8s8s32x_dw_convolution_fwd_t<sse42, false, src_type, dst_type>;
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_avx2_x8s8s32x_dw_convolution_relu_t = _jit_uni_x8s8s32x_dw_convolution_fwd_t<avx2, true, src_type, dst_type>;
+template <impl::data_type_t src_type, impl::data_type_t dst_type>
+using jit_sse42_x8s8s32x_dw_convolution_relu_t = _jit_uni_x8s8s32x_dw_convolution_fwd_t<sse42, true, src_type, dst_type>;
+
+}
+}
+}
+
+#endif
index e291e7d..e9da692 100644 (file)
@@ -63,6 +63,7 @@ void nchw_pooling_fwd_t<data_type>::execute_forward() {
     const int padF = conf_.padFront();
     const int padT = conf_.padT();
     const int padL = conf_.padL();
+    const int padBack = conf_.padBack();
     const int padB = conf_.padB();
     const int padR = conf_.padR();
 
@@ -86,6 +87,7 @@ void nchw_pooling_fwd_t<data_type>::execute_forward() {
     };
 
     auto ker_max = [=](data_t *d, int mb, int c, int od, int oh, int ow) {
+        bool is_initialized = false;
         for (int kd = 0; kd < KD; ++kd) {
             for (int kh = 0; kh < KH; ++kh) {
                 for (int kw = 0; kw < KW; ++kw) {
@@ -104,9 +106,14 @@ void nchw_pooling_fwd_t<data_type>::execute_forward() {
                         + (size_t)IW * ih
                         + (size_t)iw;
                     auto s = src[src_offset];
-                    if (s > d[0]) {
+                    if (!is_initialized) {
                         d[0] = s;
                         set_ws(mb, c, od, oh, ow, kd*KH*KW + kh*KW + kw);
+                        is_initialized = true;
+                    } else {
+                        if (d[0] < s)
+                            d[0] = s;
+                            set_ws(mb, c, od, oh, ow, kd*KH*KW + kh*KW + kw);
                     }
                 }
             }
@@ -114,23 +121,26 @@ void nchw_pooling_fwd_t<data_type>::execute_forward() {
     };
 
     auto ker_avg = [=](data_t *d, int mb, int c, int od, int oh, int ow) {
+        auto id_start = od*SD - padF;
         auto ih_start = oh*SH - padT;
         auto iw_start = ow*SW - padL;
-        auto id_start = od*SD - padF;
-        auto id_end = nstl::min(od*SD - padF + KD, ID);
+        auto id_end = nstl::min(od*SD - padF + KD, ID + padBack);
         auto ih_end = nstl::min(oh*SH - padT + KH, IH + padB);
         auto iw_end = nstl::min(ow*SW - padL + KW, IW + padR);
 
         // case alg == pooling_avg_include_padding
         auto num_summands = (id_end - id_start)*(ih_end - ih_start)*(iw_end - iw_start);
 
+        id_start = nstl::max(id_start, 0);
         ih_start = nstl::max(ih_start, 0);
         iw_start = nstl::max(iw_start, 0);
+        id_end = nstl::min(id_end, ID);
         ih_end = nstl::min(ih_end, IH);
         iw_end = nstl::min(iw_end, IW);
 
         if (alg == pooling_avg_exclude_padding)
-            num_summands = (ih_end - ih_start)*(iw_end - iw_start);
+            num_summands = (id_end - id_start)*(ih_end - ih_start)*(iw_end - iw_start);
+        if (num_summands == 0) return;
 
         for (int id = id_start; id < id_end; ++id) {
             for (int ih = ih_start; ih < ih_end; ++ih) {
@@ -160,7 +170,7 @@ void nchw_pooling_fwd_t<data_type>::execute_forward() {
                 + (size_t)OW * oh
                 + (size_t)ow;
             data_t *d = &dst[dst_offset];
-            d[0] = nstl::numeric_limits<data_t>::lowest();
+            d[0] = (data_t)0;
             set_ws(mb, c, od, oh, ow, 0);
             ker_max(d, mb, c, od, oh, ow);
         });
@@ -174,7 +184,7 @@ void nchw_pooling_fwd_t<data_type>::execute_forward() {
                 + (size_t)OW * oh
                 + (size_t)ow;
             data_t *d = &dst[dst_offset];
-            d[0] = 0;
+            d[0] = (data_t)0;
             ker_avg(d, mb, c, od, oh, ow);
         });
     }
index 3980e08..d755538 100644 (file)
@@ -23,8 +23,8 @@
 #include "ncsp_batch_normalization.hpp"
 #include "type_helpers.hpp"
 
-// clang6 generates incorrect code with OMP_SIMD in some particular cases
-#if (defined __clang_major__) && (__clang_major__ == 6)
+// clang 6 and 7 generate incorrect code with OMP_SIMD in some particular cases
+#if (defined __clang_major__) && (__clang_major__ >= 6)
 #define SAFE_TO_USE_OMP_SIMD 0
 #else
 #define SAFE_TO_USE_OMP_SIMD 1
@@ -124,6 +124,12 @@ void ncsp_batch_normalization_fwd_t::execute_forward() {
         int SP_N_nthr = N_nthr * S_nthr;
         for (int it = 0; it < iters; ++it) {
             if (it == iters - 1 && iters > 1) {
+                // On the last iteration the access pattern to ws_reduce
+                // might change (due to re-balance on C). So sync the
+                // threads if they are not synced by the algorithm.
+                if (SP_N_nthr == 1 && mkldnn_thr_syncable())
+                    mkldnn_thr_barrier();
+
                 S_s = S_e = C_blk_s = C_blk_e = N_s = N_e = 0;
                 spatial_thr_allowed = bnorm_utils::thread_balance(do_blocking,
                         spatial_thr_allowed, ithr, nthr, N, last_iter_blks, SP,
@@ -134,6 +140,12 @@ void ncsp_batch_normalization_fwd_t::execute_forward() {
                 SP_N_nthr = N_nthr * S_nthr;
             }
             size_t C_off = it * C_blks_per_iter;
+            // On the last iteration the access pattern to ws_reduce
+            // might change (due to re-balance on C). Since sync is not always
+            // possible (in case of TBB) use different parts of ws for each
+            // iteration if threads are not synced by the algorithm.
+            size_t ws_iter_off = (mkldnn_thr_syncable() ? 0 : 1) * C_off;
+
             if (calculate_stats) {
                 data_t *mean_blk = mean + C_off;
                 data_t *variance_blk = variance + C_off;
@@ -145,7 +157,8 @@ void ncsp_batch_normalization_fwd_t::execute_forward() {
                         for (int sp = S_s; sp < S_e; ++sp) {
                             sum += src[off + n * C * SP + sp];
                         }
-                    ws_reduce[SP_N_ithr * C_blks_per_iter + c] = sum;
+                    ws_reduce[ws_iter_off + SP_N_ithr * C_blks_per_iter + c]
+                        = sum;
                 }
 
                 if (SP_N_nthr > 1) mkldnn_thr_barrier();
@@ -153,7 +166,8 @@ void ncsp_batch_normalization_fwd_t::execute_forward() {
                 for (int c = C_blk_gl_s; c < C_blk_gl_e; c++) {
                     mean_blk[c] = 0.;
                     for (int n = 0; n < SP_N_nthr; n++)
-                        mean_blk[c] += ws_reduce[n * C_blks_per_iter + c];
+                        mean_blk[c] += ws_reduce[ws_iter_off
+                                + n * C_blks_per_iter + c];
                     mean_blk[c] /= (N * SP);
                 }
 
@@ -169,7 +183,8 @@ void ncsp_batch_normalization_fwd_t::execute_forward() {
                                     - mean[off];
                             sum += m * m;
                         }
-                    ws_reduce[SP_N_ithr * C_blks_per_iter + c] = sum;
+                    ws_reduce[ws_iter_off + SP_N_ithr * C_blks_per_iter + c]
+                        = sum;
                 }
 
                 if (SP_N_nthr > 1) mkldnn_thr_barrier();
@@ -177,7 +192,8 @@ void ncsp_batch_normalization_fwd_t::execute_forward() {
                 for (int c = C_blk_gl_s; c < C_blk_gl_e; c++) {
                     variance_blk[c] = 0.;
                     for (int n = 0; n < SP_N_nthr; n++)
-                        variance_blk[c] += ws_reduce[n * C_blks_per_iter + c];
+                        variance_blk[c] += ws_reduce[ws_iter_off
+                                + n * C_blks_per_iter + c];
                     variance_blk[c] /= (N * SP);
                 }
 
@@ -282,6 +298,12 @@ void ncsp_batch_normalization_bwd_t::execute_backward() {
 
         for (int it = 0; it < iters; ++it) {
             if (it == iters - 1 && iters > 1) {
+                // On the last iteration the access pattern to ws_reduce
+                // might change (due to re-balance on C). So sync the
+                // threads if they are not synced by the algorithm.
+                if (SP_N_nthr == 1 && mkldnn_thr_syncable())
+                    mkldnn_thr_barrier();
+
                 C_blk_s = C_blk_e = N_s = N_e = 0;
                 spatial_thr_allowed = bnorm_utils::thread_balance(do_blocking,
                         spatial_thr_allowed, ithr, nthr, N, last_iter_blks, SP,
@@ -292,6 +314,12 @@ void ncsp_batch_normalization_bwd_t::execute_backward() {
                 SP_N_nthr = N_nthr * S_nthr;
             }
             size_t C_off = it * C_blks_per_iter;
+            // On the last iteration the access pattern to ws_reduce
+            // might change (due to re-balance on C). Since sync is not always
+            // possible (in case of TBB) use different parts of ws for each
+            // iteration if threads are not synced by the algorithm.
+            size_t ws_iter_off = (mkldnn_thr_syncable() ? 0 : 1) * 2 * C_off;
+
             data_t *diff_gamma_blk = diff_scaleshift + C_off;
             data_t *diff_beta_blk = diff_scaleshift + C + C_off;
             for (int c = C_blk_s; c < C_blk_e; c++) {
@@ -310,10 +338,10 @@ void ncsp_batch_normalization_bwd_t::execute_backward() {
                         diff_gamma += (src[d_off] - v_mean) * dd;
                         diff_beta += dd;
                     }
-                ws_reduce[SP_N_ithr * C_blks_per_iter + c] = diff_gamma;
-                ws_reduce[SP_N_nthr * C_blks_per_iter + SP_N_ithr * C_blks_per_iter
-                        + c]
-                        = diff_beta;
+                ws_reduce[ws_iter_off + SP_N_ithr * C_blks_per_iter + c]
+                    = diff_gamma;
+                ws_reduce[ws_iter_off + SP_N_nthr * C_blks_per_iter
+                        + SP_N_ithr * C_blks_per_iter + c] = diff_beta;
             }
 
             if (SP_N_nthr > 1) mkldnn_thr_barrier();
@@ -324,9 +352,11 @@ void ncsp_batch_normalization_bwd_t::execute_backward() {
                 diff_gamma_blk[c] = 0.;
                 diff_beta_blk[c] = 0.;
                 for (int n = 0; n < SP_N_nthr; n++) {
-                    diff_gamma_blk[c] += ws_reduce[n * C_blks_per_iter + c];
-                    diff_beta_blk[c] += ws_reduce[SP_N_nthr * C_blks_per_iter
+                    diff_gamma_blk[c] += ws_reduce[ws_iter_off
                             + n * C_blks_per_iter + c];
+                    diff_beta_blk[c] += ws_reduce[ws_iter_off
+                            + SP_N_nthr * C_blks_per_iter + n * C_blks_per_iter
+                            + c];
                 }
                 diff_gamma_blk[c] *= sqrt_variance;
             }
index d3bcc5f..ddf6df6 100644 (file)
@@ -47,7 +47,7 @@ struct ncsp_batch_normalization_fwd_t : public cpu_primitive_t {
                 && is_fwd()
                 && !has_zero_dim_memory()
                 && desc()->data_desc.data_type == f32
-                && utils::implication(use_scaleshift(),
+                && IMPLICATION(use_scaleshift(),
                         desc()->data_scaleshift_desc.data_type == f32)
                 && utils::one_of(data_pd_.desc()->format, memory_format::nchw,
                         memory_format::ncdhw, memory_format::nc)
@@ -107,7 +107,7 @@ struct ncsp_batch_normalization_bwd_t : public cpu_primitive_t {
                 && is_bwd()
                 && !has_zero_dim_memory()
                 && desc()->data_desc.data_type == f32
-                && utils::implication(use_scaleshift(),
+                && IMPLICATION(use_scaleshift(),
                         desc()->data_scaleshift_desc.data_type == f32)
                 && utils::one_of(data_pd_.desc()->format, memory_format::nchw,
                         memory_format::ncdhw, memory_format::nc)
index a138cb7..96eb50b 100644 (file)
@@ -22,8 +22,8 @@
 #include "nspc_batch_normalization.hpp"
 #include "type_helpers.hpp"
 
-// clang6 generates incorrect code with OMP_SIMD in some particular cases
-#if (defined __clang_major__) && (__clang_major__ == 6)
+// clang 6 and 7 generate incorrect code with OMP_SIMD in some particular cases
+#if (defined __clang_major__) && (__clang_major__ >= 6)
 #define SAFE_TO_USE_OMP_SIMD 0
 #else
 #define SAFE_TO_USE_OMP_SIMD 1
index e050aeb..168caf9 100644 (file)
@@ -50,7 +50,7 @@ struct nspc_batch_normalization_fwd_t : public cpu_primitive_t {
                 && is_fwd()
                 && !has_zero_dim_memory()
                 && desc()->data_desc.data_type == f32
-                && utils::implication(use_scaleshift(),
+                && IMPLICATION(use_scaleshift(),
                         desc()->data_scaleshift_desc.data_type == f32)
                 && utils::one_of(data_pd_.desc()->format, memory_format::nhwc)
                 && (attr()->has_default_values() || this->with_relu_post_op());
@@ -111,7 +111,7 @@ struct nspc_batch_normalization_bwd_t : public cpu_primitive_t {
                 && is_bwd()
                 && !has_zero_dim_memory()
                 && desc()->data_desc.data_type == f32
-                && utils::implication(use_scaleshift(),
+                && IMPLICATION(use_scaleshift(),
                         desc()->data_scaleshift_desc.data_type == f32)
                 && utils::one_of(data_pd_.desc()->format, memory_format::nhwc)
                 && (attr()->has_default_values() || this->with_relu_post_op());
index d121c82..33b5fe0 100644 (file)
@@ -77,34 +77,41 @@ void _ref_convolution_fwd_t<with_relu, src_type, wei_type, dst_type, acc_type>
 
     auto ker = [=](acc_data_t &d, int g, int mb, int oc, int od, int oh,
             int ow) {
-        for (int ic = 0; ic < IC; ++ic) {
-            for (int kd = 0; kd < KD; ++kd)
-            for (int kh = 0; kh < KH; ++kh)
-            for (int kw = 0; kw < KW; ++kw) {
-                const int id = od * KSD - padFront + kd * (1 + KDD);
-                const int ih = oh * KSH - padT + kh * (1 + KDH);
-                const int iw = ow * KSW - padL + kw * (1 + KDW);
-
-                if (id < 0 || id >= ID) continue;
-                if (ih < 0 || ih >= IH) continue;
-                if (iw < 0 || iw >= IW) continue;
-
-                if (ndims == 5)
+        for (int ic = 0; ic < IC; ++ic)
+        for (int kd = 0; kd < KD; ++kd)
+        for (int kh = 0; kh < KH; ++kh)
+        for (int kw = 0; kw < KW; ++kw) {
+            const int id = od * KSD - padFront + kd * (1 + KDD);
+            const int ih = oh * KSH - padT + kh * (1 + KDH);
+            const int iw = ow * KSW - padL + kw * (1 + KDW);
+
+            if (id < 0 || id >= ID) continue;
+            if (ih < 0 || ih >= IH) continue;
+            if (iw < 0 || iw >= IW) continue;
+
+            if (ndims == 5)
                 d += (acc_data_t)src[src_d.off(mb, g*IC + ic, id, ih, iw)]
                     * (with_groups
-                        ? weights[weights_d.off(g, oc, ic, kd, kh, kw)]
-                        : weights[weights_d.off(oc, ic, kd, kh, kw)]);
-                else
+                    ? weights[weights_d.off(g, oc, ic, kd, kh, kw)]
+                    : weights[weights_d.off(oc, ic, kd, kh, kw)]);
+            else if (ndims == 4)
                 d += (acc_data_t)src[src_d.off(mb, g*IC + ic, ih, iw)]
                     * (with_groups
-                        ? weights[weights_d.off(g, oc, ic, kh, kw)]
-                        : weights[weights_d.off(oc, ic, kh, kw)]);
-            }
-        }
+                    ? weights[weights_d.off(g, oc, ic, kh, kw)]
+                    : weights[weights_d.off(oc, ic, kh, kw)]);
+            else if (ndims == 3)
+                d += (acc_data_t)src[src_d.off(mb, g*IC + ic, iw)]
+                    * (with_groups
+                    ? weights[weights_d.off(g, oc, ic, kw)]
+                    : weights[weights_d.off(oc, ic, kw)]);
+           else
+               assert(false);
+
+       }
     };
-    auto get_bias = [=, &bias](size_t off) -> acc_data_t {
+    auto get_bias = [=, &bias](size_t off) -> float {
 #       define CASE(dt) case dt: \
-            return (acc_data_t)(*((const prec_traits<dt>::type *)bias + off))
+            return (float)(*((const prec_traits<dt>::type *)bias + off))
         switch (conf_.cdesc()->bias_desc.data_type) {
         CASE(data_type::s8);
         CASE(data_type::u8);
@@ -117,19 +124,33 @@ void _ref_convolution_fwd_t<with_relu, src_type, wei_type, dst_type, acc_type>
     };
     parallel_nd(G, MB, OC, OD, OH, OW,
         [&](int g, int mb, int oc, int od, int oh, int ow) {
-        acc_data_t a = bias
-            ? get_bias(bias_d.off(g*OC + oc))
-            : (acc_data_t)0;
+        acc_data_t a = 0;
         ker(a, g, mb, oc, od, oh, ow);
-        if (with_relu && a < (acc_data_t)0)
-            a = (acc_data_t)((float)a * nslope);
+
+        float a_fp = (float)a;
+
+        if (bias)
+            a_fp += get_bias(bias_d.off(g*OC + oc));
+
+        if (with_relu && a_fp < 0)
+            a_fp *= nslope;
+
+        if (data_traits<dst_data_t>::data_type != data_type::f32) {
+            switch (conf_.attr()->round_mode_) {
+                case round_mode::down:    a_fp = floorf(a_fp); break;
+                case round_mode::nearest: a_fp = nearbyintf(a_fp); break;
+            }
+        }
+
         if (ndims == 5)
-        dst[dst_d.off(mb, g*OC + oc, od, oh, ow)]
-        = saturate<dst_data_t>(a);
+            dst[dst_d.off(mb, g*OC + oc, od, oh, ow)] = saturate<dst_data_t>(a_fp);
+        else if (ndims == 4)
+            dst[dst_d.off(mb, g*OC + oc, oh, ow)] = saturate<dst_data_t>(a_fp);
+        else if (ndims == 3)
+            dst[dst_d.off(mb, g*OC + oc, ow)] = saturate<dst_data_t>(a_fp);
         else
-        dst[dst_d.off(mb, g*OC + oc, oh, ow)]
-        = saturate<dst_data_t>(a);
-    });
+            assert(false);
+   });
 }
 
 template <data_type_t diff_src_type, data_type_t wei_type,
@@ -180,39 +201,42 @@ void ref_convolution_bwd_data_t<diff_src_type, wei_type, diff_dst_type,
 
     auto ker = [=](acc_data_t &d, int g, int mb, int ic, int id, int ih,
             int iw) {
-        for (int oc = 0; oc < OC; ++oc) {
-            for (int kd = 0; kd < KD; ++kd) {
-                for (int kh = 0; kh < KH; ++kh) {
-                    for (int kw = 0; kw < KW; ++kw) {
-                        if (iw + padL < kw * (1 + KDW)
-                            || ih + padT < kh * (1 + KDH)
-                            || id + padFront < kd * (1 + KDD))
-                            continue;
-                        int ow = iw - kw * (1 + KDW) + padL;
-                        int oh = ih - kh * (1 + KDH) + padT;
-                        int od = id - kd * (1 + KDD) + padFront;
-                        if (ow % KSW != 0 || oh % KSH != 0 || od % KSD != 0 )
-                            continue;
-
-                        ow /= KSW;
-                        oh /= KSH;
-                        od /= KSD;
-
-                        if (od < OD && oh < OH && ow < OW) {
-                            if (ndims == 5)
-                            d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC
-                                + oc, od, oh, ow)] * (with_groups
-                                ? weights[weights_d.off(g, oc, ic, kd, kh, kw)]
-                                : weights[weights_d.off(oc, ic, kd, kh, kw)]);
-                            else
-                            d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC
-                                + oc, oh, ow)] * (with_groups
-                                ? weights[weights_d.off(g, oc, ic, kh, kw)]
-                                : weights[weights_d.off(oc, ic, kh, kw)]);
-
-                        }
-                    }
-                }
+        for (int oc = 0; oc < OC; ++oc)
+        for (int kd = 0; kd < KD; ++kd)
+        for (int kh = 0; kh < KH; ++kh)
+        for (int kw = 0; kw < KW; ++kw) {
+            if (iw + padL < kw * (1 + KDW)
+                || ih + padT < kh * (1 + KDH)
+                || id + padFront < kd * (1 + KDD))
+                continue;
+            int ow = iw - kw * (1 + KDW) + padL;
+            int oh = ih - kh * (1 + KDH) + padT;
+            int od = id - kd * (1 + KDD) + padFront;
+            if (ow % KSW != 0 || oh % KSH != 0 || od % KSD != 0)
+                continue;
+
+            ow /= KSW;
+            oh /= KSH;
+            od /= KSD;
+
+            if (od < OD && oh < OH && ow < OW) {
+                if (ndims == 5)
+                    d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC
+                        + oc, od, oh, ow)] * (with_groups
+                        ? weights[weights_d.off(g, oc, ic, kd, kh, kw)]
+                        : weights[weights_d.off(oc, ic, kd, kh, kw)]);
+                else if (ndims == 4)
+                    d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC
+                        + oc, oh, ow)] * (with_groups
+                        ? weights[weights_d.off(g, oc, ic, kh, kw)]
+                        : weights[weights_d.off(oc, ic, kh, kw)]);
+                else if (ndims == 3)
+                    d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC
+                        + oc, ow)] * (with_groups
+                        ? weights[weights_d.off(g, oc, ic, kw)]
+                        : weights[weights_d.off(oc, ic, kw)]);
+                else
+                    assert(false);
             }
         }
     };
@@ -233,7 +257,9 @@ void ref_convolution_bwd_data_t<diff_src_type, wei_type, diff_dst_type,
         [&](int g, int mb, int ic, int id, int ih, int iw) {
         auto ds_idx = (ndims == 5)
             ? diff_src_d.off(mb, g*IC + ic, id, ih, iw)
-            : diff_src_d.off(mb, g*IC + ic, ih, iw);
+            : (ndims == 4)
+            ? diff_src_d.off(mb, g*IC + ic, ih, iw)
+            : diff_src_d.off(mb, g*IC + ic, iw);
         acc_data_t a = bias
             ? get_bias(bias_d.off(g*IC + ic))
             : (acc_data_t)0;
@@ -289,49 +315,50 @@ void ref_convolution_bwd_weights_t<src_type, diff_wei_type, diff_dst_type,
     const int ndims = conf_.cdesc()->src_desc.ndims;
 
 auto ker = [=](acc_data_t &d, int g, int oc, int ic, int kd, int kh, int kw) {
-        for (int mb = 0; mb < MB; ++mb) {
-            for (int od = 0; od < OD; ++od) {
-                for (int oh = 0; oh < OH; ++oh) {
-                    for (int ow = 0; ow < OW; ++ow) {
-                        if (ow*KSW + kw * (1 + KDW) < padL
-                            || oh*KSH + kh * (1 + KDH) < padT
-                            || od*KSD + kd * (1 + KDD) < padFront
-                            || ow*KSW + kw * (1 + KDW) >= IW + padL
-                            || oh*KSH + kh * (1 + KDH) >= IH + padT
-                            || od*KSD + kd * (1 + KDD) >= ID + padFront)
-                            continue;
-
-                        int id = od*KSD - padFront + kd * (1 + KDD);
-                        int ih = oh*KSH - padT + kh * (1 + KDH);
-                        int iw = ow*KSW - padL + kw * (1 + KDW);
-                        if (ndims == 5)
-                        d += (acc_data_t)diff_dst[diff_dst_d.off(
-                            mb, g*OC + oc, od, oh, ow)]
-                            * src[src_d.off(mb, g*IC + ic, id, ih, iw)];
-                        else
-                        d += (acc_data_t)diff_dst[diff_dst_d.off(
-                            mb, g*OC + oc, oh, ow)]
-                            * src[src_d.off(mb, g*IC + ic, ih, iw)];
-                    }
-                }
-            }
+        for (int mb = 0; mb < MB; ++mb)
+        for (int od = 0; od < OD; ++od)
+        for (int oh = 0; oh < OH; ++oh)
+        for (int ow = 0; ow < OW; ++ow) {
+            if (ow*KSW + kw * (1 + KDW) < padL
+                || oh*KSH + kh * (1 + KDH) < padT
+                || od*KSD + kd * (1 + KDD) < padFront
+                || ow*KSW + kw * (1 + KDW) >= IW + padL
+                || oh*KSH + kh * (1 + KDH) >= IH + padT
+                || od*KSD + kd * (1 + KDD) >= ID + padFront)
+                continue;
+
+            int id = od*KSD - padFront + kd * (1 + KDD);
+            int ih = oh*KSH - padT + kh * (1 + KDH);
+            int iw = ow*KSW - padL + kw * (1 + KDW);
+            if (ndims == 5)
+                d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC + oc, od,
+                    oh, ow)] * src[src_d.off(mb, g*IC + ic, id, ih, iw)];
+            else if (ndims == 4)
+                d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC + oc, oh, ow)]
+                    * src[src_d.off(mb, g*IC + ic, ih, iw)];
+            else if (ndims == 3)
+                d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC + oc, ow)]
+                    * src[src_d.off(mb, g*IC + ic, iw)];
+            else
+                assert(false);
         }
     };
 
     auto ker_bias = [=](acc_data_t &d, int g, int oc) {
-        for (int mb = 0; mb < MB; ++mb) {
-            for (int od = 0; od < OD; ++od) {
-                for (int oh = 0; oh < OH; ++oh) {
-                    for (int ow = 0; ow < OW; ++ow) {
-                        if (ndims == 5)
-                        d += (acc_data_t)diff_dst[diff_dst_d.off(
-                            mb, g*OC + oc, od, oh, ow)];
-                        else
-                        d += (acc_data_t)diff_dst[diff_dst_d.off(
-                            mb, g*OC + oc, oh, ow)];
-                    }
-                }
-            }
+        for (int mb = 0; mb < MB; ++mb)
+        for (int od = 0; od < OD; ++od)
+        for (int oh = 0; oh < OH; ++oh)
+        for (int ow = 0; ow < OW; ++ow) {
+            if (ndims == 5)
+                d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC + oc, od, oh,
+                     ow)];
+            else if (ndims == 4)
+                d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC + oc, oh,
+                     ow)];
+            else if (ndims == 3)
+                d += (acc_data_t)diff_dst[diff_dst_d.off(mb, g*OC + oc, ow)];
+            else
+                assert(false);
         }
     };
 
@@ -343,27 +370,30 @@ auto ker = [=](acc_data_t &d, int g, int oc, int ic, int kd, int kh, int kw) {
                 = saturate<diff_wei_data_t>(db);
         }
 
-        for (int ic = 0; ic < IC; ++ic) {
-            for (int kd = 0; kd < KD; ++kd) {
-                for (int kh = 0; kh < KH; ++kh) {
-                    for (int kw = 0; kw < KW; ++kw) {
-                        acc_data_t dw = 0;
-                        ker(dw, g, oc, ic, kd, kh, kw);
-
-                        if (ndims == 5)
-                        {
-                        auto idx = with_groups
-                            ? diff_weights_d.off(g, oc, ic, kd, kh, kw)
-                            : diff_weights_d.off(oc, ic, kd, kh, kw);
-                        diff_weights[idx] = saturate<diff_wei_data_t>(dw);
-                        } else {
-                        auto idx = with_groups
-                            ? diff_weights_d.off(g, oc, ic, kh, kw)
-                            : diff_weights_d.off(oc, ic, kh, kw);
-                        diff_weights[idx] = saturate<diff_wei_data_t>(dw);
-                        }
-                    }
-                }
+        for (int ic = 0; ic < IC; ++ic)
+        for (int kd = 0; kd < KD; ++kd)
+        for (int kh = 0; kh < KH; ++kh)
+        for (int kw = 0; kw < KW; ++kw) {
+            acc_data_t dw = 0;
+            ker(dw, g, oc, ic, kd, kh, kw);
+
+            if (ndims == 5) {
+                auto idx = with_groups
+                    ? diff_weights_d.off(g, oc, ic, kd, kh, kw)
+                    : diff_weights_d.off(oc, ic, kd, kh, kw);
+                    diff_weights[idx] = saturate<diff_wei_data_t>(dw);
+            } else if (ndims == 4) {
+                auto idx = with_groups
+                    ? diff_weights_d.off(g, oc, ic, kh, kw)
+                    : diff_weights_d.off(oc, ic, kh, kw);
+                    diff_weights[idx] = saturate<diff_wei_data_t>(dw);
+            } else if (ndims == 3) {
+                auto idx = with_groups
+                    ? diff_weights_d.off(g, oc, ic, kw)
+                    : diff_weights_d.off(oc, ic, kw);
+                    diff_weights[idx] = saturate<diff_wei_data_t>(dw);
+            } else {
+                 assert(false);
             }
         }
     });
index bea46cb..3153e4d 100644 (file)
@@ -58,11 +58,11 @@ struct _ref_convolution_fwd_t: public cpu_primitive_t {
                 && this->cdesc_().weights_desc.data_type == wei_type
                 && this->cdesc_().accum_data_type == acc_type
                 && this->cdesc_().dst_desc.data_type == dst_type
-                && utils::implication(this->with_bias(), true
-                        && utils::implication(src_type == u8,
+                && IMPLICATION(this->with_bias(), true
+                        && IMPLICATION(src_type == u8,
                             utils::one_of(this->cdesc_().bias_desc.data_type,
                                 f32, s32, s8, u8))
-                        && utils::implication(src_type == f32,
+                        && IMPLICATION(src_type == f32,
                             this->cdesc_().bias_desc.data_type == f32))
                 && this->attr()->has_default_values();
             return ok ? status::success : status::unimplemented;
@@ -189,7 +189,7 @@ struct ref_convolution_bwd_weights_t: public cpu_primitive_t {
                 && this->desc()->diff_weights_desc.data_type == diff_wei_type
                 && this->desc()->diff_dst_desc.data_type == diff_dst_type
                 && this->desc()->accum_data_type == acc_type
-                && utils::implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                         this->desc()->diff_bias_desc.data_type
                         == diff_wei_type)
                 && this->attr()->has_default_values();
index 4bd984d..6890c1c 100644 (file)
@@ -134,10 +134,18 @@ struct ref_deconvolution_fwd_t: public cpu_primitive_t {
                 const deconvolution_desc_t *adesc,
                 const primitive_attr_t *attr,
                 const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_deconvolution_fwd_pd_t(engine, adesc, attr,
-                    hint_fwd_pd)
+            : cpu_deconvolution_fwd_pd_t(engine, adesc, attr, hint_fwd_pd)
+            , conv_pd_(nullptr)
         {}
 
+        pd_t(const pd_t &other)
+            : cpu_deconvolution_fwd_pd_t(other)
+            , conv_pd_(other.conv_pd_->clone())
+            , conv_supports_bias_(other.conv_supports_bias_)
+        {}
+
+        ~pd_t() { delete conv_pd_; }
+
         DECLARE_DECONVOLUTION_PD_T("ref:any", ref_deconvolution_fwd_t);
 
         status_t init_convolution(){
@@ -165,7 +173,7 @@ struct ref_deconvolution_fwd_t: public cpu_primitive_t {
                     /* only weights in non-double-blocked format are supported */
                     && (wei_fmt == blocked && !is_format_double_blocked(wei_fmt))
                     /* deconv reference code can process only f32 bias */
-                    && utils::implication(with_bias(),
+                    && IMPLICATION(with_bias(),
                             conv_supports_bias_ || output_f32);
                 if (ok)
                     return success;
@@ -210,7 +218,9 @@ struct ref_deconvolution_fwd_t: public cpu_primitive_t {
 
     ref_deconvolution_fwd_t(const pd_t *pd, const input_vector &inputs,
             const output_vector &outputs)
-        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {}
+        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), conv_p_(nullptr) {}
+
+    ~ref_deconvolution_fwd_t() { delete this->conv_p_; }
 
     virtual void execute(event_t *e) {
         switch (conf_.desc()->prop_kind) {
@@ -258,8 +268,15 @@ struct ref_deconvolution_bwd_data_t: public cpu_primitive_t {
                 const primitive_attr_t *attr,
                 const deconvolution_fwd_pd_t *hint_fwd_pd)
             : cpu_deconvolution_bwd_data_pd_t(engine, adesc, attr, hint_fwd_pd)
+            , conv_pd_(nullptr)
         {}
 
+        pd_t(const pd_t &other)
+            : cpu_deconvolution_bwd_data_pd_t(other)
+            , conv_pd_(other.conv_pd_->clone()) {}
+
+        ~pd_t() { delete conv_pd_; }
+
         DECLARE_DECONVOLUTION_PD_T("ref:any", ref_deconvolution_bwd_data_t);
 
         status_t init_convolution(){
@@ -321,7 +338,8 @@ struct ref_deconvolution_bwd_data_t: public cpu_primitive_t {
     };
     ref_deconvolution_bwd_data_t(const pd_t *pd, const input_vector &inputs,
             const output_vector &outputs)
-        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {}
+        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), conv_p_(nullptr) {}
+    ~ref_deconvolution_bwd_data_t() { delete this->conv_p_; }
 
     virtual void execute(event_t *e) {
         switch (conf_.desc()->prop_kind) {
@@ -346,8 +364,15 @@ struct ref_deconvolution_bwd_weights_t: public cpu_primitive_t {
                 const primitive_attr_t *attr,
                 const deconvolution_fwd_pd_t *hint_fwd_pd)
             : cpu_deconvolution_bwd_weights_pd_t(engine, adesc, attr, hint_fwd_pd)
+            , conv_pd_(nullptr)
         {}
 
+        pd_t(const pd_t &other)
+            : cpu_deconvolution_bwd_weights_pd_t(other)
+            , conv_pd_(other.conv_pd_->clone()) {}
+
+        ~pd_t() { delete conv_pd_; }
+
         DECLARE_DECONVOLUTION_PD_T("ref:any", ref_deconvolution_bwd_weights_t);
 
         status_t init_convolution(){
@@ -411,7 +436,9 @@ struct ref_deconvolution_bwd_weights_t: public cpu_primitive_t {
 
     ref_deconvolution_bwd_weights_t(const pd_t *pd, const input_vector &inputs,
             const output_vector &outputs)
-        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd) {}
+        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd), conv_p_(nullptr) {}
+
+    ~ref_deconvolution_bwd_weights_t() { delete this->conv_p_; }
 
     typedef typename prec_traits<data_type::f32>::type data_t;
 
index 9676ae9..b5d334a 100644 (file)
@@ -67,15 +67,18 @@ void ref_depthwise_fwd_t<data_type>::execute_forward() {
 
     const int MB = conf_.MB();
     const int C = conf_.C();
+    const int D = conf_.D();
     const int H = conf_.H();
     const int W = conf_.W();
     const auto alg_kind = conf_.desc()->alg_kind;
 
-    parallel_nd(MB, C, H, W,
-        [&](int n, int c, int h, int w) {
+    parallel_nd(MB, C, D, H, W,
+        [&](int n, int c, int d, int h, int w) {
         size_t data_off = data_d.ndims() == 4
                         ? data_d.off(n, c, h, w)
-                        : data_d.off(n, c);
+                        : data_d.ndims() == 5
+                            ? data_d.off(n, c, d, h, w)
+                            : data_d.off(n, c);
 
         data_t s_val = src[data_off];
         data_t w_val = weights[weights_d.off(c)];
index 35381aa..bd90dc1 100644 (file)
@@ -77,7 +77,7 @@ struct ref_eltwise_fwd_t: public cpu_primitive_t {
                 && one_of(desc()->prop_kind, forward_training,
                         forward_inference)
                 && everyone_is(data_type, desc()->data_desc.data_type)
-                && implication(use_generic, one_of(src_d.ndims(), 4, 5))
+                && IMPLICATION(use_generic, one_of(src_d.ndims(), 4, 5))
                 && attr()->has_default_values();
             if (!ok) return status::unimplemented;
 
index fca131e..afb21a1 100644 (file)
@@ -53,12 +53,12 @@ struct ref_inner_product_fwd_t: public cpu_primitive_t {
                 && desc()->weights_desc.data_type == wei_type
                 && desc()->accum_data_type == acc_type
                 && desc()->dst_desc.data_type == dst_type
-                && utils::implication(with_bias(),
+                && IMPLICATION(with_bias(),
                             utils::one_of(desc()->bias_desc.data_type,
                                 f32, s32, s8, u8))
                 && attr()->output_scales_.has_default_values()
                 && attr()->post_ops_.len_ <= 1
-                && utils::implication(attr()->post_ops_.len_ == 1,
+                && IMPLICATION(attr()->post_ops_.len_ == 1,
                         attr()->post_ops_.entry_[0].is_relu(true, false));
             return ok ? status::success : status::unimplemented;
         }
@@ -167,7 +167,7 @@ struct ref_inner_product_bwd_weights_t: public cpu_primitive_t {
                         this->desc()->src_desc.data_type,
                         this->desc()->diff_dst_desc.data_type,
                         this->desc()->diff_weights_desc.data_type)
-                && utils::implication(this->with_bias(),
+                && IMPLICATION(this->with_bias(),
                         data_type == this->desc()->diff_bias_desc.data_type)
                 && attr()->has_default_values();
             return ok ? status::success : status::unimplemented;
index ab92297..38b81dd 100644 (file)
@@ -29,11 +29,18 @@ namespace cpu {
 
 static inline float fast_negative_powf(float omega, float beta) {
     float Y;
+/*
+ * Y = omega^(-3/4) =
+ * = 1.0f / sqrtf(omega) * sqrtf(1.0f / sqrtf(omega))
+ * = sqrtf(1.0f / sqrtf(omega)) * 1.0f / sqrtf(omega)
+ * = sqrtf(1.0f / sqrtf(omega)) / sqrtf(omega)
+ * = sqrtf(1.0f / sqrtf(omega) / omega)
+ * = sqrtf(1.0f / (sqrtf(omega) * omega))
+ */
     if (beta == 0.75f) {
-        Y = 1.0f / sqrtf(omega);
-        Y *= sqrtf(Y);
+        Y = sqrtf(1.0f / (sqrtf(omega) * omega));
     } else {
-        Y = 1.0f /powf(omega, beta);
+        Y = 1.0f / powf(omega, beta);
     }
     return Y;
 };
index db580ce..ad89ed7 100644 (file)
@@ -69,8 +69,9 @@ struct ref_lrn_fwd_t: public cpu_primitive_t {
         case nChw8c: execute_forward<nChw8c>(); break;
         case nchw: execute_forward<nchw>(); break;
         case nhwc: execute_forward<nhwc>(); break;
-        case any: execute_forward<mkldnn_any>(); break;
-        default: break;
+        // XXX: fix compatibility with 0.14
+        // mkldnn_any is used to call ref code for arbitrary format
+        default: execute_forward<mkldnn_any>();
         }
         e->set_state(event_t::ready);
     }
@@ -117,8 +118,9 @@ struct ref_lrn_bwd_t: public cpu_primitive_t {
         case nChw8c: execute_backward<nChw8c>(); break;
         case nchw: execute_backward<nchw>(); break;
         case nhwc: execute_backward<nhwc>(); break;
-        case any: execute_backward<mkldnn_any>(); break;
-        default: break;
+        // XXX: fix compatibility with 0.14
+        // mkldnn_any is used to call ref code for arbitrary format
+        default: execute_backward<mkldnn_any>();
         }
         e->set_state(event_t::ready);
     }
index a8da7bf..4ee010d 100644 (file)
@@ -58,14 +58,15 @@ void ref_pooling_fwd_t<data_type, acc_type>::execute_forward() {
     const int padF = conf_.padFront();
     const int padT = conf_.padT();
     const int padL = conf_.padL();
+    const int padBack = conf_.padBack();
     const int padB = conf_.padB();
     const int padR = conf_.padR();
 
     const bool is_3d = conf_.desc()->src_desc.ndims == 5;
 
-    auto apply_offset = [=](int index, int offset) {
-        return (index > offset) ? index - offset : 0;
-    };
+//    auto apply_offset = [=](int index, int offset) {
+//        return (index > offset) ? index - offset : 0;
+//    };
 
     auto set_ws = [=](int mb, int oc, int od, int oh, int ow, int value) {
         if (ws) {
@@ -81,6 +82,7 @@ void ref_pooling_fwd_t<data_type, acc_type>::execute_forward() {
     };
 
     auto ker_max = [=](data_t *d, int mb, int oc, int oh, int ow) {
+        bool is_initialized = false;
         for (int kh = 0; kh < KH; ++kh) {
             for (int kw = 0; kw < KW; ++kw) {
                 const int ih = oh * SH - padT + kh;
@@ -90,9 +92,14 @@ void ref_pooling_fwd_t<data_type, acc_type>::execute_forward() {
                 if (iw < 0 || iw >= IW) continue;
 
                 auto s = src[src_d.off(mb, oc, ih, iw)];
-                if (s > d[0]) {
+                if (!is_initialized) {
                     d[0] = s;
                     set_ws(mb, oc, 1, oh, ow, kh*KW + kw);
+                    is_initialized = true;
+                } else {
+                    if (d[0] < s)
+                        d[0] = s;
+                        set_ws(mb, oc, 1, oh, ow, kh*KW + kw);
                 }
             }
         }
@@ -114,6 +121,7 @@ void ref_pooling_fwd_t<data_type, acc_type>::execute_forward() {
 
         if (alg == pooling_avg_exclude_padding)
             num_summands = (ih_end - ih_start)*(iw_end - iw_start);
+        if (num_summands == 0) return;
 
         acc_data_t dst = 0;
         for (int ih = ih_start; ih < ih_end; ++ih) {
@@ -126,6 +134,7 @@ void ref_pooling_fwd_t<data_type, acc_type>::execute_forward() {
     };
 
     auto ker_max_3d = [=](data_t *d, int mb, int oc, int od, int oh, int ow) {
+        bool is_initialized = false;
         for (int kd = 0; kd < KD; ++kd) {
             for (int kh = 0; kh < KH; ++kh) {
                 for (int kw = 0; kw < KW; ++kw) {
@@ -138,9 +147,14 @@ void ref_pooling_fwd_t<data_type, acc_type>::execute_forward() {
                     if (iw < 0 || iw >= IW) continue;
 
                     auto s = src[src_d.off(mb, oc, id, ih, iw)];
-                    if (s > d[0]) {
+                    if (!is_initialized) {
                         d[0] = s;
-                        set_ws(mb, oc, od, oh, ow, kd * KH * KW + kh*KW + kw);
+                        set_ws(mb, oc, od, oh, ow, kd * KH * KW + kh * KW + kw);
+                        is_initialized = true;
+                    } else {
+                        if (d[0] < s)
+                            d[0] = s;
+                            set_ws(mb, oc, od, oh, ow, kd * KH * KW + kh * KW + kw);
                     }
                 }
             }
@@ -148,15 +162,26 @@ void ref_pooling_fwd_t<data_type, acc_type>::execute_forward() {
     };
 
     auto ker_avg_3d = [=](data_t *d, int mb, int oc, int od, int oh, int ow) {
-        auto id_start = apply_offset(od*SD, padF);
-        auto ih_start = apply_offset(oh*SH, padT);
-        auto iw_start = apply_offset(ow*SW, padL);
-        auto id_end = nstl::min(od*SD - padF + KD, ID);
-        auto ih_end = nstl::min(oh*SH - padT + KH, IH);
-        auto iw_end = nstl::min(ow*SW - padL + KW, IW);
+        auto id_start = od*SD - padF;
+        auto ih_start = oh*SH - padT;
+        auto iw_start = ow*SW - padL;
+        auto id_end = nstl::min(od*SD - padF + KD, ID + padBack);
+        auto ih_end = nstl::min(oh*SH - padT + KH, IH + padB);
+        auto iw_end = nstl::min(ow*SW - padL + KW, IW + padR);
 
-        auto num_summands = (alg == pooling_avg_include_padding) ? KW*KH*KD
-            : (ih_end - ih_start)*(iw_end - iw_start)*(id_end - id_start);
+        // case alg == pooling_avg_include_padding
+        auto num_summands = (ih_end - ih_start)*(iw_end - iw_start)*(id_end - id_start);
+
+        id_start = nstl::max(id_start, 0);
+        ih_start = nstl::max(ih_start, 0);
+        iw_start = nstl::max(iw_start, 0);
+        id_end = nstl::min(id_end, ID);
+        ih_end = nstl::min(ih_end, IH);
+        iw_end = nstl::min(iw_end, IW);
+
+        if (alg == pooling_avg_exclude_padding)
+            num_summands = (ih_end - ih_start)*(iw_end - iw_start)*(id_end - id_start);
+        if (num_summands == 0) return;
 
         acc_data_t dst = 0;
         for (int id = id_start; id < id_end; ++id) {
@@ -182,7 +207,7 @@ void ref_pooling_fwd_t<data_type, acc_type>::execute_forward() {
             data_t *d = is_3d
                 ? &dst[dst_d.off(mb, oc, od, oh, ow)]
                 : &dst[dst_d.off(mb, oc, oh, ow)];
-                d[0] = nstl::numeric_limits<data_t>::lowest();
+                d[0] = (data_t)0;
                 set_ws(mb, oc, od, oh, ow, 0);
                 if (is_3d) ker_max_3d(d, mb, oc, od, oh, ow);
                 else ker_max(d, mb, oc, oh, ow);
@@ -193,7 +218,7 @@ void ref_pooling_fwd_t<data_type, acc_type>::execute_forward() {
             data_t *d = is_3d
                 ? &dst[dst_d.off(mb, oc, od, oh, ow)]
                 : &dst[dst_d.off(mb, oc, oh, ow)];
-            d[0] = 0;
+            d[0] = (data_t)0;
             if (is_3d) ker_avg_3d(d, mb, oc, od, oh, ow);
             else ker_avg(d, mb, oc, oh, ow);
         });
index 5dbedc9..b2be03b 100644 (file)
@@ -106,7 +106,7 @@ struct ref_pooling_bwd_t: public cpu_primitive_t {
                         pooling_avg_exclude_padding)
                 && utils::everyone_is(data_type, diff_dst_pd()->desc()->data_type,
                         diff_src_pd()->desc()->data_type)
-                && utils::implication(desc()->alg_kind == pooling_max,
+                && IMPLICATION(desc()->alg_kind == pooling_max,
                         hint_fwd_pd_ && hint_fwd_pd_->workspace_pd()
                         && hint_fwd_pd_->workspace_pd()->engine()->kind()
                                 == engine_kind::cpu)
index c1f0612..122b424 100644 (file)
@@ -28,6 +28,7 @@
   only the cell execution function should be impacted
 
  */
+
 #include "c_types_map.hpp"
 #include "math_utils.hpp"
 #include "mkldnn_thread.hpp"
@@ -48,6 +49,13 @@ using namespace alg_kind;
 
 #define AOC array_offset_calculator
 
+inline float one_m_square(float x) {
+    return (1.0f - x) * (1.0f + x);
+}
+inline float x_m_square(float x) {
+    return (1.0f - x) * x;
+}
+
 template <>
 float activation<alg_kind::eltwise_relu, prop_kind::forward>(
         float dd, float s, float alpha, float cliping) {
@@ -69,7 +77,19 @@ float activation<alg_kind::eltwise_tanh, prop_kind::forward>(
 template <>
 float activation<alg_kind::eltwise_tanh, prop_kind::backward>(
         float dd, float s, float alpha, float cliping) {
-    return tanh_bwd<float>(dd, s);
+    return dd * one_m_square(s);
+}
+
+template <>
+float activation<alg_kind::eltwise_logistic, prop_kind::forward>(
+        float dd, float s, float alpha, float cliping) {
+    return logistic_fwd<float>(s);
+}
+
+template <>
+float activation<alg_kind::eltwise_logistic, prop_kind::backward>(
+        float dd, float s, float alpha, float cliping) {
+    return dd * x_m_square(s);
 }
 
 //************************* Cell execution *************************//
@@ -77,41 +97,41 @@ float activation<alg_kind::eltwise_tanh, prop_kind::backward>(
 /// particularly?
 template <>
 elemwise_sig(_ref_rnn_common_t<prop_kind::forward>::rnn_elemwise) {
-    AOC<float, 3> ws_gates(ws_gates_, batch, n_gates, dic);
+    AOC<float, 3> ws_gates(ws_gates_, batch, conf_.GC());
     AOC<const float, 2> bias(bias_, n_gates, dic);
-    AOC<float, 3> states_t_l(states_t_l_, n_states, batch, wic);
+    AOC<float, 4> states_t_l(states_t_l_, n_states, iter_stride, batch, wic);
     parallel_nd(batch, [&](int i) {
         for (int j = 0; j < dic; j++) {
-            const float h
-                    = activation_func(0, ws_gates(i, 0, j) + bias(0, j), 0, 0);
-            ws_gates(i, 0, j) = states_t_l(0, i, j) = h;
+            const float h =
+                activation_func(0, ws_gates(i, j) + bias(0, j), 0, 0);
+            ws_gates(i, j) = states_t_l(0, 0, i, j) = h;
         }
     });
 }
 
 template <>
 elemwise_sig(_ref_rnn_common_t<prop_kind::backward>::rnn_elemwise) {
-    AOC<float, 3> ws_gates(ws_gates_, batch, n_gates, dic);
-    AOC<float, 3> diff_states_tp1_l(
-            diff_states_tp1_l_, n_states + 1, batch, wic);
-    AOC<float, 3> diff_states_t_lp1(
-            diff_states_t_lp1_, n_states + 1, batch, wic);
+    AOC<float, 3> ws_gates(ws_gates_, batch, conf_.GC());
+    AOC<float, 4> diff_states_tp1_l(
+            diff_states_tp1_l_, n_states + 1, iter_stride, batch, wic);
+    AOC<float, 4> diff_states_t_lp1(
+            diff_states_t_lp1_, n_states + 1, iter_stride, batch, wic);
     parallel_nd(batch, [&](int i) {
         for (int j = 0; j < dic; ++j) {
-            const float dH = diff_states_t_lp1(n_states, i, j)
-                    + diff_states_tp1_l(0, i, j);
-            auto g = ws_gates(i, 0, j);
-            ws_gates(i, 0, j) = activation_func(dH, g, 0, 0);
+            const float dH = diff_states_t_lp1(n_states, 0, i, j)
+                + diff_states_tp1_l(0, 0, i, j);
+            auto g = ws_gates(i, j);
+            ws_gates(i, j) = activation_func(dH, g, 0, 0);
         }
     });
 }
 
 template <>
 elemwise_sig(_ref_rnn_common_t<prop_kind::forward>::lstm_elemwise) {
-    AOC<float, 3> ws_gates(ws_gates_, batch, n_gates, dic);
+    AOC<float, 3> ws_gates(ws_gates_, batch, conf_.GC());
     AOC<const float, 2> bias(bias_, n_gates, dic);
-    AOC<float, 3> states_t_l(states_t_l_, n_states, batch, wic);
-    AOC<float, 3> states_tm1_l(states_tm1_l_, n_states, batch, wic);
+    AOC<float, 4> states_t_l(states_t_l_, n_states, iter_stride, batch, wic);
+    AOC<float, 4> states_tm1_l(states_tm1_l_, n_states, iter_stride, batch, wic);
 
     parallel_nd(batch, [&](int i) {
 // WA. Loss of correctnes in case of simd loop unrolling with icc 18
@@ -119,66 +139,65 @@ elemwise_sig(_ref_rnn_common_t<prop_kind::forward>::lstm_elemwise) {
         PRAGMA_OMP_SIMD()
 #endif
         for (int j = 0; j < dic; j++) {
-            ws_gates(i, 0, j) = logistic_fwd(ws_gates(i, 0, j) + bias(0, j));
-            ws_gates(i, 1, j) = logistic_fwd(ws_gates(i, 1, j) + bias(1, j));
-            ws_gates(i, 2, j) = logistic_fwd(ws_gates(i, 2, j) + bias(2, j));
-            ws_gates(i, 3, j) = tanh_fwd(ws_gates(i, 3, j) + bias(3, j));
-
-            float tmp = ws_gates(i, 0, j) * states_tm1_l(1, i, j)
-                    + ws_gates(i, 1, j) * ws_gates(i, 3, j);
-            states_t_l(0, i, j) = ws_gates(i, 2, j) * tanh_fwd(tmp);
-            states_t_l(1, i, j) = tmp;
+            ws_gates(i, 0 * dic + j) = logistic_fwd(ws_gates(i, 0 * dic + j) + bias(0, j));
+            ws_gates(i, 1 * dic + j) = logistic_fwd(ws_gates(i, 1 * dic + j) + bias(1, j));
+            ws_gates(i, 2 * dic + j) = tanh_fwd(ws_gates(i, 2 * dic + j) + bias(2, j));
+            ws_gates(i, 3 * dic + j) = logistic_fwd(ws_gates(i, 3 * dic + j) + bias(3, j));
+
+            float tmp = ws_gates(i, 1 * dic + j) * states_tm1_l(1, 0, i, j)
+                    + ws_gates(i, 0 * dic + j) * ws_gates(i, 2 * dic + j);
+            states_t_l(0, 0, i, j) = ws_gates(i, 3 * dic + j) * tanh_fwd(tmp);
+            states_t_l(1, 0, i, j) = tmp;
         }
     });
 }
 
 template <>
 elemwise_sig(_ref_rnn_common_t<prop_kind::backward>::lstm_elemwise) {
-    AOC<float, 3> ws_gates(ws_gates_, batch, n_gates, dic);
+    AOC<float, 3> ws_gates(ws_gates_, batch, conf_.GC());
     AOC<const float, 2> bias(bias_, n_gates, dic);
-    AOC<float, 3> states_t_l(states_t_l_, n_states, batch, wic);
-    AOC<float, 3> states_tm1_l(states_tm1_l_, n_states, batch, wic);
-    AOC<float, 3> diff_states_t_l(diff_states_t_l_, n_states + 1, batch, wic);
-    AOC<float, 3> diff_states_tp1_l(
-            diff_states_tp1_l_, n_states + 1, batch, wic);
-    AOC<float, 3> diff_states_t_lp1(
-            diff_states_t_lp1_, n_states + 1, batch, wic);
-
-    auto one_m_square = [](float a) -> float { return 1.0f - a * a; };
+    AOC<float, 4> states_t_l(states_t_l_, n_states, iter_stride, batch, wic);
+    AOC<float, 4> states_tm1_l(states_tm1_l_, n_states, iter_stride, batch, wic);
+    AOC<float, 4> diff_states_t_l(diff_states_t_l_, n_states + 1, iter_stride, batch, wic);
+    AOC<float, 4> diff_states_tp1_l(
+        diff_states_tp1_l_, n_states + 1, iter_stride, batch, wic);
+    AOC<float, 4> diff_states_t_lp1(
+        diff_states_t_lp1_, n_states + 1, iter_stride, batch, wic);
 
     parallel_nd(batch, [&](int i) {
         PRAGMA_OMP_SIMD()
         for (int j = 0; j < dic; j++) {
-            float Ct = states_t_l(1, i, j);
+            float Ct = states_t_l(1, 0, i, j);
             /// @todo save it in the workspace in fwd pass or recompute it to
             /// save bw
             float tanhCt = tanh_fwd(Ct);
             // we have 2 incoming diffs on Ht
-            float dHt = diff_states_tp1_l(0, i, j)
-                    + diff_states_t_lp1(n_states, i, j);
-            float dCt = diff_states_tp1_l(1, i, j)
-                    + one_m_square(tanhCt) * ws_gates(i, 2, j) * dHt;
-
-            float dG0 = states_tm1_l(1, i, j)
-                    * logistic_bwd(dCt, ws_gates(i, 0, j));
-            float dG1
-                    = ws_gates(i, 3, j) * logistic_bwd(dCt, ws_gates(i, 1, j));
-            float dG2 = logistic_bwd(tanhCt * dHt, ws_gates(i, 2, j));
-            float dG3 = ws_gates(i, 1, j) * tanh_bwd(dCt, ws_gates(i, 3, j));
-
-            diff_states_t_l(1, i, j) = dCt * ws_gates(i, 0, j);
-
-            ws_gates(i, 0, j) = dG0;
-            ws_gates(i, 1, j) = dG1;
-            ws_gates(i, 2, j) = dG2;
-            ws_gates(i, 3, j) = dG3;
+            float dHt = diff_states_tp1_l(0, 0, i, j)
+            + diff_states_t_lp1(n_states, 0, i, j);
+            float dCt = diff_states_tp1_l(1, 0, i, j)
+                    + one_m_square(tanhCt) * ws_gates(i, 3 * dic + j) * dHt;
+
+            float dG1 = states_tm1_l(1, 0, i, j) * dCt
+                    * x_m_square(ws_gates(i, 1 * dic + j));
+            float dG0 = ws_gates(i, 2 * dic + j) * dCt
+                    * x_m_square(ws_gates(i, 0 * dic + j));
+            float dG3 = tanhCt * dHt * x_m_square(ws_gates(i, 3 * dic + j));
+            float dG2 = ws_gates(i, 0 * dic + j) * dCt
+                    * one_m_square(ws_gates(i, 2 * dic + j));
+
+            diff_states_t_l(1, 0, i, j) = dCt * ws_gates(i, 1 * dic + j);
+
+            ws_gates(i, 0 * dic + j) = dG0;
+            ws_gates(i, 1 * dic + j) = dG1;
+            ws_gates(i, 2 * dic + j) = dG2;
+            ws_gates(i, 3 * dic + j) = dG3;
         }
     });
 }
 
 template <prop_kind_t aprop>
 gemm_sig(_ref_rnn_common_t<aprop>::packed_gemm) {
-#if USE_MKL_PACKED_GEMM
+#if (USE_MKL_PACKED_GEMM)
     cblas_sgemm_compute(CblasColMajor, CblasPacked,
             is_B_trans ? CblasTrans : CblasNoTrans, m, n, k, a_, strideA_m, b_,
             is_B_trans ? strideB_n : strideB_k, beta, c_, strideC_m);
@@ -200,19 +219,20 @@ gemm_sig(_ref_rnn_common_t<aprop>::gemm) {
     float alpha = 1.f;
     extended_sgemm("N", is_B_trans ? "T" : "N", &m, &n, &k, &alpha,
             a_, &strideA_m, b_, is_B_trans ? &strideB_n : &strideB_k, &beta,
-            c_, &strideC_m);
+            c_, &strideC_m, nullptr, use_jit_sgemm_);
 }
 
 template <prop_kind_t aprop>
-void _ref_rnn_common_t<aprop>::gates_reduction(int n_gates, int dic, int batch,
+void _ref_rnn_common_t<aprop>::gates_reduction(int n_gates, int dic, int wic, int batch,
         const float *ws_gates_, float *diff_bias_) {
     auto body = [&](int i, int k) {
         for (int j = 0; j < batch; j++)
-            diff_bias_[i * dic + k] += ws_gates_[(j * n_gates + i) * dic + k];
+            diff_bias_[i * dic + k]
+                    += ws_gates_[j * conf_.GC() + i * dic + k];
     };
 
     // @todo block k on simd-width
-#if (_OPENMP >= 201307) \
+#if MKLDNN_THR == MKLDNN_THR_OMP && _OPENMP >= 201307 \
     /* icc 17.0 has a problem with simd collapse */ \
     && !((defined __INTEL_COMPILER) && (__INTEL_COMPILER == 1700))
 #pragma omp parallel for simd collapse(2)
@@ -227,82 +247,92 @@ void _ref_rnn_common_t<aprop>::gates_reduction(int n_gates, int dic, int batch,
 ///  to pass argument for empty function is too big
 template <>
 cell_execution_sig(_ref_rnn_common_t<prop_kind::forward>::cell_execution) {
-    (this->*gemm_input_func)(n_gates * dic, batch, slc, n_gates * dic, slc,
-            batch, wic, n_gates * dic, batch, w_input_[0], states_t_lm1_,
-            ws_gates_, false, 0.0f);
-    (this->*gemm_state_func)(n_gates * dic, batch, sic, n_gates * dic, sic,
-            batch, wic, n_gates * dic, batch, w_state_[0], states_tm1_l_,
+    if (!merge_gemm_layer) {
+        (this->*gemm_input_func)(n_gates * dic, batch, slc, conf_.WL_GLD(), slc,
+                batch, wic, conf_.GC(), batch, w_input_[0], states_t_lm1_,
+                ws_gates_, false, 0.0f);
+    }
+    (this->*gemm_state_func)(n_gates * dic, batch, sic, conf_.WI_GLD(), sic,
+            batch, wic, conf_.GC(), batch, w_state_[0], states_tm1_l_,
             ws_gates_, false, 1.0f);
-    (this->*elemwise_func)(dic, wic, batch, n_states, n_gates, ws_gates_,
+    (this->*elemwise_func)(dic, wic, batch, n_states, iter_stride, n_gates, ws_gates_,
             states_t_l_, states_t_lm1_, states_tm1_l_, diff_states_t_l_,
             diff_states_t_lp1_, diff_states_tp1_l_, bias_, ws_grid_, ws_cell_);
 }
 
 template <>
 cell_execution_sig(_ref_rnn_common_t<prop_kind::backward>::cell_execution) {
-    (this->*elemwise_func)(dic, wic, batch, n_states, n_gates, ws_gates_,
+    (this->*elemwise_func)(dic, wic, batch, n_states, iter_stride, n_gates, ws_gates_,
             states_t_l_, states_t_lm1_, states_tm1_l_, diff_states_t_l_,
             diff_states_t_lp1_, diff_states_tp1_l_, bias_, ws_grid_, ws_cell_);
 
     /// bwd by data on the cell
-    (this->*gemm_state_func)(sic, batch, n_gates * dic, sic, n_gates * dic,
-            batch, n_gates * dic, wic, batch, w_state_[0], ws_gates_,
-            diff_states_t_l_, false, 0.0f);
-    (this->*gemm_input_func)(slc, batch, n_gates * dic, slc, n_gates * dic,
-            batch, n_gates * dic, wic, batch, w_input_[0], ws_gates_,
-            diff_states_t_l_ + n_states * (batch * wic), false, 0.0f);
-
-    /// bwd by weights on the cell
-    gemm(n_gates * dic, slc, batch, n_gates * dic, batch, wic, batch,
-            n_gates * dic, slc, ws_gates_, states_t_lm1_, diff_w_input_, true,
-            1.0f);
-    gemm(n_gates * dic, sic, batch, n_gates * dic, batch, wic, batch,
-            n_gates * dic, sic, ws_gates_, states_tm1_l_, diff_w_state_, true,
-            1.0f);
+    (this->*gemm_state_func)(sic, batch, n_gates * dic, conf_.WI_GLD(),
+            n_gates * dic, batch, conf_.GC(), wic, batch, w_state_[0],
+            ws_gates_, diff_states_t_l_, false, 0.0f);
+
+    if (!merge_gemm_layer) {
+        (this->*gemm_input_func)(slc, batch, n_gates * dic, conf_.WL_GLD(),
+                n_gates * dic, batch, conf_.GC(), wic, batch, w_input_[0],
+                ws_gates_,
+                diff_states_t_l_ + n_states * iter_stride * (batch * wic),
+                false, 0.0f);
+
+        /// bwd by weights on the cell
+        gemm(n_gates * dic, slc, batch, conf_.GC(), batch, wic, batch,
+                conf_.DWL_GLD(), slc, ws_gates_, states_t_lm1_, diff_w_input_,
+                true, 1.0f);
+    }
 
+    if (!merge_gemm_iter)
+        gemm(n_gates * dic, sic, batch, conf_.GC(), batch, wic, batch,
+                conf_.DWI_GLD(), sic, ws_gates_, states_tm1_l_, diff_w_state_,
+                true, 1.0f);
     /// bwd by bias we just accumulate diffs from the gates
-    gates_reduction(n_gates, dic, batch, ws_gates_, diff_bias_);
+    gates_reduction(n_gates, dic, wic, batch, ws_gates_, diff_bias_);
 }
 
 template <>
 cell_execution_sig(_ref_rnn_common_t<prop_kind::forward>::cell_execution_gru) {
-    AOC<float, 3> ws_gates(ws_gates_, batch, n_gates, dic);
+    AOC<float, 3> ws_gates(ws_gates_, batch, conf_.GC());
     AOC<const float, 2> bias(bias_, n_gates, dic);
     AOC<float, 2> states_t_l(states_t_l_, batch, wic);
     AOC<float, 2> states_tm1_l(states_tm1_l_, batch, wic);
 
     // 1. gemm Wx[0-2],x
-    (this->*gemm_input_func)(n_gates * dic, batch, slc, n_gates * dic, slc,
-            batch, wic, n_gates * dic, batch, w_input_[0], states_t_lm1_,
-            ws_gates_, false, 0.0f);
+    if (!merge_gemm_layer) {
+        (this->*gemm_input_func)(n_gates * dic, batch, slc, conf_.WL_GLD(), slc,
+                batch, wic, conf_.GC(), batch, w_input_[0], states_t_lm1_,
+                ws_gates_, false, 0.0f);
+    }
 
     // 2. gemm Wh[0-1],h
-    (this->*gemm_state_func)((n_gates - 1)*dic, batch, sic, n_gates * dic, sic,
-            batch, wic, n_gates * dic, batch, w_state_[0], states_tm1_l_,
+    (this->*gemm_state_func)((n_gates - 1) * dic, batch, sic, conf_.WI_GLD(),
+            sic, batch, wic, conf_.GC(), batch, w_state_[0], states_tm1_l_,
             ws_gates_, false, 1.0f);
 
     // 3. activation zt and rt + elemwise multiplication rt,ht-1
     parallel_nd(batch, [&](int i) {
         PRAGMA_OMP_SIMD()
         for (int j = 0; j < dic; j++) {
-            ws_gates(i, 0, j) = logistic_fwd(ws_gates(i, 0, j) + bias(0, j));
-            ws_gates(i, 1, j) = logistic_fwd(ws_gates(i, 1, j) + bias(1, j));
-            states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 1, j);
+            ws_gates(i, 0 * dic + j) = logistic_fwd(ws_gates(i, 0 * dic + j) + bias(0, j));
+            ws_gates(i, 1 * dic + j) = logistic_fwd(ws_gates(i, 1 * dic + j) + bias(1, j));
+            states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 1 * dic + j);
         }
     });
 
     // 4. gemm Wh[2],h~t
-    (this->*gemm_state_func)(dic, batch, sic, n_gates * dic, sic,
-            batch, wic, n_gates * dic, batch, w_state_[1], states_t_l_,
-            &(ws_gates(0, 2, 0)), false, 1.0f);
+    (this->*gemm_state_func)(dic, batch, sic, conf_.WI_GLD(), sic, batch, wic,
+            conf_.GC(), batch, w_state_[1], states_t_l_,
+            &(ws_gates(0, 2 * dic)), false, 1.0f);
 
     // 5. activation h~t + calculate ht
     parallel_nd(batch, [&](int i) {
         PRAGMA_OMP_SIMD()
         for (int j = 0; j < dic; j++) {
-            ws_gates(i, 2, j) = tanh_fwd(ws_gates(i, 2, j) + bias(2, j));
-            states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 0, j) +
-                (1.0f - ws_gates(i, 0, j)) * ws_gates(i, 2, j);
+            ws_gates(i, 2 * dic + j) = tanh_fwd(ws_gates(i, 2 * dic + j) + bias(2, j));
+            states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 0 * dic + j) +
+                (1.0f - ws_gates(i, 0 * dic +  j)) * ws_gates(i, 2 * dic + j);
         }
     });
 }
@@ -310,24 +340,24 @@ cell_execution_sig(_ref_rnn_common_t<prop_kind::forward>::cell_execution_gru) {
 template <>
 elemwise_sig(_ref_rnn_common_t<prop_kind::forward>::gru_lbr_elemwise) {
     bool is_training = conf_.is_training();
-    AOC<float, 3> ws_gates(ws_gates_, batch, n_gates, dic);
+    AOC<float, 3> ws_gates(ws_gates_, batch, conf_.GC());
     AOC<float, 2> ws_Wh_b(ws_grid_, batch, dic);
     AOC<const float, 2> bias(bias_, n_gates + 1, dic);
     AOC<float, 2> states_t_l(states_t_l_, batch, wic);
     AOC<float, 2> states_tm1_l(states_tm1_l_, batch, wic);
-    AOC<float, 3> ws_gemm_state(ws_cell_, batch, n_gates, dic);
+    AOC<float, 3> ws_gemm_state(ws_cell_, batch, conf_.GC());
     parallel_nd(batch, [&](int i) {
         PRAGMA_OMP_SIMD()
         for (int j = 0; j < dic; j++) {
-            float Wh_b = ws_gemm_state(i, 2, j) + bias(3, j);
-            ws_gates(i, 0, j) = logistic_fwd(ws_gates(i, 0, j) +
-                ws_gemm_state(i, 0, j) + bias(0, j));
-            ws_gates(i, 1, j) = logistic_fwd(ws_gates(i, 1, j) +
-                ws_gemm_state(i, 1, j) + bias(1, j));
-            ws_gates(i, 2, j) = tanh_fwd(ws_gates(i, 2, j) +
-                ws_gates(i, 1, j) * Wh_b + bias(2, j));
-            states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 0, j) +
-                (1.0f - ws_gates(i, 0, j)) * ws_gates(i, 2, j);
+            float Wh_b = ws_gemm_state(i, 2 * dic + j) + bias(3, j);
+            ws_gates(i, 0 * dic + j) = logistic_fwd(ws_gates(i, 0 * dic + j) +
+                ws_gemm_state(i, j) + bias(0, j));
+            ws_gates(i, 1 * dic + j) = logistic_fwd(ws_gates(i, 1 * dic + j) +
+                ws_gemm_state(i, dic + j) + bias(1, j));
+            ws_gates(i, 2 * dic + j) = tanh_fwd(ws_gates(i, 2 * dic + j) +
+                ws_gates(i, 1 * dic + j) * Wh_b + bias(2, j));
+            states_t_l(i, j) = states_tm1_l(i, j) * ws_gates(i, 0 * dic + j) +
+                (1.0f - ws_gates(i, 0 * dic + j)) * ws_gates(i, 2 * dic + j);
             if (is_training) ws_Wh_b(i, j) = Wh_b;
         }
     });
@@ -335,27 +365,29 @@ elemwise_sig(_ref_rnn_common_t<prop_kind::forward>::gru_lbr_elemwise) {
 
 template <>
 cell_execution_sig(_ref_rnn_common_t<prop_kind::forward>::cell_execution_gru_lbr) {
-    (this->*gemm_input_func)(n_gates * dic, batch, slc, n_gates * dic, slc,
-            batch, wic, n_gates * dic, batch, w_input_[0], states_t_lm1_,
-            ws_gates_, false, 0.0f);
-    (this->*gemm_state_func)(n_gates * dic, batch, sic, n_gates * dic, sic,
-            batch, wic, n_gates * dic, batch, w_state_[0], states_tm1_l_,
-            ws_cell_, false, 0.0f);
-    (this->*elemwise_func)(dic, wic, batch, n_states, n_gates, ws_gates_,
+    if (!merge_gemm_layer) {
+        (this->*gemm_input_func)(n_gates * dic, batch, slc, conf_.WL_GLD(), slc,
+                batch, wic, conf_.GC(), batch, w_input_[0], states_t_lm1_,
+                ws_gates_, false, 0.0f);
+    }
+    (this->*gemm_state_func)(n_gates * dic, batch, sic, conf_.WI_GLD(), sic,
+            batch, wic, conf_.GC(), batch, w_state_[0], states_tm1_l_, ws_cell_,
+            false, 0.0f);
+    (this->*elemwise_func)(dic, wic, batch, n_states, iter_stride, n_gates, ws_gates_,
             states_t_l_, states_t_lm1_, states_tm1_l_, diff_states_t_l_,
             diff_states_t_lp1_, diff_states_tp1_l_, bias_, ws_grid_, ws_cell_);
 }
 
 template <>
 elemwise_sig(_ref_rnn_common_t<prop_kind::backward>::gru_lbr_elemwise) {
-    AOC<float, 3> ws_gates(ws_gates_, batch, n_gates, dic);
+    AOC<float, 3> ws_gates(ws_gates_, batch, conf_.GC());
     AOC<const float, 2> states_tm1_l(states_tm1_l_, batch, wic);
-    AOC<float, 3> diff_states_t_l(diff_states_t_l_, n_states + 1, batch, wic);//dht-1 dxt
-    AOC<float, 3> diff_states_tp1_l(
-            diff_states_tp1_l_, n_states + 1, batch, wic);
-    AOC<float, 3> diff_states_t_lp1(
-            diff_states_t_lp1_, n_states + 1, batch, wic);
-    AOC<float, 3> ws_gates_r(ws_cell_, batch, n_gates, dic);
+    AOC<float, 4> diff_states_t_l(diff_states_t_l_, n_states + 1, iter_stride, batch, wic);//dht-1 dxt
+    AOC<float, 4> diff_states_tp1_l(
+        diff_states_tp1_l_, n_states + 1, iter_stride, batch, wic);
+    AOC<float, 4> diff_states_t_lp1(
+        diff_states_t_lp1_, n_states + 1, iter_stride, batch, wic);
+    AOC<float, 3> ws_gates_r(ws_cell_, batch, conf_.GC());
     AOC<float, 2> ws_Wh_b(ws_grid_, batch, dic);
 
     // 1. calculate dG1 dG2 dG3
@@ -366,20 +398,20 @@ elemwise_sig(_ref_rnn_common_t<prop_kind::backward>::gru_lbr_elemwise) {
         PRAGMA_OMP_SIMD()
         for (int j = 0; j < dic; j++) {
             float h = states_tm1_l(i, j);
-            float dHt = diff_states_tp1_l(0, i, j)
-                    + diff_states_t_lp1(n_states, i, j);
-            float dG0 = (h - ws_gates(i, 2, j))
-                    * logistic_bwd(dHt, ws_gates(i, 0, j));
-            float dG2 = (1.0f - ws_gates(i, 0, j)) * dHt;
-            float dG1
-                    = ws_Wh_b(i, j) * logistic_bwd(dG2, ws_gates(i, 1, j));
-            dG2 *= tanh_bwd(1.0f, ws_gates(i, 2, j));
-
-            diff_states_t_l(0, i, j) = dHt * ws_gates(i, 0, j);
-            ws_gates(i, 2, j) = dG2;
-            ws_gates_r(i, 2, j) = dG2 * ws_gates(i, 1, j);
-            ws_gates(i, 0, j) = ws_gates_r(i, 0, j) = dG0;
-            ws_gates(i, 1, j) = ws_gates_r(i, 1, j) = dG1;
+            float dHt = diff_states_tp1_l(0, 0, i, j)
+                    + diff_states_t_lp1(n_states, 0, i, j);
+            float dG0 = (h - ws_gates(i, 2 * dic + j)) * dHt
+                    * x_m_square(ws_gates(i, 0 * dic + j));
+            float dG2 = (1.0f - ws_gates(i, 0 * dic + j))
+                    * one_m_square(ws_gates(i, 2 * dic + j)) * dHt;
+            float dG1 = ws_Wh_b(i, j) * dG2
+                    * x_m_square(ws_gates(i, 1 * dic + j));
+
+            diff_states_t_l(0, 0, i, j) = dHt * ws_gates(i, 0 * dic + j);
+            ws_gates(i, 2 * dic + j) = dG2;
+            ws_gates_r(i, 2 * dic + j) = dG2 * ws_gates(i, 1 * dic + j);
+            ws_gates(i, 0 * dic + j) = ws_gates_r(i, 0 * dic + j) = dG0;
+            ws_gates(i, 1 * dic + j) = ws_gates_r(i, 1 * dic + j) = dG1;
         }
     });
 }
@@ -387,52 +419,57 @@ elemwise_sig(_ref_rnn_common_t<prop_kind::backward>::gru_lbr_elemwise) {
 template <>
 cell_execution_sig(_ref_rnn_common_t<prop_kind::backward>::cell_execution_gru_lbr) {
     AOC<float, 2> diff_bias(diff_bias_, n_gates + 1, dic);
-    AOC<float, 3> ws_gates_r(ws_cell_, batch, n_gates, dic);
+    AOC<float, 3> ws_gates_r(ws_cell_, batch, conf_.GC());
 
-    (this->*elemwise_func)(dic, wic, batch, n_states, n_gates, ws_gates_,
+    (this->*elemwise_func)(dic, wic, batch, n_states, iter_stride, n_gates, ws_gates_,
             states_t_l_, states_t_lm1_, states_tm1_l_, diff_states_t_l_,
             diff_states_t_lp1_, diff_states_tp1_l_, bias_, ws_grid_, ws_cell_);
 
-    //  dx = dG * Wx^t
-    (this->*gemm_input_func)(slc, batch, n_gates * dic, slc, n_gates * dic,
-            batch, n_gates * dic, wic, batch, w_input_[0], ws_gates_,
-            diff_states_t_l_ + n_states * (batch * wic), false, 0.0f);
+    if (!merge_gemm_layer) {
+         //  dx = dG * Wx^t
+         (this->*gemm_input_func)(slc, batch, n_gates * dic, conf_.WL_GLD(),
+                 n_gates * dic, batch, conf_.GC(), wic, batch, w_input_[0],
+                 ws_gates_,
+                 diff_states_t_l_ + n_states * iter_stride * (batch * wic),
+                 false, 0.0f);
+         // dWx +=  dG^t * x
+         gemm(n_gates * dic, slc, batch, conf_.GC(), batch, wic, batch,
+                 conf_.DWL_GLD(), slc, ws_gates_, states_t_lm1_, diff_w_input_,
+                 true, 1.0f);
+    }
     // dh +=  dGr * Wh^t
-    (this->*gemm_state_func)(sic, batch, n_gates * dic, sic, n_gates * dic,
-            batch, n_gates * dic, wic, batch, w_state_[0], ws_cell_,
+    (this->*gemm_state_func)(sic, batch, n_gates * dic, conf_.WI_GLD(),
+            n_gates * dic, batch, conf_.GC(), wic, batch, w_state_[0], ws_cell_,
             diff_states_t_l_, false, 1.0f);
-    // dWx +=  dG^t * x
-    gemm(n_gates * dic, slc, batch, n_gates * dic, batch, wic, batch,
-            n_gates * dic, slc, ws_gates_, states_t_lm1_, diff_w_input_, true,
-            1.0f);
+
     // dWh += dGr^t * h
-    gemm(n_gates * dic, sic, batch, n_gates * dic, batch, wic, batch,
-            n_gates * dic, sic, ws_cell_, states_tm1_l_, diff_w_state_, true,
+    gemm(n_gates * dic, sic, batch, conf_.GC(), batch, wic, batch,
+            conf_.DWL_GLD(), sic, ws_cell_, states_tm1_l_, diff_w_state_, true,
             1.0f);
 
     // db1-3 += e * dG
     // db4 += e * (r * dG2)
-    gates_reduction(n_gates, dic, batch, ws_gates_, diff_bias_);
+    gates_reduction(n_gates, dic, wic, batch, ws_gates_, diff_bias_);
 
     parallel_nd(dic, [&](int j) {
         for (int i = 0; i < batch; i++) {
-            diff_bias_[3 * dic + j] += ws_gates_r(i, 2, j);
+            diff_bias_[3 * dic + j] += ws_gates_r(i, 2 *dic + j);
         }
     });
 }
 
 template <>
 cell_execution_sig(_ref_rnn_common_t<prop_kind::backward>::cell_execution_gru) {
-    AOC<float, 3> ws_gates(ws_gates_, batch, n_gates, dic);
+    AOC<float, 2> ws_gates(ws_gates_, batch, conf_.GC());
     AOC<const float, 2> states_tm1_l(states_tm1_l_, batch, wic);
-    AOC<float, 3> diff_states_t_l(diff_states_t_l_, n_states + 1, batch, wic);//dht-1 dxt
-    AOC<float, 3> diff_w_state(diff_w_state_, dic, n_gates, sic);
-    AOC<float, 3> diff_states_tp1_l(
-            diff_states_tp1_l_, n_states + 1, batch, wic);
-    AOC<float, 3> diff_states_t_lp1(
-            diff_states_t_lp1_, n_states + 1, batch, wic);
+    AOC<float, 4> diff_states_t_l(diff_states_t_l_, n_states + 1, iter_stride, batch, wic);//dht-1 dxt
+    AOC<float, 3> diff_w_state(diff_w_state_, sic, conf_.GC());
+    AOC<float, 4> diff_states_tp1_l(
+        diff_states_tp1_l_, n_states + 1, iter_stride, batch, wic);
+    AOC<float, 4> diff_states_t_lp1(
+        diff_states_t_lp1_, n_states + 1, iter_stride, batch, wic);
     //use state memory for intermediate computations
-    float *dhG1_ = diff_states_t_l_ + n_states * batch * wic;
+    float *dhG1_ = &(diff_states_t_l(n_states, 0, 0, 0));
     float *hG1_ = dhG1_;
     AOC<float, 2> dhG1(dhG1_, batch, wic);
     AOC<float, 2> hG1(hG1_, batch, wic);
@@ -445,23 +482,23 @@ cell_execution_sig(_ref_rnn_common_t<prop_kind::backward>::cell_execution_gru) {
         PRAGMA_OMP_SIMD()
         for (int j = 0; j < dic; j++) {
             float h = states_tm1_l(i, j);
-            float dHt = diff_states_tp1_l(0, i, j)
-                    + diff_states_t_lp1(n_states, i, j);
-            float dG2 = (1.0f - ws_gates(i, 0, j))
-                    * tanh_bwd(dHt, ws_gates(i, 2, j));
-            float dG0 = (h - ws_gates(i, 2, j))
-                    * logistic_bwd(dHt, ws_gates(i, 0, j));
-
-            diff_states_t_l(0, i, j) = dHt * ws_gates(i, 0, j);
-            ws_gates(i, 0, j) = dG0;
-            ws_gates(i, 2, j) = dG2;
+            float dHt = diff_states_tp1_l(0, 0, i, j)
+                    + diff_states_t_lp1(n_states, 0, i, j);
+            float dG2 = (1.0f - ws_gates(i, 0 * dic + j)) * dHt
+                    * one_m_square(ws_gates(i, 2 * dic + j));
+            float dG0 = (h - ws_gates(i, 2 * dic + j)) * dHt
+                    * x_m_square(ws_gates(i, 0 * dic + j));
+
+            diff_states_t_l(0, 0, i, j) = dHt * ws_gates(i, 0 * dic + j);
+            ws_gates(i, 0 * dic + j) = dG0;
+            ws_gates(i, 2 * dic + j) = dG2;
         }
     });
 
     //2. calculate intermediate d(hG1)
     //d(hG1) = dG2 * W2h^t
-    (this->*gemm_state_func)(sic, batch, dic, sic, n_gates * dic,
-            batch, n_gates * dic, wic, batch, w_state_[1], &(ws_gates(0, 2, 0)),
+    (this->*gemm_state_func)(sic, batch, dic, conf_.WI_GLD(), n_gates * dic,
+            batch, conf_.GC(), wic, batch, w_state_[1], &(ws_gates(0, 2 * dic)),
             dhG1_, false, 0.0f);
 
     //3. calculate dG1^ and part of dht-1
@@ -472,82 +509,88 @@ cell_execution_sig(_ref_rnn_common_t<prop_kind::backward>::cell_execution_gru) {
         PRAGMA_OMP_SIMD()
         for (int j = 0; j < dic; j++) {
             float h = states_tm1_l(i, j);
-            float G1 =  ws_gates(i, 1, j);
-            diff_states_t_l(0, i, j) += dhG1(i, j) * G1;
-            ws_gates(i, 1, j) = dhG1(i, j) * logistic_bwd(h, G1);
+            float G1 =  ws_gates(i, 1 * dic + j);
+            diff_states_t_l(0, 0, i, j) += dhG1(i, j) * G1;
+            ws_gates(i, 1 * dic + j) = dhG1(i, j) * h * x_m_square(G1);
             hG1(i, j) = G1 * h;
         }
     });
 
     //4. calculate diff weights
-    //dWx += [dG0 dG1 dG2] * [x]
     //dWh1 += dG1 * h, dWh2 += dG2 * h, dWh3 += dG3 * (G1(*)h)
-    gemm(n_gates * dic, slc, batch, n_gates * dic, batch, wic, batch,
-            n_gates * dic, slc, ws_gates_, states_t_lm1_, diff_w_input_, true,
+    gemm((n_gates - 1) * dic, sic, batch, conf_.GC(), batch, wic, batch,
+            conf_.DWI_GLD(), sic, ws_gates_, states_tm1_l_, diff_w_state_, true,
             1.0f);
-    gemm((n_gates - 1) * dic, sic, batch, n_gates * dic, batch, wic, batch,
-            n_gates * dic, sic, ws_gates_, states_tm1_l_, diff_w_state_, true,
+    gemm(dic, sic, batch, conf_.GC(), batch, wic, batch, conf_.DWI_GLD(), sic,
+            &(ws_gates(0, 2 * dic)), hG1_, &(diff_w_state(0, 2 * dic)), true,
             1.0f);
-    gemm(dic, sic, batch, n_gates * dic, batch, wic, batch, n_gates * dic, sic,
-            &(ws_gates(0, 2, 0)), hG1_, &(diff_w_state(0, 2, 0)), true, 1.0f);
 
     //5. calculate diff states
-    //dx = dG2 * W2x + dG1 * W1x + dG0 * W0x
     //dht-1 += dG1 * W1h + dG0 * W0h
-    (this->*gemm_state_func)(sic, batch, (n_gates - 1) * dic, sic, n_gates * dic,
-            batch, n_gates * dic, wic, batch, w_state_[0], ws_gates_,
-            diff_states_t_l_, false, 1.0f);
-    (this->*gemm_input_func)(slc, batch, n_gates * dic, slc, n_gates * dic,
-            batch, n_gates * dic, wic, batch, w_input_[0], ws_gates_,
-            diff_states_t_l_ + n_states * batch * wic, false, 0.0f);
+    (this->*gemm_state_func)(sic, batch, (n_gates - 1) * dic, conf_.WI_GLD(),
+            n_gates * dic, batch, conf_.GC(), wic, batch, w_state_[0],
+            ws_gates_, diff_states_t_l_, false, 1.0f);
+
+    if (!merge_gemm_layer) {
+        //dWx += [dG0 dG1 dG2] * [x]
+        gemm(n_gates * dic, slc, batch, conf_.GC(), batch, wic, batch,
+                conf_.DWL_GLD(), slc, ws_gates_, states_t_lm1_, diff_w_input_,
+                true, 1.0f);
+        //dx = dG2 * W2x + dG1 * W1x + dG0 * W0x
+        (this->*gemm_input_func)(slc, batch, n_gates * dic, conf_.WL_GLD(),
+                n_gates * dic, batch, conf_.GC(), wic, batch, w_input_[0],
+                ws_gates_, &(diff_states_t_l(n_states, 0, 0, 0)), false, 0.0f);
+    }
 
     //6. calculate diff bias
-    gates_reduction(n_gates, dic, batch, ws_gates_, diff_bias_);
+    gates_reduction(n_gates, dic, wic, batch, ws_gates_, diff_bias_);
 }
 
 //*************** Grid computations strategy: linear ***************//
 template <prop_kind_t aprop>
 grid_execution_sig(_ref_rnn_common_t<aprop>::linear_execution) {
-    AOC<float, 4> ws_states(ws_states_, n_layer + 1, n_direction, n_iter + 1,
-            n_states * batch * wic);
-    AOC<float, 4> ws_diff_states(ws_diff_states_, n_layer + 1, n_direction,
-            n_iter + 1, (n_states + 1) * batch * wic);
+    AOC<float, 5> ws_states(ws_states_, n_layer + 1, n_direction, n_states, n_iter + 1,
+            batch * wic);
+    AOC<float, 5> ws_diff_states(ws_diff_states_, n_layer + 1, n_direction, (n_states + 1),
+            n_iter + 1, batch * wic);
     AOC<float, 4> ws_gates(
-            ws_gates_, n_layer, n_direction, n_iter, n_gates * batch * dic);
+            ws_gates_, n_layer, n_direction, n_iter, batch * conf_.GC());
     AOC<float *, 3> weights_input(weights_input_, n_layer, n_direction,
             n_parts_wei_i);
     AOC<float *, 3> weights_states(weights_states_, n_layer, n_direction,
             n_parts_wei_st);
     AOC<const float, 3> bias(bias_, n_layer, n_direction, n_bias * dic);
     AOC<float, 3> diff_weights_layer(
-            diff_weights_layer_, n_layer, n_direction, slc * n_gates * dic);
+            diff_weights_layer_, n_layer, n_direction, slc * conf_.DWL_GLD());
     AOC<float, 3> diff_weights_iter(
-            diff_weights_iter_, n_layer, n_direction, sic * n_gates * dic);
+            diff_weights_iter_, n_layer, n_direction, sic * conf_.DWI_GLD());
     AOC<float, 3> diff_bias(diff_bias_, n_layer, n_direction, n_bias * dic);
     AOC<float, 4> ws_grid(ws_grid_, n_layer, n_direction, n_iter, ws_per_cell);
 
     // We run the grid of computation
     for (int dir = 0; dir < n_direction; dir++) {
         for (int j = 0; j < n_layer; j++) {
+            int lay = (aprop == prop_kind::forward) ? j : n_layer - j - 1;
+            if ((aprop == prop_kind::forward) && merge_gemm_layer) {
+                /* Assumption: merge_gemm_layer happens only on forward */
+                (this->*gemm_input_func)(n_gates * dic, batch * n_iter, slc,
+                        conf_.WL_GLD(), slc, batch * n_iter, wic, conf_.GC(),
+                        batch * n_iter, weights_input(lay, dir, 0),
+                        &(ws_states(lay, dir, 0, 1, 0)),
+                        &(ws_gates(lay, dir, 0, 0)), false, 0.0f);
+            }
             for (int i = 0; i < n_iter; i++) {
-                int lay, iter;
-                if (aprop == prop_kind::forward) {
-                    lay = j;
-                    iter = i;
-                } else { // backward
-                    lay = n_layer - j - 1;
-                    iter = n_iter - i - 1;
-                }
-                (this->*cell_func)(dic, slc, sic, wic, batch, n_gates, n_states,
-                        &(ws_states(lay + 1, dir, iter + 1, 0)),
-                        &(ws_diff_states(lay, dir, iter, 0)),
+                int iter = (aprop == prop_kind::forward) ? i : n_iter - i - 1;
+                (this->*cell_func)(dic, slc, sic, wic, batch, n_gates, n_states, n_iter + 1,
+                        &(ws_states(lay + 1, dir, 0, iter + 1, 0)),
+                        &(ws_diff_states(lay, dir, 0, iter, 0)),
                         &(weights_input(lay, dir, 0)),
                         &(weights_states(lay, dir, 0)),
                         &(bias(lay, dir, 0)),
-                        &(ws_states(lay, dir, iter + 1, 0)),
-                        &(ws_states(lay + 1, dir, iter, 0)),
-                        &(ws_diff_states(lay + 1, dir, iter, 0)),
-                        &(ws_diff_states(lay, dir, iter + 1, 0)),
+                        &(ws_states(lay, dir, 0, iter + 1, 0)),
+                        &(ws_states(lay + 1, dir, 0, iter, 0)),
+                        &(ws_diff_states(lay + 1, dir, 0, iter, 0)),
+                        &(ws_diff_states(lay, dir, 0, iter + 1, 0)),
                         &(diff_weights_layer(lay, dir, 0)),
                         &(diff_weights_iter(lay, dir, 0)),
                         &(diff_bias(lay, dir, 0)),
@@ -555,172 +598,30 @@ grid_execution_sig(_ref_rnn_common_t<aprop>::linear_execution) {
                         &(ws_grid(lay, dir, iter, 0)),
                         ws_cell_);
             }
+            if ((aprop == prop_kind::backward) && merge_gemm_layer) {
+                (this->*gemm_input_func)(slc, batch * n_iter, n_gates * dic,
+                        conf_.WL_GLD(), n_gates * dic, batch * n_iter,
+                        conf_.GC(), wic, batch * n_iter,
+                        weights_input(lay, dir, 0), &(ws_gates(lay, dir, 0, 0)),
+                        &(ws_diff_states(lay, dir, n_states, 0, 0)), false,
+                        0.0f);
+                gemm(n_gates * dic, slc, batch * n_iter, conf_.GC(),
+                        batch * n_iter, wic, batch * n_iter, conf_.DWL_GLD(),
+                        slc, &(ws_gates(lay, dir, 0, 0)),
+                        &(ws_states(lay, dir, 0, 1, 0)),
+                        &(diff_weights_layer(lay, dir, 0)), true, 1.0f);
+            }
+            if ((aprop == prop_kind::backward) && merge_gemm_iter) {
+                gemm(n_gates * dic, sic, batch * n_iter, conf_.GC(),
+                        batch * n_iter, wic, batch * n_iter, conf_.DWI_GLD(),
+                        sic, &(ws_gates(lay, dir, 0, 0)),
+                        &(ws_states(lay + 1, dir, 0, 0, 0)),
+                        &(diff_weights_iter(lay, dir, 0)), true, 1.0f);
+            }
         }
     }
 }
 
-#if 0
-//************* Grid computations strategy: wavefront **************//
-
-/*
-  // To cover n_iter > n_layer and n_iter < n_layer
-  min_dim = min(n_layer, n_iter)
-  max_dim = max(n_layer, n_iter)
-  and we assume that i refers to the max_dim dimension and j to the min_dim dimension
-
-  We compute the the wavefront using 3 loop nests, each nest having 2 loops:
-  - one for the head of the form loop on n_layer, and loop on n_elem in wave
-      for (int i = 0; i < min_dim - 1; i++)
-          for(int j = 0; j < i+1; j++)
-  - one for the body:
-      for (int i = 0; i < max_dim - min_dim + 1; i++)
-          for(int j = 0; j < min_dim; j++)
-  - one for the tail
-      for (int i = min_dim; i > 0 ; i--)
-          for(int j = 0; j < i; j++)
-  Here, we define classes for each of the wavefront direction to compute
-  the coordinates of the recurrent cells when running a wavefront execution
- */
-
-typedef enum wavefront_loop_index_ {
-    head,
-    body,
-    tail
-} wavefront_loop_index;
-
-struct wavefront_indexer {
-    wavefront_indexer(int dim)
-        : dim_(dim){};
-    virtual int get(wavefront_loop_index idx,int i, int j) const;
-protected:
-    int dim_;
-};
-
-// bottom to top or left to right maxdim
-struct wi_b2t_l2r_maxdim : wavefront_indexer {
-    int get(wavefront_loop_index idx, int i, int j) const override {
-        switch(idx){
-        case head: return i - j;
-        case body: return i - j;
-        case tail: return dim_ - 1 - j;
-        default: return -1;
-        }
-    }
-};
-
-// bottom to top or left to right mindim
-struct wi_b2t_l2r_mindim : wavefront_indexer {
-    int get(wavefront_loop_index idx, int i , int j) const override {
-        switch(idx){
-        case head: return j;
-        case body: return j;
-        case tail: return dim_ - i + j;
-        default: return -1;
-        }
-    }
-};
-
-template<typename original_indexer>
-struct reversed_indexer : wavefront_indexer {
-    reversed_indexer(int dim) : wavefront_indexer(dim),
-                                wd(original_indexer(dim)){}
-
-    int get(wavefront_loop_index idx, int i, int j) const override {
-        switch(idx){
-        case head: return dim_ - 1 - wd.head(i,j);
-        case body: return dim_ - 1 - wd.body(i,j);
-        case tail: return dim_ - 1 - wd.tail(i,j);
-        default: return -1;
-        }
-    }
-
-private:
-    original_indexer wd;
-};
-
-// top to bottom or right left maxdim and mindim
-using wi_t2b_r2l_maxdim = reversed_indexer<wi_b2t_l2r_maxdim>;
-using wi_t2b_r2l_mindim = reversed_indexer<wi_b2t_l2r_mindim>;
-
-template<prop_kind_t aprop>
-grid_execution_sig(_ref_rnn_common_t<aprop>::wavefront_execution){// (int dic, int slc,
-                         // int sic, int batch,
-                         // int n_layer, int n_direction, int n_iter,
-                         // int n_gates, int n_states,
-                         // const float **weights_input_, //[n_gates*dic][slc],
-                         // const float **weights_states_, //[n_gates*dic][dic],
-                         // const float *bias_, //[n_gates][dic],
-                         // float *ws_, //[n_layer+1][n_direction][n_iter+1][n_states][batch][dic],
-                         // float *gates_){ //[n_layer][n_direction][n_iter][batch][n_gates][dic]) {
-
-    AOC<float, 4> ws(ws_, n_layer + 1, n_direction, n_iter + 1, n_states * batch * wic);
-    AOC<float, 4> gates(gates_, n_layer, n_direction, n_iter, n_gates * batch * dic);
-    AOC<float*, 2> weights_input(weights_input_, n_layer, n_direction);
-    AOC<float*, 2> weights_states(weights_states_, n_layer, n_direction);
-    AOC<const float, 2> bias(bias_, n_layer, n_gates * dic);
-    // Setup the indexers: we have to check directions and if max_dim or min_dim
-    bool is_niter_maxdim = n_iter >= n_layer;
-    wavefront_indexer wi_maxdim = (is_niter_maxdim)
-        ? (((exec_dir == b2t_l2r) || (exec_dir == t2b_l2r)) //niter is maxdim, we look for l2r
-           ? (wavefront_indexer) wi_b2t_l2r_maxdim(n_iter)
-           : (wavefront_indexer) wi_t2b_r2l_maxdim(n_iter))
-        : (((exec_dir == b2t_l2r) || (exec_dir == b2t_r2l)) //nlayer is maxdim, we look for b2t
-           ? (wavefront_indexer) wi_b2t_l2r_maxdim(n_layer)
-           : (wavefront_indexer) wi_t2b_r2l_maxdim(n_layer));
-
-    wavefront_indexer wi_mindim = (!is_niter_maxdim)
-        ? (((exec_dir == b2t_l2r) || (exec_dir == t2b_l2r)) //niter is mindim, we look for l2r
-           ? (wavefront_indexer) wi_b2t_l2r_mindim(n_iter)
-           : (wavefront_indexer) wi_t2b_r2l_mindim(n_iter))
-        : (((exec_dir == b2t_l2r) || (exec_dir == b2t_r2l)) //nlayer is mindim, we look for b2t
-           ? (wavefront_indexer) wi_b2t_l2r_mindim(n_layer)
-           : (wavefront_indexer) wi_t2b_r2l_mindim(n_layer));
-
-    // auto get_offset = [=](wavefront_loop_index idx, int i, int j){
-    //     int dim_min = wi_mindim.get(idx, i,j);
-    //     int dim_max = wi_maxdim.get(idx, i,j);
-    //     int offset = (is_niter_maxdim)
-    //     ? dim_min*n_iter + dim_max
-    //     : dim_max*n_iter + dim_min;
-    // };
-
-#define get_lay_n_iter(idx, i, j)               \
-    do {                                        \
-        int dim_min = wi_mindim.get(idx, i, j); \
-        int dim_max = wi_maxdim.get(idx, i, j); \
-        if (is_niter_maxdim) {                  \
-            lay = dim_min;                      \
-            iter = dim_max;                     \
-        } else {                                \
-            lay = dim_max;                      \
-            iter = dim_min;                     \
-        }                                       \
-    } while (0)
-
-    int min_dim = is_niter_maxdim ? n_layer : n_iter;
-    int max_dim = is_niter_maxdim ? n_iter :n_layer;
-    int lay, iter;
-    for (int i = 0; i < min_dim - 1; i++)
-        for(int j = 0; j < i+1; j++){
-            get_lay_n_iter(head,i,j);
-            cell_execution(dic, slc, sic, batch,
-                 n_gates, n_states,
-                 &(ws(lay, iter, 0)), weights_input(lay - 1, 0),
-                 weights_states(lay - 1, 0), &(bias(lay-1, 0)),
-                 &(ws(lay - 1, iter, 0)), &(ws(lay, iter - 1, 0)), &(gates(lay-1, iter-1, 0)));
-        }
-    for (int i = min_dim - 1; i < max_dim; i++)
-        for(int j = 0; j < min_dim; j++){
-            get_lay_n_iter(body,i,j);
-        }
-    for (int i = min_dim - 1; i > 0 ; i--)
-        for(int j = 0; j < i; j++){
-            get_lay_n_iter(tail,i,j);
-        }
-
-#undef get_lay_n_iter
-}
-#endif
 //********* GRID computations strategy: utility functions **********//
 
 template <>
@@ -730,7 +631,7 @@ void _ref_rnn_common_t<prop_kind::forward>::copy_init_layer(bool lr, bool rl,
         float *ws_diff_states_, const float *xt_,
         const float *diff_dst_layer_) {
     AOC<float, 5> ws_states(
-            ws_states_, n_direction, n_iter + 1, n_states, batch, wic);
+            ws_states_, n_direction, n_states, n_iter + 1, batch, wic);
     auto xt_d = memory_desc_wrapper(conf_.src_pd(0));
 
     parallel_nd(n_iter, [&](int it) {
@@ -738,11 +639,11 @@ void _ref_rnn_common_t<prop_kind::forward>::copy_init_layer(bool lr, bool rl,
         if (lr)
             for (int b = 0; b < batch; b++)
                 for (int c = 0; c < slc; c++)
-                    ws_states(0, it + 1, 0, b, c) = *(xxt + b * slc + c);
+                    ws_states(0, 0, it + 1, b, c) = *(xxt + b * slc + c);
         if (rl)
             for (int b = 0; b < batch; b++)
                 for (int c = 0; c < slc; c++)
-                    ws_states(n_direction - 1, n_iter - it, 0, b, c)
+                    ws_states(n_direction - 1, 0, n_iter - it, b, c)
                             = *(xxt + b * slc + c);
     });
 }
@@ -754,7 +655,7 @@ void _ref_rnn_common_t<prop_kind::backward>::copy_init_layer(bool lr, bool rl,
         float *ws_diff_states_, const float *xt_,
         const float *diff_dst_layer_) {
     AOC<float, 6> ws_diff_states(ws_diff_states_, n_layer + 1, n_direction,
-            n_iter + 1, (n_states + 1), batch, wic);
+            (n_states + 1), n_iter + 1, batch, wic);
     auto diff_dst_layer_d = memory_desc_wrapper(conf_.diff_dst_pd(0));
 
     switch (conf_.direction()) {
@@ -762,10 +663,10 @@ void _ref_rnn_common_t<prop_kind::backward>::copy_init_layer(bool lr, bool rl,
         parallel_nd(n_iter, batch, [&](int it, int b) {
             auto diff_dst_layer_x
             = diff_dst_layer_ + diff_dst_layer_d.blk_off(it, b);
-            for (int s = 0; s < dlc; s++) {
-                ws_diff_states(n_layer, 0, it, n_states, b, s)
+            for (int s = 0; s < dic; s++) {
+                ws_diff_states(n_layer, 0, n_states, it, b, s)
                     = diff_dst_layer_x[s];
-                ws_diff_states(n_layer, 1, it, n_states, b, s)
+                ws_diff_states(n_layer, 1, n_states, n_iter - it - 1, b, s)
                     = diff_dst_layer_x[dic + s];
             }
         });
@@ -775,23 +676,36 @@ void _ref_rnn_common_t<prop_kind::backward>::copy_init_layer(bool lr, bool rl,
             auto diff_dst_layer_x
             = diff_dst_layer_ + diff_dst_layer_d.blk_off(it, b);
             for (int s = 0; s < dic; s++) {
-                ws_diff_states(n_layer, 0, it, n_states, b, s)
+                ws_diff_states(n_layer, 0, n_states, it, b, s)
                     = diff_dst_layer_x[s];
-                ws_diff_states(n_layer, 1, it, n_states, b, s)
+                ws_diff_states(n_layer, 1, n_states, n_iter - it - 1, b, s)
                     = diff_dst_layer_x[s];
             }
         });
         break;
-    default: // assumes default is always unidirectional
+    case mkldnn_unidirectional_left2right:
         parallel_nd(n_iter, batch, [&](int it, int b) {
             auto diff_dst_layer_x
                     = diff_dst_layer_ + diff_dst_layer_d.blk_off(it, b);
             for (int s = 0; s < dic; s++) {
-                ws_diff_states(n_layer, 0, it, n_states, b, s)
+                ws_diff_states(n_layer, 0, n_states, it, b, s)
                         = diff_dst_layer_x[s];
             }
         });
         break;
+    case mkldnn_unidirectional_right2left:
+        parallel_nd(n_iter, batch, [&](int it, int b) {
+            auto diff_dst_layer_x
+                    = diff_dst_layer_ + diff_dst_layer_d.blk_off(n_iter - it - 1, b);
+            for (int s = 0; s < dic; s++) {
+                ws_diff_states(n_layer, 0, n_states, it, b, s)
+                        = diff_dst_layer_x[s];
+            }
+        });
+        break;
+    default:
+        assert(!"Unsupported direction");
+        break;
     }
 }
 
@@ -800,14 +714,14 @@ void _ref_rnn_common_t<prop_kind::forward>::copy_init_iter(int n_layer,
         int n_direction, int n_states, int batch, int sic, int dic, int wic,
         int n_iter, float *ws_states_, float *ws_diff_states_,
         const float *firstit_states_, const float *diff_dst_iter_) {
-    AOC<float, 6> ws_states(ws_states_, n_layer + 1, n_direction, n_iter + 1,
-            n_states, batch, wic);
+    AOC<float, 6> ws_states(ws_states_, n_layer + 1, n_direction, n_states,
+            n_iter + 1, batch, wic);
     auto firstit_states_d = memory_desc_wrapper(conf_.src_pd(1));
     if (firstit_states_) {
         parallel_nd(n_layer, n_direction, [&](int lay, int dir) {
             for (int state = 0; state < n_states; state++)
                 for (int b = 0; b < batch; ++b) {
-                    array_copy(&(ws_states(lay + 1, dir, 0, state, b, 0)),
+                    array_copy(&(ws_states(lay + 1, dir, state, 0, b, 0)),
                         firstit_states_ + firstit_states_d.blk_off(
                         lay, dir, state, b), sic);
                 }
@@ -817,7 +731,7 @@ void _ref_rnn_common_t<prop_kind::forward>::copy_init_iter(int n_layer,
             for (int state = 0; state < n_states; state++)
                 for (int i = 0; i < batch; i++)
                     for (int j = 0; j < sic; j++)
-                        ws_states(lay + 1, dir, 0, state, i, j) = 0.0f;
+                        ws_states(lay + 1, dir, state, 0, i, j) = 0.0f;
         });
     }
 }
@@ -828,12 +742,12 @@ void _ref_rnn_common_t<prop_kind::backward>::copy_init_iter(int n_layer,
         int n_iter, float *ws_states_, float *ws_diff_states_,
         const float *firstit_states_, const float *diff_dst_iter_) {
     AOC<float, 6> ws_diff_states(ws_diff_states_, n_layer + 1, n_direction,
-            n_iter + 1, n_states + 1, batch, wic);
+            n_states + 1, n_iter + 1, batch, wic);
     auto diff_dst_iter_d = memory_desc_wrapper(conf_.diff_dst_pd(1));
     if (diff_dst_iter_) {
         parallel_nd(n_layer, n_direction, n_states, batch,
             [&](int lay, int dir, int state, int b) {
-            array_copy(&(ws_diff_states(lay, dir, n_iter, state, b, 0)),
+            array_copy(&(ws_diff_states(lay, dir, state, n_iter, b, 0)),
                 diff_dst_iter_ + diff_dst_iter_d.blk_off(lay, dir, state, b),
                 dic);
         });
@@ -841,7 +755,7 @@ void _ref_rnn_common_t<prop_kind::backward>::copy_init_iter(int n_layer,
         parallel_nd(n_layer, n_direction, n_states, batch,
             [&](int lay, int dir, int state, int i) {
             for (int j = 0; j < dic; j++)
-                ws_diff_states(lay, dir, n_iter, state, i, j) = 0.0f;
+                ws_diff_states(lay, dir, state, n_iter, i, j) = 0.0f;
         });
     }
 }
@@ -855,14 +769,14 @@ void _ref_rnn_common_t<prop_kind::forward>::copy_res_layer(bool lr, bool rl,
         const float *ws_diff_states_) {
     auto dst_layer_d = memory_desc_wrapper(conf_.dst_pd(0));
     AOC<const float, 6> ws_states(ws_states_, n_layer + 1, n_direction,
-            n_iter + 1, n_states, batch, wic);
+            n_states, n_iter + 1, batch, wic);
 
     parallel_nd(n_iter, batch, [&](int it, int b) {
         int dir = 0;
         if (lr) {
             for (int s = 0; s < dic; s++)
                 dst_layer_[dst_layer_d.blk_off(it, b, dir * dic + s)]
-                        = ws_states(n_layer, dir, it + 1, 0, b, s);
+                        = ws_states(n_layer, dir, 0, it + 1, b, s);
             dir = 1;
         }
         if (rl) {
@@ -870,11 +784,11 @@ void _ref_rnn_common_t<prop_kind::forward>::copy_res_layer(bool lr, bool rl,
                 switch (direction) {
                 case mkldnn_bidirectional_sum:
                     dst_layer_[dst_layer_d.blk_off(it, b, s)] += ws_states(
-                            n_layer, dir, n_iter - it, 0, b, s);
+                            n_layer, dir, 0, n_iter - it, b, s);
                     break;
                 default:
                     dst_layer_[dst_layer_d.blk_off(it, b, dir * dic + s)]
-                            = ws_states(n_layer, dir, n_iter - it, 0, b, s);
+                            = ws_states(n_layer, dir, 0, n_iter - it, b, s);
                 }
         }
     });
@@ -889,7 +803,7 @@ void _ref_rnn_common_t<prop_kind::backward>::copy_res_layer(bool lr, bool rl,
         const float *ws_diff_states_) {
     auto diff_src_layer_d = memory_desc_wrapper(conf_.diff_src_pd(0));
     AOC<const float, 6> ws_diff_states(ws_diff_states_, n_layer + 1,
-            n_direction, n_iter + 1, n_states + 1, batch, wic);
+            n_direction, n_states + 1, n_iter + 1, batch, wic);
 
     parallel_nd(n_iter, batch, [&](int it, int b) {
         int dir = 0;
@@ -901,10 +815,10 @@ void _ref_rnn_common_t<prop_kind::backward>::copy_res_layer(bool lr, bool rl,
                                       n_iter - 1 - it :
                                       it,
                               b, dir * slc + s);
-            float res = ws_diff_states(0, 0, it, n_states, b, s);
+            float res = ws_diff_states(0, 0, n_states, it, b, s);
             if (n_direction - 1)
                 res += ws_diff_states(
-                        0, 1, n_iter - 1 - it, n_states, b, s);
+                        0, 1, n_states, n_iter - 1 - it, b, s);
             dst_addr[0] = res;
         }
     });
@@ -917,13 +831,13 @@ void _ref_rnn_common_t<prop_kind::forward>::copy_res_iter(int n_layer,
         const float *ws_states_, const float *ws_diff_states_) {
     auto dst_iter_d = memory_desc_wrapper(conf_.dst_pd(1));
     AOC<const float, 6> ws_states(ws_states_, n_layer + 1, n_direction,
-            n_iter + 1, n_states, batch, wic);
+            n_states, n_iter + 1, batch, wic);
     if (dst_iter_) {
         parallel_nd(n_layer, n_direction, n_states, batch,
             [&](int lay, int dir, int state, int b) {
             for (int s = 0; s < dic; s++) {
                 dst_iter_[dst_iter_d.blk_off(lay, dir, state, b, s)]
-                        = ws_states(lay + 1, dir, n_iter, state, b, s);
+                        = ws_states(lay + 1, dir, state, n_iter, b, s);
             }
         });
     }
@@ -936,14 +850,14 @@ void _ref_rnn_common_t<prop_kind::backward>::copy_res_iter(int n_layer,
         const float *ws_states_, const float *ws_diff_states_) {
     auto diff_src_iter_d = memory_desc_wrapper(conf_.diff_src_pd(1));
     AOC<const float, 6> ws_diff_states(ws_diff_states_, n_layer + 1,
-            n_direction, n_iter + 1, n_states + 1, batch, wic);
+            n_direction, n_states + 1, n_iter + 1, batch, wic);
     if (diff_src_iter_) {
         parallel_nd(n_layer, n_direction, n_states, batch,
             [&](int lay, int dir, int state, int b) {
             for (int s = 0; s < sic; s++) {
                 diff_src_iter_[diff_src_iter_d.blk_off(
                         lay, dir, state, b, s)]
-                        = ws_diff_states(lay, dir, 0, state, b, s);
+                        = ws_diff_states(lay, dir, state, 0, b, s);
             }
         });
     }
@@ -951,25 +865,25 @@ void _ref_rnn_common_t<prop_kind::backward>::copy_res_iter(int n_layer,
 
 template <prop_kind_t aprop>
 packing_sig(_ref_rnn_common_t<aprop>::pack_weights) {
-#if USE_MKL_PACKED_GEMM
+#if (USE_MKL_PACKED_GEMM)
     AOC<const float, 5> w(
             w_, n_layer, n_direction, IC_size, n_gates, OC_size);
     AOC<float *, 3> weights(weights_, n_layer, n_direction, n_parts);
-    int m = 0, n = 0, k = 0, ldA = 0;
+    int m = 0, n = 0, k = 0;
     auto transA = CblasNoTrans;
     bool is_fwd = aprop == prop_kind::forward;
     if (is_fwd) {
         m = n_gates * OC_size;
         n = batch;
         k = IC_size;
+        //todo: do a transposition if ldgoi
         transA = CblasNoTrans;
-        ldA = m;
     } else {
         m = IC_size;
         n = batch;
         k = n_gates * OC_size;
-        transA = CblasTrans;
-        ldA = k;
+        //TODO: do a transposition if ldigo
+        transA = CblasNoTrans;
     }
     for (int i = 0; i < n_layer; i++) {
         for (int d = 0; d < n_direction; d++) {
@@ -979,7 +893,7 @@ packing_sig(_ref_rnn_common_t<aprop>::pack_weights) {
                 int g = (p > 0) ? gates_per_part[p - 1] : 0;
                 weights(i, d, p) = cblas_sgemm_alloc(CblasAMatrix, m_p, n, k_p);
                 cblas_sgemm_pack(CblasColMajor, CblasAMatrix, transA, m_p, n,
-                        k_p, 1.0f, &(w(i, d, 0, g, 0)), ldA, weights(i, d, p));
+                        k_p, 1.0f, &(w(i, d, 0, g, 0)), m, weights(i, d, p));
             }
         }
     }
@@ -1001,22 +915,66 @@ packing_sig(_ref_rnn_common_t<aprop>::pack_weights) {
 
 template <prop_kind_t aprop>
 packing_sig(_ref_rnn_common_t<aprop>::no_pack_weights) {
-    AOC<const float, 5> w(
-            w_, n_layer, n_direction, OC_size, n_gates, IC_size);
+    AOC<const float, 3> w(
+            w_, n_layer, n_direction, IC_size * n_gates * OC_size);
     AOC<float *, 3> weights(weights_, n_layer, n_direction, n_parts);
-    for (int i = 0; i < n_layer; i++) {
-        for (int d = 0; d < n_direction; d++) {
-            weights(i, d, 0) = (float *)&(w(i, d, 0, 0, 0));
-            for (int p = 1; p < n_parts; p++) {
-                weights(i, d, p) = (float *)&(w(i, d, 0, gates_per_part[p-1], 0));
+    int m = 0, n = 0, ldA = 0;
+
+    bool is_fwd = aprop == prop_kind::forward;
+    if (is_fwd) {
+        m = n_gates * OC_size;
+        n = IC_size;
+        ldA = conf_.GC();
+    } else {
+        m = IC_size;
+        n = n_gates * OC_size;
+        ldA = conf_.WIC();
+    }
+
+    if (!do_copy) {
+        for (int i=0; i < n_layer; i++)
+            for (int d = 0; d < n_direction; d++) {
+                weights(i, d, 0) = (float *) &(w(i, d, 0));
+                for (int p = 1; p < n_parts; p++) {
+                    size_t offset = is_fwd
+                        ? gates_per_part[p - 1] * OC_size
+                        : gates_per_part[p - 1] * OC_size * IC_size;
+                    weights(i, d, p) = (float *) &w(i, d, offset);
+                }
             }
-        }
+        return;
     }
+
+    /* We always assume
+       - column major
+       - alpha = 1.0f
+    */
+    auto copy_matrix = [](char trans, int nrows, int ncols,
+            const float *src, const int ld_src, float *dst, const int ld_dst){
+        for (int i = 0; i < ncols; i++)
+            for (int j = 0; j < nrows; j++)
+                dst[i * ld_dst + j] = src[i * ld_src + j];
+    };
+
+    AOC<float, 3> tmp(scratch_mem, n_layer, n_direction, ldA * n);
+    mkldnn::impl::parallel_nd(n_layer, n_direction, [&](int i, int d) {
+            auto src_mat = &(w(i, d, 0));
+            auto dst_mat = &(tmp(i, d, 0));
+            copy_matrix('N', m, n, src_mat, m, dst_mat, ldA);
+            weights(i, d, 0) = &tmp(i, d, 0);
+            for (int p = 1; p < n_parts; p++) {
+                size_t offset = is_fwd
+                    ? gates_per_part[p - 1] * OC_size
+                    : gates_per_part[p - 1] * OC_size * conf_.WIC();
+                weights(i, d, p) = &tmp(i, d, offset);
+            }
+        });
 }
 
+
 template <prop_kind_t aprop>
 free_packed_sig(_ref_rnn_common_t<aprop>::free_packed_weights) {
-#if USE_MKL_PACKED_GEMM
+#if (USE_MKL_PACKED_GEMM)
     AOC<float *, 3> weights(weights_, n_layer, n_direction, n_parts);
     for (int i = 0; i < n_layer; i++)
         for (int j = 0; j < n_direction; j++)
@@ -1033,10 +991,7 @@ free_packed_sig(_ref_rnn_common_t<aprop>::free_packed_weights) {
 
 template <prop_kind_t aprop>
 free_packed_sig(_ref_rnn_common_t<aprop>::free_no_packed_weights) {
-    UNUSED(n_layer);
-    UNUSED(n_direction);
-    UNUSED(n_parts);
-    UNUSED(weights_);
+    // IN this case, only scratchpad is used, so no free necessary
 }
 
 //********************* Execution function *********************//
@@ -1055,7 +1010,8 @@ void _ref_rnn_common_t<aprop>::execute_() {
     int sic = conf_.SIC();
     int dic = conf_.DIC();
     int dlc = conf_.DLC();
-    int wic = nstl::max(slc, nstl::max(sic, dic));
+    int wic = conf_.WIC();
+
     bool is_orig_gru = conf_.cell_kind()
         == alg_kind::vanilla_gru;
     int n_parts_wei_st = is_orig_gru ? 2 : 1, n_parts_wei_i = 1;
@@ -1096,25 +1052,21 @@ void _ref_rnn_common_t<aprop>::execute_() {
             nullptr :
             reinterpret_cast<const float *>(this->input_memory(input_idx++));
 
+    // fetchihg buffers from the workspace
     // if no workspace was provided we use the scratchpad
-    if (use_scratchpad_for_ws_) {
-        ws_gates_ = ((float *)scratchpad_->get());
-        ws_states_ = ((float *)scratchpad_->get()) + ws_states_offset_;
-        ws_diff_states_
-                = ((float *)scratchpad_->get()) + ws_diff_states_offset_;
-        ws_grid_ = ((float *)scratchpad_->get()) + ws_grid_comp_offset_;
-        ws_cell_ = ((float *)scratchpad_->get()) + ws_cell_comp_offset_;
-    } else {
-        float *ws_ptr = is_fwd ?
-                reinterpret_cast<float *>(this->memory(output_idx++)) :
-                const_cast<float *>(reinterpret_cast<const float *>(
-                        this->input_memory(input_idx++)));
-        ws_gates_ = ws_ptr + ws_gates_offset_;
-        ws_states_ = ws_ptr + ws_states_offset_;
-        ws_diff_states_ = ws_ptr + ws_diff_states_offset_;
-        ws_grid_ = ws_ptr + ws_grid_comp_offset_;
-        ws_cell_ = use_scratchpad_ ? ((float *)scratchpad_->get()) : nullptr;
-    }
+    float *scratch_ptr = ((float *)scratchpad_->get());
+    float *ws_ptr = nullptr;
+    if (use_workspace_)
+        ws_ptr = is_fwd ?
+            reinterpret_cast<float *>(this->memory(output_idx++)) :
+            const_cast<float *>(reinterpret_cast<const float *>(
+                    this->input_memory(input_idx++)));
+    float *base_ptr = use_workspace_ ? ws_ptr : scratch_ptr;
+    ws_gates_ = base_ptr + ws_gates_offset_;
+    ws_states_ = base_ptr + ws_states_offset_;
+    ws_diff_states_ = base_ptr + ws_diff_states_offset_;
+    ws_grid_ = base_ptr + ws_grid_comp_offset_;
+    ws_cell_ = base_ptr + ws_cell_comp_offset_;
 
     auto diff_src_layer = is_fwd ?
             nullptr :
@@ -1132,21 +1084,43 @@ void _ref_rnn_common_t<aprop>::execute_() {
             nullptr :
             reinterpret_cast<float *>(this->memory(output_idx++));
 
-    // initialize diff_states to 0
-    if (aprop == prop_kind::backward)
+    // Fetching extra buffers from scratchpad
+    ws_weights_layer_ = scratch_ptr + ws_weights_layer_offset_;
+    ws_weights_iter_ = scratch_ptr + ws_weights_iter_offset_;
+    ws_diff_weights_layer_ = scratch_ptr + ws_diff_weights_layer_offset_;
+    ws_diff_weights_iter_ = scratch_ptr + ws_diff_weights_iter_offset_;
+
+
+// initialize diff_states to 0
+    if (aprop == prop_kind::backward) {
         array_set(ws_diff_states_, 0.0f, conf_.ws_diff_states_size());
+        // TODO: add a variable to check if good_ld_copy is necessary
+        if (copy_diff_weights_layer_) {
+            parallel_nd(conf_.ws_diff_weights_layer_size(), [&](size_t i) {
+                ws_diff_weights_layer_[i] = 0.;
+            });
+        } else
+            ws_diff_weights_layer_ = diff_weights_layer;
+        if (copy_diff_weights_iter_) {
+            parallel_nd(conf_.ws_diff_weights_iter_size(), [&](size_t i) {
+                ws_diff_weights_iter_[i] = 0.;
+            });
+        } else
+            ws_diff_weights_iter_ = diff_weights_iter;
+    }
 
     // TODO: implement without copies
     bool is_lr = !one_of(exec_dir, b2t_r2l, t2b_r2l);
     bool is_rl = !one_of(exec_dir, b2t_l2r, t2b_l2r);
-
     // we pack the weights if we are using the packed API
     (this->*weights_state_pack_func)(n_layer, n_direction, n_weights_state,
             n_gates, batch, dic, sic, ptr_wei_state_, n_parts_wei_st,
-            (is_orig_gru ? parts_wei_st_gru : &parts_wei_st), w_state);
+            (is_orig_gru ? parts_wei_st_gru : &parts_wei_st), w_state,
+            ws_weights_iter_, copy_weights_iter_);
     (this->*weights_input_pack_func)(n_layer, n_direction, n_weights_input,
             n_gates, batch, dic, slc, ptr_wei_input_, n_parts_wei_i,
-            &parts_wei_i, w_input);
+            &parts_wei_i, w_input,
+            ws_weights_layer_, copy_weights_layer_);
 
     // we first need to copy the initial states and input into ws
     copy_init_layer(is_lr, is_rl, n_layer, n_direction, n_iter, batch, slc, dic,
@@ -1160,7 +1134,7 @@ void _ref_rnn_common_t<aprop>::execute_() {
             n_iter, n_gates, n_states, n_bias, ptr_wei_input_, n_parts_wei_i,
             ptr_wei_state_, n_parts_wei_st, (float *)bias, ws_states_,
             ws_diff_states_, ws_gates_, ws_cell_, ws_grid_, ws_per_cell,
-            diff_weights_layer, diff_weights_iter, diff_bias);
+            ws_diff_weights_layer_, ws_diff_weights_iter_, diff_bias);
 
     // Finally we copy the results to the result buffers
     copy_res_layer(is_lr, is_rl, n_layer, n_direction, n_iter, batch,
@@ -1169,6 +1143,39 @@ void _ref_rnn_common_t<aprop>::execute_() {
     copy_res_iter(n_layer, n_direction, n_states, batch, sic, dic, wic, n_iter,
             dst_last_iter, diff_src_iter, ws_states_, ws_diff_states_);
 
+    // copy of the diff weights if bwd
+    if (aprop == prop_kind::backward){
+        // TODO: write an impl of matcopy in MKL-DNN
+        // TODO: support ldgoi using the trans parameters
+        AOC<float, 3> diff_weights_layer_aoc(diff_weights_layer, n_layer, n_direction, slc * n_gates * dic);
+        AOC<float, 3> diff_weights_iter_aoc(diff_weights_iter, n_layer, n_direction, sic * n_gates * dic);
+        AOC<float, 3> ws_diff_weights_layer_aoc(ws_diff_weights_layer_, n_layer, n_direction, slc * conf_.GC());
+        AOC<float, 3> ws_diff_weights_iter_aoc(ws_diff_weights_iter_, n_layer, n_direction, sic * conf_.GC());
+
+        /*
+           - assumes column major and non transposed matrices
+           - computes B = A + B
+        */
+        auto inplace_matadd = [=](const int nrows, const int ncols,
+                const float *A, const int ldA, float *B, const int ldB){
+            for(int i = 0; i < ncols; i++)
+                for(int j = 0; j < nrows; j++)
+                    B[i * ldB + j] += A[i * ldA + j];
+        };
+        mkldnn::impl::parallel_nd(n_layer, n_direction, [&](int i, int d) {
+            auto wei_lay = &(diff_weights_layer_aoc(i, d, 0));
+            auto wei_it = &(diff_weights_iter_aoc(i, d, 0));
+            auto ws_wei_lay = &(ws_diff_weights_layer_aoc(i, d, 0));
+            auto ws_wei_it = &(ws_diff_weights_iter_aoc(i, d, 0));
+            if (copy_diff_weights_layer_)
+                inplace_matadd(n_gates*dic, slc, ws_wei_lay, conf_.GC(),
+                        wei_lay, n_gates*dic);
+            if (copy_diff_weights_iter_)
+                inplace_matadd(n_gates*dic, sic, ws_wei_it, conf_.GC(),
+                        wei_it, n_gates*dic);
+        });
+    }
+
     // We free the packed weights if they were packed internally
     (this->*weights_state_free_packed_func)(n_layer, n_direction,
             n_parts_wei_st, ptr_wei_state_);
@@ -1178,6 +1185,8 @@ void _ref_rnn_common_t<aprop>::execute_() {
 
 template struct _ref_rnn_common_t<prop_kind::forward>;
 template struct _ref_rnn_common_t<prop_kind::backward>;
+
+#undef AOC
 }
 }
 }
index 87f48e3..703aa18 100644 (file)
@@ -22,6 +22,7 @@
 #include "c_types_map.hpp"
 #include "cpu_engine.hpp"
 #include "cpu_rnn_pd.hpp"
+#include "cpu_isa_traits.hpp"
 #include "scratchpad.hpp"
 #include "type_helpers.hpp"
 #include "utils.hpp"
@@ -33,7 +34,7 @@ namespace impl {
 namespace cpu {
 
 #define elemwise_sig(f)                                                 \
-    void f(int dic, int wic, int batch, int n_states, int n_gates,      \
+    void f(int dic, int wic, int batch, int n_states, int iter_stride, int n_gates, \
             float *ws_gates_, float *states_t_l_, float *states_t_lm1_, \
             float *states_tm1_l_, float *diff_states_t_l_,              \
             float *diff_states_t_lp1_, float *diff_states_tp1_l_,       \
@@ -41,7 +42,7 @@ namespace cpu {
 
 #define cell_execution_sig(f)                                                 \
     void f(int dic, int slc, int sic, int wic, int batch, int n_gates,        \
-            int n_states, float *states_t_l_, float *diff_states_t_l_,        \
+            int n_states, int iter_stride, float *states_t_l_, float *diff_states_t_l_, \
             float **w_input_, float **w_state_, const float *bias_,           \
             float *states_t_lm1_, float *states_tm1_l_,                       \
             float *diff_states_t_lp1_, float *diff_states_tp1_l_,             \
@@ -66,7 +67,8 @@ namespace cpu {
 #define packing_sig(f)                                               \
     void f(int n_layer, int n_direction, int n_weights, int n_gates, \
             int batch, int OC_size, int IC_size, float **weights_,   \
-            int n_parts, int *gates_per_part, const float *w_)
+            int n_parts, int *gates_per_part, const float *w_,       \
+            float * scratch_mem, bool do_copy)
 
 #define free_packed_sig(f) void f(int n_layer, int n_direction, int n_parts, \
             float **weights_)
@@ -118,10 +120,10 @@ struct _ref_rnn_common_t : public cpu_primitive_t {
                     && one_of(cell_kind, alg_kind::vanilla_rnn,
                                alg_kind::vanilla_lstm, alg_kind::vanilla_gru,
                                alg_kind::gru_linear_before_reset)
-                    && implication(aprop == prop_kind::forward,
+                    && IMPLICATION(aprop == prop_kind::forward,
                                one_of(this->desc()->prop_kind, forward_training,
                                        forward_inference))
-                    && implication(aprop == backward,
+                    && IMPLICATION(aprop == backward,
                                one_of(this->desc()->prop_kind, backward))
                     && this->set_default_params() == status::success;
             if (!ok)
@@ -197,6 +199,11 @@ struct _ref_rnn_common_t : public cpu_primitive_t {
         default: assert(false);
         }
 
+        merge_gemm_layer = ((aprop == prop_kind::forward) && (conf_.MB() < 128))
+            || (aprop == prop_kind::backward);
+        merge_gemm_iter = (aprop == prop_kind::backward)
+                && (!utils::one_of(conf_.cell_kind(), alg_kind::vanilla_gru,
+                            alg_kind::gru_linear_before_reset));
         auto set_pack_funcs = [](bool packed_gemm, gemm_t &g, bool pack_w,
                 packing_t &p, free_packed_t &f) {
             g = packed_gemm ? &class_name::packed_gemm : &class_name::gemm;
@@ -205,17 +212,20 @@ struct _ref_rnn_common_t : public cpu_primitive_t {
             f = pack_w ? &class_name::free_packed_weights :
                              &class_name::free_no_packed_weights;
         };
+#ifdef USE_MKL_PACKED_GEMM
+        const bool weights_pack_cond =
+            (conf_.T() > 1) && (conf_.MB() == 32) &&
+            (conf_.SIC() == 512) &&(conf_.SLC() == 512) && (conf_.DIC() == 512);
+#else
+        const bool weights_pack_cond = false;
+#endif
 
-        const bool weights_pack_cond = USE_MKL_PACKED_GEMM && conf_.T() > 1;
-        const bool is_weights_state_packed = USE_MKL_PACKED_GEMM
-                && conf_.desc()->weights_iter_desc.format == packed_format;
+        const bool is_weights_state_packed = conf_.desc()->weights_iter_desc.format == packed_format;
         set_pack_funcs(weights_pack_cond || is_weights_state_packed,
                 gemm_state_func, weights_pack_cond && !is_weights_state_packed,
                 weights_state_pack_func, weights_state_free_packed_func);
 
-        const bool is_weights_input_packed = USE_MKL_PACKED_GEMM
-                && conf_.desc()->weights_layer_desc.format == packed_format;
-
+        const bool is_weights_input_packed = conf_.desc()->weights_layer_desc.format == packed_format;
         set_pack_funcs(weights_pack_cond || is_weights_input_packed,
                 gemm_input_func, weights_pack_cond && !is_weights_input_packed,
                 weights_input_pack_func, weights_input_free_packed_func);
@@ -235,6 +245,9 @@ struct _ref_rnn_common_t : public cpu_primitive_t {
             case alg_kind::eltwise_tanh:
                 activation_func = &activation<alg_kind::eltwise_tanh, aprop>;
                 break;
+            case alg_kind::eltwise_logistic:
+                activation_func = &activation<alg_kind::eltwise_logistic, aprop>;
+                break;
             default: break;
             }
             break;
@@ -262,10 +275,6 @@ struct _ref_rnn_common_t : public cpu_primitive_t {
         /// wavefront
         grid_computation = &class_name::linear_execution;
 
-        conf_.set_offsets(
-                ws_gates_offset_, ws_states_offset_, ws_diff_states_offset_,
-                ws_grid_comp_offset_, ws_cell_comp_offset_);
-
         // we need to allocate memory for:
         // - the states to compute a pass.
         // - the intermediate results from the gates.
@@ -288,11 +297,31 @@ struct _ref_rnn_common_t : public cpu_primitive_t {
         //   = TODO: allocate only n_layer_wav * batch * n_gates * dic for
         //   wavefront execution (inference)
 
-        use_scratchpad_for_ws_ = (conf_.desc()->prop_kind == prop_kind::forward_inference);
-        use_scratchpad_ = use_scratchpad_for_ws_ || conf_.is_lbr();
-        if (use_scratchpad_)
-            scratchpad_ =
-                create_scratchpad(conf_.get_scratchpad_size() * sizeof(float));
+        use_jit_sgemm_ = ((aprop == prop_kind::forward_inference)
+            || (conf_.is_training() && conf_.DIC() < 500))
+            && !mayiuse(avx512_mic);
+
+        copy_weights_layer_ = (conf_.WL_LD() != conf_.WL_GLD());
+        copy_weights_iter_ = (conf_.WI_LD() != conf_.WI_GLD());
+
+        copy_diff_weights_layer_ = (aprop == prop_kind::backward)
+                && (conf_.DWL_LD() != conf_.DWL_GLD());
+        copy_diff_weights_iter_ = (aprop == prop_kind::backward)
+                && (conf_.DWI_LD() != conf_.DWI_GLD());
+
+        use_workspace_ = (conf_.desc()->prop_kind != prop_kind::forward_inference);
+
+        size_t scratchpad_size = conf_.set_offsets(use_workspace_,
+            ws_gates_offset_, ws_states_offset_,  ws_diff_states_offset_,
+            ws_grid_comp_offset_,
+            conf_.is_lbr(), ws_cell_comp_offset_,
+            copy_weights_layer_, ws_weights_layer_offset_,
+            copy_weights_iter_, ws_weights_iter_offset_,
+            copy_diff_weights_layer_, ws_diff_weights_layer_offset_,
+            copy_diff_weights_iter_, ws_diff_weights_iter_offset_);
+
+        scratchpad_ =
+            create_scratchpad(scratchpad_size * sizeof(float));
 
         int max_nparts = (conf_.cell_kind() == alg_kind::vanilla_gru) ? 2 : 1;
         int ptr_wei_sz = conf_.L() * conf_.D() * max_nparts;
@@ -300,8 +329,7 @@ struct _ref_rnn_common_t : public cpu_primitive_t {
         ptr_wei_state_ = (float **)malloc(sizeof(float *) * ptr_wei_sz, 64);
     }
     ~_ref_rnn_common_t() {
-        if (use_scratchpad_)
-            delete scratchpad_;
+        delete scratchpad_;
         free(ptr_wei_input_);
         free(ptr_wei_state_);
     }
@@ -349,16 +377,19 @@ private:
             int sic, int dic, int wic, int n_iter, float *dst_iter_,
             float *diff_src_iter, const float *ws_states_,
             const float *ws_diff_states_);
-    void gates_reduction(int n_gates, int dic, int batch,
+    void gates_reduction(int n_gates, int dic, int wic, int batch,
             const float *ws_gates_, float *diff_bias_);
     pd_t conf_;
-    bool use_scratchpad_;
-    bool use_scratchpad_for_ws_;
+    bool use_workspace_;
     scratchpad_t *scratchpad_;
 
     size_t ws_gates_offset_;
     size_t ws_states_offset_;
+    size_t ws_weights_layer_offset_;
+    size_t ws_weights_iter_offset_;
     size_t ws_diff_states_offset_;
+    size_t ws_diff_weights_layer_offset_;
+    size_t ws_diff_weights_iter_offset_;
     size_t ws_grid_comp_offset_;
     size_t ws_cell_comp_offset_;
 
@@ -367,6 +398,10 @@ private:
     float *ws_diff_states_;
     float *ws_cell_;
     float *ws_grid_;
+    float *ws_weights_layer_;
+    float *ws_weights_iter_;
+    float *ws_diff_weights_layer_;
+    float *ws_diff_weights_iter_;
     int n_output_features;
 
     float **ptr_wei_input_;
@@ -376,6 +411,14 @@ private:
     grid_execution_f grid_computation;
     cell_execution_f cell_func;
 
+    bool copy_weights_layer_;
+    bool copy_weights_iter_;
+    bool copy_diff_weights_layer_;
+    bool copy_diff_weights_iter_;
+    bool merge_gemm_layer;
+    bool merge_gemm_iter;
+    bool use_jit_sgemm_;
+
     packing_t weights_input_pack_func;
     packing_t weights_state_pack_func;
 
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.cpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.cpp
new file mode 100644 (file)
index 0000000..42234e9
--- /dev/null
@@ -0,0 +1,151 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <assert.h>
+#include <math.h>
+
+#include "c_types_map.hpp"
+#include "mkldnn_thread.hpp"
+#include "type_helpers.hpp"
+
+#include "ref_shuffle.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+using namespace memory_format;
+
+template <int data_type_size>
+template <mkldnn_memory_format_t fmt>
+void ref_shuffle_t<data_type_size>::execute_() {
+    using namespace prop_kind;
+    using namespace utils;
+
+    const memory_desc_wrapper data_d(conf_.data_pd());
+
+    auto input = reinterpret_cast<const data_t*>(this->input_memory(0));
+    auto output = reinterpret_cast<data_t*>(this->memory(0));
+
+    const int axis = conf_.axis();
+    const int axis_size = conf_.axis_size();
+
+    const int MB = conf_.MB();
+    const int C = conf_.C();
+    int H = 1, W = 1, D = 1, HW = 1, SP = 1;
+    const bool has_spatial = utils::one_of(data_d.ndims(), 3, 4 ,5);
+    if (has_spatial)
+    {
+        D = conf_.D();
+        H = conf_.H();
+        W = conf_.W();
+        HW = H * W;
+        SP = D * HW;
+    }
+    const size_t stride_mb = data_d.blocking_desc().strides[0][0];
+    constexpr int blksize = one_of(fmt, nChw16c, nCdhw16c) ? 16 : 8;
+
+    if (axis == 1 && one_of(fmt, nChw16c, nChw8c, nCdhw16c, nCdhw16c)) {
+#if MKLDNN_THR == MKLDNN_THR_OMP
+#       pragma omp parallel for collapse(3) schedule(static)
+        for (int mb = 0; mb < MB; ++mb)
+        for (int cb = 0; cb < C; cb += blksize)
+        for (int sp = 0; sp < SP; ++sp) {
+            const size_t off = mb * stride_mb + sp * blksize;
+            const size_t output_off = off + cb * SP;
+            PRAGMA_OMP_SIMD()
+            for (int cc = 0; cc < nstl::min(blksize, C - cb); ++cc)
+            {
+                int input_c = rev_transposed_[cb + cc];
+                const size_t input_off = off + input_c / blksize * SP * blksize
+                                           + input_c % blksize;
+                output[output_off + cc] = input[input_off];
+            }
+        }
+#else
+        parallel_nd(MB, utils::div_up(C, blksize), SP, [&](int mb, int c,
+                  int sp) {
+            const size_t off = mb * stride_mb + sp * blksize;
+            const int cb = c * blksize;
+            const size_t output_off = off + cb * SP;
+            for (int cc = 0; cc < nstl::min(blksize, C - cb); ++cc)
+            {
+                int input_c = rev_transposed_[cb + cc];
+                const size_t input_off = off + input_c / blksize * SP * blksize
+                                           + input_c % blksize;
+                output[output_off + cc] = input[input_off];
+            }
+        });
+#endif
+    } else if (axis == 1 && one_of(fmt, nhwc, ndhwc)) {
+        parallel_nd(MB, SP, [&](int mb, int sp) {
+            const size_t off = mb * stride_mb + sp * C;
+            PRAGMA_OMP_SIMD()
+            for (int c = 0; c < C; ++c)
+                output[off + c] = input[off + rev_transposed_[c]];
+        });
+    } else if (axis == 1 && one_of(fmt, nchw, ncdhw)) {
+        parallel_nd(MB, C, [&](int mb, int c) {
+            const size_t output_off = mb * stride_mb + c * SP;
+            const size_t input_off = mb * stride_mb + rev_transposed_[c] * SP;
+            PRAGMA_OMP_SIMD()
+            for (int sp = 0; sp < SP; ++sp) {
+                output[output_off + sp] = input[input_off + sp];
+            }
+        });
+    } else {
+        auto dims = conf_.desc()->data_desc.dims;
+        auto ndims = conf_.desc()->data_desc.ndims;
+        const size_t outer_size = utils::array_product(dims, axis);
+        const size_t inner_size = utils::array_product(dims + axis + 1,
+                                         ndims - axis - 1);
+        const size_t dim = axis_size * inner_size;
+
+        parallel_nd(outer_size, axis_size, inner_size, [&](size_t ou, int a,
+               size_t in)
+        {
+            const size_t off = ou * dim + in;
+            auto &o = output[data_d.off_l(off + a * inner_size)];
+            o = input[data_d.off_l(off + rev_transposed_[a] * inner_size)];
+        });
+    }
+}
+
+template void ref_shuffle_t<4>::execute_<nCdhw16c>();
+template void ref_shuffle_t<4>::execute_<nChw16c>();
+template void ref_shuffle_t<4>::execute_<nCdhw8c>();
+template void ref_shuffle_t<4>::execute_<nChw8c>();
+template void ref_shuffle_t<4>::execute_<ncdhw>();
+template void ref_shuffle_t<4>::execute_<nchw>();
+template void ref_shuffle_t<4>::execute_<ndhwc>();
+template void ref_shuffle_t<4>::execute_<nhwc>();
+template void ref_shuffle_t<4>::execute_<any>();
+
+template void ref_shuffle_t<1>::execute_<nCdhw16c>();
+template void ref_shuffle_t<1>::execute_<nChw16c>();
+template void ref_shuffle_t<1>::execute_<nCdhw8c>();
+template void ref_shuffle_t<1>::execute_<nChw8c>();
+template void ref_shuffle_t<1>::execute_<ncdhw>();
+template void ref_shuffle_t<1>::execute_<nchw>();
+template void ref_shuffle_t<1>::execute_<ndhwc>();
+template void ref_shuffle_t<1>::execute_<nhwc>();
+template void ref_shuffle_t<1>::execute_<any>();
+
+}
+}
+}
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
diff --git a/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.hpp b/inference-engine/thirdparty/mkl-dnn/src/cpu/ref_shuffle.hpp
new file mode 100644 (file)
index 0000000..763bbaa
--- /dev/null
@@ -0,0 +1,105 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_REF_SHUFFLE_HPP
+#define CPU_REF_SHUFFLE_HPP
+
+#include <assert.h>
+
+#include "c_types_map.hpp"
+#include "cpu_shuffle_pd.hpp"
+#include "cpu_engine.hpp"
+#include "type_helpers.hpp"
+#include "utils.hpp"
+
+namespace mkldnn {
+namespace impl {
+namespace cpu {
+
+template<int data_type_size>
+struct ref_shuffle_t : public cpu_primitive_t {
+    using shuffle_class = ref_shuffle_t<data_type_size>;
+
+    struct pd_t: public cpu_shuffle_pd_t {
+        pd_t(engine_t *engine, const shuffle_desc_t *adesc,
+                const primitive_attr_t *attr,
+                const shuffle_pd_t *hint_fwd_pd)
+            : cpu_shuffle_pd_t(engine, adesc, attr, hint_fwd_pd) {}
+
+        DECLARE_COMMON_PD_T("ref:any",shuffle_class);
+
+        virtual status_t init() override {
+            assert(this->engine()->kind() == engine_kind::cpu);
+
+            bool ok = true
+                 && data_type_size ==
+                     types::data_type_size(this->desc()->data_desc.data_type);
+            if (!ok)
+                return status::unimplemented;
+            return status::success;
+        }
+    };
+
+    ref_shuffle_t(const pd_t *pd, const input_vector &inputs,
+            const output_vector &outputs)
+        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*pd)
+    {
+        const int axis_size = conf_.axis_size();
+        const int group_size = conf_.group_size();
+        const int transpose_row = conf_.is_fwd() ? group_size
+                                                 : axis_size / group_size;
+        const int transpose_col = conf_.is_fwd() ? axis_size / group_size
+                                                 : group_size;
+        rev_transposed_ = (int *)malloc(axis_size * sizeof(int), 64);
+        parallel_nd(transpose_col, transpose_row, [&](int i, int j) {
+            rev_transposed_[j * transpose_col + i] = i * transpose_row + j;
+        });
+    }
+
+    ~ref_shuffle_t() { free(rev_transposed_); }
+
+    typedef typename typesize_traits<data_type_size>::type data_t;
+
+    virtual void execute(event_t *e) {
+        using namespace memory_format;
+        switch (conf_.data_pd()->desc()->format) {
+        case nCdhw16c: execute_<nCdhw16c>(); break;
+        case nChw16c:  execute_<nChw16c>(); break;
+        case nCdhw8c:  execute_<nCdhw8c>(); break;
+        case nChw8c:   execute_<nChw8c>(); break;
+        case ncdhw:    execute_<ncdhw>(); break;
+        case nchw:     execute_<nchw>(); break;
+        case ndhwc:    execute_<ndhwc>(); break;
+        case nhwc:     execute_<nhwc>(); break;
+        default:       execute_<mkldnn_any>(); break;
+        }
+
+        e->set_state(event_t::ready);
+    }
+
+private:
+    template<memory_format_t fmt>void execute_();
+    pd_t conf_;
+    int *rev_transposed_;
+};
+
+}
+}
+}
+
+#endif
+
+// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s
index 1dd5ebd..eb5723f 100644 (file)
@@ -25,10 +25,6 @@ namespace cpu {
 template <data_type_t data_type>
 void simple_concat_t<data_type>::execute() {
     const int num_arrs = conf_.n_inputs();
-    const data_t *input_ptrs[max_num_arrs];
-    data_t *output_ptrs[max_num_arrs];
-    size_t nelems_to_copy[max_num_arrs];
-    strides_t is[max_num_arrs];
     int *perm = conf_.perm_, *iperm = conf_.iperm_;
     int concat_dim = conf_.concat_dim();
     auto o_base_ptr = reinterpret_cast<data_t *>(this->memory());
@@ -37,12 +33,16 @@ void simple_concat_t<data_type>::execute() {
         const memory_desc_wrapper i_d(conf_.src_pd(a));
         const memory_desc_wrapper o_d(conf_.src_image_pd(a));
 
-        input_ptrs[a] = reinterpret_cast<const data_t *>(
+        input_ptrs_[a] = reinterpret_cast<const data_t *>(
                 this->input_memory(a)) + i_d.blk_off(0);
-        output_ptrs[a] = o_base_ptr + o_d.blk_off(0);
-        nelems_to_copy[a] = nelems_to_concat(concat_dim, perm, iperm, i_d);
-        for (int i = 0; i < perm[concat_dim]; i++)
-            is[a][i] = size_t(i_d.blocking_desc().strides[0][iperm[i]]);
+        output_ptrs_[a] = o_base_ptr + o_d.blk_off(0);
+        nelems_to_copy_[a] = nelems_to_concat(concat_dim, perm, iperm, i_d);
+        for (int i = 0; i < TENSOR_MAX_DIMS; i++) {
+            if (i < perm[concat_dim])
+                is_[a][i] = size_t(i_d.blocking_desc().strides[0][iperm[i]]);
+            else
+                is_[a][i] = 0;
+        }
     }
 
     const memory_desc_wrapper o_d(conf_.src_image_pd());
@@ -59,9 +59,9 @@ void simple_concat_t<data_type>::execute() {
     switch (perm[concat_dim]) {
     case (0): {
         for (int a = 0; a < num_arrs; ++a) {
-            const data_t *i = &input_ptrs[a][0];
-            data_t *o = &output_ptrs[a][0];
-            parallel_nd((ptrdiff_t)nelems_to_copy[a],
+            const data_t *i = &input_ptrs_[a][0];
+            data_t *o = &output_ptrs_[a][0];
+            parallel_nd((ptrdiff_t)nelems_to_copy_[a],
                     [&](ptrdiff_t e) { o[e] = i[e]; });
         }
         break;
@@ -70,24 +70,29 @@ void simple_concat_t<data_type>::execute() {
         parallel_nd(phys_dims[0], phys_dims[1], phys_dims[2], phys_dims[3],
             phys_dims[4], num_arrs,
             [&](int n0, int n1, int n2, int n3, int n4, int a) {
-            size_t in_off = is[a][0] * n0 + is[a][1] * n1
-                    + is[a][2] * n2 + is[a][3] * n3
-                    + is[a][4] * n4;
+            // XXX: this code may access unitialized values in is_[*][0-4] --
+            // that's why we have to set them to zero although this is
+            // probably benign
+            size_t in_off = is_[a][0] * n0 + is_[a][1] * n1
+                    + is_[a][2] * n2 + is_[a][3] * n3
+                    + is_[a][4] * n4;
             size_t out_off = os[0] * n0 + os[1] * n1
                     + os[2] * n2 + os[3] * n3 + os[4] * n4;
-            const data_t *i = &input_ptrs[a][in_off];
-            data_t *o = &output_ptrs[a][out_off];
+            const data_t *i = &input_ptrs_[a][in_off];
+            data_t *o = &output_ptrs_[a][out_off];
 
             PRAGMA_OMP_SIMD()
-            for (size_t e = 0; e < nelems_to_copy[a]; ++e)
+            for (size_t e = 0; e < nelems_to_copy_[a]; ++e)
                 o[e] = i[e];
         });
     }
 }
+
 template struct simple_concat_t<data_type::f32>;
 template struct simple_concat_t<data_type::u8>;
 template struct simple_concat_t<data_type::s8>;
 template struct simple_concat_t<data_type::s32>;
+
 }
 }
 }
index 9d09587..45193b2 100644 (file)
@@ -49,7 +49,6 @@ struct simple_concat_t: public cpu_primitive_t {
             const memory_desc_wrapper dst_d(&dst_pd_);
             bool ok = true
                 && cpu_concat_pd_t::init() == success
-                && src_pds_.size() <= max_num_arrs
                 && dst_d.ndims() <= 6;
 
             if (!ok) return unimplemented;
@@ -62,7 +61,8 @@ struct simple_concat_t: public cpu_primitive_t {
                             o_d.data_type())
                     && i_d.format() == o_d.format()
                     && !utils::one_of(i_d.format(), memory_format::blocked,
-                        memory_format::wino_fmt);
+                        memory_format::wino_fmt)
+                    && !i_d.is_additional_buffer();
             }
 
             if (!ok)
@@ -85,19 +85,36 @@ struct simple_concat_t: public cpu_primitive_t {
 
     simple_concat_t(const pd_t *conf, const input_vector &inputs,
             const output_vector &outputs)
-        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*conf) {}
+        : cpu_primitive_t(&conf_, inputs, outputs), conf_(*conf)
+    {
+        const int n = conf_.n_inputs();
+        input_ptrs_ = (decltype(input_ptrs_))malloc(
+                sizeof(*input_ptrs_) * n, 64);
+        output_ptrs_ = (decltype(output_ptrs_))malloc(
+                sizeof(*output_ptrs_) * n, 64);
+        nelems_to_copy_ = (decltype(nelems_to_copy_))malloc(
+                sizeof(*nelems_to_copy_) * n, 64);
+        is_ = (decltype(is_))malloc(sizeof(*is_) * n, 64);
+    }
+
+    ~simple_concat_t() {
+        free(input_ptrs_);
+        free(output_ptrs_);
+        free(nelems_to_copy_);
+        free(is_);
+    }
 
     virtual void execute(event_t *e) {
         execute();
         e->set_state(event_t::ready);
     }
 
-    enum { max_num_arrs = 16 };
     typedef typename prec_traits<data_type>::type data_t;
 
 private:
     static void format_perm(
             const int ndims, const stride_t *strides, int *perm, int *iperm) {
+        assert(ndims >= 0);
         bool swapped;
         strides_t strides_tmp;
         utils::array_copy(strides_tmp, strides, ndims);
@@ -151,7 +168,13 @@ private:
 
     void execute();
     pd_t conf_;
+
+    const data_t **input_ptrs_ = nullptr;
+    data_t **output_ptrs_ = nullptr;
+    size_t *nelems_to_copy_ = nullptr;
+    strides_t *is_ = nullptr;
 };
+
 }
 }
 }
index db4c30c..0cacac7 100644 (file)
@@ -88,14 +88,16 @@ template <typename in_t> struct qz_b0<in_t, float> {
 /* Quantization */
 template <typename in_t, typename out_t> struct qz {
     out_t operator()(in_t in, out_t out, float alpha, float beta,
-            round_mode_t rmode)
-    { return round_and_saturate<out_t>(alpha * in + beta * out, rmode); }
+            round_mode_t rmode) {
+        return round_and_saturate<out_t>(
+                alpha * in + (beta ? beta * out : 0), rmode);
+    }
 };
 
 template <typename in_t> struct qz<in_t, float> {
     float operator()(in_t in, float out, float alpha, float beta,
             round_mode_t rmode)
-    { return alpha * in + beta * out; }
+    { return alpha * in + (beta ? beta * out : 0); }
 };
 
 }
index ad51a69..e78d6ad 100644 (file)
 #include "mkldnn_thread.hpp"
 #include "utils.hpp"
 
+#include "format_traits.hpp"
 #include "cpu_reorder_pd.hpp"
 #include "cpu_primitive.hpp"
 
 #include "simple_q10n.hpp"
+#include "cpu_isa_traits.hpp"
 
 namespace mkldnn {
 namespace impl {
@@ -38,12 +40,21 @@ using namespace mkldnn::impl::status;
 using namespace mkldnn::impl::memory_format;
 using namespace mkldnn::impl::data_type;
 
+using dk = data_kind_t;
+using bf = block_format_t;
+
 using namespace mkldnn::impl::utils;
 using math::saturate;
 
 template<impl::data_type_t type>
 using data_t = typename prec_traits<type>::type;
 
+template<impl::data_type_t type_i, impl::data_type_t type_o>
+using _qz_a1b0 = qz_a1b0<data_t<type_i>, data_t<type_o>>;
+
+template<impl::data_type_t type_i, impl::data_type_t type_o>
+using _qz = qz<data_t<type_i>, data_t<type_o>>;
+
 namespace fmt_order {
     const bool keep = true;
     const bool reverse = false;
@@ -66,7 +77,8 @@ struct reference {};
         const memory_desc_wrapper &input_d = pd->input_pd(); \
         const memory_desc_wrapper &output_d = pd->output_pd(); \
         const float alpha = pd->alpha(); MAYBE_UNUSED(alpha); \
-        const float beta = pd->beta(); MAYBE_UNUSED(beta);
+        const float beta = pd->beta(); MAYBE_UNUSED(beta); \
+        const round_mode_t rmode = pd->attr()->round_mode_; MAYBE_UNUSED(rmode);
 
 /* specific reorders: common template */
 template <SIMPLE_REORDER_TEMPL_DECL, typename spec = void>
@@ -82,323 +94,22 @@ bool simple_fmt_check(bool order_keep, impl::memory_format_t fmt_i,
 bool simple_attr_check(const primitive_attr_t *attr, bool many_scales_support) {
     if (many_scales_support)
         return true;
-    return utils::implication(attr, attr->output_scales_.mask_ == 0);
+    return IMPLICATION(attr, attr->output_scales_.mask_ == 0);
 }
-#define SIMPLE_IS_APPLICABLE(many_scales_support) \
-    static bool is_applicable(const memory_desc_wrapper &input_d, \
-            const memory_desc_wrapper &output_d, const primitive_attr_t *attr) \
-    { \
-        return simple_fmt_check(order_keep, fmt_i, fmt_o, input_d, output_d) \
-            && simple_attr_check(attr, many_scales_support); \
-    }
 }
 
 /* specific reorders: implementation */
-
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        fmt_i == nchw && (fmt_o == nChw8c || fmt_o == nChw16c)
-    >::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
-
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
-
-        const auto &nchw_d = order_keep ? input_d : output_d;
-        const auto &dims = input_d.dims();
-        constexpr int blksize = fmt_o == nChw8c ? 8 : 16;
-
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o, int blk_proc) {
-            if (alpha == 1.0 && beta == 0.0) {
-                for (int w = 0; w < dims[3]; ++w) {
-                    for (int c = 0; c < blk_proc; ++c) {
-                        const auto nchw_off =
-                            c * nchw_d.blocking_desc().strides[0][1] + w;
-                        if (order_keep) {
-                            o[w * blksize + c] = data_t<type_o>(i[nchw_off]);
-                        } else {
-                            o[nchw_off] = data_t<type_o>(i[w * blksize + c]);
-                        }
-                    }
-                }
-            } else {
-                for (int w = 0; w < dims[3]; ++w) {
-                    for (int c = 0; c < blk_proc; ++c) {
-                        const auto nchw_off =
-                            c * nchw_d.blocking_desc().strides[0][1] + w;
-                        if (order_keep) {
-                            o[w * blksize + c] = data_t<type_o>(
-                                alpha * i[nchw_off]
-                                + (beta ? beta * o[w * blksize + c] : 0));
-                        } else {
-                            o[nchw_off] = data_t<type_o>(
-                                alpha * i[w * blksize + c]
-                                + (beta ? beta * o[nchw_off] : 0));
-                        }
-                    }
-                }
-            }
-        };
-
-        const int CB = (dims[1] - 1) / blksize + 1;
-        int blktile  = (dims[1] - 1) % blksize + 1;
-
-        parallel_nd(dims[0], CB, dims[2], [&](int n, int C, int h) {
-            constexpr int i_c_mult = order_keep ? blksize : 1;
-            constexpr int o_c_mult = order_keep ? 1 : blksize;
-            auto i = &input[input_d.blk_off(n, i_c_mult * C, h)];
-            auto o = &output[output_d.blk_off(n, o_c_mult * C, h)];
-            ker(i, o, C < CB-1 ? blksize : blktile);
-        });
-
-        return success;
-    }
-};
-
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        fmt_i == ncdhw && (fmt_o == nCdhw16c)
-    >::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
-
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
-
-        const auto &ncdhw_d = order_keep ? input_d : output_d;
-        const auto &dims = input_d.dims();
-        constexpr int blksize = 16;
-
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            if (alpha == 1.0 && beta == 0.0) {
-                for (int w = 0; w < dims[4]; ++w) {
-                    for (int c = 0; c < blksize; ++c) {
-                        const auto ncdhw_off =
-                            c * ncdhw_d.blocking_desc().strides[0][1] + w; //to check
-                        if (order_keep) {
-                            o[w * blksize + c] = data_t<type_o>(i[ncdhw_off]);
-                        } else {
-                            o[ncdhw_off] = data_t<type_o>(i[w * blksize + c]);
-                        }
-                    }
-                }
-            } else {
-                for (int w = 0; w < dims[4]; ++w) {
-                    for (int c = 0; c < blksize; ++c) {
-                        const auto ncdhw_off =
-                            c * ncdhw_d.blocking_desc().strides[0][1] + w; //to check
-                        if (order_keep) {
-                            o[w * blksize + c] = data_t<type_o>(
-                                alpha * i[ncdhw_off]
-                                + (beta ? beta * o[w * blksize + c] : 0));
-                        } else {
-                            o[ncdhw_off] = data_t<type_o>(
-                                alpha * i[w * blksize + c]
-                                + (beta ? beta * o[ncdhw_off] : 0));
-                        }
-                    }
-                }
-            }
-        };
-        parallel_nd(dims[0], dims[1] / blksize, dims[2],
-            [&](int n, int C, int d) {
-            for (int h = 0; h < dims[3]; ++h) {
-                constexpr int i_c_mult = order_keep ? blksize : 1;
-                constexpr int o_c_mult = order_keep ? 1 : blksize;
-                auto i = &input[input_d.blk_off(n, i_c_mult * C, d, h)];
-                auto o = &output[output_d.blk_off(n, o_c_mult * C, d, h)];
-                ker(i, o);
-            }
-        });
-        return success;
-    }
-};
-
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        fmt_i == nhwc && (fmt_o == nChw8c || fmt_o == nChw16c)
-    >::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
-
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
-
-        const auto &dims = input_d.dims();
-        constexpr int blksize = fmt_o == nChw8c ? 8 : 16;
-        const auto is = input_d.blocking_desc().strides[0];
-        const auto os = output_d.blocking_desc().strides[0];
-
-        round_mode_t rmode = pd->attr()->round_mode_;
-
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            if (order_keep) {
-                if (alpha == 1.0 && beta == 0.0) {
-#                   pragma unroll
-                    for (int C = 0; C < dims[1] / blksize; ++C) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < blksize; ++c) {
-                            o[C * os[1] + c] = qz_a1b0<data_t<type_i>,
-                                data_t<type_o>>()(i[C * blksize + c], rmode);
-                        }
-                    }
-                } else if (alpha == 1.0) {
-#                   pragma unroll
-                    for (int C = 0; C < dims[1] / blksize; ++C) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < blksize; ++c) {
-                            o[C * os[1] + c] = qz_a1<data_t<type_i>,
-                                data_t<type_o>>()(i[C * blksize + c],
-                                o[C * os[1] + c], beta, rmode);
-                        }
-                    }
-                } else if (beta == 0.0) {
-#                   pragma unroll
-                    for (int C = 0; C < dims[1] / blksize; ++C) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < blksize; ++c) {
-                            o[C * os[1] + c] = qz_b0<data_t<type_i>,
-                                data_t<type_o>>()(i[C * blksize + c], alpha, rmode);
-                        }
-                    }
-                } else {
-#                   pragma unroll
-                    for (int C = 0; C < dims[1] / blksize; ++C) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < blksize; ++c) {
-                            o[C * os[1] + c] = qz<data_t<type_i>,
-                                data_t<type_o>>()(i[C * blksize + c],
-                                o[C * os[1] + c], alpha, beta, rmode);
-                        }
-                    }
-                }
-            } else {
-                if (alpha == 1.0 && beta == 0.0) {
-#                   pragma unroll
-                    for (int C = 0; C < dims[1] / blksize; ++C) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < blksize; ++c) {
-                            o[C * blksize + c] = qz_a1b0<data_t<type_i>,
-                                data_t<type_o>>()(i[C * is[1] + c], rmode);
-                        }
-                    }
-                } else if (alpha == 1.0) {
-#                   pragma unroll
-                    for (int C = 0; C < dims[1] / blksize; ++C) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < blksize; ++c) {
-                            o[C * blksize + c] = qz_a1<data_t<type_i>,
-                                data_t<type_o>>()(i[C * is[1] + c],
-                                o[C * blksize + c], beta, rmode);
-                        }
-                    }
-                } else if (beta == 0.0) {
-#                   pragma unroll
-                    for (int C = 0; C < dims[1] / blksize; ++C) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < blksize; ++c) {
-                            o[C * blksize + c] = qz_b0<data_t<type_i>,
-                                data_t<type_o>>()(i[C * is[1] + c], alpha, rmode);
-                        }
-                    }
-                } else {
-#                   pragma unroll
-                    for (int C = 0; C < dims[1] / blksize; ++C) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < blksize; ++c) {
-                            o[C * blksize + c] = qz<data_t<type_i>,
-                                data_t<type_o>>()(i[C * is[1] + c],
-                               o[C * blksize + c], alpha, beta, rmode);
-                        }
-                    }
-                }
-            }
-        };
-
-        parallel_nd(dims[0], dims[2], dims[3], [&](int n, int h, int w) {
-            auto i = &input[input_d.blk_off(n, 0, h, w)];
-            auto o = &output[output_d.blk_off(n, 0, h, w)];
-            ker(i, o);
-        });
-
-        return success;
-    }
-};
-
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<fmt_i == chwn
-    && (fmt_o == nChw8c || fmt_o == nChw16c)>::type>
+    typename utils::enable_if<fmt_i == nChw8c && fmt_o == nChw16c>::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
-
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
-
-        const auto &dims = input_d.dims();
-        const auto i_st = input_d.blocking_desc().strides[0];
-        const auto o_st = output_d.blocking_desc().strides[0];
-
-        constexpr int blksize = fmt_o == nChw8c ? 8 : 16;
-        constexpr int tsize = 16;
-
-        constexpr int i_mult = order_keep ? blksize : 1;
-        constexpr int o_mult = order_keep ? 1 : blksize;
-
-        const auto ci_mult = order_keep ? i_st[1] : 1;
-        const auto co_mult = order_keep ? 1 : o_st[1];
-
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o,
-                const int nsize) {
-            if (alpha == 1.0 && beta == 0) {
-                PRAGMA_OMP_SIMD(collapse(2))
-                for (int n = 0; n < nsize; n++) {
-                    for (int c = 0; c < blksize; ++c) {
-                        o[n * o_st[0] + c * co_mult] =
-                            data_t<type_o>(i[n * i_st[0] + c * ci_mult]);
-                    }
-                }
-            } else {
-                PRAGMA_OMP_SIMD(collapse(2))
-                for (int n = 0; n < nsize; n++) {
-                    for (int c = 0; c < blksize; ++c) {
-                        o[n * o_st[0] + c * co_mult] = data_t<type_o>(
-                            alpha * i[n * i_st[0] + c * ci_mult]
-                            + (beta ? beta * o[n * o_st[0] + c * co_mult] : 0));
-                    }
-                }
-            }
-        };
-
-        parallel_nd(dims[1] / blksize, dims[2], div_up(dims[0], tsize), dims[3],
-            [&](int C, int h, int n_blk, int w) {
-            int n = n_blk * tsize;
-            const int nsize =
-                n + tsize > dims[0] ? dims[0] - n : tsize;
-            auto i = &input[n * i_st[0] + C * i_mult * i_st[1]
-                + h * i_st[2] + w * i_st[3]];
-            auto o = &output[n * o_st[0] + C * o_mult * o_st[1]
-                + h * o_st[2] + w * o_st[3]];
-            ker(i, o, nsize);
-        });
-
-        return success;
+    static bool is_applicable(const memory_desc_wrapper &input_d,
+            const memory_desc_wrapper &output_d, const primitive_attr_t *attr)
+    {
+        return simple_fmt_check(order_keep, fmt_i, fmt_o, input_d, output_d)
+            && simple_attr_check(attr, false);
     }
-};
 
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<fmt_i == nChw8c && fmt_o == nChw16c>::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
@@ -456,280 +167,66 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
     }
 };
 
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<fmt_i == nchw && fmt_o == nhwc>::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
-
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
-
-        const auto &dims = input_d.dims();
-        const auto is = input_d.blocking_desc().strides[0];
-        const auto os = output_d.blocking_desc().strides[0];
-
-        round_mode_t rmode = pd->attr()->round_mode_;
-
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            if (order_keep) {
-                if (alpha == 1.0 && beta == 0.0) {
-#                   pragma unroll
-                    for (int w = 0; w < dims[3]; ++w) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < dims[1]; ++c) {
-                            o[w * os[3] + c] = qz_a1b0<data_t<type_i>,
-                                data_t<type_o>>()(i[c * is[1] + w], rmode);
-                        }
-                    }
-                } else if (alpha == 1.0) {
-#                   pragma unroll
-                    for (int w = 0; w < dims[3]; ++w) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < dims[1]; ++c) {
-                            o[w * os[3] + c] = qz_a1<data_t<type_i>,
-                                data_t<type_o>>()(i[c * is[1] + w],
-                                 o[w * os[3] + c], beta, rmode);
-                        }
-                    }
-                } else if (beta == 0.0) {
-#                   pragma unroll
-                    for (int w = 0; w < dims[3]; ++w) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < dims[1]; ++c) {
-                            o[w * os[3] + c] = qz_b0<data_t<type_i>,
-                                data_t<type_o>>()(i[c * is[1] + w], alpha, rmode);
-                        }
-                    }
-                } else {
-#                   pragma unroll
-                    for (int w = 0; w < dims[3]; ++w) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < dims[1]; ++c) {
-                            o[w * os[3] + c] = qz<data_t<type_i>,
-                                data_t<type_o>>()(i[c * is[1] + w],
-                                o[w * os[3] + c], alpha, beta, rmode);
-                        }
-                    }
-                }
-            } else {
-                if (alpha == 1.0 && beta == 0.0) {
-#                   pragma unroll
-                    for (int w = 0; w < dims[3]; ++w) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < dims[1]; ++c) {
-                            o[c * os[1] + w] = qz_a1b0<data_t<type_i>,
-                                data_t<type_o>>()(i[w * is[3] + c], rmode);
-                        }
-                    }
-                } else if (alpha == 1.0) {
-#                   pragma unroll
-                    for (int w = 0; w < dims[3]; ++w) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < dims[1]; ++c) {
-                            o[c * os[1] + w] = qz_a1<data_t<type_i>,
-                                data_t<type_o>>()(i[w * is[3] + c],
-                                o[c * os[1] + w], beta, rmode);
-                        }
-                    }
-                } else if (beta == 0.0) {
-#                   pragma unroll
-                    for (int w = 0; w < dims[3]; ++w) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < dims[1]; ++c) {
-                            o[c * os[1] + w] = qz_b0<data_t<type_i>,
-                                data_t<type_o>>()(i[w * is[3] + c], alpha, rmode);
-                        }
-                    }
-                } else {
-#                   pragma unroll
-                    for (int w = 0; w < dims[3]; ++w) {
-                        PRAGMA_OMP_SIMD()
-                        for (int c = 0; c < dims[1]; ++c) {
-                            o[c * os[1] + w] = qz<data_t<type_i>,
-                                data_t<type_o>>()(i[w * is[3] + c],
-                                o[c * os[1] + w], alpha, beta, rmode);
-                        }
-                    }
-                }
-            }
-        };
-
-        parallel_nd(dims[0], dims[2], [&](int n, int h) {
-            auto i = &input[input_d.blk_off(n, 0, h)];
-            auto o = &output[output_d.blk_off(n, 0, h)];
-            ker(i, o);
-        });
-
-        return success;
-    }
-};
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<fmt_i == hwio && fmt_o == oihw>::type>
+typename utils::enable_if<fmt_i == any && (false
+    || fmt_o == hwio_s8s8
+    || fmt_o == hwigo_s8s8)>::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
-
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
-
-        const auto &dims = input_d.dims();
-        const auto is = input_d.blocking_desc().strides[0];
-        const auto os = output_d.blocking_desc().strides[0];
-
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            if (alpha == 1.0 && beta == 0) {
-                for (int oc = 0; oc < dims[0]; ++oc) {
-                    for (int kw = 0; kw < dims[3]; ++kw) {
-                        if (order_keep) {
-                            o[oc * os[0] + kw] = data_t<type_o>(i[kw*is[3]+oc]);
-                        } else {
-                            o[kw * os[3] + oc] = data_t<type_o>(i[oc*is[0]+kw]);
-                        }
-                    }
-                }
-            } else {
-                for (int oc = 0; oc < dims[0]; ++oc) {
-                    for (int kw = 0; kw < dims[3]; ++kw) {
-                        const auto dst_off = order_keep ? oc * os[0] + kw :
-                                                          kw * os[3] + oc;
-                        const auto src_off = order_keep ? kw * is[3] + oc :
-                                                          oc * is[0] + kw;
-                        o[dst_off] = data_t<type_o>(alpha * i[src_off]
-                                     + (beta ? beta * o[dst_off] : 0));
-                    }
-                }
-            }
-        };
-
-        parallel_nd(dims[1], dims[2], [&](int ic, int kh) {
-            auto i = &input[input_d.blk_off(0, ic, kh)];
-            auto o = &output[output_d.blk_off(0, ic, kh)];
-            ker(i, o);
-        });
-
-        return success;
+    static bool is_applicable(const memory_desc_wrapper &input_d,
+            const memory_desc_wrapper &output_d, const primitive_attr_t *attr)
+    {
+        const size_t D_mask = utils::array_product(input_d.dims(),
+                                math::ilog2q(attr->output_scales_.mask_ + 1));
+        const int oc = (input_d.dims()[fmt_o == hwigo_s8s8 + 0]);
+        const int g = (fmt_o == hwigo_s8s8) ? (input_d.dims()[0]) : 1;
+
+        return output_d.format() == fmt_o
+            && (input_d.data_type() == f32 || input_d.data_type() == s8)
+            && output_d.data_type() == s8
+            && (D_mask == 1 || D_mask == (size_t)g * oc);
     }
-};
-
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<fmt_i == nchw && fmt_o == chwn>::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        const auto &dims = input_d.dims();
-
-        constexpr int tsize = 16;
-
-        const auto istrides = input_d.blocking_desc().strides[0];
-        const auto ostrides = output_d.blocking_desc().strides[0];
-        const auto CHW = dims[1] * dims[2] * dims[3];
-
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o,
-                const int nrows, const int ncols) {
-            if (alpha == 1.0 && beta == 0) {
-                PRAGMA_OMP_SIMD(collapse(2))
-                for (int row = 0; row < nrows; ++row) {
-                    for (int col = 0; col < ncols; ++col) {
-                        const auto o_idx = row * ostrides[0]
-                            + col * ostrides[3];
-                        const auto i_idx = row * istrides[0]
-                            + col * istrides[3];
-                        o[o_idx] = data_t<type_o>(i[i_idx]);
-                    }
-                }
-            } else {
-                PRAGMA_OMP_SIMD(collapse(2))
-                for (int row = 0; row < nrows; ++row) {
-                    for (int col = 0; col < ncols; ++col) {
-                        const auto o_idx = row * ostrides[0]
-                            + col * ostrides[3];
-                        const auto i_idx = row * istrides[0]
-                            + col * istrides[3];
-                        o[o_idx] = data_t<type_o>(alpha * i[i_idx]
-                            + (beta ? beta * o[o_idx] : 0));
-                    }
-                }
-            }
-        };
-
-        parallel_nd(div_up(dims[0], tsize), div_up(CHW, tsize),
-            [&](int r_blk, int c_blk) {
-            int r = r_blk * tsize;
-            int c = c_blk * tsize;
-            const int nrows =
-                r + tsize > dims[0] ? dims[0] - r : tsize;
-            const int ncols = c + tsize > CHW ? CHW - c : tsize;
-            auto i = &input[r * istrides[0] + c * istrides[3]];
-            auto o = &output[r * ostrides[0] + c * ostrides[3]];
-            ker(i, o, nrows, ncols);
-        });
-
-        return success;
-    }
-};
-
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<fmt_i == hwio
-    && (fmt_o == Ohwi8o || fmt_o == Ohwi16o)>::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
-
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
+        static constexpr bool w_groups = fmt_o == hwigo_s8s8;
 
         const auto &dims = input_d.dims();
-        const auto is = input_d.blocking_desc().strides[0];
-        const auto os = output_d.blocking_desc().strides[0];
+        const auto &pdims = output_d.blocking_desc().padding_dims;
 
-        constexpr int blksize = fmt_o == Ohwi8o ? 8 : 16;
+        const int G = w_groups ? dims[0] : 1;
+        const int OC = dims[w_groups + 0];
+        const int IC = dims[w_groups + 1];
+        const int H = dims[w_groups + 2];
+        const int W = dims[w_groups + 3];
 
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            if (alpha == 1.0 && beta == 0) {
-                PRAGMA_OMP_SIMD(collapse(2))
-                for (int O = 0; O < dims[0] / blksize; ++O) {
-                    for (int oc = 0; oc < blksize; ++oc) {
-                        if (order_keep) {
-                            o[O * os[0] + oc] =
-                                data_t<type_o>(i[O * blksize + oc]);
-                        } else {
-                            o[O * blksize + oc] =
-                                data_t<type_o>(i[O * is[0] + oc]);
-                        }
-                    }
-                }
-            } else {
-                PRAGMA_OMP_SIMD(collapse(2))
-                for (int O = 0; O < dims[0] / blksize; ++O) {
-                    for (int oc = 0; oc < blksize; ++oc) {
-                        const auto dst_off = order_keep ? O * os[0] + oc :
-                                                          O * blksize + oc;
-                        const auto src_off = order_keep ? O * blksize + oc :
-                                                          O * is[0] + oc;
-                        o[dst_off] = data_t<type_o>(alpha * i[src_off]
-                                     + (beta ? beta * o[dst_off] : 0));
-                    }
-                }
+        const float *scales = pd->attr()->output_scales_.scales_;
+        const size_t D_mask = utils::array_product(input_d.dims(),
+                math::ilog2q(pd->attr()->output_scales_.mask_ + 1));
+
+        float adj_scale = (mayiuse(avx512_core_vnni)) ? 1.0f : (1.0f / 2.0f);
+
+        size_t offset = G * pdims[w_groups + 0] * pdims[w_groups + 1] * H * W;
+        int32_t *cp = reinterpret_cast<int32_t *>(output + offset);
+
+        parallel_nd(G, OC, [&](int g, int oc) {
+            cp[g * OC + oc] = 0;
+            for (int ic = 0; ic < IC; ic++)
+            for (int h = 0; h < H; h++)
+            for (int w = 0; w < W; w++) {
+                auto i = input[input_d.blk_off<!w_groups>(g, oc, ic, h, w)];
+                auto &o = output[output_d.blk_off<!w_groups>(g, oc, ic, h, w)];
+                const float s = scales[(D_mask == 1) ? 0 : g * OC + oc];
+
+                o = qz_b0<data_t<type_i>, data_t<type_o>>()(
+                    i, s * adj_scale, rmode);
+                cp[g * OC + oc] -= (int32_t)o;
             }
-        };
-
-        parallel_nd(dims[2], dims[3], dims[1],
-            [&](int h, int w, int ic) {
-            auto i = &input[input_d.blk_off(0, ic, h, w)];
-            auto o = &output[output_d.blk_off(0, ic, h, w)];
-            ker(i, o);
+            cp [g * OC + oc] *= 128;
         });
-
         return success;
     }
 };
@@ -737,532 +234,231 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
     typename utils::enable_if<
-        (fmt_i == goihw && (fmt_o == gOIhw8i8o || fmt_o == gOIhw16i16o))
-        || ((fmt_i == oihw || fmt_i == ihwo)
-                && (fmt_o == OIhw8i8o || fmt_o == OIhw16i16o))
+          (fmt_i == goihw && fmt_o == gOIhw4i16o4i_s8s8)
+       || (fmt_i == oihw && fmt_o == OIhw4i16o4i_s8s8)
     >::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
+    static bool is_applicable(const memory_desc_wrapper &input_d,
+            const memory_desc_wrapper &output_d, const primitive_attr_t *attr)
+    {
+        const size_t D_mask = utils::array_product(input_d.dims(),
+                                math::ilog2q(attr->output_scales_.mask_ + 1));
+        const int oc = (input_d.dims()[(fmt_i == goihw) + 0]);
+        const int g = (fmt_i == goihw) ? (input_d.dims()[0]) : 1;
+
+        return input_d.format() == fmt_i
+            && output_d.format() == fmt_o
+            && (input_d.data_type() == f32 || input_d.data_type() == s8)
+            && output_d.data_type() == s8
+            && (D_mask == 1 || D_mask == (size_t)g * oc);
+    }
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        constexpr bool w_groups = fmt_i == goihw;
+        static constexpr bool w_groups = fmt_i == goihw;
+        const int blksize = 16;
+        const int sblk = 4;
 
         const auto &_g_oihw_d = order_keep ? input_d : output_d;
         const auto &dims = input_d.dims();
-        constexpr int blksize =
-            (fmt_o == OIhw8i8o || fmt_o == gOIhw8i8o) ? 8 : 16;
-
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            if (alpha == 1.0 && beta == 0.0) {
-                for (int ic = 0; ic < blksize; ++ic) {
-                for (int oc = 0; oc < blksize; ++oc) {
-                    const auto _g_oihw_off =
-                        oc * _g_oihw_d.blocking_desc().strides[0][w_groups + 0]
-                        + ic * _g_oihw_d.blocking_desc().strides[0]
-                            [w_groups + 1];
-                    if (order_keep) {
-                        o[ic * blksize + oc] = data_t<type_o>(i[_g_oihw_off]);
-                    } else {
-                        o[_g_oihw_off] = data_t<type_o>(i[ic * blksize + oc]);
-                    }
-                }
-                }
-            } else {
-                for (int ic = 0; ic < blksize; ++ic) {
-                for (int oc = 0; oc < blksize; ++oc) {
-                    const auto _g_oihw_off =
-                        oc * _g_oihw_d.blocking_desc().strides[0][w_groups + 0]
-                        + ic * _g_oihw_d.blocking_desc().strides[0]
-                            [w_groups + 1];
-                    if (order_keep) {
-                        o[ic * blksize + oc] =
-                            data_t<type_o>(alpha * i[_g_oihw_off]
-                            + (beta ? beta * o[ic * blksize + oc] : 0));
-                    } else {
-                        o[_g_oihw_off] =
-                            data_t<type_o>(alpha * i[ic * blksize + oc]
-                            + (beta ? beta * o[_g_oihw_off] : 0));
-                    }
-                }
-                }
-            }
-        };
-
-        const int _G = w_groups ? dims[0] : 1;
-
-        parallel_nd(_G, dims[w_groups + 0] / blksize,
-            dims[w_groups + 1] / blksize, dims[w_groups + 2],
-            dims[w_groups + 3], [&](int g, int O, int I, int h, int w) {
-            constexpr int i_mult = order_keep ? blksize : 1;
-            constexpr int o_mult = order_keep ? 1 : blksize;
-            auto i = &input[input_d.blk_off<!w_groups>(g,
-                    i_mult * O, i_mult * I, h, w)];
-            auto o = &output[output_d.blk_off<!w_groups>(
-                    g, o_mult * O, o_mult * I, h, w)];
-            ker(i, o);
-        });
-
-        return success;
-    }
-};
+        const auto &pdims = order_keep
+            ? output_d.blocking_desc().padding_dims
+            : input_d.blocking_desc().padding_dims;
 
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        (fmt_i == goidhw && (fmt_o == gOIdhw16i16o))
-        || ((fmt_i == oidhw) && (fmt_o == OIdhw16i16o))
-    >::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
+        const int G = w_groups ? dims[0] : 1;
+        const int OC = dims[w_groups + 0];
+        const int NB_OC = pdims[w_groups + 0] / blksize;
+        const int IC = dims[w_groups + 1];
+        const int NB_IC = pdims[w_groups + 1] / blksize;
+        const int H = dims[w_groups + 2];
+        const int W = dims[w_groups + 3];
 
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
+        const float *scales = pd->attr()->output_scales_.scales_;
+        const size_t D_mask = utils::array_product(input_d.dims(),
+                            math::ilog2q(pd->attr()->output_scales_.mask_ + 1));
 
-        constexpr bool w_groups = fmt_i == goidhw;
+        float adj_scale = (mayiuse(avx512_core_vnni)) ? 1.f : (1.f / 2.f);
 
-        const auto &_g_oihw_d = order_keep ? input_d : output_d;
-        const auto &dims = input_d.dims();
-        constexpr int blksize = 16;
+        auto index = [&](const int ic, const int oc) {
+            return ((ic / sblk) * blksize * sblk + sblk * oc + ic % sblk);
+        };
 
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            if (alpha == 1.0 && beta == 0.0) {
-                for (int ic = 0; ic < blksize; ++ic) {
-                for (int oc = 0; oc < blksize; ++oc) {
-                    const auto _g_oihw_off =
-                        oc * _g_oihw_d.blocking_desc().strides[0][w_groups + 0]
-                        + ic * _g_oihw_d.blocking_desc().strides[0]
-                            [w_groups + 1];
-                    if (order_keep) {
-                        o[ic * blksize + oc] = data_t<type_o>(i[_g_oihw_off]);
-                    } else {
-                        o[_g_oihw_off] = data_t<type_o>(i[ic * blksize + oc]);
-                    }
-                }
-                }
-            } else {
-                for (int ic = 0; ic < blksize; ++ic) {
-                for (int oc = 0; oc < blksize; ++oc) {
-                    const auto _g_oihw_off =
-                        oc * _g_oihw_d.blocking_desc().strides[0][w_groups + 0]
-                        + ic * _g_oihw_d.blocking_desc().strides[0]
-                            [w_groups + 1];
-                    if (order_keep) {
-                        o[ic * blksize + oc] =
-                            data_t<type_o>(alpha * i[_g_oihw_off]
-                            + (beta ? beta * o[ic * blksize + oc] : 0));
-                    } else {
-                        o[_g_oihw_off] =
-                            data_t<type_o>(alpha * i[ic * blksize + oc]
-                            + (beta ? beta * o[_g_oihw_off] : 0));
-                    }
-                }
-                }
+        auto ker = [&](const data_t<type_i> *inp, data_t<type_o> *out,
+            int32_t *c, const float *s, const int oc_block, const int ic_block) {
+            for (int ic = 0; ic < ic_block; ++ic) {
+            for (int oc = 0; oc < oc_block; ++oc) {
+                const auto _g_oihw_off =
+                    oc * _g_oihw_d.blocking_desc().strides[0][w_groups + 0]
+                  + ic * _g_oihw_d.blocking_desc().strides[0][w_groups + 1];
+                out[index(ic, oc)]
+                    = qz_b0<data_t<type_i>, data_t<type_o>>()(
+                            inp[_g_oihw_off], s[oc] * adj_scale, rmode);
+                c[oc] -= (128 * (int32_t)(out[index(ic, oc)]));
+            }
             }
         };
 
-        const int _G = w_groups ? dims[0] : 1;
-        parallel_nd(_G, dims[w_groups + 0] / blksize,
-            dims[w_groups + 1] / blksize, dims[w_groups + 2],
-            dims[w_groups + 3], [&](int g, int O, int I, int d, int h) {
-            for (int w = 0; w < dims[w_groups + 4]; ++w) {
-                    constexpr int i_mult = order_keep ? blksize : 1;
-                    constexpr int o_mult = order_keep ? 1 : blksize;
+        constexpr int i_mult = blksize;
+        constexpr int o_mult = 1;
+
+        size_t offset = G * pdims[w_groups+0] * pdims[w_groups+1] * H * W;
+        int32_t *cp = reinterpret_cast<int32_t *>(output + offset);
+        parallel_nd(G * NB_OC * blksize, [&](int i) {
+            cp[i] = 0;
+        });
+
+        parallel_nd(G, NB_OC, [&](int g, int O) {
+            for (int I = 0; I < NB_IC; I++)
+                for (int h = 0; h < H; h++)
+                for (int w = 0; w < W; w++) {
                     auto i = &input[input_d.blk_off<!w_groups>(g,
-                            i_mult * O, i_mult * I, d, h, w)];
+                            i_mult * O, i_mult * I, h, w)];
                     auto o = &output[output_d.blk_off<!w_groups>(
-                            g, o_mult * O, o_mult * I, d, h, w)];
-                    ker(i, o);
-            }
+                            g, o_mult * O, o_mult * I, h, w)];
+                    const int oc_block = nstl::min(blksize, OC - O * blksize);
+                    const int ic_block = nstl::min(blksize, IC - I * blksize);
+
+                    int _offset = (g * NB_OC + O) * blksize;
+                    ker(i, o, (order_keep) ? &cp[_offset] : nullptr,
+                            &scales[(D_mask == 1) ? 0 : _offset],
+                                        oc_block, ic_block);
+                }
         });
-
         return success;
     }
 };
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        (fmt_i == goihw && (fmt_o == gOIhw8o8i || fmt_o == gOIhw16o16i
-                            || fmt_o == gIOhw16o16i))
-        || (fmt_i == oihw && (fmt_o == OIhw8o8i || fmt_o == OIhw16o16i
-                            || fmt_o == IOhw16o16i))
-    >::type>
+    typename utils::enable_if<true
+    && format_traits<fmt_i>::blk_fmt == bf::_8i16o2i
+    && format_traits<fmt_o>::blk_fmt == bf::_8o16i2o>::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
+    static bool is_applicable(const memory_desc_wrapper &input_d,
+            const memory_desc_wrapper &output_d, const primitive_attr_t *attr)
+    {
+        return simple_fmt_check(order_keep, fmt_i, fmt_o, input_d, output_d)
+            && simple_attr_check(attr, false);
+    }
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        constexpr bool w_groups = fmt_i == goihw;
+        static constexpr bool w_groups
+            = format_traits<fmt_o>::data_kind == dk::gwei;
+        constexpr int is_1d = format_traits<fmt_o>::ndims_sp == 1;
+        constexpr int is_3d = format_traits<fmt_o>::ndims_sp == 3;
+        constexpr int blksize = format_traits<fmt_o>::blk_size;
 
-        const auto &_g_oihw_d = order_keep ? input_d : output_d;
         const auto &dims = input_d.dims();
-        constexpr int blksize =
-            (fmt_o == OIhw8o8i || fmt_o == gOIhw8o8i) ? 8 : 16;
 
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            if (alpha == 1.0 && beta == 0.0) {
-                for (int oc = 0; oc < blksize; ++oc) {
-                for (int ic = 0; ic < blksize; ++ic) {
-                    const auto _g_oihw_off =
-                        oc * _g_oihw_d.blocking_desc().strides[0][w_groups + 0]
-                        + ic * _g_oihw_d.blocking_desc().strides[0]
-                            [w_groups + 1];
-                    if (order_keep) {
-                        o[oc * blksize + ic] = data_t<type_o>(i[_g_oihw_off]);
-                    } else {
-                        o[_g_oihw_off] = data_t<type_o>(i[oc * blksize + ic]);
-                    }
-                }
-                }
-            } else {
-                for (int oc = 0; oc < blksize; ++oc) {
-                for (int ic = 0; ic < blksize; ++ic) {
-                    const auto _g_oihw_off =
-                        oc * _g_oihw_d.blocking_desc().strides[0][w_groups + 0]
-                        + ic * _g_oihw_d.blocking_desc().strides[0]
-                            [w_groups + 1];
-                    if (order_keep) {
-                        o[oc * blksize + ic] =
-                            data_t<type_o>(alpha * i[_g_oihw_off]
-                            + (beta ? beta * o[oc * blksize + ic] : 0));
-                    } else {
-                        o[_g_oihw_off] =
-                            data_t<type_o>(alpha * i[oc * blksize + ic]
-                            + (beta ? beta * o[_g_oihw_off] : 0));
-                    }
-                }
-                }
-            }
-        };
-
-        const int _G = w_groups ? dims[0] : 1;
-
-        parallel_nd(_G, dims[w_groups + 0] / blksize,
-            dims[w_groups + 1] / blksize, dims[w_groups + 2],
-            dims[w_groups + 3], [&](int g, int O, int I, int h, int w) {
-            constexpr int i_mult = order_keep ? blksize : 1;
-            constexpr int o_mult = order_keep ? 1 : blksize;
-            auto i = &input[input_d.blk_off<!w_groups>(g,
-                    i_mult * O, i_mult * I, h, w)];
-            auto o = &output[output_d.blk_off<!w_groups>(
-                    g, o_mult * O, o_mult * I, h, w)];
-            ker(i, o);
-        });
-
-        return success;
-    }
-};
-
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        (fmt_i == goidhw && fmt_o == gOIdhw16o16i)
-        || (fmt_i == oidhw && fmt_o == OIdhw16o16i)
-    >::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
-
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
+        const int G = w_groups ? dims[0] : 1;
+        const int NB_OC = dims[w_groups + 0] / blksize;
+        const int NB_IC = dims[w_groups + 1] / blksize;
+        const int D = is_3d ? dims[w_groups + 2] : 1;
+        const int H = is_1d ? 1 : dims[w_groups + 2 + is_3d];
+        const int W = dims[w_groups + 3 + is_3d - is_1d];
 
-        constexpr bool w_groups = fmt_i == goidhw;
+        auto idx_i = [&](const int oc, const int ic)
+        { return ((ic / 2) * blksize * 2 + 2 * oc + ic % 2); };
 
-        const auto &_g_oidhw_d = order_keep ? input_d : output_d;
-        const auto &dims = input_d.dims();
-        constexpr int blksize = 16;
+        auto idx_o = [&](const int oc, const int ic)
+        { return ((oc / 2) * blksize * 2 + 2 * ic + oc % 2); };
 
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
+        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) -> void {
             if (alpha == 1.0 && beta == 0.0) {
-                for (int oc = 0; oc < blksize; ++oc) {
                 for (int ic = 0; ic < blksize; ++ic) {
-                    const auto _g_oidhw_off =
-                        oc * _g_oidhw_d.blocking_desc().strides[0][w_groups + 0]
-                        + ic * _g_oidhw_d.blocking_desc().strides[0]
-                            [w_groups + 1];
-                    if (order_keep) {
-                        o[oc * blksize + ic] = data_t<type_o>(i[_g_oidhw_off]);
-                    } else {
-                        o[_g_oidhw_off] = data_t<type_o>(i[oc * blksize + ic]);
+                    for (int oc = 0; oc < blksize; ++oc) {
+                        o[idx_o(oc, ic)] = _qz_a1b0<type_i, type_o>()(
+                                i[idx_i(oc, ic)], rmode);
                     }
                 }
-                }
             } else {
-                for (int oc = 0; oc < blksize; ++oc) {
                 for (int ic = 0; ic < blksize; ++ic) {
-                    const auto _g_oidhw_off =
-                        oc * _g_oidhw_d.blocking_desc().strides[0][w_groups + 0]
-                        + ic * _g_oidhw_d.blocking_desc().strides[0]
-                            [w_groups + 1];
-                    if (order_keep) {
-                        o[oc * blksize + ic] =
-                            data_t<type_o>(alpha * i[_g_oidhw_off]
-                            + (beta ? beta * o[oc * blksize + ic] : 0));
-                    } else {
-                        o[_g_oidhw_off] =
-                            data_t<type_o>(alpha * i[oc * blksize + ic]
-                            + (beta ? beta * o[_g_oidhw_off] : 0));
+                    for (int oc = 0; oc < blksize; ++oc) {
+                        o[idx_o(oc, ic)] = _qz<type_i, type_o>()(
+                                i[idx_i(oc, ic)], o[idx_o(oc, ic)], alpha,
+                                beta, rmode);
                     }
                 }
-                }
             }
         };
 
-        const int _G = w_groups ? dims[0] : 1;
-
-        parallel_nd(_G, dims[w_groups + 0] / blksize,
-            dims[w_groups + 1] / blksize, dims[w_groups + 2],
-            dims[w_groups + 3], [&](int g, int O, int I, int d, int h) {
-            for (int w = 0; w < dims[w_groups + 4]; ++w) {
-                constexpr int i_mult = order_keep ? blksize : 1;
-                constexpr int o_mult = order_keep ? 1 : blksize;
-                auto i = &input[input_d.blk_off<!w_groups>(g,
-                        i_mult * O, i_mult * I, d, h, w)];
-                auto o = &output[output_d.blk_off<!w_groups>(
-                        g, o_mult * O, o_mult * I, d, h, w)];
-                ker(i, o);
-            }
+        parallel_nd(G, NB_OC, NB_IC, D, H, W,
+            [&](int g, int o, int i, int d, int h, int w) {
+            auto ptr_i = &input[wei_blk_off_like_gwei3D<fmt_i>(
+                    input_d, g, o, i, d,  h, w)];
+            auto ptr_o = &output[wei_blk_off_like_gwei3D<fmt_o>(
+                    output_d, g, o, i, d, h, w)];
+            ker(ptr_i, ptr_o);
         });
 
         return success;
     }
 };
 
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        (fmt_i == goihw && fmt_o == gOihw16o)
-        || (fmt_i == oihw && fmt_o == Oihw16o)
-    >::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
-
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
-
-        constexpr bool w_groups = fmt_i == goihw;
-
-        const auto &_g_oihw_d = order_keep ? input_d : output_d;
-        const auto strd_oc = _g_oihw_d.blocking_desc().strides[0][w_groups];
-        const auto &dims = input_d.dims();
-        const int blksize = 16;
-
-        const int _G = w_groups ? dims[0] : 1;
-        constexpr int i_mult = order_keep ? blksize : 1;
-        constexpr int o_mult = order_keep ? 1 : blksize;
-
-        parallel_nd(_G, dims[w_groups + 0] / blksize, dims[w_groups + 1],
-            dims[w_groups + 2], dims[w_groups + 3],
-            [&](int g, int O, int i, int h, int w) {
-            auto inp = &input [input_d.blk_off<!w_groups>(g,
-                    i_mult * O, i, h, w)];
-            auto out = &output[output_d.blk_off<!w_groups>(g,
-                    o_mult * O, i, h, w)];
-            if (alpha == 1.0 && beta == 0.0) {
-                for (int oc = 0; oc < blksize; ++oc) {
-                    const auto off = oc * strd_oc;
-                    if (order_keep) {
-                        out[oc] = data_t<type_o>(inp[off]);
-                    } else {
-                        out[off] = data_t<type_o>(inp[oc]);
-                    }
-                }
-            } else {
-                for (int oc = 0; oc < blksize; ++oc) {
-                    const auto off = oc * strd_oc;
-                    if (order_keep) {
-                        out[oc] = data_t<type_o>(alpha * inp[off] + (beta
-                                    ? beta * out[oc] : 0));
-                    } else {
-                        out[off] = data_t<type_o>(alpha * inp[oc] + (beta
-                                    ? beta * out[off] : 0));
-                    }
-                }
-            }
-        });
-
-        return success;
-    }
-};
+/* reorders with tail support */
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        (fmt_i == goidhw && fmt_o == gOidhw16o)
-        || (fmt_i == oidhw && fmt_o == Oidhw16o)
-    >::type>
+typename utils::enable_if<fmt_i == nChw8c && fmt_o == nhwc && order_keep>::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
-
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
-
-        constexpr bool w_groups = fmt_i == goidhw;
-
-        const auto &_g_oihw_d = order_keep ? input_d : output_d;
-        const auto strd_oc = _g_oihw_d.blocking_desc().strides[0][w_groups];
-        const auto &dims = input_d.dims();
-        const int blksize = 16;
-
-        const int _G = w_groups ? dims[0] : 1;
-        constexpr int i_mult = order_keep ? blksize : 1;
-        constexpr int o_mult = order_keep ? 1 : blksize;
-
-        parallel_nd(_G, dims[w_groups + 0] / blksize, dims[w_groups + 1],
-            dims[w_groups + 2], dims[w_groups + 3], dims[w_groups + 4],
-            [&](int g, int O, int i, int d, int h, int w) {
-                auto inp = &input [input_d.blk_off<!w_groups>(g,
-                        i_mult * O, i, d, h, w)];
-                auto out = &output[output_d.blk_off<!w_groups>(g,
-                        o_mult * O, i, d, h, w)];
-                if (alpha == 1.0 && beta == 0.0) {
-                    for (int oc = 0; oc < blksize; ++oc) {
-                        const auto off = oc * strd_oc;
-                        if (order_keep) {
-                            out[oc] = data_t<type_o>(inp[off]);
-                        } else {
-                            out[off] = data_t<type_o>(inp[oc]);
-                        }
-                    }
-                } else {
-                    for (int oc = 0; oc < blksize; ++oc) {
-                        const auto off = oc * strd_oc;
-                        if (order_keep) {
-                            out[oc] = data_t<type_o>(
-                                    alpha * inp[off] + (beta
-                                        ? beta * out[oc] : 0));
-                        } else {
-                            out[off] = data_t<type_o>(
-                                    alpha * inp[oc] + (beta
-                                        ? beta * out[off] : 0));
-                        }
-                    }
-                }
-        });
-
-        return success;
+    static bool is_applicable(const memory_desc_wrapper &input_d,
+        const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
+        int smask = attr ? attr->output_scales_.mask_ : 0;
+        return (smask == 0 || smask == 2) && order_keep && input_d._md->format == nChw8c && output_d._md->format == nhwc;
     }
-};
-
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<(fmt_i == goihw && fmt_o == Goihw8g) ||
-                              (fmt_i == goihw && fmt_o == Goihw16g)>::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        constexpr bool w_groups = fmt_i == goihw;
-
-        const auto &_goihw_d = order_keep ? input_d : output_d;
+        const auto &pdims = input_d.blocking_desc().padding_dims;
         const auto &dims = input_d.dims();
-        const int blksize = fmt_o == Goihw8g ? 8 : 16;
-
-        const int NG = dims[0];
-        constexpr int i_mult = order_keep ? blksize : 1;
-        constexpr int o_mult = order_keep ? 1 : blksize;
-
-        parallel_nd(NG / blksize, dims[1], dims[2], dims[3], dims[4],
-            [&](int G, int oc, int ic, int h, int w) {
-            auto i = &input[input_d.blk_off<!w_groups>(
-                   G * i_mult, oc, ic, h, w)];
-            auto o = &output[output_d.blk_off<!w_groups>(
-                   G * o_mult, oc, ic, h, w)];
-            if (alpha == 1.0 && beta == 0.0) {
-                for (int g = 0; g < blksize; ++g) {
-                    const auto _goihw_off = g *
-                        _goihw_d.blocking_desc().strides[0][0];
-                    if (order_keep) {
-                        o[g] = data_t<type_o>(i[_goihw_off]);
-                    } else {
-                        o[_goihw_off] = data_t<type_o>(i[g]);
-                    }
-                }
-            } else {
-                for (int g = 0; g < blksize; ++g) {
-                    const auto _goihw_off = g *
-                        _goihw_d.blocking_desc().strides[0][0];
-                    if (order_keep) {
-                        o[g] = data_t<type_o>(alpha * i[_goihw_off] +
-                             (beta ? beta * o[g] : 0));
-                   } else {
-                        o[_goihw_off] = data_t<type_o>(alpha *
-                             i[g] + (beta ? beta * o[_goihw_off] : 0));
-                   }
-               }
-           }
-        });
-
-        return success;
-    }
-};
-
-template <SIMPLE_REORDER_TEMPL_DECL>
-struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        fmt_i == hwio && (fmt_o == OIhw8i8o || fmt_o == OIhw16i16o)
-    >::type>
-{
-    SIMPLE_IS_APPLICABLE(false);
+        constexpr int blksize = format_traits<fmt_i>::blk_size;
+        const int C = dims[1];
+        const int H = dims[2];
+        const int W = dims[3];
 
-    static status_t execute(const cpu_reorder_pd_t *pd,
-        const data_t<type_i> *input, data_t<type_o> *output) {
-        DECLARE_COMMON_PARAMS();
+        constexpr int i_c_mult = 1;
+        constexpr int o_c_mult = blksize;
 
-        const auto &_hwio_d = order_keep ? input_d : output_d;
-        const auto &dims = input_d.dims();
-        constexpr int blksize = fmt_o == OIhw8i8o ? 8 : 16;
-        const auto _hwio_st = _hwio_d.blocking_desc().strides[0];
+        const float *scales = pd->attr()->output_scales_.scales_;
+        int smask = pd->attr()->output_scales_.mask_;
 
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            if (alpha == 1.0 && beta == 0.0) {
-                PRAGMA_OMP_SIMD(collapse(2))
-                for (int ic = 0; ic < blksize; ++ic) {
-                    for (int oc = 0; oc < blksize; ++oc) {
-                        if (order_keep) {
-                            o[ic * blksize + oc] =
-                                data_t<type_o>(i[oc + ic * _hwio_st[1]]);
-                        } else {
-                            o[oc + ic * _hwio_st[1]] =
-                                data_t<type_o>(i[ic * blksize + oc]);
-                        }
+        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o,
+                       const int nb_c, const int c_block) {
+            if (smask == 2) {
+                for (int w = 0; w < W; ++w) {
+                    const ptrdiff_t flat_off = w * output_d.blocking_desc().strides[0][3];
+                    PRAGMA_OMP_SIMD()
+                    for (int c = 0; c < c_block; ++c) {
+                        const float scale = scales[nb_c * blksize + c];
+
+                        o[flat_off + c] = _qz<type_i, type_o>()(i[w * blksize + c],
+                                                            o[flat_off + c], scale, beta, rmode);
                     }
                 }
             } else {
-                PRAGMA_OMP_SIMD(collapse(2))
-                for (int ic = 0; ic < blksize; ++ic) {
-                    for (int oc = 0; oc < blksize; ++oc) {
-                        const auto dst_off = order_keep ? ic * blksize + oc :
-                                                          ic * _hwio_st[1] + oc;
-                        const auto src_off = order_keep ? ic * _hwio_st[1] + oc :
-                                                          ic * blksize + oc;
-                        o[dst_off] = data_t<type_o>(alpha * i[src_off]
-                                     + (beta ? beta * o[dst_off] : 0));
+                for (int w = 0; w < W; ++w) {
+                    const ptrdiff_t flat_off = w * output_d.blocking_desc().strides[0][3];
+                    PRAGMA_OMP_SIMD()
+                    for (int c = 0; c < c_block; ++c) {
+                        o[flat_off + c] = _qz_a1b0<type_i, type_o>()(i[w * blksize + c], rmode);
                     }
                 }
             }
         };
 
-        parallel_nd(dims[2], dims[3], dims[0] / blksize, dims[1] / blksize,
-            [&](int h, int w, int O, int I) {
-            constexpr int i_mult = order_keep ? blksize : 1;
-            constexpr int o_mult = order_keep ? 1 : blksize;
-            auto i = &input[input_d.blk_off(
-                    i_mult * O, i_mult * I, h, w)];
-            auto o = &output[output_d.blk_off(
-                    o_mult * O, o_mult * I, h, w)];
-            ker(i, o);
+        parallel_nd(dims[0], pdims[1] / blksize, H,
+            [&](int n, int nb_c, int h) {
+                    auto i = &input[input_d.blk_off(n, i_c_mult * nb_c, h)];
+                    auto o = &output[output_d.blk_off(n, o_c_mult * nb_c, h)];
+                    const int c_block = nstl::min(blksize, C - nb_c * blksize);
+                    ker(i, o, nb_c, c_block);
         });
 
         return success;
@@ -1271,147 +467,101 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-          (fmt_i == goihw && (fmt_o == gOIhw4i16o4i || fmt_o == gOIhw8i16o2i))
-       || (fmt_i == oihw && (fmt_o == OIhw4i16o4i || fmt_o == OIhw8i16o2i))
-    >::type>
+typename utils::enable_if<fmt_i == nhwc && fmt_o == nChw8c>::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
+    static bool is_applicable(const memory_desc_wrapper &input_d,
+        const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
+        int smask = attr ? attr->output_scales_.mask_ : 0;
+        return (smask == 2) && order_keep && input_d._md->format == nhwc && output_d._md->format == nChw8c;
+    }
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        static constexpr bool w_groups = fmt_i == goihw;
-        int sblk = fmt_o == OIhw4i16o4i || fmt_o == gOIhw4i16o4i ? 4 : 2;
-
-        constexpr int is_3d = false;
-
-        const auto &_g_oihw_d = order_keep ? input_d : output_d;
+        const auto &pdims = output_d.blocking_desc().padding_dims;
         const auto &dims = input_d.dims();
-        const auto &pdims = order_keep
-            ? output_d.blocking_desc().padding_dims
-            : input_d.blocking_desc().padding_dims;
+        constexpr int blksize = format_traits<fmt_o>::blk_size;
+        const int C = dims[1];
+        const int H = dims[2];
+        const int W = dims[3];
 
-        const int blksize = 16;
-        const int G = w_groups ? dims[0] : 1;
-        const int OC = dims[w_groups + 0];
-        const int NB_OC = pdims[w_groups + 0] / blksize;
-        const int IC = dims[w_groups + 1];
-        const int NB_IC = pdims[w_groups + 1] / blksize;
-        //const int D = is_3d ? dims[w_groups + 2] : 1;
-        const int H = dims[w_groups + 2 + is_3d];
-        const int W = dims[w_groups + 3 + is_3d];
+        constexpr int i_c_mult = blksize;
+        constexpr int o_c_mult = 1;
 
-        auto index = [&](const int ic, const int oc) {
-            return ((ic / sblk) * blksize * sblk + sblk * oc + ic % sblk);
-        };
+        const float *scales = pd->attr()->output_scales_.scales_;
+        int smask = pd->attr()->output_scales_.mask_;
 
         auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o,
-                const int oc_block, const int ic_block) {
-            if (alpha == 1.0 && beta == 0.0) {
-                for (int ic = 0; ic < ic_block; ++ic) {
-                for (int oc = 0; oc < oc_block; ++oc) {
-                    const auto _g_oihw_off =
-                        oc * _g_oihw_d.blocking_desc().strides[0][w_groups + 0]
-                      + ic * _g_oihw_d.blocking_desc().strides[0][w_groups + 1];
-                    if (order_keep) {
-                        o[index(ic, oc)] =
-                            data_t<type_o>(i[_g_oihw_off]);
-                    } else {
-                        o[_g_oihw_off] =
-                            data_t<type_o>(i[index(ic, oc)]);
+                       const int nb_c, const int c_block) {
+            if (smask == 2) {
+                for (int w = 0; w < W; ++w) {
+                    const ptrdiff_t flat_off = w * input_d.blocking_desc().strides[0][3];
+                    PRAGMA_OMP_SIMD()
+                    for (int c = 0; c < c_block; ++c) {
+                        const float scale = scales[nb_c * blksize + c];
+
+                        o[w * blksize + c] = _qz<type_i, type_o>()(i[flat_off + c],
+                                                                   o[w * blksize + c], scale, beta, rmode);
                     }
                 }
-                }
             } else {
-                for (int ic = 0; ic < ic_block; ++ic) {
-                for (int oc = 0; oc < oc_block; ++oc) {
-                    const auto _g_oihw_off =
-                        oc * _g_oihw_d.blocking_desc().strides[0][w_groups + 0]
-                      + ic * _g_oihw_d.blocking_desc().strides[0][w_groups + 1];
-                    if (order_keep) {
-                        o[index(ic, oc)] = data_t<type_o>(
-                            alpha * i[_g_oihw_off]
-                            + (beta ? beta * o[index(ic, oc)] : 0));
-                    } else {
-                        o[_g_oihw_off] = data_t<type_o>(
-                            alpha * i[index(ic, oc)]
-                            + (beta ? beta * o[_g_oihw_off] : 0));
+                for (int w = 0; w < W; ++w) {
+                    const ptrdiff_t flat_off = w * input_d.blocking_desc().strides[0][3];
+                    PRAGMA_OMP_SIMD()
+                    for (int c = 0; c < c_block; ++c) {
+                        o[w * blksize + c] = _qz_a1b0<type_i, type_o>()(i[flat_off + c], rmode);
                     }
                 }
-                }
             }
         };
 
-        constexpr int i_mult = order_keep ? blksize : 1;
-        constexpr int o_mult = order_keep ? 1 : blksize;
-
-        parallel_nd(G, NB_OC, NB_IC, H, W,
-            [&](int g, int O, int I, int h, int w) {
-            auto i = &input[input_d.blk_off<!w_groups>(g,
-                    i_mult * O, i_mult * I, h, w)];
-            auto o = &output[output_d.blk_off<!w_groups>(
-                    g, o_mult * O, o_mult * I, h, w)];
-            const int oc_block = nstl::min(blksize, OC - O * blksize);
-            const int ic_block = nstl::min(blksize, IC - I * blksize);
-            ker(i, o, oc_block, ic_block);
+        parallel_nd(dims[0], pdims[1] / blksize, H,
+            [&](int n, int nb_c, int h) {
+                    auto i = &input[input_d.blk_off(n, i_c_mult * nb_c, h)];
+                    auto o = &output[output_d.blk_off(n, o_c_mult * nb_c, h)];
+                    const int c_block = nstl::min(blksize, C - nb_c * blksize);
+                    ker(i, o, nb_c, c_block);
         });
+
         return success;
     }
 };
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        (fmt_i == gOIhw8i16o2i && fmt_o == gOIhw8o16i2o)
-        || (fmt_i == OIhw8i16o2i && fmt_o == OIhw8o16i2o)
-    >::type>
+typename utils::enable_if<fmt_i == nhwc && fmt_o == nhwc>::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
+    static bool is_applicable(const memory_desc_wrapper &input_d,
+        const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
+        int smask = attr ? attr->output_scales_.mask_ : 0;
+        return (smask == 2) && order_keep && input_d._md->format == nhwc && output_d._md->format == nhwc;
+    }
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        static constexpr bool w_groups = fmt_i == gOIhw8i16o2i;
-
         const auto &dims = input_d.dims();
-        const int blksize = 16;
+        const int C = dims[1];
+        const int H = dims[2];
+        const int W = dims[3];
 
-        auto index_src = [&](const int ic, const int oc) {
-            return ((ic / 2) * blksize * 2 + 2 * oc + ic % 2);
-        };
-        auto index_dst = [&](const int ic, const int oc) {
-            return ((oc / 2) * blksize * 2 + 2 * ic + oc % 2);
-        };
+        const float *scales = pd->attr()->output_scales_.scales_;
 
-        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) -> void {
-            if (alpha == 1.0 && beta == 0.0) {
-                for (int ic = 0; ic < blksize; ++ic) {
-                    for (int oc = 0; oc < blksize; ++oc) {
-                        o[index_dst(ic,oc)] = data_t<type_o>(i[index_src(ic,oc)]);
-                    }
-                }
-            } else {
-                for (int ic = 0; ic < blksize; ++ic) {
-                    for (int oc = 0; oc < blksize; ++oc) {
-                        o[index_dst(ic,oc)] = data_t<type_o>(
-                                alpha * i[index_src(ic,oc)]
-                                + (beta ? beta * o[index_dst(ic,oc)] : 0));
-                    }
+        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
+                for (int c = 0; c < C; ++c) {
+                    const float scale = scales[c];
+
+                    o[c] = _qz<type_i, type_o>()(i[c], o[c], scale, beta, rmode);
                 }
-            }
         };
 
-        const int _G = w_groups ? dims[0] : 1;
-
-        parallel_nd(_G, dims[w_groups + 0] / blksize,
-            dims[w_groups + 1] / blksize, dims[w_groups + 2],
-            dims[w_groups + 3], [&](int g, int o, int i, int h, int w) {
-            auto i_ptr = &input[input_d.blk_off<!w_groups>(g, o, i, h, w)];
-            auto o_ptr = &output[output_d.blk_off<!w_groups>(g, o, i, h, w)];
-            ker(i_ptr, o_ptr);
+        parallel_nd(dims[0], H, W,
+            [&](int n, int h, int w) {
+                auto i = &input[input_d.blk_off(n, 0, h, w)];
+                auto o = &output[output_d.blk_off(n, 0, h, w)];
+                ker(i, o);
         });
 
         return success;
@@ -1420,50 +570,49 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        (fmt_i == gOIhw8i8o && fmt_o == gOIhw8o8i)
-        || (fmt_i == OIhw8i8o && fmt_o == OIhw8o8i)
-        || (fmt_i == gOIhw16i16o && fmt_o == gOIhw16o16i)
-        || (fmt_i == OIhw16i16o && fmt_o == OIhw16o16i)
-        || (fmt_i == gOIhw16i16o && fmt_o == gIOhw16o16i)
-        || (fmt_i == OIhw16i16o && fmt_o == IOhw16o16i)
-    >::type>
+typename utils::enable_if<fmt_i == nchw && fmt_o == nhwc>::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
+    static bool is_applicable(const memory_desc_wrapper &input_d,
+        const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
+        int smask = attr ? attr->output_scales_.mask_ : 0;
+        return (smask == 0 || smask == 2) && order_keep && input_d._md->format == nchw && output_d._md->format == nhwc;
+    }
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        constexpr bool w_groups = (fmt_i == gOIhw8i8o || fmt_i == gOIhw16i16o);
-
         const auto &dims = input_d.dims();
-        constexpr int blksize =
-            (fmt_i == OIhw8i8o || fmt_i == gOIhw8i8o) ? 8 : 16;
+        const int C = dims[1];
+        const int H = dims[2];
+        const int W = dims[3];
+
+        int smask = pd->attr()->output_scales_.mask_;
+        const float *scales = pd->attr()->output_scales_.scales_;
 
         auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            for (int ic = 0; ic < blksize; ++ic) {
-                for (int oc = 0; oc < blksize; ++oc) {
-                    const int o_idx = ic * blksize + oc;
-                    const int i_idx = oc * blksize + ic;
-                    o[o_idx] = (alpha == 1.0 && beta == 0.0)
-                        ? data_t<type_o>(i[i_idx])
-                        : data_t<type_o>(alpha * i[i_idx]
-                            + (beta ? beta * o[o_idx] : 0));
+            if (smask == 2) {
+                for (int c = 0; c < C; ++c) {
+                    const float scale = scales[c];
+
+                    const ptrdiff_t flat_off = c * input_d.blocking_desc().strides[0][1];
+
+                    o[c] = _qz<type_i, type_o>()(i[flat_off], o[c], scale, beta, rmode);
+                }
+            } else {
+                for (int c = 0; c < C; ++c) {
+                    const ptrdiff_t flat_off = c * input_d.blocking_desc().strides[0][1];
+
+                    o[c] = _qz_a1b0<type_i, type_o>()(i[flat_off], rmode);
                 }
             }
         };
 
-        const int _G = w_groups ? dims[0] : 1;
-
-        parallel_nd(_G, dims[w_groups + 0] / blksize,
-            dims[w_groups + 1] / blksize, dims[w_groups + 2],
-            dims[w_groups + 3], [&](int g, int o, int i, int h, int w) {
-            auto i_ptr = &input[input_d.blk_off<!w_groups>(g,
-                    o, i, h, w)];
-            auto o_ptr = &output[output_d.blk_off<!w_groups>(g,
-                    o, i, h, w)];
-            ker(i_ptr, o_ptr);
+        parallel_nd(dims[0], H, W,
+            [&](int n, int h, int w) {
+                auto i = &input[input_d.blk_off(n, 0, h, w)];
+                auto o = &output[output_d.blk_off(n, 0, h, w)];
+                ker(i, o);
         });
 
         return success;
@@ -1472,88 +621,142 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        (fmt_i == gOIdhw16i16o && fmt_o == gOIdhw16o16i)
-        || (fmt_i == OIdhw16i16o && fmt_o == OIdhw16o16i)
-    >::type>
+typename utils::enable_if<fmt_i == nhwc && fmt_o == nchw>::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
+    static bool is_applicable(const memory_desc_wrapper &input_d,
+        const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
+        int smask = attr ? attr->output_scales_.mask_ : 0;
+        return (smask == 0 || smask == 2) && order_keep && input_d._md->format == nhwc && output_d._md->format == nchw;
+    }
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        constexpr bool w_groups = fmt_i == gOIdhw16i16o;
-
         const auto &dims = input_d.dims();
-        constexpr int blksize = 16;
+        const int C = dims[1];
+        const int H = dims[2];
+        const int W = dims[3];
+
+        int smask = pd->attr()->output_scales_.mask_;
+        const float *scales = pd->attr()->output_scales_.scales_;
 
         auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
-            for (int ic = 0; ic < blksize; ++ic) {
-                for (int oc = 0; oc < blksize; ++oc) {
-                    const int o_idx = ic * blksize + oc;
-                    const int i_idx = oc * blksize + ic;
-                    o[o_idx] = (alpha == 1.0 && beta == 0.0)
-                        ? data_t<type_o>(i[i_idx])
-                        : data_t<type_o>(alpha * i[i_idx]
-                            + (beta ? beta * o[o_idx] : 0));
+            if (smask == 2) {
+                for (int c = 0; c < C; ++c) {
+                    const float scale = scales[c];
+
+                    const ptrdiff_t flat_off = c * output_d.blocking_desc().strides[0][1];
+
+                    o[flat_off] = _qz<type_i, type_o>()(i[c], o[flat_off], scale, beta, rmode);
+                }
+            } else {
+                for (int c = 0; c < C; ++c) {
+                    const ptrdiff_t flat_off = c * output_d.blocking_desc().strides[0][1];
+
+                    o[flat_off] = _qz_a1b0<type_i, type_o>()(i[c], rmode);
                 }
             }
         };
 
-        const int _G = w_groups ? dims[0] : 1;
-
-        parallel_nd(_G, dims[w_groups + 0] / blksize,
-            dims[w_groups + 1] / blksize, dims[w_groups + 2],
-            dims[w_groups + 3], [&](int g, int o, int i, int d, int h) {
-            for (int w = 0; w < dims[w_groups + 4]; ++w) {
-                auto i_ptr = &input[input_d.blk_off<!w_groups>(g,
-                        o, i, d, h, w)];
-                auto o_ptr = &output[output_d.blk_off<!w_groups>(g,
-                        o, i, d, h, w)];
-                ker(i_ptr, o_ptr);
-            }
+        parallel_nd(dims[0], H, W,
+            [&](int n, int h, int w) {
+                auto i = &input[input_d.blk_off(n, 0, h, w)];
+                auto o = &output[output_d.blk_off(n, 0, h, w)];
+                ker(i, o);
         });
 
         return success;
     }
 };
 
+#define PLAIN_TO_BLOCKED_IS_APPLICABLE() \
+    static bool is_applicable(const memory_desc_wrapper &input_d, \
+        const memory_desc_wrapper &output_d, const primitive_attr_t *attr) { \
+        return simple_attr_check(attr, false) && (order_keep \
+                ? output_d.format() == fmt_o && input_d.is_plain() \
+                : input_d.format() == fmt_o && output_d.is_plain()); \
+    }
+
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        (fmt_i == Oihw16o && fmt_o == Ohwi16o)
-        || (fmt_i == gOihw16o && fmt_o == gOhwi16o)
-    >::type>
+typename utils::enable_if<fmt_i == any && (false
+    || format_traits<fmt_o>::blk_fmt == bf::_8c
+    || format_traits<fmt_o>::blk_fmt == bf::_16c)>::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
+    PLAIN_TO_BLOCKED_IS_APPLICABLE();
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        const bool w_groups = fmt_i == gOihw16o;
+        constexpr int is_1d = format_traits<fmt_o>::ndims_sp == 1;
+        constexpr int is_3d = format_traits<fmt_o>::ndims_sp == 3;
+        constexpr int blksize = format_traits<fmt_o>::blk_size;
 
+        const auto &flat_d = order_keep ? input_d : output_d;
         const auto &dims = input_d.dims();
-        const int blksize = 16;
+        const auto &pdims = order_keep
+            ? output_d.blocking_desc().padding_dims
+            : input_d.blocking_desc().padding_dims;
+
+        const int C = dims[1];
+        const int D = is_3d ? dims[2] : 1;
+        const int H = is_1d ? 1 : dims[2 + is_3d];
+        const int W = dims[3 + is_3d - is_1d];
 
-        const int _G = w_groups ? dims[0] : 1;
-
-        parallel_nd(_G, dims[w_groups + 0] / blksize, dims[w_groups + 1],
-            dims[w_groups + 2], dims[w_groups + 3],
-            [&](int g, int o, int i, int h, int w) {
-            auto i_ptr = &input[input_d.blk_off<!w_groups>(g,
-                    o, i, h, w)];
-            auto o_ptr = &output[output_d.blk_off<!w_groups>(g,
-                    o, i, h, w)];
-            for (int oc = 0; oc < blksize; ++oc) {
-                o_ptr[oc] = (alpha == 1.0 && beta == 0.0)
-                    ? data_t<type_o>(i_ptr[oc])
-                    : data_t<type_o>(alpha * i_ptr[oc]
-                        + (beta ? beta * o_ptr[oc] : 0));
+        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o,
+            const int c_block) {
+            if (alpha == 1.0 && beta == 0.0) {
+                for (int w = 0; w < W; ++w)
+                for (int c = 0; c < c_block; ++c) {
+                    const ptrdiff_t flat_off = 0
+                        + c * flat_d.blocking_desc().strides[0][1]
+                        + w * flat_d.blocking_desc().strides[0][3 + is_3d
+                            - is_1d];
+                    if (order_keep) {
+                        o[w * blksize + c] = _qz_a1b0<type_i, type_o>()(
+                                i[flat_off], rmode);
+                    } else {
+                        o[flat_off] = _qz_a1b0<type_i, type_o>()(
+                                i[w * blksize + c], rmode);
+                    }
+                }
+            } else {
+                for (int w = 0; w < W; ++w)
+                for (int c = 0; c < c_block; ++c) {
+                    const ptrdiff_t flat_off = 0
+                        + c * flat_d.blocking_desc().strides[0][1]
+                        + w * flat_d.blocking_desc().strides[0][3 + is_3d
+                            - is_1d];
+                    if (order_keep) {
+                        o[w * blksize + c] = _qz<type_i, type_o>()(i[flat_off],
+                                o[w * blksize + c], alpha, beta, rmode);
+                    } else {
+                        o[flat_off] = _qz<type_i, type_o>()(i[w * blksize + c],
+                                o[flat_off], alpha, beta, rmode);
+                    }
+                }
             }
+        };
+
+        constexpr int i_c_mult = order_keep ? blksize : 1;
+        constexpr int o_c_mult = order_keep ? 1 : blksize;
+
+#       define data_blk_off(md, n, c, d, h) \
+        ( is_1d ? (md).blk_off(n, c) \
+          : is_3d ? (md).blk_off(n, c, d, h) : (md).blk_off(n, c, h))
+
+        parallel_nd(dims[0], pdims[1] / blksize, D, H,
+            [&](int n, int nb_c, int d, int h) {
+            auto i = &input[data_blk_off(input_d, n, i_c_mult * nb_c, d, h)];
+            auto o = &output[data_blk_off(output_d, n, o_c_mult * nb_c, d, h)];
+            const int c_block = nstl::min(blksize, C - nb_c * blksize);
+            ker(i, o, c_block);
         });
 
+#       undef data_blk_off
+
         return success;
     }
 };
@@ -1561,35 +764,99 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
     typename utils::enable_if<
-        (fmt_i == Oidhw16o && fmt_o == Odhwi16o)
-        || (fmt_i == gOidhw16o && fmt_o == gOdhwi16o)
+          (fmt_i == goihw && fmt_o == gOhIw8o4i_s8s8)
+       || (fmt_i == oihw && fmt_o == OhIw8o4i_s8s8)
     >::type>
 {
-    SIMPLE_IS_APPLICABLE(false);
+    static bool is_applicable(const memory_desc_wrapper &input_d,
+            const memory_desc_wrapper &output_d, const primitive_attr_t *attr)
+    {
+        const size_t D_mask = utils::array_product(input_d.dims(),
+                                math::ilog2q(attr->output_scales_.mask_ + 1));
+        const int oc = (input_d.dims()[(fmt_i == goihw) + 0]);
+        const int g = (fmt_i == goihw) ? (input_d.dims()[0]) : 1;
+
+        return input_d.format() == fmt_i
+            && output_d.format() == fmt_o
+            && (input_d.data_type() == f32 || input_d.data_type() == s8)
+            && output_d.data_type() == s8
+            && (D_mask == 1 || D_mask == (size_t)g * oc);
+    }
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        const bool w_groups = fmt_i == gOidhw16o;
+        static constexpr bool w_groups
+            = format_traits<fmt_o>::data_kind == dk::gwei;
+        constexpr int blksize_o = 8;
+        constexpr int blksize_i = 4;
 
+        const auto &flat_d = order_keep ? input_d : output_d;
         const auto &dims = input_d.dims();
-        const int blksize = 16;
+        const auto &pdims = order_keep
+            ? output_d.blocking_desc().padding_dims
+            : input_d.blocking_desc().padding_dims;
 
-        const int _G = w_groups ? dims[0] : 1;
+        const int G = w_groups ? dims[0] : 1;
+        const int OC = dims[w_groups + 0];
+        const int NB_OC = pdims[w_groups + 0] / blksize_o;
+        const int IC = dims[w_groups + 1];
+        const int NB_IC = pdims[w_groups + 1] / blksize_i;
+        const int H = dims[w_groups + 2];
+        const int W = dims[w_groups + 3];
 
-        parallel_nd(_G, dims[w_groups + 0] / blksize, dims[w_groups + 1],
-            dims[w_groups + 2], dims[w_groups + 3], dims[w_groups + 4],
-            [&](int g, int o, int i, int d, int h, int w) {
-            auto i_ptr = &input[input_d.blk_off<!w_groups>(g,
-                    o, i, d, h, w)];
-            auto o_ptr = &output[output_d.blk_off<!w_groups>(g,
-                    o, i, d, h, w)];
-            for (int oc = 0; oc < blksize; ++oc) {
-                o_ptr[oc] = (alpha == 1.0 && beta == 0.0)
-                    ? data_t<type_o>(i_ptr[oc])
-                    : data_t<type_o>(alpha * i_ptr[oc]
-                        + (beta ? beta * o_ptr[oc] : 0));
+        const float *scales = pd->attr()->output_scales_.scales_;
+        const size_t D_mask = utils::array_product(input_d.dims(),
+                                                   math::ilog2q(pd->attr()->output_scales_.mask_ + 1));
+
+        float adj_scale = (mayiuse(avx512_core_vnni)) ? 1.0 : (1.0 / 2.0);
+
+        auto ker = [&](const data_t<type_i> *inp, data_t<type_o> *out,
+            int32_t *c, const float *s, const int oc_block, const int ic_block) {
+#            define blk_off OI_blk_off<format_traits<fmt_o>::blk_fmt>
+
+            for (int ic = 0; ic < ic_block; ++ic) {
+                for (int oc = 0; oc < oc_block; ++oc) {
+                    const auto _g_oihw_off = oc * flat_d.blocking_desc().strides[0][w_groups + 0] +
+                                             ic * flat_d.blocking_desc().strides[0][w_groups + 1];
+
+                    if (order_keep) {
+                        out[blk_off(oc, ic)] = qz_b0<data_t<type_i>, data_t<type_o>>()(inp[_g_oihw_off], s[oc] * adj_scale, rmode);
+                        c[oc] -= (128 * (int32_t)(out[blk_off(oc, ic)]));
+                    } else {
+                        out[_g_oihw_off] = qz_b0<data_t<type_i>, data_t<type_o>>()(inp[blk_off(oc, ic)], s[oc] * adj_scale, rmode);
+                        c[oc] -= (128 * (int32_t)(out[_g_oihw_off]));
+                    }
+                }
+            }
+
+#           undef blk_off
+        };
+
+        constexpr int i_mult_o = blksize_o;
+        constexpr int i_mult_i = blksize_i;
+
+        size_t offset = G * pdims[w_groups+0] * pdims[w_groups+1] * H * W;
+        int32_t *cp = reinterpret_cast<int32_t *>(output + offset);
+        parallel_nd(G * NB_OC * blksize_o, [&](int i) {
+            cp[i] = 0;
+        });
+
+        parallel_nd(G, NB_OC, [&](int g, int O) {
+            for (int I = 0; I < NB_IC; I++) {
+                for (int h = 0; h < H; h++) {
+                    for (int w = 0; w < W; w++) {
+                        auto i = &input[input_d.blk_off<!w_groups>(g, i_mult_o * O, i_mult_i * I, h, w)];
+                        auto o = &output[output_d.blk_off<!w_groups>(g, O, I, h, w)];
+                        const int oc_block = nstl::min(blksize_o, OC - O * blksize_o);
+                        const int ic_block = nstl::min(blksize_i, IC - I * blksize_i);
+
+                        int _offset = (g * NB_OC + O) * blksize_o;
+                        ker(i, o, (order_keep) ? &cp[_offset] : nullptr, &scales[(D_mask == 1) ? 0 : _offset], oc_block,
+                            ic_block);
+                    }
+                }
             }
         });
 
@@ -1599,26 +866,20 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<
-        fmt_i == any
-        && (fmt_o == nChw8c || fmt_o == nChw16c || fmt_o == nCdhw16c)
-    >::type>
+typename utils::enable_if<fmt_i == any && (fmt_o == OhIw8o4i || fmt_o == gOhIw8o4i)>::type>
 {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
-        const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
-        return order_keep
-            ? output_d.format() == fmt_o && utils::one_of(input_d.format(),
-                    nchw, nhwc, chwn, ncdhw, ndhwc)
-            : input_d.format() == fmt_o && utils::one_of(output_d.format(),
-                    nchw, nhwc, chwn, ncdhw, ndhwc);
-    }
+    PLAIN_TO_BLOCKED_IS_APPLICABLE();
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        constexpr int is_3d = fmt_o == nCdhw16c;
-        constexpr int blksize = fmt_o == nChw8c ? 8 : 16;
+        static constexpr bool w_groups
+            = format_traits<fmt_o>::data_kind == dk::gwei;
+        constexpr int is_1d = format_traits<fmt_o>::ndims_sp == 1;
+        constexpr int is_3d = format_traits<fmt_o>::ndims_sp == 3;
+        constexpr int blksize_o = 8;//format_traits<fmt_o>::blk_size;
+        constexpr int blksize_i = 4;
 
         const auto &flat_d = order_keep ? input_d : output_d;
         const auto &dims = input_d.dims();
@@ -1626,57 +887,67 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
             ? output_d.blocking_desc().padding_dims
             : input_d.blocking_desc().padding_dims;
 
-        const int C = dims[1];
-        const int D = is_3d ? dims[2] : 1;
-        const int H = dims[2 + is_3d];
-        const int W = dims[3 + is_3d];
+        const int G = w_groups ? dims[0] : 1;
+        const int OC = dims[w_groups + 0];
+        const int NB_OC = pdims[w_groups + 0] / blksize_o;
+        const int IC = dims[w_groups + 1];
+        const int NB_IC = pdims[w_groups + 1] / blksize_i;
+        const int D = is_3d ? dims[w_groups + 2] : 1;
+        const int H = is_1d ? 1 : dims[w_groups + 2 + is_3d];
+        const int W = dims[w_groups + 3 + is_3d - is_1d];
 
         auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o,
-            const int c_block) {
+            const int oc_block, const int ic_block) {
+#           define blk_off OI_blk_off<format_traits<fmt_o>::blk_fmt>
+
             if (alpha == 1.0 && beta == 0.0) {
-                for (int w = 0; w < W; ++w)
-                for (int c = 0; c < c_block; ++c) {
+                for (int oc = 0; oc < oc_block; ++oc)
+                for (int ic = 0; ic < ic_block; ++ic) {
                     const ptrdiff_t flat_off = 0
-                        + c * flat_d.blocking_desc().strides[0][1]
-                        + w * flat_d.blocking_desc().strides[0][3 + is_3d];
+                        + oc * flat_d.blocking_desc().strides[0][w_groups + 0]
+                        + ic * flat_d.blocking_desc().strides[0][w_groups + 1];
                     if (order_keep) {
-                        o[w * blksize + c] = data_t<type_o>(i[flat_off]);
+                        o[blk_off(oc, ic)] = _qz_a1b0<type_i, type_o>()(
+                                i[flat_off], rmode);
                     } else {
-                        o[flat_off] = data_t<type_o>(i[w * blksize + c]);
+                        o[flat_off] = _qz_a1b0<type_i, type_o>()(
+                                i[blk_off(oc, ic)], rmode);
                     }
                 }
             } else {
-                for (int w = 0; w < W; ++w)
-                for (int c = 0; c < c_block; ++c) {
+                for (int oc = 0; oc < oc_block; ++oc)
+                for (int ic = 0; ic < ic_block; ++ic) {
                     const ptrdiff_t flat_off = 0
-                        + c * flat_d.blocking_desc().strides[0][1]
-                        + w * flat_d.blocking_desc().strides[0][3 + is_3d];
+                        + oc * flat_d.blocking_desc().strides[0][w_groups + 0]
+                        + ic * flat_d.blocking_desc().strides[0][w_groups + 1];
                     if (order_keep) {
-                        o[w * blksize + c] = data_t<type_o>(
-                            alpha * i[flat_off]
-                            + (beta ? beta * o[w * blksize + c] : 0));
+                        o[blk_off(oc, ic)] = _qz<type_i, type_o>()(i[flat_off],
+                                o[blk_off(oc, ic)], alpha, beta, rmode);
                     } else {
-                        o[flat_off] = data_t<type_o>(
-                            alpha * i[w * blksize + c]
-                            + (beta ? beta * o[flat_off] : 0));
+                        o[flat_off] = _qz<type_i, type_o>()(i[blk_off(oc, ic)],
+                                o[flat_off], alpha, beta, rmode);
                     }
                 }
             }
+
+#           undef blk_off
         };
 
-        constexpr int i_c_mult = order_keep ? blksize : 1;
-        constexpr int o_c_mult = order_keep ? 1 : blksize;
 
-        parallel_nd(dims[0], pdims[1] / blksize, D, H,
-            [&](int n, int nb_c, int d, int h) {
-            auto i = &input[is_3d
-                ? input_d.blk_off(n, i_c_mult * nb_c, d, h)
-                : input_d.blk_off(n, i_c_mult * nb_c, h)];
-            auto o = &output[is_3d
-                ? output_d.blk_off(n, o_c_mult * nb_c, d, h)
-                : output_d.blk_off(n, o_c_mult * nb_c, h)];
-            const int c_block = nstl::min(blksize, C - nb_c * blksize);
-            ker(i, o, c_block);
+        constexpr int i_mult_o = blksize_o;
+        constexpr int i_mult_i = blksize_i;
+
+        parallel_nd(G, NB_OC, NB_IC, D, H, W,
+            [&](int g, int nb_oc, int nb_ic, int d, int h, int w) {
+            int i_off = wei_blk_off_like_gwei3D<fmt_o>(input_d,
+                                                       g, i_mult_o * nb_oc, i_mult_i * nb_ic, d, h, w);
+            int o_off = wei_blk_off_like_gwei3D<fmt_o>(output_d,
+                                                       g, nb_oc, nb_ic, d, h, w);
+            auto i = &input[i_off];
+            auto o = &output[o_off];
+            const int oc_block = nstl::min(blksize_o, OC - nb_oc * blksize_o);
+            const int ic_block = nstl::min(blksize_i, IC - nb_ic * blksize_i);
+            ker(i, o, oc_block, ic_block);
         });
 
         return success;
@@ -1685,36 +956,20 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<fmt_i == any &&
-    (false
-     || fmt_o == OIhw16i16o || fmt_o == gOIhw16i16o
-     || fmt_o == OIdhw16i16o || fmt_o == gOIdhw16i16o
-     || fmt_o == OIhw16o16i || fmt_o == gOIhw16o16i
-     || fmt_o == OIdhw16o16i || fmt_o == gOIdhw16o16i
-     || fmt_o == IOhw16o16i || fmt_o == gIOhw16o16i
-     )>::type>
+typename utils::enable_if<fmt_i == any
+&& block_format_traits<format_traits<fmt_o>::blk_fmt>::blk_ndims == 2 && fmt_o != OhIw8o4i && fmt_o != gOhIw8o4i>::type>
 {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
-        const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
-        return order_keep
-            ? output_d.format() == fmt_o && utils::one_of(input_d.format(),
-                    oihw, ihwo, hwio, goihw, hwigo, dhwio, oidhw, goidhw)
-            : input_d.format() == fmt_o &&  utils::one_of(output_d.format(),
-                    oihw, ihwo, hwio, goihw, hwigo, dhwio, oidhw, goidhw);
-    }
+    PLAIN_TO_BLOCKED_IS_APPLICABLE();
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        static constexpr bool w_groups = false
-            || fmt_o == gOIhw16i16o || fmt_o == gOIdhw16i16o
-            || fmt_o == gOIhw16o16i || fmt_o == gOIdhw16o16i
-            || fmt_o == gIOhw16o16i;
-
-        constexpr int is_3d = false
-            || fmt_o == OIdhw16i16o || fmt_o == gOIdhw16i16o
-            || fmt_o == OIdhw16o16i || fmt_o == gOIdhw16o16i;
+        static constexpr bool w_groups
+            = format_traits<fmt_o>::data_kind == dk::gwei;
+        constexpr int is_1d = format_traits<fmt_o>::ndims_sp == 1;
+        constexpr int is_3d = format_traits<fmt_o>::ndims_sp == 3;
+        constexpr int blksize = format_traits<fmt_o>::blk_size;
 
         const auto &flat_d = order_keep ? input_d : output_d;
         const auto &dims = input_d.dims();
@@ -1722,26 +977,19 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
             ? output_d.blocking_desc().padding_dims
             : input_d.blocking_desc().padding_dims;
 
-        constexpr int blksize = 16;
         const int G = w_groups ? dims[0] : 1;
         const int OC = dims[w_groups + 0];
         const int NB_OC = pdims[w_groups + 0] / blksize;
         const int IC = dims[w_groups + 1];
         const int NB_IC = pdims[w_groups + 1] / blksize;
         const int D = is_3d ? dims[w_groups + 2] : 1;
-        const int H = dims[w_groups + 2 + is_3d];
-        const int W = dims[w_groups + 3 + is_3d];
-
-        auto index = [&](const int ic, const int oc) {
-            if (fmt_o == OIhw16i16o || fmt_o == gOIhw16i16o ||
-                    fmt_o == OIdhw16i16o || fmt_o == gOIdhw16i16o)
-                return ic * blksize + oc;
-            else
-                return oc * blksize + ic;
-        };
+        const int H = is_1d ? 1 : dims[w_groups + 2 + is_3d];
+        const int W = dims[w_groups + 3 + is_3d - is_1d];
 
         auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o,
             const int oc_block, const int ic_block) {
+#           define blk_off OI_blk_off<format_traits<fmt_o>::blk_fmt>
+
             if (alpha == 1.0 && beta == 0.0) {
                 for (int oc = 0; oc < oc_block; ++oc)
                 for (int ic = 0; ic < ic_block; ++ic) {
@@ -1749,9 +997,11 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                         + oc * flat_d.blocking_desc().strides[0][w_groups + 0]
                         + ic * flat_d.blocking_desc().strides[0][w_groups + 1];
                     if (order_keep) {
-                        o[index(ic, oc)] = data_t<type_o>(i[flat_off]);
+                        o[blk_off(oc, ic)] = _qz_a1b0<type_i, type_o>()(
+                                i[flat_off], rmode);
                     } else {
-                        o[flat_off] = data_t<type_o>(i[index(ic, oc)]);
+                        o[flat_off] = _qz_a1b0<type_i, type_o>()(
+                                i[blk_off(oc, ic)], rmode);
                     }
                 }
             } else {
@@ -1761,27 +1011,28 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                         + oc * flat_d.blocking_desc().strides[0][w_groups + 0]
                         + ic * flat_d.blocking_desc().strides[0][w_groups + 1];
                     if (order_keep) {
-                        o[index(ic, oc)] = data_t<type_o>(alpha * i[flat_off]
-                                + (beta ? beta * o[index(ic, oc)] : 0));
+                        o[blk_off(oc, ic)] = _qz<type_i, type_o>()(i[flat_off],
+                                o[blk_off(oc, ic)], alpha, beta, rmode);
                     } else {
-                        o[flat_off] = data_t<type_o>(alpha * i[index(ic, oc)]
-                                + (beta ? beta * o[flat_off] : 0));
+                        o[flat_off] = _qz<type_i, type_o>()(i[blk_off(oc, ic)],
+                                o[flat_off], alpha, beta, rmode);
                     }
                 }
             }
+
+#           undef blk_off
         };
 
+
         constexpr int i_mult = order_keep ? blksize : 1;
         constexpr int o_mult = order_keep ? 1 : blksize;
 
         parallel_nd(G, NB_OC, NB_IC, D, H, W,
             [&](int g, int nb_oc, int nb_ic, int d, int h, int w) {
-            auto i = &input[is_3d
-                ? input_d.blk_off<!w_groups>(g, i_mult * nb_oc, i_mult * nb_ic, d, h, w)
-                : input_d.blk_off<!w_groups>(g, i_mult * nb_oc, i_mult * nb_ic, h, w)];
-            auto o = &output[is_3d
-                ? output_d.blk_off<!w_groups>(g, o_mult * nb_oc, o_mult * nb_ic, d, h, w)
-                : output_d.blk_off<!w_groups>(g, o_mult * nb_oc, o_mult * nb_ic, h, w)];
+            auto i = &input[wei_blk_off_like_gwei3D<fmt_o>(input_d,
+                    g, i_mult * nb_oc, i_mult * nb_ic, d, h, w)];
+            auto o = &output[wei_blk_off_like_gwei3D<fmt_o>(output_d,
+                    g, o_mult * nb_oc, o_mult * nb_ic, d, h, w)];
             const int oc_block = nstl::min(blksize, OC - nb_oc * blksize);
             const int ic_block = nstl::min(blksize, IC - nb_ic * blksize);
             ker(i, o, oc_block, ic_block);
@@ -1793,34 +1044,21 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-    typename utils::enable_if<fmt_i == any &&
-    (false
-     || fmt_o == Oihw16o || fmt_o == Ohwi16o || fmt_o == Oidhw16o
-     || fmt_o == Odhwi16o
-     || fmt_o == gOihw16o || fmt_o == gOhwi16o || fmt_o == gOidhw16o
-     || fmt_o == gOdhwi16o
-     )>::type>
+typename utils::enable_if<fmt_i == any && (false
+    || format_traits<fmt_o>::blk_fmt == bf::_8o
+    || format_traits<fmt_o>::blk_fmt == bf::_16o)>::type>
 {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
-        const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
-        return order_keep
-            ? output_d.format() == fmt_o && utils::one_of(input_d.format(),
-                    oihw, ihwo, hwio, goihw, hwigo, dhwio, oidhw, goidhw)
-            : input_d.format() == fmt_o && utils::one_of(output_d.format(),
-                    oihw, ihwo, hwio, goihw, hwigo, dhwio, oidhw, goidhw);
-    }
+    PLAIN_TO_BLOCKED_IS_APPLICABLE();
 
     static status_t execute(const cpu_reorder_pd_t *pd,
         const data_t<type_i> *input, data_t<type_o> *output) {
         DECLARE_COMMON_PARAMS();
 
-        constexpr int blksize = 16;
-
-        static constexpr bool w_groups = fmt_o == gOihw16o || fmt_o == gOhwi16o
-            || fmt_o == gOidhw16o || fmt_o == gOdhwi16o;
-        constexpr int is_3d = false
-            || fmt_o == gOidhw16o || fmt_o == Oidhw16o
-            || fmt_o == gOdhwi16o || fmt_o == Odhwi16o;
+        static constexpr bool w_groups
+            = format_traits<fmt_o>::data_kind == dk::gwei;
+        constexpr int is_1d = format_traits<fmt_o>::ndims_sp == 1;
+        constexpr int is_3d = format_traits<fmt_o>::ndims_sp == 3;
+        constexpr int blksize = format_traits<fmt_o>::blk_size;
 
         const auto &flat_d = order_keep ? input_d : output_d;
         const auto &dims = input_d.dims();
@@ -1832,8 +1070,8 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         const int OC = dims[w_groups + 0];
         const int IC = dims[w_groups + 1];
         const int D = is_3d ? dims[w_groups + 2] : 1;
-        const int H = dims[w_groups + 2 + is_3d];
-        const int W = dims[w_groups + 3 + is_3d];
+        const int H = is_1d ? 1 : dims[w_groups + 2 + is_3d];
+        const int W = dims[w_groups + 3 + is_3d - is_1d];
 
         constexpr int i_mult = order_keep ? blksize : 1;
         constexpr int o_mult = order_keep ? 1 : blksize;
@@ -1841,31 +1079,30 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
         parallel_nd(G, pdims[w_groups + 0] / blksize, IC, D, H, W,
             [&](int g, int nb_oc, int ic, int d, int h, int w) {
-            auto inp = &input[is_3d
-                ? input_d.blk_off<!w_groups>(g, i_mult * nb_oc, ic, d, h, w)
-                : input_d.blk_off<!w_groups>(g, i_mult * nb_oc, ic, h, w)];
-            auto out = &output[is_3d
-                ? output_d.blk_off<!w_groups>(g, o_mult * nb_oc, ic, d, h, w)
-                : output_d.blk_off<!w_groups>(g, o_mult * nb_oc, ic, h, w)];
+            auto i = &input[wei_blk_off_like_gwei3D<fmt_o>(input_d,
+                    g, i_mult * nb_oc, ic, d, h, w)];
+            auto o = &output[wei_blk_off_like_gwei3D<fmt_o>(output_d,
+                    g, o_mult * nb_oc, ic, d, h, w)];
             const int oc_block = nstl::min(blksize, OC - nb_oc * blksize);
+
             if (alpha == 1.0 && beta == 0.0) {
                 for (int oc = 0; oc < oc_block; ++oc) {
                     const auto off = oc * strd_oc;
                     if (order_keep) {
-                        out[oc] = data_t<type_o>(inp[off]);
+                        o[oc] = _qz_a1b0<type_i, type_o>()(i[off], rmode);
                     } else {
-                        out[off] = data_t<type_o>(inp[oc]);
+                        o[off] = _qz_a1b0<type_i, type_o>()(i[oc], rmode);
                     }
                 }
             } else {
                 for (int oc = 0; oc < oc_block; ++oc) {
                     const auto off = oc * strd_oc;
                     if (order_keep) {
-                        out[oc] = data_t<type_o>(alpha * inp[off]
-                                + (beta ? beta * out[oc] : 0));
+                        o[oc] = _qz<type_i, type_o>()(i[off], o[oc], alpha,
+                                beta, rmode);
                     } else {
-                        out[off] = data_t<type_o>(alpha * inp[oc]
-                                + (beta ? beta * out[off] : 0));
+                        o[off] = _qz<type_i, type_o>()(i[oc], o[off], alpha,
+                                beta, rmode);
                     }
                 }
             }
@@ -1875,6 +1112,8 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
     }
 };
 
+/* generic and direct-copy reorders */
+
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
     typename utils::enable_if<
@@ -1909,7 +1148,6 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
             balance211(num_blocks, nthr, ithr, start, end);
             start = start * block_size;
             end = end * block_size;
-            round_mode_t rmode = pd->attr()->round_mode_;
 
             if (alpha == 1.0 && beta == 0.0) {
                 PRAGMA_OMP_SIMD()
@@ -2007,12 +1245,12 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                 nd_iterator_init(start, n, N, dim1_s, nelems_no_d0);
                 while(start < end) {
                     size_t work_rem = end - start;
-                    size_t dim1_e =
-                        dim1_s + work_rem > nelems_no_d0 ? nelems_no_d0
-                        : dim1_s + work_rem;
+                    size_t dim1_e = dim1_s + work_rem > nelems_no_d0
+                        ? nelems_no_d0 : dim1_s + work_rem;
                     PRAGMA_OMP_SIMD()
-                    for (size_t e = dim1_s; e < dim1_e; ++e){
-                        output[os * n + e] = data_t<type_o>(input[is * n + e]);
+                    for (size_t e = dim1_s; e < dim1_e; ++e) {
+                        output[os * n + e] = _qz_a1b0<type_i, type_o>()(
+                                input[is * n + e], rmode);
                     }
                     nd_iterator_jump(start, end, n, N, dim1_s, nelems_no_d0);
                 }
@@ -2030,9 +1268,9 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                         : dim1_s + work_rem;
                     PRAGMA_OMP_SIMD()
                     for (size_t e = dim1_s; e < dim1_e; ++e){
-                        output[os * n + e] = data_t<type_o>(
-                                alpha * input[is * n + e]
-                                + beta * output[os * n + e]);
+                        output[os * n + e] = _qz<type_i, type_o>()(
+                                input[is * n + e], output[os * n + e], alpha,
+                                beta, rmode);
                     }
                     nd_iterator_jump(start, end, n, N, dim1_s, nelems_no_d0);
                 }
@@ -2081,6 +1319,8 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         return true
             && input_d.is_blocking_desc()
             && output_d.is_blocking_desc()
+            && !output_d.is_additional_buffer()
+            && !input_d.is_additional_buffer()
             && smask == 0;
     }
 
@@ -2109,19 +1349,10 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
             const float scale = scales[dm];
 
             const size_t e = (ds * D_mask + dm) * D_rest + dr;
-            float i = (float)input[input_d.off_l(e)];
+            const auto &i = input[input_d.off_l(e)];
             auto &o = output[output_d.off_l(e)];
 
-            i = scale * i + (beta ? beta * (float)o : 0);
-            if (type_o != f32) {
-                switch (pd->attr()->round_mode_) {
-                case round_mode::down: i = floorf(i); break;
-                case round_mode::nearest: i = nearbyintf(i); break;
-                }
-                o = saturate<data_t<type_o>>(i);
-            } else {
-                o = (data_t<type_o>)i;
-            }
+            o = _qz<type_i, type_o>()(i, o, scale, beta, rmode);
         });
 
         return success;
index 00f74a6..78d005e 100644 (file)
@@ -107,6 +107,8 @@ private:
         oc2_block_ = output_d.wino_desc().oc2_block;
         assert(nb_ic_ % ic2_block_ == 0 && nb_oc_ % oc2_block_ == 0);
 
+        adj_scale_ = output_d.wino_desc().adj_scale;
+
         size_wino_wei_ = w_alpha_ * w_alpha_ * oc_ * ic_;
         size_wspace_ = r_ * w_alpha_ * oc_block_;
 
@@ -189,7 +191,7 @@ private:
                         : scales[ob * oc_block_ + ioc];
                     _out[(i * w_alpha_ + j) * Z + ioc]
                             = qz_b0<in_data_t, out_data_t>()(
-                                    (in_data_t)t, scale, rmode);
+                                    (in_data_t)t, scale * adj_scale_, rmode);
                 } else {
                     _out[(i * w_alpha_ + j) * Z + ioc] = (out_data_t)t;
                 }
@@ -336,6 +338,7 @@ private:
     int r_, w_alpha_;
     int ic_, oc_, or_ic_, or_oc_, kh_, kw_;
     int oc_block_, ic_block_, oc2_block_, ic2_block_;
+    float adj_scale_;
     int nb_oc_, nb_ic_;
     mkldnn_wino_memory_format_t wino_format_;
     in_data_t *__restrict wspace_;
index 3ed1802..6e9caa6 100644 (file)
@@ -23,14 +23,17 @@ if(POLICY CMP0065)
 endif()
 
 # propagate TEST specific flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_TEST_CCXX_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_TEST_CCXX_FLAGS}")
+append(CMAKE_C_FLAGS "${CMAKE_TEST_CCXX_FLAGS}")
+append(CMAKE_CXX_FLAGS "${CMAKE_TEST_CCXX_FLAGS}")
 
 set(CMAKE_TEST_CCXX_NOWARN_FLAGS)
 
 # propagate no warning flags
-set(CMAKE_TEST_CCXX_NOWARN_FLAGS
-    "${CMAKE_TEST_CCXX_NOWARN_FLAGS} ${CMAKE_CCXX_NOWARN_FLAGS}")
+append(CMAKE_TEST_CCXX_NOWARN_FLAGS "${CMAKE_CCXX_NOWARN_FLAGS}")
+
+# propagate sanitizer flags
+append(CMAKE_C_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}")
+append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_SANITIZER_FLAGS}")
 
 # allow tests to include internal header files with, e.g.
 # include "src/common/mkldnn_thread.hpp"
@@ -39,28 +42,24 @@ include_directories(${CMAKE_SOURCE_DIR})
 if(UNIX OR MINGW)
     # workaround for Intel Compiler 16.0 that doesn't suppress warning on
     # deprecation with "-Wno-deprecated" compiler flag
-    if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" AND
-            CMAKE_CXX_COMPILER_VERSION VERSION_LESS "17.0")
-        set(CMAKE_TEST_CCXX_NOWARN_FLAGS
-            "${CMAKE_TEST_CCXX_NOWARN_FLAGS} -diag-disable:1478")
+    if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "17.0")
+        append(CMAKE_TEST_CCXX_NOWARN_FLAGS "-diag-disable:1478")
     else()
-        set(CMAKE_TEST_CCXX_NOWARN_FLAGS
-            "${CMAKE_TEST_CCXX_NOWARN_FLAGS} -Wno-deprecated-declarations")
+        append(CMAKE_TEST_CCXX_NOWARN_FLAGS "-Wno-deprecated-declarations")
     endif()
 elseif(WIN32 AND NOT MINGW)
     if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-        # 1478: deprecated functions
-        set(CMAKE_TEST_CCXX_NOWARN_FLAGS
-            "${CMAKE_TEST_CCXX_NOWARN_FLAGS} /Qdiag-disable:1478")
+        # 1478, 1786: deprecated functions
+        append(CMAKE_TEST_CCXX_NOWARN_FLAGS "/Qdiag-disable:1478 /Qdiag-disable:1786")
     else()
         # c4244: conversion with possible loss of data
         # c4996: unsafe / deprecated functions
-        set(CMAKE_TEST_CCXX_NOWARN_FLAGS
-            "${CMAKE_TEST_CCXX_NOWARN_FLAGS} /wd4996 /wd4244")
+        append(CMAKE_TEST_CCXX_NOWARN_FLAGS "/wd4996 /wd4244")
     endif()
 endif()
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_TEST_CCXX_NOWARN_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_TEST_CCXX_NOWARN_FLAGS}")
+
+append(CMAKE_C_FLAGS "${CMAKE_TEST_CCXX_NOWARN_FLAGS}")
+append(CMAKE_CXX_FLAGS "${CMAKE_TEST_CCXX_NOWARN_FLAGS}")
 
 register_exe(api-c api.c "test")
 
index 5fb5b91..aaaf7f8 100644 (file)
@@ -25,6 +25,7 @@ include_directories(
     ${CMAKE_CURRENT_SOURCE_DIR}
     ${CMAKE_CURRENT_SOURCE_DIR}/conv
     ${CMAKE_CURRENT_SOURCE_DIR}/ip
+    ${CMAKE_CURRENT_SOURCE_DIR}/shuffle
     ${CMAKE_CURRENT_SOURCE_DIR}/reorder
     )
 
@@ -33,12 +34,8 @@ if(BENCHDNN_USE_RDPMC)
 endif()
 
 if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-    if(WIN32)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qprec-div")
-    elseif(UNIX)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -prec-div")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fp-model precise")
-    endif()
+    append_if(WIN32 CMAKE_CXX_FLAGS "-Qprec-div")
+    append_if(UNIX  CMAKE_CXX_FLAGS "-prec-div -fp-model precise")
 endif()
 
 if(UNIX AND NOT APPLE)
@@ -75,3 +72,6 @@ register_benchdnn_test(test_benchdnn_regression
     "benchdnn --conv --batch=inputs/test_conv_regression"
     "benchdnn --bnorm --batch=inputs/bnorm/test_bnorm_regressions"
     )
+register_benchdnn_test(test_benchdnn_regression_large
+    "benchdnn --bnorm --batch=inputs/bnorm/test_bnorm_regressions_large"
+    )
index 6fb03c1..9d5ba2f 100644 (file)
@@ -23,7 +23,7 @@ The usage:
 ```
 where:
 
- - `HARNESS` is either `conv` [default], `ip`, `reorder`, `bnorm`, `rnn` or `self`
+ - `HARNESS` is either `conv` [default], `ip`, `shuffle`, `reorder`, `bnorm`, `rnn` or `self`
 
  - `MODE` -- string that contains flags for benchmark mode. Use `C` or `c` for correctness (used by default), and `P` or `p` for performance
 
@@ -114,9 +114,13 @@ configurations for **benchdnn**:
 | s32     | s16      | s16      | s32      | s32s16s16s32 | optimized for processors with support of 4vnni, backward wrt data only (aka BWD_D)
 | s16     | s32      | s16      | s32      | s16s32s16s32 | optimized for processors with support of 4vnni, backward wrt weights (aka BWD_W, BWD_WB)
 | u8      | s8       | f32      | s32      | u8s8f32s32   | optimized for processors with support of avx512vl, forward pass only (aka FWD_D, FWD_B)
-| u8      | s8       | s32      | s32      | u8s8s32s32   | same notes as for u8s8s32s32
-| u8      | s8       | s8       | s32      | u8s8s8s32    | same notes as for u8s8s32s32
-| u8      | s8       | u8       | s32      | u8s8u8s32    | same notes as for u8s8s32s32
+| u8      | s8       | s32      | s32      | u8s8s32s32   | same notes as for u8s8f32s32
+| u8      | s8       | s8       | s32      | u8s8s8s32    | same notes as for u8s8f32s32
+| u8      | s8       | u8       | s32      | u8s8u8s32    | same notes as for u8s8f32s32
+| s8      | s8       | f32      | s32      | s8s8f32s32   | same notes as for u8s8f32s32
+| s8      | s8       | s32      | s32      | s8s8s32s32   | same notes as for u8s8f32s32
+| s8      | s8       | s8       | s32      | s8s8s8s32    | same notes as for u8s8f32s32
+| s8      | s8       | u8       | s32      | s8s8u8s32    | same notes as for u8s8f32s32
 
 
 ## Performance measurements
index 58fc61c..aa91c06 100644 (file)
@@ -30,6 +30,7 @@
 #include "conv/conv.hpp"
 #include "conv/deconv.hpp"
 #include "ip/ip.hpp"
+#include "shuffle/shuffle.hpp"
 #include "reorder/reorder.hpp"
 #include "bnorm/bnorm.hpp"
 #include "rnn/rnn.hpp"
@@ -51,6 +52,7 @@ int main(int argc, char **argv) {
         else if (!strcmp("--conv", argv[0])) prim = CONV;
         else if (!strcmp("--deconv", argv[0])) prim = DECONV;
         else if (!strcmp("--ip", argv[0])) prim = IP;
+        else if (!strcmp("--shuffle", argv[0])) prim = SHUFFLE;
         else if (!strcmp("--reorder", argv[0])) prim = REORDER;
         else if (!strcmp("--bnorm", argv[0])) prim = BNORM;
         else if (!strcmp("--rnn", argv[0])) prim = RNN;
@@ -78,6 +80,7 @@ int main(int argc, char **argv) {
     case CONV: conv::bench(argc, argv); break;
     case DECONV: deconv::bench(argc, argv); break;
     case IP: ip::bench(argc, argv); break;
+    case SHUFFLE: shuffle::bench(argc, argv); break;
     case REORDER: reorder::bench(argc, argv); break;
     case BNORM: bnorm::bench(argc, argv); break;
     case RNN: rnn::bench(argc, argv); break;
index 2a723a8..3675176 100644 (file)
@@ -105,8 +105,8 @@ int bench(int argc, char **argv, bool main_bench) {
             perf_template = argv[arg] + 16;
         else if (!strcmp("--reset", argv[arg]))
             reset_parameters();
-        else if (!strncmp("--mode=", argv[0], 7))
-            bench_mode = str2bench_mode(argv[0] + 7);
+        else if (!strncmp("--mode=", argv[arg], 7))
+            bench_mode = str2bench_mode(argv[arg] + 7);
         else if (!strncmp("-v", argv[arg], 2))
             verbose = atoi(argv[arg] + 2);
         else if (!strncmp("--verbose=", argv[arg], 10))
index 97bce3e..7a6c81c 100644 (file)
@@ -510,8 +510,8 @@ static int cvt_mask_to_ws(const prb_t *p, const dnn_mem_t &mask_fp,
     DNN_SAFE(mkldnn_primitive_create(&b, bpd, inputs, outputs), WARN);
     SAFE(execute(b), WARN);
 
-    mkldnn_primitive_desc_destroy(bpd);
-    mkldnn_primitive_destroy(b);
+    DNN_SAFE(mkldnn_primitive_desc_destroy(bpd), CRIT);
+    DNN_SAFE(mkldnn_primitive_destroy(b), CRIT);
 
     return OK;
 }
@@ -675,6 +675,8 @@ int doit(const prb_t *p, res_t *r) {
     }
 
     delete p_ws_dt;
+    DNN_SAFE(mkldnn_primitive_desc_destroy(bpd), CRIT);
+    DNN_SAFE(mkldnn_primitive_destroy(b), CRIT);
 
     return OK;
 }
index c84bd56..7c65f30 100644 (file)
@@ -303,8 +303,8 @@ FILE *open_batch_file(const char *fname) {
     static char search_paths[max_paths][PATH_MAX] = {{0}};
 
     char *fdir = NULL;
+    char fname_copy[PATH_MAX];
     {
-        char fname_copy[PATH_MAX];
         strncpy(fname_copy, fname, PATH_MAX - 1);
         fname_copy[PATH_MAX - 1] = '\0';
         fdir = dirname(fname_copy);
index c194532..e86385d 100644 (file)
@@ -87,7 +87,7 @@ extern int verbose;
     } \
 } while (0)
 
-enum prim_t { SELF, CONV, DECONV, IP, REORDER, BNORM, RNN, DEF = CONV, };
+enum prim_t { SELF, CONV, DECONV, IP, SHUFFLE, REORDER, BNORM, RNN, DEF = CONV, };
 
 enum bench_mode_t { MODE_UNDEF = 0x0, CORR = 0x1, PERF = 0x2, };
 const char *bench_mode2str(bench_mode_t mode);
index f2cbff7..d3de6ed 100644 (file)
@@ -102,8 +102,8 @@ int bench(int argc, char **argv, bool main_bench) {
             perf_template = argv[arg] + 16;
         else if (!strcmp("--reset", argv[arg]))
             reset_parameters();
-        else if (!strncmp("--mode=", argv[0], 7))
-            bench_mode = str2bench_mode(argv[0] + 7);
+        else if (!strncmp("--mode=", argv[arg], 7))
+            bench_mode = str2bench_mode(argv[arg] + 7);
         else if (!strncmp("-v", argv[arg], 2))
             verbose = atoi(argv[arg] + 2);
         else if (!strncmp("--verbose=", argv[arg], 10))
index d2ff4e9..18792c1 100644 (file)
@@ -101,8 +101,8 @@ int bench(int argc, char **argv, bool main_bench) {
             perf_template = argv[arg] + 16;
         else if (!strcmp("--reset", argv[arg]))
             reset_parameters();
-        else if (!strncmp("--mode=", argv[0], 7))
-            bench_mode = str2bench_mode(argv[0] + 7);
+        else if (!strncmp("--mode=", argv[arg], 7))
+            bench_mode = str2bench_mode(argv[arg] + 7);
         else if (!strncmp("-v", argv[arg], 2))
             verbose = atoi(argv[arg] + 2);
         else if (!strncmp("--verbose=", argv[arg], 10))
index 201497a..a08e1d1 100644 (file)
@@ -51,10 +51,10 @@ const _dt_conf_t conf_f32_full = {
 };
 
 const _dt_conf_t conf_f32_wino = {
-    {mkldnn_f32, -FLT_MAX, FLT_MAX, -4,  16, 3, 1, .25, 1e-5},
-    {mkldnn_f32, -FLT_MAX, FLT_MAX,  2,  64, 2, 1, .75, 5e-5},
-    {mkldnn_f32, -FLT_MAX, FLT_MAX,  1, 128, 1, 1, .25,   0.},
-    {mkldnn_f32, -FLT_MAX, FLT_MAX,  0,  16, 3, 1, .25, 2e-5},
+    {mkldnn_f32, -FLT_MAX, FLT_MAX,  -16, 128, 3, 1, .25, 1e-5},
+    {mkldnn_f32, -FLT_MAX, FLT_MAX,  2,  64, 2, 1, .75, 6e-6},
+    {mkldnn_f32, -FLT_MAX, FLT_MAX,  1, 128, 1, 1, .25,  2e-7},
+    {mkldnn_f32, -FLT_MAX, FLT_MAX, -16, 128, 3, 1, .25, 2e-5},
     {mkldnn_f32,},
 };
 
@@ -114,38 +114,70 @@ const _dt_conf_t conf_u8s8u8s32 = {
     {mkldnn_s32,},
 };
 
-const _dt_conf_t conf_u8s8f32s32_wino = {
-    {mkldnn_u8,          0, UINT8_MAX,    0,   8, 0, 1, .25, 0.},
+const _dt_conf_t conf_s8s8f32s32 = {
+    {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -5,   5, 0, 1, .25, 0.},
     {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -8,   3, 0, 4, .25, 0.},
     {mkldnn_f32, INT32_MIN, INT32_MAX,   -8,  32, 0, 1, .25, 0.},
     {mkldnn_f32, INT32_MIN, INT32_MAX, -255, 255, 0, 1, .25, 0.},
     {mkldnn_s32,},
 };
 
-const _dt_conf_t conf_u8s8s32s32_wino = {
-    {mkldnn_u8,          0, UINT8_MAX,    0,   8, 0, 1, .25, 0.},
+const _dt_conf_t conf_s8s8s32s32 = {
+    {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -5,   5, 0, 1, .25, 0.},
     {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -8,   3, 0, 4, .25, 0.},
     {mkldnn_f32, INT32_MIN, INT32_MAX,   -8,  32, 0, 1, .25, 0.},
     {mkldnn_s32, INT32_MIN, INT32_MAX, -255, 255, 0, 1, .25, 0.},
     {mkldnn_s32,},
 };
 
-const _dt_conf_t conf_u8s8s8s32_wino = {
-    {mkldnn_u8,          0, UINT8_MAX,    0,   8, 0, 1, .25, 0.},
+const _dt_conf_t conf_s8s8s8s32 = {
+    {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -5,   5, 0, 1, .25, 0.},
     {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -8,   3, 0, 4, .25, 0.},
     {mkldnn_f32, INT32_MIN, INT32_MAX,   -8,  32, 0, 1, .25, 0.},
     {mkldnn_s8,   INT8_MIN,  INT8_MAX, -127, 127, 0, 1, .25, 0.},
     {mkldnn_s32,},
 };
 
-const _dt_conf_t conf_u8s8u8s32_wino = {
-    {mkldnn_u8,          0, UINT8_MAX,    0,   8, 0, 1, .25, 0.},
+const _dt_conf_t conf_s8s8u8s32 = {
+    {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -5,   5, 0, 1, .25, 0.},
     {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -8,   3, 0, 4, .25, 0.},
     {mkldnn_f32, INT32_MIN, INT32_MAX,   -8,  32, 0, 1, .25, 0.},
     {mkldnn_u8,          0, UINT8_MAX,    0, 255, 0, 1, .25, 0.},
     {mkldnn_s32,},
 };
 
+const _dt_conf_t conf_u8s8f32s32_wino = {
+    {mkldnn_u8,          0, UINT8_MAX,    0,   239, 0, 4, .25, 0.},
+    {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -72,   71, 0, 9, .25, 0.},
+    {mkldnn_f32, INT32_MIN, INT32_MAX,   -9,  32, 0, 9, .25, 0.},
+    {mkldnn_f32, INT32_MIN, INT32_MAX, -255, 255, 0, 1, .25, 0.},
+    {mkldnn_s32,},
+};
+
+const _dt_conf_t conf_u8s8s32s32_wino = {
+    {mkldnn_u8,          0, UINT8_MAX,    0,   239, 0, 4, .25, 0.},
+    {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -72,   71, 0, 9, .25, 0.},
+    {mkldnn_f32, INT32_MIN, INT32_MAX,   -9,  32, 0, 9, .25, 0.},
+    {mkldnn_s32, INT32_MIN, INT32_MAX, -255, 255, 0, 1, .25, 0.},
+    {mkldnn_s32,},
+};
+
+const _dt_conf_t conf_u8s8s8s32_wino = {
+    {mkldnn_u8,          0, UINT8_MAX,    0,   239, 0, 4, .25, 0.},
+    {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -72,   71, 0, 9, .25, 0.},
+    {mkldnn_f32, INT32_MIN, INT32_MAX,   -9,  32, 0, 9, .25, 0.},
+    {mkldnn_s8,   INT8_MIN,  INT8_MAX, -127, 127, 0, 1, .25, 0.},
+    {mkldnn_s32,},
+};
+
+const _dt_conf_t conf_u8s8u8s32_wino = {
+    {mkldnn_u8,          0, UINT8_MAX,    0,   239, 0, 4, .25, 0.},
+    {mkldnn_s8,   INT8_MIN,  INT8_MAX,   -72,   71, 0, 9, .25, 0.},
+    {mkldnn_f32, INT32_MIN, INT32_MAX,   -9,  32, 0, 9, .25, 0.},
+    {mkldnn_u8,          0, UINT8_MAX,    0, 255, 0, 1, .25, 0.},
+    {mkldnn_s32,},
+};
+
 const dt_conf_t *str2cfg(const char *str) {
 #define CASE(cfg) \
     if (!strcasecmp(STRINGIFY(cfg), str)) return CONCAT2(conf_,cfg)
@@ -159,6 +191,10 @@ const dt_conf_t *str2cfg(const char *str) {
     CASE(u8s8s32s32);
     CASE(u8s8s8s32);
     CASE(u8s8u8s32);
+    CASE(s8s8f32s32);
+    CASE(s8s8s32s32);
+    CASE(s8s8s8s32);
+    CASE(s8s8u8s32);
     CASE(u8s8f32s32_wino);
     CASE(u8s8s32s32_wino);
     CASE(u8s8s8s32_wino);
@@ -180,6 +216,10 @@ const char *cfg2str(const dt_conf_t *cfg) {
     CASE(u8s8s32s32);
     CASE(u8s8s8s32);
     CASE(u8s8u8s32);
+    CASE(s8s8f32s32);
+    CASE(s8s8s32s32);
+    CASE(s8s8s8s32);
+    CASE(s8s8u8s32);
     CASE(u8s8f32s32_wino);
     CASE(u8s8s32s32_wino);
     CASE(u8s8s8s32_wino);
index 37d70a6..eb1e4ca 100644 (file)
@@ -37,6 +37,14 @@ inline bool is_conv_3d(const prb_t *p)
     return (p->id > 1) ? 1 : 0;
 }
 
+inline bool is_conv_1d(const prb_t *p)
+{
+    return (!is_conv_3d(p) && p->ih == 1 && p->kh == 1
+                   && p->cfg[SRC].dt != mkldnn_s8 // temporary workaround until
+                   && p->cfg[SRC].dt != mkldnn_u8) // int8 jit supports 1d
+            ? 1 : 0;
+}
+
 double get_trust_nz_level(const prb_t *p, data_kind_t kind, bool final_compare)
 {
     if (!final_compare)
@@ -72,6 +80,27 @@ double get_trust_nz_level(const prb_t *p, data_kind_t kind, bool final_compare)
     return trust;
 }
 
+inline double get_eps(const prb_t *p, const data_kind_t kind) {
+    if (p->alg & WINO && p->dir & FLAG_WEI) {
+        /*This is an empirical equation derived by observing growth error
+          with increasing 'k' dimension in gemm of winograd*/
+        return p->cfg[kind].eps *
+            (MAX2(1, pow(10, 0.4 * log10(0.125 * p->mb * p->oh * p->ow))));
+    }
+    return p->cfg[kind].eps;
+}
+
+inline void get_result(const prb_t *p, const data_kind_t kind, res_t *r,
+        const diff_norm_t diff_norm) {
+    bool wino_test = (p->alg & WINO)
+        && (diff_norm.rel_diff(norm_t::L2) <= get_eps(p, kind));
+    /* Ignoring elementwise errors for winograd,
+       since large relative error in few elements(which are anyways close to zero)
+       results in false positive failures*/
+    if (wino_test) r->errors = 0;
+    r->state = r->errors ? FAILED : r->state;
+}
+
 inline int compare_dat(const prb_t *p, data_kind_t kind, dnn_mem_t &mem_dt,
         dnn_mem_t &mem_fp, res_t *r, bool final_compare = false) {
     size_t nelems = mem_dt.nelems();
@@ -118,13 +147,13 @@ inline int compare_dat(const prb_t *p, data_kind_t kind, dnn_mem_t &mem_dt,
             above_ok += ok;
         } else {
             diff_norm.update(fp, dt);
-            ok = (fabs(fp) > 1e-5 ? rel_diff : diff) <= p->cfg[kind].eps;
+            ok = (fabs(fp) > 1e-5 ? rel_diff : diff) <= get_eps(p, kind);
             in += 1;
             in_ok += ok;
         }
         if (!ok) {
             r->errors++;
-            if (r->errors < 10 || verbose >= 10) {
+            if ((!(p->alg & WINO) && r->errors < 10) || verbose >=10) {
                 int mb_or_g = 0, g_or_oc = 0, c = 0, d = 0, h = 0, w = 0;
                 switch (kind) {
                 case SRC: inv_src_off_f(p, i, mb_or_g, g_or_oc, c, d, h, w); break;
@@ -207,8 +236,7 @@ inline int compare_dat(const prb_t *p, data_kind_t kind, dnn_mem_t &mem_dt,
                 non_zero, (unsigned long)r->total);
     }
 
-    if (r->errors)
-        r->state = FAILED;
+    get_result(p, kind, r, diff_norm);
 
     if (final_compare && r->state == UNTESTED)
         r->state = PASSED; /* optimism */
@@ -234,7 +262,7 @@ int fill_src(const prb_t *p, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp,
     const bool extra_mem = mem_dt.dt() != mem_fp.dt();
     dnn_mem_t *p_mem_00 = extra_mem
         ? new dnn_mem_t(mem_dt.md_, mkldnn_f32,
-            is_conv_3d(p) ? mkldnn_ncdhw : mkldnn_nchw)
+            get_default_format(mem_dt.md_.ndims, DATA))
         : &mem_fp;
     dnn_mem_t &mem_00 = *p_mem_00;
 
@@ -264,12 +292,13 @@ int fill_src(const prb_t *p, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp,
 int fill_wei(const prb_t *p, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp,
     res_t *r) {
     const bool wino_s8 = p->alg == WINO && p->cfg[WEI].dt == mkldnn_s8;
+    const bool s8_s8 = p->cfg[WEI].dt == mkldnn_s8 && p->cfg[SRC].dt == mkldnn_s8;
     const bool diff_data_type = mem_dt.dt() != mem_fp.dt();
-    const bool extra_mem = diff_data_type && !wino_s8;
+    const bool check_reorder = diff_data_type && !wino_s8 && !s8_s8;
 
-    dnn_mem_t *p_mem_00 = extra_mem
+    dnn_mem_t *p_mem_00 = check_reorder
         ? new dnn_mem_t(mem_dt.md_, mkldnn_f32,
-            is_conv_3d(p) ? mkldnn_goidhw : mkldnn_goihw)
+            get_default_format(mem_dt.md_.ndims, GWEI))
         : &mem_fp;
     dnn_mem_t &mem_00 = *p_mem_00;
 
@@ -288,7 +317,7 @@ int fill_wei(const prb_t *p, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp,
     });
 
     SAFE(mem_dt.reorder(mem_00), WARN);
-    if (extra_mem) {
+    if (check_reorder) {
         SAFE(mem_fp.reorder(mem_dt), WARN);
         SAFE(compare_wei(p, mem_fp, mem_00, r), WARN);
         delete &mem_00;
@@ -333,7 +362,7 @@ int fill_dst(const prb_t *p, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp,
     const bool extra_mem = mem_dt.dt() != mem_fp.dt();
     dnn_mem_t *p_mem_00 = extra_mem
         ? new dnn_mem_t(mem_dt.md_, mkldnn_f32,
-            is_conv_3d(p) ? mkldnn_ncdhw : mkldnn_nchw)
+            get_default_format(mem_dt.md_.ndims, DATA))
         : &mem_fp;
     dnn_mem_t &mem_00 = *p_mem_00;
 
@@ -364,44 +393,45 @@ inline int init_pd(const prb_t *p, mkldnn_convolution_desc_t &cd,
         mkldnn_primitive_desc_t &cpd, res_t *r) {
     mkldnn_memory_desc_t src_d, wei_d, bia_d, dst_d;
 
-    int ndims = is_conv_3d(p) ? 5 : 4;
+    int ndims = is_conv_3d(p) ? 5 : is_conv_1d(p) ? 3 : 4;
     mkldnn_dims_t src_dims = {p->mb, p->ic, p->ih, p->iw};
+    mkldnn_dims_t src_1d_dims = {p->mb, p->ic, p->iw};
     mkldnn_dims_t src_3d_dims = {p->mb, p->ic, p->id, p->ih, p->iw};
     mkldnn_dims_t wei_dims = {p->g, p->oc / p->g, p->ic / p->g, p->kh, p->kw};
+    mkldnn_dims_t wei_1d_dims = {p->g, p->oc / p->g, p->ic / p->g, p->kw};
     mkldnn_dims_t wei_3d_dims = {p->g, p->oc / p->g, p->ic / p->g, p->kd, p->kh, p->kw};
     mkldnn_dims_t bia_dims = {p->oc};
     mkldnn_dims_t dst_dims = {p->mb, p->oc, p->oh, p->ow};
+    mkldnn_dims_t dst_1d_dims = {p->mb, p->oc, p->ow};
     mkldnn_dims_t dst_3d_dims = {p->mb, p->oc, p->od, p->oh, p->ow};
 
     DNN_SAFE(mkldnn_memory_desc_init(&src_d, ndims,
-        is_conv_3d(p) ? src_3d_dims : src_dims, p->cfg[SRC].dt, mkldnn_any), WARN);
+        is_conv_3d(p) ? src_3d_dims : is_conv_1d(p) ? src_1d_dims : src_dims,
+        p->cfg[SRC].dt, mkldnn_any), WARN);
     DNN_SAFE(mkldnn_memory_desc_init(&wei_d, ndims + 1,
-        is_conv_3d(p) ? wei_3d_dims : wei_dims, p->cfg[WEI].dt, mkldnn_any), WARN);
-    DNN_SAFE(mkldnn_memory_desc_init(&bia_d, 1, bia_dims, p->cfg[BIA].dt, mkldnn_any), WARN);
+        is_conv_3d(p) ? wei_3d_dims :  is_conv_1d(p) ? wei_1d_dims : wei_dims,
+        p->cfg[WEI].dt, mkldnn_any), WARN);
+    DNN_SAFE(mkldnn_memory_desc_init(&bia_d, 1, bia_dims, p->cfg[BIA].dt,
+        mkldnn_any), WARN);
     DNN_SAFE(mkldnn_memory_desc_init(&dst_d, ndims,
-        is_conv_3d(p) ? dst_3d_dims : dst_dims, p->cfg[DST].dt, mkldnn_any), WARN);
-    int strides_2d[] = {p->sh, p->sw};
-    int dilates_2d[] = {p->dh, p->dw};
-    int padding_2d[] = {p->ph, p->pw};
-    int strides_3d[] = {p->sd, p->sh, p->sw};
-    int dilates_3d[] = {p->dd, p->dh, p->dw};
-    int padding_3d[] = {p->pd, p->ph, p->pw};
+        is_conv_3d(p) ? dst_3d_dims : is_conv_1d(p) ? dst_1d_dims : dst_dims,
+        p->cfg[DST].dt, mkldnn_any), WARN);
+    int strides_nd[] = {p->sd, p->sh, p->sw};
+    int dilates_nd[] = {p->dd, p->dh, p->dw};
+    int padding_nd[] = {p->pd, p->ph, p->pw};
 
     auto bph = [&](int ih, int oh, int kh, int sh, int ph, int dh) {
         return (oh - 1) * sh - ih + ((kh - 1) * (dh + 1) + 1) - ph;
     };
-    int padding_r_3d[] = {
+    int padding_r_nd[] = {
         bph(p->id, p->od, p->kd, p->sd, p->pd, p->dd),
         bph(p->ih, p->oh, p->kh, p->sh, p->ph, p->dh),
         bph(p->iw, p->ow, p->kw, p->sw, p->pw, p->dw)};
-    int padding_r_2d[] = {
-        bph(p->ih, p->oh, p->kh, p->sh, p->ph, p->dh),
-        bph(p->iw, p->ow, p->kw, p->sw, p->pw, p->dw)};
 
-    int *strides = is_conv_3d(p) ? strides_3d : strides_2d;
-    int *dilates = is_conv_3d(p) ? dilates_3d : dilates_2d;
-    int *padding = is_conv_3d(p) ? padding_3d : padding_2d;
-    int *padding_r = is_conv_3d(p) ? padding_r_3d : padding_r_2d;
+    int *strides = strides_nd + (5 - ndims);
+    int *dilates = dilates_nd + (5 - ndims);
+    int *padding = padding_nd + (5 - ndims);
+    int *padding_r = padding_r_nd + (5 - ndims);
 
     mkldnn_alg_kind_t alg = mkldnn_convolution_direct;
     if (p->alg == WINO) alg = mkldnn_convolution_winograd;
@@ -517,8 +547,8 @@ int doit(const prb_t *p, res_t *r) {
         ? new dnn_mem_t(bia_dt_d, p->cfg[BIA].dt) : new dnn_mem_t();
     dnn_mem_t &bia_dt = *p_bia_dt;
 
-    auto src_format = is_conv_3d(p) ? mkldnn_ncdhw : mkldnn_nchw;
-    auto wei_format = is_conv_3d(p) ? mkldnn_goidhw : mkldnn_goihw;
+    auto src_format = get_default_format(src_dt.md_.ndims, DATA);
+    auto wei_format = get_default_format(wei_dt.md_.ndims, GWEI);
 
     const auto fp = mkldnn_f32;
     dnn_mem_t src_fp(src_dt_d, fp, src_format);
@@ -597,6 +627,9 @@ int doit(const prb_t *p, res_t *r) {
         }
     }
 
+    DNN_SAFE(mkldnn_primitive_desc_destroy(cpd), CRIT);
+    DNN_SAFE(mkldnn_primitive_destroy(c), CRIT);
+
     delete p_bia_dt;
     delete p_bia_fp;
 
index 408a842..8301e87 100644 (file)
@@ -166,12 +166,12 @@ int str2desc(desc_t *desc, const char *str, bool is_deconv) {
         d.sw = d.sh;
         d.dw = d.dh;
     } else if (no_h) {
-        d.ih = d.iw;
-        d.kh = d.kw;
-        d.oh = d.ow;
-        d.ph = d.pw;
-        d.sh = d.sw;
-        d.dh = d.dw;
+        d.ih = 1;
+        d.kh = 1;
+        d.oh = 1;
+        d.ph = 0;
+        d.sh = 1;
+        d.dh = 0;
     }
     if (d.id<1) {d.id = 1; d.kd = 1; d.od = 1; d.sd = 1; d.pd = 0; d.dd = 0;}
 
index ca362d1..d3969ec 100644 (file)
@@ -85,6 +85,9 @@ extern const _dt_conf_t conf_s16s32s16s32;
 extern const _dt_conf_t conf_u8s8s32s32;
 extern const _dt_conf_t conf_u8s8s8s32;
 extern const _dt_conf_t conf_u8s8u8s32;
+extern const _dt_conf_t conf_s8s8s32s32;
+extern const _dt_conf_t conf_s8s8s8s32;
+extern const _dt_conf_t conf_s8s8u8s32;
 extern const _dt_conf_t conf_u8s8f32s32_wino;
 extern const _dt_conf_t conf_u8s8s32s32_wino;
 extern const _dt_conf_t conf_u8s8s8s32_wino;
index 7785b6f..ec0e0d0 100644 (file)
@@ -41,7 +41,7 @@ inline static void swap(int &a, int &b)
 }
 inline bool is_deconv_3d(const prb_t *p)
 {
-    return (p->id > 1) ? 1 : 0;
+    return (p->id > 1 || p->od > 1) ? 1 : 0;
 }
 
 inline int transpose_data_wei(const prb_t *p, dnn_mem_t &wei, dnn_mem_t &wei_tr) {
@@ -299,6 +299,9 @@ int doit(const prb_t *p, res_t *r) {
         }
     }
 
+    DNN_SAFE_V(mkldnn_primitive_destroy(c));
+    DNN_SAFE_V(mkldnn_primitive_desc_destroy(dpd));
+
     delete p_bia_dt;
     delete p_bia_fp;
     delete p_zero_fp;
index bc2f406..a471d21 100644 (file)
@@ -278,8 +278,7 @@ void compute_ref_bwd_bias(const prb_t *p, dnn_mem_t &diff_bia_m,
     dnn_mem_t &diff_dst_m) {
     mkldnn::impl::parallel_nd(p->g, p->oc / p->g, [&](int g, int oc) {
        size_t bia_off = bia_off_f(p, g, oc);
-       float &db = ((float*)diff_bia_m)[bia_off];
-       db = 0;
+       double sum = 0;
 
        for (int mb = 0; mb < p->mb; ++mb)
        for (int od = 0; od < p->od; ++od)
@@ -287,8 +286,9 @@ void compute_ref_bwd_bias(const prb_t *p, dnn_mem_t &diff_bia_m,
        for (int ow = 0; ow < p->ow; ++ow)
        {
            size_t dst_off = dst_off_f(p, mb, g, oc, od, oh, ow);
-           db += ((float*)diff_dst_m)[dst_off];
+           sum += ((float*)diff_dst_m)[dst_off];
        }
+       ((float *)diff_bia_m)[bia_off] = (float)sum;
     });
 }
 
index 3344911..2bb3429 100644 (file)
@@ -79,6 +79,10 @@ data_kind_t fmt2data_kind(mkldnn_memory_format_t fmt) {
     case mkldnn_tnc:
     case mkldnn_ntc:
 
+    case mkldnn_ncw:
+    case mkldnn_nwc:
+    case mkldnn_nCw16c:
+
     case mkldnn_nchw:
     case mkldnn_nhwc:
     case mkldnn_chwn:
@@ -90,11 +94,19 @@ data_kind_t fmt2data_kind(mkldnn_memory_format_t fmt) {
     case mkldnn_nCdhw16c:
         return DATA;
 
+    case mkldnn_goiw:
+    case mkldnn_gOIw16i16o:
+    case mkldnn_gOIw16o16i:
+    case mkldnn_gOiw16o:
+    case mkldnn_gOwi16o:
+    case mkldnn_gOIw8i16o2i:
     case mkldnn_goihw:
     case mkldnn_hwigo:
+    case mkldnn_hwigo_s8s8:
     case mkldnn_gOIhw8i8o:
     case mkldnn_gOIhw16i16o:
     case mkldnn_gOIhw4i16o4i:
+    case mkldnn_gOIhw4i16o4i_s8s8:
     case mkldnn_gOIhw8i16o2i:
     case mkldnn_gOIdhw8i16o2i:
     case mkldnn_gOIhw8o16i2o:
@@ -373,7 +385,32 @@ mkldnn_primitive_attr_t create_mkldnn_attr(const attr_t &attr, int scale_cnt,
         const_mkldnn_post_ops_t c_ops;
         DNN_SAFE_V(mkldnn_primitive_attr_get_post_ops(mkldnn_attr, &c_ops));
         SAFE_V(mkldnn_post_ops_len(c_ops) == attr.post_ops.len ? OK : FAIL);
+
+        DNN_SAFE_V(mkldnn_post_ops_destroy(ops));
     }
 
     return mkldnn_attr;
 }
+
+mkldnn_memory_format_t get_default_format(int ndims, data_kind_t kind) {
+    switch(kind) {
+    case DATA: return (ndims == 5)
+        ? mkldnn_ncdhw
+        : (ndims == 4)
+        ? mkldnn_nchw
+        : mkldnn_ncw;
+    case GWEI: return (ndims == 6)
+        ? mkldnn_goidhw
+        : (ndims == 5)
+        ? mkldnn_goihw
+        : mkldnn_goiw;
+    case WEI: return (ndims == 5)
+        ? mkldnn_oidhw
+        : (ndims == 4)
+        ? mkldnn_oihw
+        : mkldnn_oiw;
+    default:
+        assert(!"unknown kind");
+    }
+    return mkldnn_format_undef;
+}
index ccd9dca..7010c98 100644 (file)
@@ -108,6 +108,7 @@ const size_t max_attr_len = 128;
 int str2attr(attr_t *attr, const char *str);
 void attr2str(const attr_t *attr, char *buffer);
 
+mkldnn_memory_format_t get_default_format(int ndims, data_kind_t kind);
 mkldnn_primitive_attr_t create_mkldnn_attr(const attr_t &attr, int scale_cnt,
         int scale_mask, const float *scales);
 inline mkldnn_primitive_attr_t create_mkldnn_attr(const attr_t &attr,
index 134a505..4cb9265 100644 (file)
@@ -2,7 +2,6 @@
 --fmt=nhwc
 
 --dir=FWD_D
---flags=SR --batch=bnorm_large
 --flags=SR --batch=bnorm_regressions
 --flags=GS --batch=bnorm_regressions
 --flags=S  --batch=bnorm_regressions
@@ -13,7 +12,6 @@
 --attr=
 
 --dir=BWD_DW
---flags=SR --batch=bnorm_large
 --flags=SR --batch=bnorm_regressions
 --flags=GS --batch=bnorm_regressions
 --flags=S  --batch=bnorm_regressions
@@ -24,7 +22,6 @@
 --fmt=nchw
 
 --dir=FWD_D
---flags=SR --batch=bnorm_large
 --flags=SR --batch=bnorm_regressions
 --flags=GS --batch=bnorm_regressions
 --flags=S  --batch=bnorm_regressions
@@ -35,7 +32,6 @@
 --attr=
 
 --dir=BWD_DW
---flags=SR --batch=bnorm_large
 --flags=SR --batch=bnorm_regressions
 --flags=GS --batch=bnorm_regressions
 --flags=S  --batch=bnorm_regressions
@@ -46,7 +42,6 @@
 --fmt=nChw16c # avx512
 
 --dir=FWD_D
---flags=SR --batch=bnorm_large
 --flags=SR --batch=bnorm_regressions
 --flags=GS --batch=bnorm_regressions
 --flags=S  --batch=bnorm_regressions
@@ -57,7 +52,6 @@
 --attr=
 
 --dir=BWD_DW
---flags=SR --batch=bnorm_large
 --flags=SR --batch=bnorm_regressions
 --flags=GS --batch=bnorm_regressions
 --flags=S  --batch=bnorm_regressions
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/bnorm/test_bnorm_regressions_large b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/bnorm/test_bnorm_regressions_large
new file mode 100644 (file)
index 0000000..ff5062f
--- /dev/null
@@ -0,0 +1,24 @@
+--skip-impl=ref
+--fmt=nhwc
+
+--dir=FWD_D
+--flags=SR --batch=bnorm_large
+
+--dir=BWD_DW
+--flags=SR --batch=bnorm_large
+
+--fmt=nchw
+
+--dir=FWD_D
+--flags=SR --batch=bnorm_large
+
+--dir=BWD_DW
+--flags=SR --batch=bnorm_large
+
+--fmt=nChw16c # avx512
+
+--dir=FWD_D
+--flags=SR --batch=bnorm_large
+
+--dir=BWD_DW
+--flags=SR --batch=bnorm_large
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_1d b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_1d
new file mode 100644 (file)
index 0000000..ae6582e
--- /dev/null
@@ -0,0 +1,34 @@
+# 1D convolution
+
+g1mb50ic3iw224oc64kw7sw1pw0n"conv1d:1"
+
+g1mb50ic3iw224oc64ow112kw7sw2pw3n"conv1d:2"
+g1mb50ic256iw56oc512ow28kw1sw2pw0n"conv1d:3"
+g1mb50ic256iw56oc128ow28kw1sw2pw0n"conv1d:4"
+
+mb1ic32iw32oc32kw2pw0sw1n"conv1d:5"
+mb1ic32iw100oc32ow98kw3pw0sw1n"conv1d:6"
+mb1ic32iw121oc32ow119kw3pw0sw1n"conv1d:7"
+mb1ic512iw300oc512ow298kw3pw0sw1n"conv1d:8"
+
+mb1ic32iw32oc32kw5pw0sw1n"conv1d:9"
+mb1ic32iw100oc32kw2pw0sw1n"conv1d:10"
+mb1ic32iw121oc32kw6pw0sw1n"conv1d:11"
+mb1ic512iw300oc512kw1pw0sw1n"conv1d:12"
+
+mb1ic32iw32oc32kw5pw0sw1n"conv1d:13"
+mb1ic32iw100oc32kw2pw0sw1n"conv1d:14"
+mb1ic32iw121oc32kw6pw0sw4n"conv1d:15"
+
+mb1ic32iw32oc32kw5pw1sw1n"conv1d:16"
+mb1ic32iw100oc32kw2pw2sw1n"conv1d:17"
+mb1ic32iw121oc32kw6pw3sw1n"conv1d:18"
+
+mb1ic512iw32oc512kw5pw1sw1n"conv1d:19"
+mb1ic512iw100oc512kw2pw2sw1n"conv1d:20"
+mb1ic512iw121oc512kw6pw3sw1n"conv1d:21"
+
+ic8oc8_iw5ow2kw3sw1dw2pw3n"conv1d_gemm:1"
+ic32oc8_iw8ow2kw3sw1dw2pw0n"conv1d_gemm:2"
+ic512oc1024_iw19ow19kw3sw1dw5pw6n"conv1d_gemm:3"
+ic256oc512_iw15ow15kw3sw1dw5pw6n"conv1d_gemm:4"
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_1d_wavenet b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/conv_1d_wavenet
new file mode 100644 (file)
index 0000000..fab218a
--- /dev/null
@@ -0,0 +1,10 @@
+# Wavenet sizes
+mb128_g1ic32oc32_iw643ow642kw2sw1dw0pw0n"wavenet:conv1d_1"
+mb256_g1ic32oc32_iw337ow336kw2sw1dw0pw0n"wavenet:conv1d_2"
+mb32_g1ic32oc32_iw269ow268kw2sw1dw0pw0n"wavenet:conv1d_3"
+mb512_g1ic32oc32_iw168ow167kw2sw1dw0pw0n"wavenet:conv1d_4"
+mb64_g1ic32oc32_iw1349ow1348kw2sw1dw0pw0n"wavenet:conv1d_5"
+
+mb1_g1ic32oc32_iw842ow842kw1sw1dw0pw0n"wavenet:conv1d_1x1_1"
+mb1_g1ic512oc256_iw812ow812kw1sw1dw0pw0n"wavenet:conv1d_1x1_2"
+mb1_g1ic512oc512_iw812ow812kw1sw1dw0pw0n"wavenet:conv1d_1x1_3"
index b692964..30949cc 100644 (file)
@@ -14,3 +14,7 @@ mb20ic4ih8iw8oc6oh4ow8kh3kw3sh2sw1ph1pw1dh1dw0n"dilated_conv:11"
 mb20ic4ih8iw8oc6oh8ow2kh3kw3sh1sw3ph1pw1dh0dw2n"dilated_conv:12"
 mb20ic8ih5iw8oc8oh3ow8kh3kw3ph2pw1dh1dw0n"dilated_conv:13"
 mb20ic8ih8iw5oc8oh8ow2kh3kw3ph1pw3dh0dw2n"dilated_conv:14"
+
+mb20ic16ih1iw8oc6oh1kh1kw3sh1pw0ph0sw1dh0dw2n"dilated_conv1d:15"
+mb20ic32ih1iw8oc8oh1kh1kw3sh1pw0ph0sw1dh0dw1n"dilated_conv1d:16"
+mb20ic32ih1iw8oc8oh1kh1kw3sh1pw0ph0sw1dh0dw2n"dilated_conv1d:17"
index 3f91569..284233c 100644 (file)
@@ -85,3 +85,4 @@ mb1ic16oc16_ih5oh5kh7ph6
 mb4_g1ic1oc2_ih5oh2kh2sh2dh0ph0_iw5ow2kw2sw2dw0pw0
 mb2ic16ih33oc16oh16kh2sh2
 mb1g2ic2oc4_ih2oh1kh2sh1ph0_iw3ow2kw3sw1pw0
+mb1_g1ic1oc1_id4od3kd3sd2dd0pd2_ih4oh2kh5sh2dh0ph2_iw4ow1kw7sw2dw0pw2
index f19836a..7d8b0fd 100644 (file)
@@ -57,4 +57,15 @@ ic17oc16_ih13oh12kh3ph0_id13od11kd3pd0_n"tails_conv_3d:3"
 ic35oc32_ih13oh13kh3ph1_id13od11kd3pd0_n"tails_conv_3d:4"
 
 ic47oc37_ih13oh12kh3ph0_id13od11kd3pd0_n"tails_conv_3d:5"
-ic27oc19_ih13oh13kh3ph1_id13od11kd3pd0_n"tails_conv_3d:6"
\ No newline at end of file
+ic27oc19_ih13oh13kh3ph1_id13od11kd3pd0_n"tails_conv_3d:6"
+
+# 1d conv
+ic19oc32_iw13ow12kw3pw1_n"tails_conv_1d:1"
+ic20oc32_iw13ow13kw3pw0_n"tails_conv_1d:2"
+ic23oc64_iw13ow13kw3pw1_n"tails_conv_1d:3"
+ic25oc24_iw13ow12kw3pw0_n"tails_conv_1d:4"
+ic29oc32_iw13ow12kw3pw0_n"tails_conv_1d:5"
+ic20oc32_iw13kw1pw0_n"tails_conv_1d_1x1:2"
+ic23oc64_iw13kw1pw0_n"tails_conv_1d_1x1:3"
+ic25oc24_iw13kw1pw0_n"tails_conv_1d_1x1:4"
+ic29oc32_iw13kw1pw0_n"tails_conv_1d_1x1:5"
index 271c2bd..e59a669 100644 (file)
@@ -5,7 +5,7 @@
 --dir=BWD_WB --batch=ip_all
 
 # i8 (skx)
---reset --dir=FWD_B --mb=2 --attr=irmode=down;oscale=per_oc:2.25;
+--reset --allow-unimpl=true --dir=FWD_B --mb=2 --attr=irmode=down;oscale=per_oc:2.25;
 --cfg=u8s8u8s32  --batch=ip_all
 --cfg=u8s8s8s32  --batch=ip_all
 --cfg=u8s8s32s32 --batch=ip_all
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru
new file mode 100644 (file)
index 0000000..7496c8b
--- /dev/null
@@ -0,0 +1,4 @@
+l2t2mb128sic512n"exp-gru-0"
+l7t1mb128sic512slc1024dic512dlc512n"exp-gru-1"
+l1t10mb32sic128slc512dic128dlc128n"exp-gru-2"
+
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru_small b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_gru_small
new file mode 100644 (file)
index 0000000..c315e9e
--- /dev/null
@@ -0,0 +1,7 @@
+l1t1sic1
+l1t1sic2
+l1t1mb4sic2
+l1t1mb4sic1slc3dic1dlc1
+l1t2mb4sic2
+l2t1mb4sic2
+l2t2mb4sic2
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_inference b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_inference
new file mode 100644 (file)
index 0000000..be35247
--- /dev/null
@@ -0,0 +1,10 @@
+l1t30mb1sic512n"GNMT_enc-inference"
+l7t30mb1sic1024n"GNMT_enc-inference"
+l8t1mb1sic2048slc1024dic1024dlc1024n"GNMT_dec-inference"
+l1t50mb1sic1760n"deepspeech2-inference"
+l1t100mb1sic760n"deepspeech2-inference"
+l1t200mb1sic1760n"deepspeech2-inference"
+l1t50mb1sic500n"pytorch_testcase-inference"
+l1t629mb1sic128n"paddlepaddle_testcase-inference"
+l1t10mb1sic128slc512dic128dlc128n"exp-0"
+l10t1mb1sic512slc128dic128dlc128n"exp-1"
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_small b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_small
new file mode 100644 (file)
index 0000000..f7e2c2d
--- /dev/null
@@ -0,0 +1,8 @@
+l1t1sic1
+l1t1sic2
+l1t1mb3sic2
+l1t1mb5sic1slc3dic1dlc1
+l1t1mb5sic3slc1dic1dlc1
+l1t2mb3sic2
+l2t1mb3sic2
+l2t2mb3sic2
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_training b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/rnn_training
new file mode 100644 (file)
index 0000000..5d9a0dd
--- /dev/null
@@ -0,0 +1,11 @@
+l1t1mb128sic512n"GNMT_enc-training"
+l2t2mb128sic1024n"GNMT_enc-training"
+l8t1mb128sic2048slc1024dic1024dlc1024n"GNMT_dec-training"
+l1t50mb32sic1760n"deepspeech2-training"
+l1t100mb32sic1760n"deepspeech2-training"
+l1t200mb32sic1760n"deepspeech2-training"
+l1t50mb64sic500n"pytorch_testcase-training"
+l1t629mb128sic128n"paddlepaddle_testcase-training"
+l1t952mb128sic128n"paddlepaddle_testcase-training"
+l1t10mb32sic128slc512dic128dlc128n"exp-0"
+l10t1mb32sic512slc128dic128dlc128n"exp-1"
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_inference b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_inference
new file mode 100644 (file)
index 0000000..86ef88f
--- /dev/null
@@ -0,0 +1,37 @@
+# f32
+
+# RNN
+--reset --alg=VANILLA_RNN
+--direction=left2right
+--activation=LOGISTIC
+--prop=FWD_D --batch=rnn_inference
+
+--reset
+--direction=concat
+--activation=LOGISTIC
+--prop=FWD_D --batch=rnn_inference
+
+# sum
+--reset
+--direction=sum
+--activation=LOGISTIC
+--prop=FWD_D --batch=rnn_inference
+
+# LSTM
+--reset --alg=VANILLA_LSTM
+--direction=left2right
+--activation=LOGISTIC
+--prop=FWD_D --batch=rnn_inference
+
+# GRU
+--reset --alg=VANILLA_GRU
+--direction=left2right
+--activation=LOGISTIC
+--prop=FWD_D --batch=rnn_gru
+
+# LBR_GRU
+--reset --alg=LBR_GRU
+--direction=left2right
+--activation=LOGISTIC
+--prop=FWD_D --batch=rnn_inference
+
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_small b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_small
new file mode 100644 (file)
index 0000000..6ca0cb3
--- /dev/null
@@ -0,0 +1,66 @@
+# f32
+
+# RNN
+# direction
+# l2r
+--reset --alg=VANILLA_RNN
+--direction=left2right
+--activation=RELU
+--prop=FWD_D --batch=rnn_small
+--prop=BWD_DW --batch=rnn_small
+
+# r2l
+--reset --alg=VANILLA_RNN
+--direction=right2left
+--activation=RELU
+--prop=FWD_D --batch=rnn_small
+--prop=BWD_DW --batch=rnn_small
+
+# concat
+--reset --alg=VANILLA_RNN
+--direction=concat
+--activation=RELU
+--prop=FWD_D --batch=rnn_small
+--prop=BWD_DW --batch=rnn_small
+
+# sum
+--reset --alg=VANILLA_RNN
+--direction=sum
+--activation=RELU
+--prop=FWD_D --batch=rnn_small
+--prop=BWD_DW --batch=rnn_small
+
+# activation
+--reset --alg=VANILLA_RNN
+--direction=left2right
+--activation=TANH
+--prop=FWD_D --batch=rnn_small
+--prop=BWD_DW --batch=rnn_small
+
+--reset --alg=VANILLA_RNN
+--direction=left2right
+--activation=LOGISTIC
+--prop=FWD_D --batch=rnn_small
+--prop=BWD_DW --batch=rnn_small
+
+# LSTM
+--reset --alg=VANILLA_LSTM
+--direction=left2right
+--activation=TANH
+--prop=FWD_D --batch=rnn_small
+--prop=BWD_DW --batch=rnn_small
+
+# GRU
+--reset --alg=VANILLA_GRU
+--direction=left2right
+--activation=TANH
+--prop=FWD_D --batch=rnn_gru_small
+--prop=BWD_DW --batch=rnn_gru_small
+
+# LBR_GRU
+--reset --alg=LBR_GRU
+--direction=left2right
+--activation=TANH
+--prop=FWD_D --batch=rnn_small
+--prop=BWD_DW --batch=rnn_small
+
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_training b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/rnn/test_rnn_training
new file mode 100644 (file)
index 0000000..0661966
--- /dev/null
@@ -0,0 +1,42 @@
+# f32
+
+# RNN
+--reset --alg=VANILLA_RNN
+--direction=left2right
+--activation=TANH
+--prop=FWD_D --batch=rnn_training
+--prop=BWD_DW --batch=rnn_training
+
+--reset
+--direction=concat
+--activation=TANH
+--prop=FWD_D --batch=rnn_training
+--prop=BWD_DW --batch=rnn_training
+
+# sum
+--reset
+--direction=sum
+--activation=TANH
+--prop=FWD_D --batch=rnn_training
+--prop=BWD_DW --batch=rnn_training
+
+# LSTM
+--reset --alg=VANILLA_LSTM
+--direction=left2right
+--activation=TANH
+--prop=FWD_D --batch=rnn_training
+--prop=BWD_DW --batch=rnn_training
+
+# GRU
+--reset --alg=VANILLA_GRU
+--direction=left2right
+--activation=TANH
+--prop=FWD_D --batch=rnn_gru
+--prop=BWD_DW --batch=rnn_gru
+
+# LBR_GRU
+--reset --alg=LBR_GRU
+--direction=left2right
+--activation=TANH
+--prop=FWD_D --batch=rnn_training
+--prop=BWD_DW --batch=rnn_training
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/shuffle/test_shuffle b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/shuffle/test_shuffle
new file mode 100644 (file)
index 0000000..4e05177
--- /dev/null
@@ -0,0 +1,97 @@
+--dir=FWD_D
+
+# group_size = 4
+# (1, 68, 56, 56)
+--dt=f32 --fmt=nchw --axis=1 --group=4 1x68x56x56
+--dt=f32 --fmt=nhwc --axis=1 --group=4 1x68x56x56
+
+--dt=u8 --fmt=nchw --axis=1 --group=4 1x68x56x56
+--dt=u8 --fmt=nhwc --axis=1 --group=4 1x68x56x56
+
+--dt=s8 --fmt=nchw --axis=1 --group=4 1x68x56x56
+--dt=s8 --fmt=nhwc --axis=1 --group=4 1x68x56x56
+
+--dt=s32 --fmt=nchw --axis=1 --group=4 1x68x56x56
+--dt=s32 --fmt=nhwc --axis=1 --group=4 1x68x56x56
+
+# (1, 136, 56, 56)
+--dt=f32 --fmt=nchw --axis=1 --group=4 1x136x56x56
+--dt=f32 --fmt=nhwc --axis=1 --group=4 1x136x56x56
+--dt=f32 --fmt=nChw8c --axis=1 --group=4 1x136x56x56
+
+--dt=u8 --fmt=nchw --axis=1 --group=4 1x136x56x56
+--dt=u8 --fmt=nhwc --axis=1 --group=4 1x136x56x56
+--dt=u8 --fmt=nChw8c --axis=1 --group=4 1x136x56x56
+
+--dt=s8 --fmt=nchw --axis=1 --group=4 1x136x56x56
+--dt=s8 --fmt=nhwc --axis=1 --group=4 1x136x56x56
+--dt=s8 --fmt=nChw8c --axis=1 --group=4 1x136x56x56
+
+--dt=s32 --fmt=nchw --axis=1 --group=4 1x136x56x56
+--dt=s32 --fmt=nhwc --axis=1 --group=4 1x136x56x56
+--dt=s32 --fmt=nChw8c --axis=1 --group=4 1x136x56x56
+
+# (1, 272, 56, 56)
+--dt=f32 --fmt=nchw --axis=1 --group=4 1x272x56x56
+--dt=f32 --fmt=nhwc --axis=1 --group=4 1x272x56x56
+--dt=f32 --fmt=nChw8c --axis=1 --group=4 1x272x56x56
+--dt=f32 --fmt=nChw16c --axis=1 --group=4 1x272x56x56
+
+--dt=u8 --fmt=nchw --axis=1 --group=4 1x272x56x56
+--dt=u8 --fmt=nhwc --axis=1 --group=4 1x272x56x56
+--dt=u8 --fmt=nChw8c --axis=1 --group=4 1x272x56x56
+--dt=u8 --fmt=nChw16c --axis=1 --group=4 1x272x56x56
+
+--dt=s8 --fmt=nchw --axis=1 --group=4 1x272x56x56
+--dt=s8 --fmt=nhwc --axis=1 --group=4 1x272x56x56
+--dt=s8 --fmt=nChw8c --axis=1 --group=4 1x272x56x56
+--dt=s8 --fmt=nChw16c --axis=1 --group=4 1x272x56x56
+
+--dt=s32 --fmt=nchw --axis=1 --group=4 1x272x56x56
+--dt=s32 --fmt=nhwc --axis=1 --group=4 1x272x56x56
+--dt=s32 --fmt=nChw8c --axis=1 --group=4 1x272x56x56
+--dt=s32 --fmt=nChw16c --axis=1 --group=4 1x272x56x56
+
+# 3D (1, 272, 2, 56, 56)
+--dt=f32 --fmt=ncdhw --axis=1 --group=4 1x272x2x56x56
+--dt=f32 --fmt=ndhwc --axis=1 --group=4 1x272x2x56x56
+--dt=f32 --fmt=nCdhw8c --axis=1 --group=4 1x272x2x56x56
+--dt=f32 --fmt=nCdhw16c --axis=1 --group=4 1x272x2x56x56
+
+--dt=u8 --fmt=ncdhw --axis=1 --group=4 1x272x2x56x56
+--dt=u8 --fmt=ndhwc --axis=1 --group=4 1x272x2x56x56
+--dt=u8 --fmt=nCdhw8c --axis=1 --group=4 1x272x2x56x56
+--dt=u8 --fmt=nCdhw16c --axis=1 --group=4 1x272x2x56x56
+
+--dt=s8 --fmt=ncdhw --axis=1 --group=4 1x272x2x56x56
+--dt=s8 --fmt=ndhwc --axis=1 --group=4 1x272x2x56x56
+--dt=s8 --fmt=nCdhw8c --axis=1 --group=4 1x272x2x56x56
+--dt=s8 --fmt=nCdhw16c --axis=1 --group=4 1x272x2x56x56
+
+--dt=s32 --fmt=ncdhw --axis=1 --group=4 1x272x2x56x56
+--dt=s32 --fmt=ndhwc --axis=1 --group=4 1x272x2x56x56
+--dt=s32 --fmt=nCdhw8c --axis=1 --group=4 1x272x2x56x56
+--dt=s32 --fmt=nCdhw16c --axis=1 --group=4 1x272x2x56x56
+
+--dir=BWD_D
+
+--dt=f32 --fmt=nchw --axis=1 --group=4 1x272x56x56
+--dt=f32 --fmt=nhwc --axis=1 --group=4 1x272x56x56
+--dt=f32 --fmt=nChw8c --axis=1 --group=4 1x272x56x56
+--dt=f32 --fmt=nChw16c --axis=1 --group=4 1x272x56x56
+
+--dt=u8 --fmt=nchw --axis=1 --group=4 1x272x56x56
+--dt=u8 --fmt=nhwc --axis=1 --group=4 1x272x56x56
+--dt=u8 --fmt=nChw8c --axis=1 --group=4 1x272x56x56
+--dt=u8 --fmt=nChw16c --axis=1 --group=4 1x272x56x56
+
+--dt=s8 --fmt=nchw --axis=1 --group=4 1x272x56x56
+--dt=s8 --fmt=nhwc --axis=1 --group=4 1x272x56x56
+--dt=s8 --fmt=nChw8c --axis=1 --group=4 1x272x56x56
+--dt=s8 --fmt=nChw16c --axis=1 --group=4 1x272x56x56
+
+--dt=s32 --fmt=nchw --axis=1 --group=4 1x272x56x56
+--dt=s32 --fmt=nhwc --axis=1 --group=4 1x272x56x56
+--dt=s32 --fmt=nChw8c --axis=1 --group=4 1x272x56x56
+--dt=s32 --fmt=nChw16c --axis=1 --group=4 1x272x56x56
+
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/shuffle/test_shuffle_axis b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/inputs/shuffle/test_shuffle_axis
new file mode 100644 (file)
index 0000000..20584de
--- /dev/null
@@ -0,0 +1,103 @@
+--dir=FWD_D
+
+# group_size = 4
+# (1, 68, 56, 56)
+--dt=f32 --fmt=nchw --axis=2 --group=4 1x68x56x56
+--dt=f32 --fmt=nhwc --axis=2 --group=4 1x68x56x56
+
+--dt=u8 --fmt=nchw --axis=2 --group=4 1x68x56x56
+--dt=u8 --fmt=nhwc --axis=2 --group=4 1x68x56x56
+
+--dt=s8 --fmt=nchw --axis=2 --group=4 1x68x56x56
+--dt=s8 --fmt=nhwc --axis=2 --group=4 1x68x56x56
+
+--dt=s32 --fmt=nchw --axis=2 --group=4 1x68x56x56
+--dt=s32 --fmt=nhwc --axis=2 --group=4 1x68x56x56
+
+# (1, 136, 56, 56)
+--dt=f32 --fmt=nchw --axis=2 --group=4 1x136x56x56
+--dt=f32 --fmt=nhwc --axis=2 --group=4 1x136x56x56
+--dt=f32 --fmt=nChw8c --axis=2 --group=4 1x136x56x56
+
+--dt=u8 --fmt=nchw --axis=2 --group=4 1x136x56x56
+--dt=u8 --fmt=nhwc --axis=2 --group=4 1x136x56x56
+--dt=u8 --fmt=nChw8c --axis=2 --group=4 1x136x56x56
+
+--dt=s8 --fmt=nchw --axis=2 --group=4 1x136x56x56
+--dt=s8 --fmt=nhwc --axis=2 --group=4 1x136x56x56
+--dt=s8 --fmt=nChw8c --axis=2 --group=4 1x136x56x56
+
+--dt=s32 --fmt=nchw --axis=2 --group=4 1x136x56x56
+--dt=s32 --fmt=nhwc --axis=2 --group=4 1x136x56x56
+--dt=s32 --fmt=nChw8c --axis=2 --group=4 1x136x56x56
+
+# (1, 272, 56, 56)
+--dt=f32 --fmt=nchw --axis=2 --group=4 1x272x56x56
+--dt=f32 --fmt=nhwc --axis=2 --group=4 1x272x56x56
+--dt=f32 --fmt=nChw8c --axis=2 --group=4 1x272x56x56
+--dt=f32 --fmt=nChw16c --axis=2 --group=4 1x272x56x56
+
+--dt=u8 --fmt=nchw --axis=2 --group=4 1x272x56x56
+--dt=u8 --fmt=nhwc --axis=2 --group=4 1x272x56x56
+--dt=u8 --fmt=nChw8c --axis=2 --group=4 1x272x56x56
+--dt=u8 --fmt=nChw16c --axis=2 --group=4 1x272x56x56
+
+--dt=s8 --fmt=nchw --axis=2 --group=4 1x272x56x56
+--dt=s8 --fmt=nhwc --axis=2 --group=4 1x272x56x56
+--dt=s8 --fmt=nChw8c --axis=2 --group=4 1x272x56x56
+--dt=s8 --fmt=nChw16c --axis=2 --group=4 1x272x56x56
+
+--dt=s32 --fmt=nchw --axis=2 --group=4 1x272x56x56
+--dt=s32 --fmt=nhwc --axis=2 --group=4 1x272x56x56
+--dt=s32 --fmt=nChw8c --axis=2 --group=4 1x272x56x56
+--dt=s32 --fmt=nChw16c --axis=2 --group=4 1x272x56x56
+
+# blocked with tail
+--dt=f32 --fmt=nChw16c --axis=1 --group=4 1x12x56x56
+--dt=f32 --fmt=nChw16c --axis=1 --group=4 1x24x56x56
+--dt=f32 --fmt=nChw16c --axis=1 --group=4 1x36x56x56
+--dt=f32 --fmt=nChw16c --axis=1 --group=4 1x68x56x56
+
+# 3D (1, 272, 2, 56, 56)
+--dt=f32 --fmt=ncdhw --axis=3 --group=4 1x272x2x56x56
+--dt=f32 --fmt=ndhwc --axis=3 --group=4 1x272x2x56x56
+--dt=f32 --fmt=nCdhw8c --axis=3 --group=4 1x272x2x56x56
+--dt=f32 --fmt=nCdhw16c --axis=3 --group=4 1x272x2x56x56
+
+--dt=u8 --fmt=ncdhw --axis=3 --group=4 1x272x2x56x56
+--dt=u8 --fmt=ndhwc --axis=3 --group=4 1x272x2x56x56
+--dt=u8 --fmt=nCdhw8c --axis=4 --group=4 1x272x2x56x56
+--dt=u8 --fmt=nCdhw16c --axis=4 --group=4 1x272x2x56x56
+
+--dt=s8 --fmt=ncdhw --axis=3 --group=4 1x272x2x56x56
+--dt=s8 --fmt=ndhwc --axis=3 --group=4 1x272x2x56x56
+--dt=s8 --fmt=nCdhw8c --axis=4 --group=4 1x272x2x56x56
+--dt=s8 --fmt=nCdhw16c --axis=4 --group=4 1x272x2x56x56
+
+--dt=s32 --fmt=ncdhw --axis=3 --group=4 1x272x2x56x56
+--dt=s32 --fmt=ndhwc --axis=3 --group=4 1x272x2x56x56
+--dt=s32 --fmt=nCdhw8c --axis=4 --group=4 1x272x2x56x56
+--dt=s32 --fmt=nCdhw16c --axis=4 --group=4 1x272x2x56x56
+
+--dir=BWD_D
+
+--dt=f32 --fmt=nchw --axis=2 --group=4 1x272x56x56
+--dt=f32 --fmt=nhwc --axis=2 --group=4 1x272x56x56
+--dt=f32 --fmt=nChw8c --axis=2 --group=4 1x272x56x56
+--dt=f32 --fmt=nChw16c --axis=2 --group=4 1x272x56x56
+
+--dt=u8 --fmt=nchw --axis=2 --group=4 1x272x56x56
+--dt=u8 --fmt=nhwc --axis=2 --group=4 1x272x56x56
+--dt=u8 --fmt=nChw8c --axis=2 --group=4 1x272x56x56
+--dt=u8 --fmt=nChw16c --axis=2 --group=4 1x272x56x56
+
+--dt=s8 --fmt=nchw --axis=2 --group=4 1x272x56x56
+--dt=s8 --fmt=nhwc --axis=2 --group=4 1x272x56x56
+--dt=s8 --fmt=nChw8c --axis=2 --group=4 1x272x56x56
+--dt=s8 --fmt=nChw16c --axis=2 --group=4 1x272x56x56
+
+--dt=s32 --fmt=nchw --axis=2 --group=4 1x272x56x56
+--dt=s32 --fmt=nhwc --axis=2 --group=4 1x272x56x56
+--dt=s32 --fmt=nChw8c --axis=2 --group=4 1x272x56x56
+--dt=s32 --fmt=nChw16c --axis=2 --group=4 1x272x56x56
+
index 5354842..6f725f9 100644 (file)
 # i8 (skx)
 --reset --dir=FWD_B --mb=2
 --skip-impl="ref:gemm"      # ! test jit version only
-
+--allow-unimpl=true
 --cfg=u8s8u8s32  --batch=conv_all
 --cfg=u8s8s8s32  --batch=conv_resnet_50
 --cfg=u8s8s32s32 --batch=conv_googlenet_v3
 --merge=RELU
 --cfg=u8s8s32s32 --batch=conv_vgg_19
+--cfg=s8s8u8s32  --batch=conv_all
+--cfg=s8s8s8s32  --batch=conv_resnet_50
+--cfg=s8s8s32s32 --batch=conv_googlenet_v3
+--merge=RELU
+--cfg=s8s8s32s32 --batch=conv_vgg_19
 
 # s16 (knm)
 --reset --mb=2
@@ -66,4 +71,4 @@
 --batch=test_conv_tails
 
 # 3D conv
---batch=test_conv_3d
\ No newline at end of file
+--batch=test_conv_3d
index 51ae42b..00d4cff 100644 (file)
@@ -7,10 +7,14 @@
 --attr=irmode=down;oscale=per_oc:2.25;post_ops='sum:1.5;relu'
 --cfg=u8s8u8s32  --batch=conv_vgg_19
 --cfg=u8s8f32s32 --batch=conv_googlenet_v2
+--cfg=s8s8u8s32  --batch=conv_vgg_19
+--cfg=s8s8f32s32 --batch=conv_googlenet_v2
 --dir=FWD_D
 --attr=irmode=nearest;oscale=common:2.25;post_ops='sum:1.5'
 --cfg=u8s8s8s32  --batch=conv_googlenet_v3
 --cfg=u8s8s32s32 --batch=conv_alexnet
+--cfg=s8s8s8s32  --batch=conv_googlenet_v3
+--cfg=s8s8s32s32 --batch=conv_alexnet
 
 # f32
 --reset --cfg=f32
@@ -19,6 +23,7 @@
 --allow-unimpl=true
 --dir=FWD_B --attr=post_ops='sum;relu' --batch=conv_resnet_50
 --dir=FWD_B --attr=post_ops='sum;relu' --batch=conv_3d
+--dir=FWD_B --attr=post_ops='sum;relu' --batch=conv_1d
 
 # f32_wino
 --reset --alg=wino --cfg=f32_wino
index af5c0f8..b5c8a32 100644 (file)
@@ -12,3 +12,5 @@
 # int8 (full)
 --reset --cfg=u8s8u8s32
 --dir=FWD_D --batch=conv_dilated_rfcn --match=.*fc6.* --batch=conv_ssd_300_voc0712
+--reset --allow-unimpl=true --cfg=s8s8u8s32
+--dir=FWD_D --batch=conv_dilated_rfcn --match=.*fc6.* --batch=conv_ssd_300_voc0712
index fda923b..e2cebb9 100644 (file)
@@ -29,6 +29,7 @@
 
 # MKLDNN-623
 --reset --cfg=u8s8u8s32 --dir=FWD_D --skip-impl="ref:gemm" mb2ic672ih29iw29oc192kh1kw1sh1sw1ph0pw0n"DENSENET_161:conv3_11/x1"
+--reset --allow-unimpl=true --cfg=s8s8u8s32 --dir=FWD_D --skip-impl="ref:gemm" mb2ic672ih29iw29oc192kh1kw1sh1sw1ph0pw0n"DENSENET_161:conv3_11/x1"
 --reset --cfg=s16s16s32s32 --skip-impl="ref:gemm" mb2ic672ih29iw29oc192kh1kw1sh1sw1ph0pw0n"DENSENET_161:conv3_11/x1"
 --reset --cfg=s32s16s16s32 --dir=BWD_D --skip-impl="ref:gemm" mb2ic672ih29iw29oc192kh1kw1sh1sw1ph0pw0n"DENSENET_161:conv3_11/x1"
 --reset --cfg=s16s32s16s32 --dir=BWD_WB --skip-impl="ref:gemm" mb2ic672ih29iw29oc192kh1kw1sh1sw1ph0pw0n"DENSENET_161:conv3_11/x1"
@@ -42,6 +43,7 @@
 # MKLDNN-796: large asymmetric padding
 --reset --cfg=f32_full mb2_g1ic32oc32_ih7oh11kh3ph2
 --reset --cfg=u8s8u8s32 mb2_g1ic32oc32_ih7oh11kh3ph2
+--reset --allow-unimpl=true --cfg=s8s8u8s32 mb2_g1ic32oc32_ih7oh11kh3ph2
 
 # MKLDNN-860
 --reset --dir=BWD_WB --cfg=f32_wino --allow-unimpl=true --alg=wino  mb2ic16ih13oc16oh13kh3ph1
 # MKLDNN-930
 --reset mb2ic3ih300oc32oh150kh3sh2
 --reset --cfg=u8s8u8s32 mb2ic3ih84oc16oh42kh3sh2
+--reset --allow-unimpl=true --cfg=s8s8u8s32 mb2ic3ih84oc16oh42kh3sh2
 
 # MKLDNN-949
 --reset --dir=BWD_D mb2_g1ic1oc2_ih3oh1kh2sh1dh1ph0_iw5ow3kw2sw1dw1pw0
 
 # special case for 4vnni and 4fma kernels had this FPE bug
 --reset --cfg=f32_full mb1_g1ic16oc16_ih7oh7kh3ph1
+
+# MKLDNN-982: FPE for large right-hand-side padding
+--reset --dir=FWD_D mb2_g1ic1oc2_ih3oh10kh2sh1dh1ph4_iw5ow12kw2sw1dw1pw2
+--reset --dir=FWD_D mb2_g1ic8oc8_ih1oh16kh2ph1
+
+# MKLDNN-1074: FPE for mb1 with ih < sh or iw < sw
+--reset --dir=FWD_D mb1_g1ic128oc256_ih1oh1kh3sh2dh0ph1_iw1ow1kw3sw2dw0pw1
+
index 142fa89..c81dd4e 100644 (file)
 --attr=irmode=down;oscale=per_oc:2.25;post_ops='sum:1.5;relu'
 --cfg=u8s8u8s32  --batch=conv_tails
 --cfg=u8s8f32s32 --batch=conv_tails
+--cfg=s8s8u8s32  --batch=conv_tails
+--cfg=s8s8f32s32 --batch=conv_tails
 --dir=FWD_D
 --attr=irmode=nearest;oscale=common:2.25;post_ops='sum:1.5'
 --cfg=u8s8s8s32  --batch=conv_tails
---cfg=u8s8s32s32 --batch=conv_tails
\ No newline at end of file
+--cfg=u8s8s32s32 --batch=conv_tails
+--cfg=s8s8s8s32  --batch=conv_tails
+--cfg=s8s8s32s32 --batch=conv_tails
index e5dc8cc..2b71b50 100644 (file)
@@ -8,13 +8,19 @@
 
 #int8
 --skip-impl=ref
---reset --dir=FWD_B --mb=2
+--reset --allow-unimpl=true --dir=FWD_B --mb=2
 --attr=irmode=down;oscale=per_oc:2.25;
 --cfg=u8s8u8s32 --batch=deconv_2d
 --cfg=u8s8s8s32 --batch=deconv_2d
 --cfg=u8s8s32s32 --batch=deconv_2d
+--cfg=s8s8u8s32 --batch=deconv_2d
+--cfg=s8s8s8s32 --batch=deconv_2d
+--cfg=s8s8s32s32 --batch=deconv_2d
 --attr=irmode=nearest;oscale=common:2.25;
 --attr=irmode=down;oscale=per_oc:2.25;
 --cfg=u8s8u8s32 --batch=deconv_2d
 --cfg=u8s8s8s32 --batch=deconv_2d
 --cfg=u8s8s32s32 --batch=deconv_2d
+--cfg=s8s8u8s32 --batch=deconv_2d
+--cfg=s8s8s8s32 --batch=deconv_2d
+--cfg=s8s8s32s32 --batch=deconv_2d
index c49f19a..3e44413 100644 (file)
@@ -78,8 +78,8 @@ int bench(int argc, char **argv, bool main_bench) {
             perf_template = argv[arg] + 16;
         else if (!strcmp("--reset", argv[arg]))
             reset_parameters();
-        else if (!strncmp("--mode=", argv[0], 7))
-            bench_mode = str2bench_mode(argv[0] + 7);
+        else if (!strncmp("--mode=", argv[arg], 7))
+            bench_mode = str2bench_mode(argv[arg] + 7);
         else if (!strncmp("-v", argv[arg], 2))
             verbose = atoi(argv[arg] + 2);
         else if (!strncmp("--verbose=", argv[arg], 10))
index ef96cdf..eba082c 100644 (file)
@@ -366,6 +366,9 @@ int doit(const prb_t *p, res_t *r) {
         }
     }
 
+    DNN_SAFE(mkldnn_primitive_desc_destroy(ippd), CRIT);
+    DNN_SAFE(mkldnn_primitive_destroy(ip), CRIT);
+
     return OK;
 }
 
index 1b4e3f3..decf41b 100644 (file)
@@ -76,6 +76,9 @@ mkldnn_memory_format_t str2fmt(const char *str) {
 } while (0)
     CASE(x);
     CASE(nc);
+    CASE(ncw);
+    CASE(nwc);
+    CASE(nCw16c);
     CASE(nchw);
     CASE(nhwc);
     CASE(chwn);
@@ -83,15 +86,24 @@ mkldnn_memory_format_t str2fmt(const char *str) {
     CASE(nChw16c);
     CASE(oi);
     CASE(io);
+    CASE(oiw);
+    CASE(wio);
+    CASE(OIw16i16o);
+    CASE(OIw16o16i);
+    CASE(Oiw16o);
+    CASE(Owi16o);
+    CASE(OIw8i16o2i);
     CASE(oihw);
     CASE(ihwo);
     CASE(hwio);
+    CASE(hwio_s8s8);
     CASE(dhwio);
     CASE(OIhw8i8o);
     CASE(OIhw16i16o);
     CASE(OIhw8i16o2i);
     CASE(OIdhw8i16o2i);
     CASE(OIhw4i16o4i);
+    CASE(OIhw4i16o4i_s8s8);
     CASE(OIhw8o16i2o);
     CASE(OIhw8o8i);
     CASE(OIhw16o16i);
@@ -99,13 +111,22 @@ mkldnn_memory_format_t str2fmt(const char *str) {
     CASE(Oihw16o);
     CASE(Ohwi8o);
     CASE(Ohwi16o);
+    CASE(goiw);
     CASE(goihw);
     CASE(hwigo);
+    CASE(hwigo_s8s8);
+    CASE(goiw);
+    CASE(gOIw16i16o);
+    CASE(gOIw16o16i);
+    CASE(gOiw16o);
+    CASE(gOwi16o);
+    CASE(gOIw8i16o2i);
     CASE(gOIhw8i8o);
     CASE(gOIhw16i16o);
     CASE(gOIhw8i16o2i);
     CASE(gOIdhw8i16o2i);
     CASE(gOIhw4i16o4i);
+    CASE(gOIhw4i16o4i_s8s8);
     CASE(gOIhw8o16i2o);
     CASE(gOIhw8o8i);
     CASE(gOIhw16o16i);
@@ -121,6 +142,7 @@ mkldnn_memory_format_t str2fmt(const char *str) {
     CASE(ndhwc);
     CASE(oidhw);
     CASE(goidhw);
+    CASE(nCdhw8c);
     CASE(nCdhw16c);
     CASE(OIdhw16i16o);
     CASE(gOIdhw16i16o);
index 50becb2..acabd39 100644 (file)
@@ -67,10 +67,10 @@ void check(const prb_t *p) {
 void run() {
     for (auto &idt: v_idt)
     for (auto &odt: v_odt)
-    for (int swap_dt = 0; swap_dt < 1 + both_dir_dt * (idt != odt); ++swap_dt)
+    for (int swap_dt = 0; swap_dt < (both_dir_dt && idt != odt ? 2 : 1); ++swap_dt)
     for (auto &ifmt: v_ifmt)
     for (auto &ofmt: v_ofmt)
-    for (int swap_fmt = 0; swap_fmt < 1 + both_dir_fmt * (ifmt != ofmt); ++swap_fmt)
+    for (int swap_fmt = 0; swap_fmt < (both_dir_fmt && ifmt != ofmt ? 2 : 1); ++swap_fmt)
     for (auto &dims: v_dims)
     {
         reorder_conf_t reorder_conf{dims,
@@ -138,8 +138,8 @@ int bench(int argc, char **argv, bool main_bench) {
             perf_template = argv[arg] + 16;
         else if (!strcmp("--reset", argv[arg]))
             reset_parameters();
-        else if (!strncmp("--mode=", argv[0], 7))
-            bench_mode = str2bench_mode(argv[0] + 7);
+        else if (!strncmp("--mode=", argv[arg], 7))
+            bench_mode = str2bench_mode(argv[arg] + 7);
         else if (!strncmp("-v", argv[arg], 2))
             verbose = atoi(argv[arg] + 2);
         else if (!strncmp("--verbose=", argv[arg], 10))
index 067623f..a19917b 100644 (file)
@@ -306,9 +306,10 @@ int check_reorder(const prb_t *p, res_t *res) {
             &check_rpd, mem_dt_in_fmt_in.mpd_, mem_dt_out_fmt_out.mpd_,
             mkldnn_attr);
     if (init_status == mkldnn_unimplemented) {
-        mkldnn_primitive_attr_destroy(mkldnn_attr);
-        return res->state = UNIMPLEMENTED, OK;
+        res->state = UNIMPLEMENTED;
+        goto cleanup;
     }
+    mkldnn_primitive_desc_destroy(check_rpd);
     SAFE(init_status, WARN);
 
     SAFE(mem_dt_out_fmt_out.reorder(mem_dt_in_fmt_in, mkldnn_attr), WARN);
@@ -353,6 +354,7 @@ int check_reorder(const prb_t *p, res_t *res) {
     }
 
     /* Step 8: clean up */
+cleanup:
     mkldnn_primitive_attr_destroy(mkldnn_attr);
     zfree(scales);
 
index 3d9a9b1..3d43c77 100644 (file)
 #include "mkldnn_debug.hpp"
 #include "mkldnn_memory.hpp"
 
-#include "rnn/input_rnn.hpp"
 #include "rnn/rnn.hpp"
 
 namespace rnn {
 
-int bench(int argc, char **argv) {
-    // !!?? TODO: check consistence of direction, dir ...
-    mkldnn_prop_kind_t direction = mkldnn_forward;
-    dir_t dir = FWD_D;
+/* global driver parameters */
+mkldnn_prop_kind_t prop = mkldnn_forward;
+alg_t alg = VANILLA_RNN;
+mkldnn_rnn_direction_t direction = mkldnn_unidirectional_left2right;
+activation_t activation = RELU;
+
+void reset_parameters() {
+    prop = mkldnn_forward;
+    alg = VANILLA_RNN;
+    direction = mkldnn_unidirectional_left2right;
+    activation = RELU;
+}
+
+int bench(int argc, char **argv, bool main_bench) {
     for (int arg = 0; arg < argc; ++arg) {
-        if (!strncmp("--dir=", argv[arg], 6)) {
-            dir = str2dir(argv[arg] + 6);
+        if (!strncmp("--batch=", argv[arg], 8))
+            SAFE(batch(argv[arg] + 8, bench), CRIT);
+        else if (!strncmp("--prop=", argv[arg], 7)) {
+            dir_t dir = str2dir(argv[arg] + 7);
             if (dir == FWD_D)
-                direction = mkldnn_forward;
-            else if (dir == BWD_D)
-                direction = mkldnn_backward;
+                prop = mkldnn_forward;
+            else if (dir == BWD_DW)
+                prop = mkldnn_backward;
             else
                 assert("unknown dir");
+        } else if (!strncmp("--alg=", argv[arg], 6))
+            alg = str2alg(argv[arg] + 6);
+        else if (!strncmp("--direction=", argv[arg], 12))
+            direction = str2direction(argv[arg] + 12);
+        else if (!strncmp("--activation=", argv[arg], 13))
+            activation = str2activation(argv[arg] + 13);
+        else if (!strncmp("--reset", argv[arg], 7))
+            reset_parameters();
+        else {
+            rnn_desc_t d;
+            if (str2desc(&d, argv[arg]) == FAIL) {
+                fprintf(stderr, "driver: unknown option: `%s`, exiting...\n",
+                        argv[arg]);
+                exit(2);
+            }
+            check(&d);
         }
     }
-    const int num_r = sizeof(rnns) / sizeof(rnns[0]);
-
-    for (int r = 0; r < num_r; ++r) {
-        const rnn_prb_t p(rnns[r], conf_f32, direction);
-        check(&p);
-    }
-
     return OK;
 }
 
-void check(const rnn_prb_t *p) {
+void check(rnn_desc_t *d) {
+    const rnn_prb_t p(*d, conf_f32, prop, alg, direction, activation);
     res_t res{};
     char pstr[max_prb_len];
-    prb2str(p, &res, pstr);
+    prb2str(&p, &res, pstr);
 
-    int status = rnn::doit(p, &res);
+    int status = rnn::doit(&p, &res);
 
-    prb2str(p, &res, pstr);
+    prb2str(&p, &res, pstr);
     bool want_perf_report = false;
 
     parse_result(res, want_perf_report, false, status, pstr);
 
     if (bench_mode & PERF)
-        perf_report(p, &res, pstr);
+        perf_report(&p, &res, pstr);
 
     benchdnn_stat.tests++;
 }
@@ -14,9 +14,6 @@
  * limitations under the License.
  *******************************************************************************/
 
-#ifndef _INPUT_LSTM_HPP
-#define _INPUT_LSTM_HPP
-
 #include "mkldnn_common.hpp"
 #include "rnn/rnn.hpp"
 
@@ -116,82 +113,4 @@ const char *cfg2str(const dt_conf_t *cfg) {
     }();
     return NULL;
 }
-
-// ?? TODO: need auto filling of strides if chunk without padding to avoid
-// specifying strides in each test case
-
-static rnn::rnn_desc_t rnns[] = {
-/* alg, activation, direction, sic, slc, dic, dlc, batch, n_layer, n_iter,
- * name */
-#if 0
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 32, 32, 32, 32, 4 /*ok with 7 batch*/, 4, 3, "exp-fail-1" },
-#endif
-    { VANILLA_RNN, TANH, mkldnn_unidirectional_left2right, 1, 1, 1, 1, 1, 1, 1,
-            "exp0" },
-    { VANILLA_RNN, RELU, mkldnn_unidirectional_left2right, 1, 1, 1, 1, 1, 1, 1,
-            "exp1.0" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 1, 1, 1, 1, 1, 1, 1,
-            "exp1.1" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 1, 1, 1, 1, 2, 1, 1,
-            "exp1.2" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 1, 1, 1, 1, 1, 2, 1,
-            "exp1.3" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 1, 1, 1, 1, 1, 1, 2,
-            "exp1.4" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_right2left, 1, 1, 1, 1, 1, 1, 1,
-            "exp2.1" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_right2left, 1, 1, 1, 1, 2, 1, 1,
-            "exp2.2" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_right2left, 1, 1, 1, 1, 1, 2, 1,
-            "exp2.3" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_right2left, 1, 1, 1, 1, 1, 1, 2,
-            "exp2.4" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_right2left, 2, 2, 2, 2, 4, 5, 7,
-            "exp4" },
-    { VANILLA_LSTM, TANH, mkldnn_bidirectional_sum, 1, 1, 1, 1, 1, 2, 2,
-            "exp5.2" },
-    { VANILLA_LSTM, TANH, mkldnn_bidirectional_concat, 1, 1, 1, 1, 1, 1, 1,
-            "exp6.2" },
-    { VANILLA_LSTM, TANH, mkldnn_bidirectional_concat, 512, 512, 512, 512, 128,
-            1, 1, "GNMT-enc-bidir" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 1024, 1024, 1024,
-            1024, 128, 7, 2, "GNMT-enc-unidir" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 2, 1, 1, 1, 1, 1, 1,
-            "GNMT-dec-1" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 2, 1, 1, 1, 2, 1, 1,
-            "GNMT-dec-mb2" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 2, 1, 1, 1, 1, 2, 1,
-            "GNMT-dec-l2" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 1, 2, 1, 1, 1, 1, 2,
-            "GNMT-dec-i2" },
-    { VANILLA_LSTM, TANH, mkldnn_unidirectional_left2right, 2048, 1024, 1024,
-            1024, 128, 8, 1, "GNMT-dec" },
-    { VANILLA_GRU, TANH, mkldnn_unidirectional_left2right, 1, 1, 1, 1, 1, 1, 1,
-            "exp-gru-0" },
-    { VANILLA_GRU, TANH, mkldnn_unidirectional_right2left, 1, 2, 1, 1, 1, 1, 4,
-            "exp-gru-1" },
-    { VANILLA_GRU, TANH, mkldnn_bidirectional_concat, 4, 4, 4, 4, 8, 5, 5,
-            "exp-gru-2" },
-    { VANILLA_GRU, TANH, mkldnn_bidirectional_sum, 512, 512, 512, 512, 128, 2, 2,
-            "exp-gru-3" },
-    { VANILLA_GRU, TANH, mkldnn_unidirectional_left2right, 512, 1024, 512, 512, 128, 1, 7,
-            "exp-gru-4" },
-    { GRU_LINEAR_BEFORE_RESET, TANH, mkldnn_unidirectional_left2right, 1, 1, 1, 1, 1, 1, 1,
-            "exp-gru-lbr-0" },
-    { GRU_LINEAR_BEFORE_RESET, TANH, mkldnn_unidirectional_right2left, 1, 2, 1, 1, 1, 1, 2,
-            "exp-gru-lbr-1" },
-    { GRU_LINEAR_BEFORE_RESET, TANH, mkldnn_unidirectional_right2left, 2, 1, 1, 1, 1, 2, 1,
-            "exp-gru-lbr-2" },
-    { GRU_LINEAR_BEFORE_RESET, TANH, mkldnn_bidirectional_concat, 4, 4, 4, 4, 8, 1, 1,
-            "exp-gru-lbr-3" },
-    { GRU_LINEAR_BEFORE_RESET, TANH, mkldnn_bidirectional_sum, 512, 512, 512, 512, 128, 2, 2,
-            "exp-gru-lbr-4" },
-    { GRU_LINEAR_BEFORE_RESET, TANH, mkldnn_unidirectional_left2right, 128, 512, 128, 128, 32, 1, 10,
-            "exp-gru-lbr-5" },
-    { GRU_LINEAR_BEFORE_RESET, TANH, mkldnn_unidirectional_left2right, 512, 128, 128, 128, 32, 10, 1,
-            "exp-gru-lbr-6" },
-};
-
 } // namespace rnn
-
-#endif
index 788cfe1..ed668c1 100644 (file)
@@ -35,19 +35,16 @@ void lstm_activation(int dic, int n_gates, int batch,
         float *a) {
     AOC<float> pa(a, batch, n_gates, dic);
     mkldnn::impl::parallel_nd(batch, [&](int ib) {
-        for (int ig = 0; ig < 3; ig++) {
-            for (int ih = 0; ih < dic; ih++) {
-                pa(ib, ig, ih) = logistic(pa(ib, ig, ih));
+        for (int ih = 0; ih < dic; ih++) {
+            pa(ib, 0, ih) = logistic(pa(ib, 0, ih));
+            pa(ib, 1, ih) = logistic(pa(ib, 1, ih));
+            pa(ib, 2, ih) = tanhf(pa(ib, 2, ih));
+            pa(ib, 3, ih) = logistic(pa(ib, 3, ih));
+            for (int ig = 0; ig < 4; ig++) {
                 print(80, "activation 1 a[%d][%d][%d] = %.7f\n", ib, ig, ih,
                         pa(ib, ig, ih));
             }
         }
-        int ig = 3;
-        for (int j = 0; j < dic; j++) {
-            pa(ib, ig, j) = tanhf(pa(ib, ig, j));
-            print(80, "activation 2 a[%d][%d][%d] = %.7f\n", ib, ig, j,
-                    pa(ib, ig, j));
-        }
     });
 }
 
@@ -178,10 +175,10 @@ void lstm_fwd(int sic, int slc, int dic, int wc, int batch, int n_gates,
     AOC<const float> src_iter_c(src_iter_c_, batch, wc);
     AOC<float> gates(gates_, batch, n_gates, dic);
 
-    const int ohf = 0;
-    const int ohi = 1;
-    const int oho = 2;
-    const int ohc = 3;
+    const int ohi = 0;
+    const int ohf = 1;
+    const int ohc = 2;
+    const int oho = 3;
 
     gemm("C", "N", "N", batch, n_gates * dic, slc, 1.0, src_layer_, wc,
             weights_layer_, n_gates * dic, 0.0, gates_, n_gates * dic);
@@ -218,7 +215,7 @@ void rnn_cell_fwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc,
         gru_fwd(sic, slc, dic, wc, batch, n_gates, dst_iter_h, gates,
                 weights_layer, weights_iter, bias, src_layer, src_iter_h);
         break;
-    case GRU_LINEAR_BEFORE_RESET:
+    case LBR_GRU:
         gru_lbr_fwd(sic, slc, dic, wc, batch, n_gates, dst_iter_h, gates,
                 weights_layer, weights_iter, bias, src_layer, src_iter_h,
                 ws_local_);
@@ -256,14 +253,15 @@ void copy(int dimc, int dimr, int ld_src, int ld_dst, const float *src_,
  * fwd: ws keeps {h, c} for every cell
  * bwd: wsb keeps {dh, dc, dx} for every cell
  */
-void copy_init(alg_t alg, int sic, int slc, int dic, int wc, int batch,
+void copy_init(alg_t alg, int sic, int slc, int dic, int dlc, int wc, int batch,
         int n_layer, int n_iter, int n_states, float *ws_,
         const float *src_layer_, const float *firstit_states_,
         rnn_iter_direction_t iter_dir, rnn_layer_direction_t lay_dir,
-        int dir_val, int n_dir, bool is_bwd = false) {
+        int dir_val, int n_dir, bool is_bwd = false, bool is_concat = false) {
     AOC<float> ws(
             ws_, n_layer + 2, n_dir, n_iter + 2, n_states + is_bwd, batch, wc);
-    AOC<const float> src_layer(src_layer_, n_iter, batch, is_bwd ? dic : slc);
+    auto c_stride = is_bwd ? (is_concat ? 2 * dlc : dlc) : slc;
+    AOC<const float> src_layer(src_layer_, n_iter, batch * c_stride);
     AOC<const float> firstit_states(firstit_states_, n_layer, n_dir, n_states,
             batch, is_bwd ? dic : sic);
 
@@ -272,7 +270,7 @@ void copy_init(alg_t alg, int sic, int slc, int dic, int wc, int batch,
 
     if (!is_bwd) {
         for (int it = 0; it < n_iter; it++)
-            copy(batch, slc, slc, wc, &src_layer(it, 0, 0),
+            copy(batch, slc, slc, wc, &src_layer(it, 0),
                     &ws(lay_dest, dir_val, it + 1, H, 0, 0));
 
         for (int lay = 0; lay < n_layer; lay++) {
@@ -286,7 +284,8 @@ void copy_init(alg_t alg, int sic, int slc, int dic, int wc, int batch,
         }
     } else {
         for (int it = 0; it < n_iter; it++)
-            copy(batch, dic, dic, wc, &src_layer(it, 0, 0),
+            copy(batch, dic, c_stride, wc,
+                    &src_layer(it, dir_val * is_concat * dlc),
                     &ws(lay_dest, dir_val, it + 1, n_states, 0, 0));
 
         for (int lay = 0; lay < n_layer; lay++) {
@@ -325,7 +324,7 @@ void copy_res(alg_t alg, int sic, int slc, int dic, int dlc, int wc, int batch,
             auto to = &lastlay_states(
                     it, nb, (action == action_concat) && (!is_bwd) ? dlc : 0);
 
-            copy(1, lastlay_c, wc, lastlay_c, from, to, action);
+            copy(1, is_bwd ?  slc : dlc, wc, lastlay_c, from, to, action);
         }
     }
 
@@ -355,14 +354,14 @@ void rnn_linear_fwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction,
     const int dic = p->dic;
     const int dlc = p->dlc;
     const int wc = max(sic, max(slc, dic));
-    bool is_lbr = p->alg == GRU_LINEAR_BEFORE_RESET;
+    bool is_lbr = p->alg == LBR_GRU;
 
     const int batch = p->mb;
-    const int n_gates = p->n_gates;
-    const int n_states = p->n_states;
+    const int n_gates = p->n_gates();
+    const int n_states = p->n_states();
     const int n_layer = p->n_layer;
     const int n_iter = p->n_iter;
-    const int n_dir = p->n_direction;
+    const int n_dir = p->n_directions();
     activation_t f = p->activation;
 
     AOC<const float> bias(bias_, n_layer, n_dir, (n_gates + is_lbr) * dic);
@@ -381,7 +380,7 @@ void rnn_linear_fwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction,
         // we first need to copy the initial states and input into ws
         // it simplifies the logic in the following code
         print(80, "rnn_linear_fwd: call copy_init dir_val = %d\n", dir_val);
-        copy_init(alg, sic, slc, dic, wc, batch, n_layer, n_iter, n_states, ws_,
+        copy_init(alg, sic, slc, dic, dlc, wc, batch, n_layer, n_iter, n_states, ws_,
                 src_layer_, src_iter_, iter_dir, lay_dir, dir_val, n_dir);
 
         // We run the grid of computation
@@ -444,11 +443,11 @@ void compute_ref_fwd(const rnn_prb_t *p, dnn_mem_t &src_layer_m,
             || direction == mkldnn_bidirectional_concat);
 
     const int wc = max(p->sic, max(p->slc, p->dic));
-    int ws_size = (p->n_layer + 2) * p->n_direction * (p->n_iter + 2)
-            * p->n_states * p->mb * wc;
+    int ws_size = (p->n_layer + 2) * p->n_directions() * (p->n_iter + 2)
+            * p->n_states() * p->mb * wc;
     auto *ws = new float[ws_size];
-    int gates_size = p->n_layer * p->n_direction * p->n_iter * p->mb
-            * p->n_gates * p->dic;
+    int gates_size = p->n_layer * p->n_directions() * p->n_iter * p->mb
+            * p->n_gates() * p->dic;
     auto *gates = new float[gates_size];
 
     rnn_linear_fwd(p, direction, (float *)src_iter_m, (float *)src_layer_m,
@@ -471,7 +470,6 @@ void rnn_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc,
         const float *weights_iter_h_, const float *bias_,
         const float *dst_iter_h_, const float *gates_,
         const float *diff_dst_layer_, const float *diff_dst_iter_h_) {
-
     AOC<const float> diff_dst_layer(diff_dst_layer_, batch, wc);
     AOC<const float> diff_dst_iter_h(diff_dst_iter_h_, batch, wc);
     AOC<const float> gates(gates_, batch, n_gates, dic);
@@ -508,7 +506,6 @@ void lstm_bwd(alg_t alg, int sic, int slc, int dic, int wc, int batch,
         const float *diff_dst_layer_, const float *diff_dst_iter_h_,
         const float *diff_dst_iter_c_) {
     // TODO: check sic and slc as last dimension in arrays and cycles
-
     // input
     AOC<const float> diff_dst_layer(diff_dst_layer_, batch, wc);
     AOC<const float> diff_dst_iter_c(diff_dst_iter_c_, batch, wc);
@@ -521,10 +518,10 @@ void lstm_bwd(alg_t alg, int sic, int slc, int dic, int wc, int batch,
     AOC<float> diff_src_iter_c(diff_src_iter_c_, batch, wc);
     AOC<float> b_gates(b_gates_, batch, n_gates, dic);
 
-    const int ohf = 0;
-    const int ohi = 1;
-    const int oho = 2;
-    const int ohc = 3;
+    const int ohi = 0;
+    const int ohf = 1;
+    const int ohc = 2;
+    const int oho = 3;
 
     for (int ib = 0; ib < batch; ib++)
         for (int ih = 0; ih < dic; ih++) {
@@ -583,11 +580,11 @@ void gru_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc,
     AOC<const float> diff_dst_layer(diff_dst_layer_, batch, wc);
     AOC<const float> diff_dst_iter_h(diff_dst_iter_h_, batch, wc);
     AOC<const float> gates(gates_, batch, n_gates, dic);
-    AOC<const float> weights_layer(weights_layer_, dic, n_gates, slc);
-    AOC<const float> weights_iter_h(weights_iter_h_, dic, n_gates, sic);
+    AOC<const float> weights_layer(weights_layer_, slc, n_gates, dic);
+    AOC<const float> weights_iter_h(weights_iter_h_, sic, n_gates, dic);
 
     AOC<float> diff_src_iter(diff_src_iter_, batch, wc);
-    AOC<float> diff_weights_iter_h(diff_weights_iter_h_, dic, n_gates, sic);
+    AOC<float> diff_weights_iter_h(diff_weights_iter_h_, sic, n_gates, dic);
     AOC<float> b_gates(b_gates_, batch, n_gates, dic);
 
     float *dhr_ = ws_local_;
@@ -664,8 +661,8 @@ void gru_lbr_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc,
     AOC<const float> diff_dst_layer(diff_dst_layer_, batch, wc);
     AOC<const float> diff_dst_iter_h(diff_dst_iter_h_, batch, wc);
     AOC<const float> gates(gates_, batch, n_gates, dic);
-    AOC<const float> weights_layer(weights_layer_, dic, n_gates, slc);
-    AOC<const float> weights_iter_h(weights_iter_h_, dic, n_gates, sic);
+    AOC<const float> weights_layer(weights_layer_, slc, n_gates, dic);
+    AOC<const float> weights_iter_h(weights_iter_h_, sic, n_gates, dic);
     AOC<const float> bias(bias_, n_gates + 1, dic);
 
     AOC<float> diff_src_iter(diff_src_iter_, batch, wc);
@@ -682,12 +679,12 @@ void gru_lbr_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc,
             Wh_b(ib, ih) = bias(3, ih);
 
     gemm("C", "N", "N", batch, dic, sic, 1.0, src_iter_, wc,
-            &weights_iter_h(0, 2, 0), dic, 1.0, Wh_b_, dic);
+            &weights_iter_h(0, 2, 0), n_gates * dic, 1.0, Wh_b_, dic);
 
 
 // dc = (1 - u) * dh; dc^ = dtanhf(c) * dc;
 // du = (h - u) * dh; du^ = dlogistic(u) * du;
-// dr = (Wh + b) * dc; dr^ = dlogistic(r) * dr;
+// dr = (Wh + b) * dc^; dr^ = dlogistic(r) * dr;
     const int ohu = 0;
     const int ohr = 1;
     const int ohc = 2;
@@ -700,12 +697,13 @@ void gru_lbr_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc,
             float c = gates(ib, ohc, ih);
             float du = (h - c) * dh;
             float dc = (1.0f - u) * dh;
-            float dr = Wh_b(ib, ih) * dc;
 
             b_gates(ib, ohu, ih) = dlogistic(u) * du;
-            b_gates(ib, ohr, ih) = dlogistic(r) * dr;
             b_gates(ib, ohc, ih) = dtanhf(c) * dc;
 
+            float dr = Wh_b(ib, ih) * b_gates(ib, ohc, ih);
+            b_gates(ib, ohr, ih) = dlogistic(r) * dr;
+
             b_gates_r(ib, ohu, ih) = b_gates(ib, ohu, ih);
             b_gates_r(ib, ohr, ih) = b_gates(ib, ohr, ih);
             b_gates_r(ib, ohc, ih) = b_gates(ib, ohc, ih) * r;
@@ -767,7 +765,7 @@ void rnn_cell_bwd(alg_t alg, activation_t f, int sic, int slc, int dic, int wc,
                 weights_iter, bias, dst_iter_h, gates, diff_dst_layer,
                 diff_dst_iter_h, ws_local_);
         break;
-    case GRU_LINEAR_BEFORE_RESET:
+    case LBR_GRU:
         gru_lbr_bwd(alg, f, sic, slc, dic, wc, batch, n_gates, diff_src_layer,
                 diff_src_iter_h, diff_weights_layer, diff_weights_iter,
                 diff_bias, b_gates, src_layer, src_iter_h, weights_layer,
@@ -790,14 +788,14 @@ void rnn_linear_bwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction,
     const int dic = p->dic;
     const int dlc = p->dlc;
     const int wc = max(sic, max(slc, dic));
-    bool is_lbr = p->alg == GRU_LINEAR_BEFORE_RESET;
+    bool is_lbr = p->alg == LBR_GRU;
 
     const int batch = p->mb;
-    const int n_gates = p->n_gates;
-    const int n_states = p->n_states;
+    const int n_gates = p->n_gates();
+    const int n_states = p->n_states();
     const int n_layer = p->n_layer;
     const int n_iter = p->n_iter;
-    const int n_dir = p->n_direction;
+    const int n_dir = p->n_directions();
     activation_t f = p->activation;
 
     const int X = n_states;
@@ -829,7 +827,7 @@ void rnn_linear_bwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction,
 
     int ws_local_size;
     switch (p->alg) {
-        case GRU_LINEAR_BEFORE_RESET:
+        case LBR_GRU:
             ws_local_size = batch * (n_gates + 1) * dic;
             break;
         case VANILLA_GRU:
@@ -843,9 +841,9 @@ void rnn_linear_bwd(const rnn_prb_t *p, mkldnn_rnn_direction_t direction,
             rnn_layer_direction_t lay_dir, int dir_val, rnn_action_t action) {
         // we first need to copy the initial states and input into ws
         // it simplifies the logic in the following code
-        copy_init(alg, sic, slc, dic, wc, batch, n_layer, n_iter, n_states,
+        copy_init(alg, sic, slc, dic, dlc, wc, batch, n_layer, n_iter, n_states,
                 wsb_, diff_dst_layer_, diff_dst_iter_, iter_dir, lay_dir,
-                dir_val, n_dir, true);
+                dir_val, n_dir, true, direction == mkldnn_bidirectional_concat);
 
         // We run the grid of computation
         for (int j = n_layer - 1; j >= 0; j--) {
@@ -929,12 +927,12 @@ void compute_ref_bwd(const rnn_prb_t *p, dnn_mem_t &input_m,
 
     assert(p->dlc == p->dic);
     int wc = max(p->sic, max(p->slc, p->dic));
-    int ws_size = (p->n_layer + 2) * p->n_direction * (p->n_iter + 2)
-            * p->n_states * p->mb * wc;
+    int ws_size = (p->n_layer + 2) * p->n_directions() * (p->n_iter + 2)
+            * p->n_states() * p->mb * wc;
     auto *ws = new float[ws_size];
     init_buffer(ws, ws_size, -55.); // ??!! Temporary. For debug.
-    int gates_size = p->n_layer * p->n_direction * p->n_iter * p->mb
-            * p->n_gates * p->dic;
+    int gates_size = p->n_layer * p->n_directions() * p->n_iter * p->mb
+            * p->n_gates() * p->dic;
     auto *gates = new float[gates_size];
 
     rnn_linear_fwd(p, direction, (float *)states_m, (float *)input_m,
index 042678e..d940831 100644 (file)
@@ -51,11 +51,13 @@ int fill_memory(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem1,
         size_t idx_end = MIN2(idx_start + chunk_size, nelems);
 
         std::minstd_rand msr;
-        std::uniform_real_distribution<float> gen(-1, 1);
+        std::normal_distribution<float> gen(.0f, .001f);
         msr.discard(idx_start);
 
-        for (size_t idx = idx_start; idx < idx_end; ++idx)
-            mem2.set_elem(idx, gen(msr));
+        for (size_t idx = idx_start; idx < idx_end; ++idx){
+            auto val = gen(msr);
+            mem2.set_elem(idx, MAX2(MIN2(val, 1.0f), -1.0f));
+        }
     });
 
     mem1.reorder(mem2);
@@ -64,12 +66,11 @@ int fill_memory(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem1,
 
 inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2],
         mkldnn_primitive_desc_t rpd[2], res_t *r) {
-    const bool is_bwd = p->prop_ == mkldnn_backward;
+    const bool is_bwd = p->prop == mkldnn_backward;
     // If we are testing backward, we have to first run forward
     // training first in order to generate a valid workspace.
     auto fwd_prop = is_bwd ? mkldnn_forward_training : mkldnn_forward_inference;
-
-    const bool is_gru_lbr = p->alg == GRU_LINEAR_BEFORE_RESET;
+    const bool is_gru_lbr = p->alg == LBR_GRU;
     int the_stride = 1;
     /// @todo we need to add stride support for diff_* tensors too
     mkldnn_memory_desc_t input_d, states_d, weights_input_d, weights_states_d,
@@ -81,11 +82,11 @@ inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2],
     mkldnn_dims_t input_dims = { p->n_iter, p->mb, p->slc };
     // bidirectional = 2, s for lstm = 2, for all other = 1
     mkldnn_dims_t weights_input_dims
-            = { p->n_layer, p->n_direction, p->slc, p->n_gates, p->dic };
+            = { p->n_layer, p->n_directions(), p->slc, p->n_gates(), p->dic };
     mkldnn_dims_t weights_states_dims
-            = { p->n_layer, p->n_direction, p->sic, p->n_gates, p->dic };
+            = { p->n_layer, p->n_directions(), p->sic, p->n_gates(), p->dic };
     mkldnn_dims_t bias_dims
-            = { p->n_layer, p->n_direction, p->n_gates + is_gru_lbr, p->dic };
+            = { p->n_layer, p->n_directions(), p->n_gates() + is_gru_lbr, p->dic };
     // mkldnn_tnc
     int lastlay_dlc = (p->direction == mkldnn_bidirectional_concat) ?
             2 * p->dlc :
@@ -93,17 +94,17 @@ inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2],
     mkldnn_dims_t dst_last_layer_dims = { p->n_iter, p->mb, lastlay_dlc };
 
     DNN_SAFE(mkldnn_memory_desc_init(
-                     &input_d, 3, input_dims, p->cfg_[SRC].dt, mkldnn_tnc),
+                     &input_d, 3, input_dims, p->cfg[SRC].dt, mkldnn_tnc),
             WARN);
     input_d.layout_desc.blocking.strides[0][0] += the_stride;
     DNN_SAFE(mkldnn_memory_desc_init(
-                     &diff_input_d, 3, input_dims, p->cfg_[SRC].dt, mkldnn_any),
+                     &diff_input_d, 3, input_dims, p->cfg[SRC].dt, mkldnn_any),
             WARN);
 
     mkldnn_dims_t states_dims
-            = { p->n_layer, p->n_direction, p->n_states, p->mb, p->sic };
+            = { p->n_layer, p->n_directions(), p->n_states(), p->mb, p->sic };
     DNN_SAFE(mkldnn_memory_desc_init(
-                     &states_d, 5, states_dims, p->cfg_[SRC].dt, mkldnn_ldsnc),
+                     &states_d, 5, states_dims, p->cfg[SRC].dt, mkldnn_ldsnc),
             WARN);
 
     states_d.layout_desc.blocking.strides[0][3] = p->sic + the_stride;
@@ -116,42 +117,42 @@ inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2],
                 * states_d.dims[d + 1];
 
     DNN_SAFE(mkldnn_memory_desc_init(&diff_states_d, 5, states_dims,
-                     p->cfg_[SRC].dt, mkldnn_any),
+                     p->cfg[SRC].dt, mkldnn_any),
             WARN);
 
     DNN_SAFE(mkldnn_memory_desc_init(&weights_input_d, 5, weights_input_dims,
-                     p->cfg_[SRC].dt, mkldnn_any),
+                     p->cfg[SRC].dt, mkldnn_any),
             WARN);
     DNN_SAFE(mkldnn_memory_desc_init(&diff_weights_input_d, 5,
-                     weights_input_dims, p->cfg_[SRC].dt, mkldnn_any),
+                     weights_input_dims, p->cfg[SRC].dt, mkldnn_any),
             WARN);
 
     DNN_SAFE(mkldnn_memory_desc_init(&weights_states_d, 5, weights_states_dims,
-                     p->cfg_[SRC].dt, mkldnn_any),
+                     p->cfg[SRC].dt, mkldnn_any),
             WARN);
     DNN_SAFE(mkldnn_memory_desc_init(&diff_weights_states_d, 5,
-                     weights_states_dims, p->cfg_[SRC].dt, mkldnn_any),
+                     weights_states_dims, p->cfg[SRC].dt, mkldnn_any),
             WARN);
 
     DNN_SAFE(mkldnn_memory_desc_init(
-                     &bias_d, 4, bias_dims, p->cfg_[SRC].dt, mkldnn_any),
+                     &bias_d, 4, bias_dims, p->cfg[SRC].dt, mkldnn_any),
             WARN);
     DNN_SAFE(mkldnn_memory_desc_init(
-                     &diff_bias_d, 4, bias_dims, p->cfg_[SRC].dt, mkldnn_any),
+                     &diff_bias_d, 4, bias_dims, p->cfg[SRC].dt, mkldnn_any),
             WARN);
 
     DNN_SAFE(mkldnn_memory_desc_init(&dst_last_layer_d, 3, dst_last_layer_dims,
-                     p->cfg_[SRC].dt, mkldnn_tnc),
+                     p->cfg[SRC].dt, mkldnn_tnc),
             WARN);
     dst_last_layer_d.layout_desc.blocking.strides[0][0] += the_stride;
     DNN_SAFE(mkldnn_memory_desc_init(&diff_last_layer_d, 3, dst_last_layer_dims,
-                     p->cfg_[SRC].dt, mkldnn_any),
+                     p->cfg[SRC].dt, mkldnn_any),
             WARN);
 
     mkldnn_dims_t dst_last_iteration_dims
-            = { p->n_layer, p->n_direction, p->n_states, p->mb, p->dic };
+            = { p->n_layer, p->n_directions(), p->n_states(), p->mb, p->dic };
     DNN_SAFE(mkldnn_memory_desc_init(&dst_last_iteration_d, 5,
-                     dst_last_iteration_dims, p->cfg_[SRC].dt, mkldnn_ldsnc),
+                     dst_last_iteration_dims, p->cfg[SRC].dt, mkldnn_ldsnc),
             WARN);
 
     dst_last_iteration_d.layout_desc.blocking.strides[0][3]
@@ -166,7 +167,7 @@ inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2],
                 * dst_last_iteration_d.dims[d + 1];
 
     DNN_SAFE(mkldnn_memory_desc_init(&diff_last_iteration_d, 5,
-                     dst_last_iteration_dims, p->cfg_[SRC].dt, mkldnn_any),
+                     dst_last_iteration_dims, p->cfg[SRC].dt, mkldnn_any),
             WARN);
 
     mkldnn_alg_kind_t kind = alg2kind(p->alg);
@@ -186,7 +187,7 @@ inline int init_pd(const rnn_prb_t *p, mkldnn_rnn_desc_t rd[2],
     }
 
     if (is_bwd) {
-        DNN_SAFE(mkldnn_rnn_backward_desc_init(&rd[1], p->prop_, &rcd,
+        DNN_SAFE(mkldnn_rnn_backward_desc_init(&rd[1], p->prop, &rcd,
                          p->direction, &input_d, &states_d, &weights_input_d,
                          &weights_states_d, &bias_d, &dst_last_layer_d,
                          &dst_last_iteration_d, &diff_input_d, &diff_states_d,
@@ -241,13 +242,13 @@ int doit(const rnn_prb_t *p, res_t *r) {
     const auto fp = mkldnn_f32;
 
     if (p->alg != VANILLA_LSTM && p->alg != VANILLA_RNN
-        && p->alg != VANILLA_GRU && p->alg != GRU_LINEAR_BEFORE_RESET) {
+        && p->alg != VANILLA_GRU && p->alg != LBR_GRU) {
         printf("p->alg: %d\n", (int)p->alg);
         r->state = UNIMPLEMENTED;
         return OK;
     }
 
-    const bool is_bwd = p->prop_ == mkldnn_backward;
+    const bool is_bwd = p->prop == mkldnn_backward;
 
     dnn_mem_t *input_dt = nullptr;
     dnn_mem_t *states_dt = nullptr;
@@ -257,6 +258,8 @@ int doit(const rnn_prb_t *p, res_t *r) {
     dnn_mem_t *dst_last_layer_dt = nullptr;
     dnn_mem_t *dst_last_iteration_dt = nullptr;
 
+    dnn_mem_t *bwd_weights_input_dt = nullptr;
+    dnn_mem_t *bwd_weights_states_dt = nullptr;
     dnn_mem_t *dst_diff_input_dt = nullptr;
     dnn_mem_t *dst_diff_states_dt = nullptr;
     dnn_mem_t *dst_diff_weights_input_dt = nullptr;
@@ -284,8 +287,8 @@ int doit(const rnn_prb_t *p, res_t *r) {
     dnn_mem_t *workspace_dt = nullptr;
 
     mkldnn_rnn_desc_t rd[2];
-    mkldnn_primitive_desc_t rpd[2];
-    mkldnn_primitive_t c;
+    mkldnn_primitive_desc_t rpd[2] = {nullptr};
+    mkldnn_primitive_t c{};
     SAFE(init_pd(p, rd, rpd, r), WARN);
     if (r->state == SKIPPED || r->state == UNIMPLEMENTED)
         return OK;
@@ -298,6 +301,8 @@ int doit(const rnn_prb_t *p, res_t *r) {
     auto &dst_last_layer_dt_d = rd[0].dst_layer_desc;
     auto &dst_last_iteration_dt_d = rd[0].dst_iter_desc;
 
+    auto &bwd_weights_input_dt_d = rd[1].weights_layer_desc;
+    auto &bwd_weights_states_dt_d = rd[1].weights_iter_desc;
     auto &diff_src_layer_dt_d = rd[1].diff_src_layer_desc;
     auto &diff_src_iter_dt_d = rd[1].diff_src_iter_desc;
     auto &diff_weights_layer_dt_d = rd[1].diff_weights_layer_desc;
@@ -315,6 +320,8 @@ int doit(const rnn_prb_t *p, res_t *r) {
     dst_last_iteration_dt = new dnn_mem_t(dst_last_iteration_dt_d, fp);
 
     if (is_bwd) {
+        bwd_weights_input_dt = new dnn_mem_t(bwd_weights_input_dt_d, fp);
+        bwd_weights_states_dt = new dnn_mem_t(bwd_weights_states_dt_d, fp);
         dst_diff_input_dt = new dnn_mem_t(diff_src_layer_dt_d, fp);
         dst_diff_states_dt = new dnn_mem_t(diff_src_iter_dt_d, fp);
         dst_diff_weights_input_dt = new dnn_mem_t(diff_weights_layer_dt_d, fp);
@@ -367,6 +374,8 @@ int doit(const rnn_prb_t *p, res_t *r) {
             WARN);
 
     if (is_bwd) {
+        SAFE(bwd_weights_states_dt->reorder(*weights_states_dt), WARN);
+        SAFE(bwd_weights_input_dt->reorder(*weights_input_dt), WARN);
         SAFE(fill_memory(
                      p, dst_diff_input, *dst_diff_input_dt, *dst_diff_input_fp),
                 WARN);
@@ -401,7 +410,7 @@ int doit(const rnn_prb_t *p, res_t *r) {
         DNN_SAFE(mkldnn_primitive_create(&c, rpd[0], inputs, outputs), WARN);
         SAFE(execute(c), WARN);
 #endif
-        if ((p->prop_ == mkldnn_forward) && (bench_mode & CORR)) {
+        if ((p->prop == mkldnn_forward) && (bench_mode & CORR)) {
             compute_ref_fwd(p, *input_fp, *states_fp, *weights_input_fp,
                     *weights_states_fp, *bias_fp, *dst_last_layer_fp,
                     *dst_last_iteration_fp, p->direction);
@@ -422,7 +431,7 @@ int doit(const rnn_prb_t *p, res_t *r) {
     if (is_bwd) {
         mkldnn_primitive_at_t inputs[] = {
             { input_dt->p_, 0 }, { states_dt->p_, 0 },
-            { weights_input_dt->p_, 0 }, { weights_states_dt->p_, 0 },
+            { bwd_weights_input_dt->p_, 0 }, { bwd_weights_states_dt->p_, 0 },
             { bias_dt->p_, 0 }, { dst_last_layer_dt->p_, 0 },
             { dst_last_iteration_dt->p_, 0 }, { diff_last_layer_dt->p_, 0 },
             { diff_last_iteration_dt->p_, 0 }, { workspace_dt->p_, 0 },
@@ -513,6 +522,8 @@ int doit(const rnn_prb_t *p, res_t *r) {
     delete dst_last_iteration_fp;
 
     if (is_bwd) {
+        delete bwd_weights_input_dt;
+        delete bwd_weights_states_dt;
         delete dst_diff_input_fp;
         delete dst_diff_states_fp;
         delete dst_diff_weights_input_fp;
@@ -542,6 +553,10 @@ int doit(const rnn_prb_t *p, res_t *r) {
 
     delete workspace_dt;
 
+    DNN_SAFE(mkldnn_primitive_desc_destroy(rpd[0]), CRIT);
+    DNN_SAFE(mkldnn_primitive_desc_destroy(rpd[1]), CRIT);
+    DNN_SAFE(mkldnn_primitive_destroy(c), CRIT);
+
     return OK;
 }
 } // namespace rnn
index 5ee988c..36d6a56 100644 (file)
@@ -29,7 +29,7 @@
 
 namespace rnn {
 
-enum alg_t { VANILLA_RNN, VANILLA_LSTM, VANILLA_GRU, GRU_LINEAR_BEFORE_RESET };
+enum alg_t { VANILLA_RNN, VANILLA_LSTM, VANILLA_GRU, LBR_GRU };
 alg_t str2alg(const char *str);
 const char *alg2str(alg_t alg);
 mkldnn_alg_kind_t alg2kind(alg_t alg);
@@ -39,6 +39,9 @@ activation_t str2activation(const char *str);
 const char *activation2str(activation_t alg);
 mkldnn_alg_kind_t activation2kind(activation_t alg);
 
+mkldnn_rnn_direction_t str2direction(const char *str);
+const char *direction2str(mkldnn_rnn_direction_t direction);
+
 const int H = 0;
 const int C = 1;
 
@@ -84,9 +87,6 @@ private:
 };
 
 struct rnn_desc_t {
-    alg_t alg;
-    activation_t activation;
-    mkldnn_rnn_direction_t direction;
     int sic;
     int slc;
     int dic;
@@ -96,6 +96,7 @@ struct rnn_desc_t {
     int n_iter;
     const char *name;
 };
+int str2desc(rnn_desc_t *desc, const char *str);
 
 enum rnn_data_kind_t {
     input,
@@ -158,44 +159,31 @@ extern const _dt_conf_t conf_f32;
 
 struct rnn_prb_t : public rnn_desc_t {
     rnn_prb_t(const rnn_desc_t desc, const dt_conf_t *cfg,
-            mkldnn_prop_kind_t prop)
-        : rnn_desc_t(desc), cfg_(cfg), prop_(prop) {
-        switch (alg) {
-        case VANILLA_LSTM:
-            n_weights = 1;
-            n_states = 2;
-            n_gates = 4;
-            break;
-        case VANILLA_GRU:
-            n_weights = 1;
-            n_states = 1;
-            n_gates = 3;
-            break;
-        case GRU_LINEAR_BEFORE_RESET:
-            n_weights = 1;
-            n_states = 1;
-            n_gates = 3;
-            break;
-        default:
-            n_weights = 1;
-            n_states = 1;
-            n_gates = 1;
-            break;
-        }
-
-        // TODO: recheck below condition
-        if (direction == mkldnn_bidirectional_concat
-                || direction == mkldnn_bidirectional_sum)
-            n_direction = 2;
-        else
-            n_direction = 1;
+            mkldnn_prop_kind_t prop, alg_t alg,
+            mkldnn_rnn_direction_t direction, activation_t activation)
+        : rnn_desc_t(desc), cfg(cfg), prop(prop), alg(alg),
+        direction(direction), activation(activation){
     }
 
-    const dt_conf_t *cfg_;
-    mkldnn_prop_kind_t prop_;
-    int n_direction; // 1 for unidirectional, 2 for bidirectional
+    int n_directions() const {
+        return (direction == mkldnn_bidirectional_concat
+                       || direction == mkldnn_bidirectional_sum) ?
+                2 :
+                1;
+    }
+    int n_weights() const { return 1; }
+    int n_states() const { return alg == VANILLA_LSTM ? 2 : 1; }
+    int n_gates() const {
+        return alg == VANILLA_LSTM ?
+                4 :
+                (alg == VANILLA_GRU || alg == LBR_GRU ? 3 : 1);
+    }
 
-    int n_weights, n_states, n_gates;
+    const dt_conf_t *cfg;
+    mkldnn_prop_kind_t prop;
+    alg_t alg;
+    mkldnn_rnn_direction_t direction;
+    activation_t activation;
 
 private:
     rnn_prb_t(const rnn_prb_t &) = delete;
@@ -240,7 +228,8 @@ inline void inv_ntc_off_f(
 // mkldnn_ldsnc
 inline size_t ldsnc_off_f(
         const rnn_prb_t *p, int l, int d, int s, int n, int c) {
-    return ((((size_t)l * p->n_direction + d) * p->n_states + s) * p->mb + n)
+    return ((((size_t)l * p->n_directions() + d) * p->n_states() + s) * p->mb
+                   + n)
             * p->sic
             + c;
 }
@@ -251,10 +240,10 @@ inline void inv_ldsnc_off_f(const rnn_prb_t *p, size_t off, int &l, int &d,
     off /= p->sic;
     n = off % p->mb;
     off /= p->mb;
-    s = off % p->n_states;
-    off /= p->n_states;
-    d = off % p->n_direction;
-    off /= p->n_direction;
+    s = off % p->n_states();
+    off /= p->n_states();
+    d = off % p->n_directions();
+    off /= p->n_directions();
     l = off % p->n_layer;
     off /= p->n_layer;
     assert(off == 0);
@@ -263,7 +252,8 @@ inline void inv_ldsnc_off_f(const rnn_prb_t *p, size_t off, int &l, int &d,
 // mkldnn_ldigo
 inline size_t ldigo_off_f(
         const rnn_prb_t *p, int l, int d, int w, int ic, int oc) {
-    return ((((size_t)l * p->n_direction + d) * p->n_weights + w) * (4 * p->slc)
+    return ((((size_t)l * p->n_directions() + d) * p->n_weights() + w)
+                           * (4 * p->slc)
                    + ic)
             * p->sic
             + oc;
@@ -275,10 +265,10 @@ inline void inv_ldigo_off_f(const rnn_prb_t *p, size_t off, int &l, int &d,
     off /= p->sic;
     ic = off % (4 * p->slc);
     off /= (4 * p->slc);
-    w = off % p->n_weights;
-    off /= p->n_weights;
-    d = off % p->n_direction;
-    off /= p->n_direction;
+    w = off % p->n_weights();
+    off /= p->n_weights();
+    d = off % p->n_directions();
+    off /= p->n_directions();
     l = off % p->n_layer;
     off /= p->n_layer;
     assert(off == 0);
@@ -287,7 +277,8 @@ inline void inv_ldigo_off_f(const rnn_prb_t *p, size_t off, int &l, int &d,
 // mkldnn_ldwOcIc
 inline size_t ldwOcIc_off_f(
         const rnn_prb_t *p, int l, int d, int w, int oc, int ic) {
-    return ((((size_t)l * p->n_direction + d) * p->n_weights + w) * (4 * p->sic)
+    return ((((size_t)l * p->n_directions() + d) * p->n_weights() + w)
+                           * (4 * p->sic)
                    + oc)
             * p->slc
             + ic;
@@ -299,10 +290,10 @@ inline void inv_ldwOcIc_off_f(const rnn_prb_t *p, size_t off, int &l, int &d,
     off /= p->slc;
     oc = off % (4 * p->sic);
     off /= (4 * p->sic);
-    w = off % p->n_weights;
-    off /= p->n_weights;
-    d = off % p->n_direction;
-    off /= p->n_direction;
+    w = off % p->n_weights();
+    off /= p->n_weights();
+    d = off % p->n_directions();
+    off /= p->n_directions();
     l = off % p->n_layer;
     off /= p->n_layer;
     assert(off == 0);
@@ -310,17 +301,18 @@ inline void inv_ldwOcIc_off_f(const rnn_prb_t *p, size_t off, int &l, int &d,
 
 // bias: mkldnn_ldgo
 inline size_t ldgo_off_f(const rnn_prb_t *p, int l, int d, int b, int c) {
-    return (((size_t)l * p->n_direction + d) * p->n_gates + b) * p->sic + c;
+    return (((size_t)l * p->n_directions() + d) * p->n_gates() + b) * p->sic
+            + c;
 }
 
 inline void inv_ldgo_off_f(
         const rnn_prb_t *p, size_t off, int &l, int &d, int &b, int &c) {
     c = off % p->sic;
     off /= p->sic;
-    b = off % p->n_gates;
-    off /= p->n_gates;
-    d = off % p->n_direction;
-    off /= p->n_direction;
+    b = off % p->n_gates();
+    off /= p->n_gates();
+    d = off % p->n_directions();
+    off /= p->n_directions();
     l = off % p->n_layer;
     off /= p->n_layer;
     assert(off == 0);
@@ -339,16 +331,16 @@ inline void inv_tnc_off_f(
     off /= p->mb;
     t = off % p->n_iter;
     off /= p->n_iter;
-    s = off % p->n_states;
-    off /= p->n_states;
+    s = off % p->n_states();
+    off /= p->n_states();
     assert(off == 0);
 }
 
 void perf_report(const rnn_prb_t *p, const res_t *r, const char *pstr);
 
 int doit(const rnn_prb_t *p, res_t *res);
-void check(const rnn_prb_t *p);
-int bench(int argc, char **argv);
+void check(rnn_desc_t *p);
+int bench(int argc, char **argv, bool main_bench = true);
 } // namespace rnn
 
 #endif
index 5c221fd..124cbec 100644 (file)
@@ -33,7 +33,7 @@ alg_t str2alg(const char *str) {
     CASE(VANILLA_RNN);
     CASE(VANILLA_LSTM);
     CASE(VANILLA_GRU);
-    CASE(GRU_LINEAR_BEFORE_RESET);
+    CASE(LBR_GRU);
 #undef CASE
     assert(!"unknown algorithm");
     return VANILLA_RNN;
@@ -46,8 +46,8 @@ const char *alg2str(alg_t alg) {
         return "VANILLA_LSTM";
     if (alg == VANILLA_GRU)
         return "VANILLA_GRU";
-    if (alg == GRU_LINEAR_BEFORE_RESET)
-        return "GRU_LINEAR_BEFORE_RESET";
+    if (alg == LBR_GRU)
+        return "LBR_GRU";
     assert(!"unknown algorithm");
     return "unknown algorithm";
 }
@@ -59,7 +59,7 @@ mkldnn_alg_kind_t alg2kind(alg_t alg) {
         return mkldnn_vanilla_lstm;
     if (alg == VANILLA_GRU)
         return mkldnn_vanilla_gru;
-    if (alg == GRU_LINEAR_BEFORE_RESET)
+    if (alg == LBR_GRU)
         return mkldnn_gru_linear_before_reset;
     assert(!"unknown algorithm");
     return mkldnn_alg_kind_undef;
@@ -99,16 +99,20 @@ mkldnn_alg_kind_t activation2kind(activation_t act) {
     return alg_kind;
 }
 
-const char *direction2str(mkldnn_rnn_direction_t direction) {
-
-    // typedef enum {
-    //     mkldnn_unidirectional_left2right,
-    //     mkldnn_unidirectional_right2left,
-    //     mkldnn_unidirectional = mkldnn_unidirectional_left2right,
-    //     mkldnn_bidirectional_concat,
-    //     mkldnn_bidirectional_sum,
-    // } mkldnn_rnn_direction_t;
+mkldnn_rnn_direction_t str2direction(const char *str) {
+    if (!strcasecmp("left2right", str))
+        return mkldnn_unidirectional_left2right;
+    if (!strcasecmp("right2left", str))
+        return mkldnn_unidirectional_right2left;
+    if (!strcasecmp("concat", str))
+        return mkldnn_bidirectional_concat;
+    if (!strcasecmp("sum", str))
+        return mkldnn_bidirectional_sum;
+    assert(!"unknown direction");
+    return mkldnn_unidirectional_left2right;
+}
 
+const char *direction2str(mkldnn_rnn_direction_t direction) {
     if (direction == mkldnn_unidirectional_left2right)
         return "left2right";
     if (direction == mkldnn_unidirectional_right2left)
@@ -121,14 +125,71 @@ const char *direction2str(mkldnn_rnn_direction_t direction) {
     return "unknown direction";
 }
 
+int str2desc(rnn_desc_t *desc, const char *str) {
+    rnn_desc_t d{0};
+
+    /* canonical form:
+     * lXtXmXsicXslcXdicXdlc
+     *
+     * where: X is number, S - string
+     * note: symbol `_` is ignored
+     *
+     * implicit rules:
+     *  - default values:
+     *      l = 1, t = 1, mb = 2, S="wip"
+     *  - if slc/dlc/dic is undefined => slc/dlc/dic = sic
+     */
+
+    d.n_layer = 1;
+    d.n_iter = 1;
+    d.mb = 2;
+    d.name = "\"wip\"";
+
+    const char *s = str;
+    assert(s);
+
+#   define CASE_NN(p, c) do { \
+        if (!strncmp(p, s, strlen(p))) { \
+            ok = 1; s += strlen(p); \
+            char *end_s; d. c = strtol(s, &end_s, 10); s += (end_s - s); \
+        } \
+    } while (0)
+#   define CASE_N(c) CASE_NN(#c, c)
+    while (*s) {
+        int ok = 0;
+        CASE_NN("l", n_layer);
+        CASE_NN("t", n_iter);
+        CASE_N(mb);
+        CASE_N(sic);
+        CASE_N(slc);
+        CASE_N(dic);
+        CASE_N(dlc);
+        if (*s == 'n') { d.name = s + 1; break; }
+        if (*s == '_') ++s;
+        if (!ok) return FAIL;
+    }
+#   undef CASE_NN
+#   undef CASE_N
+
+    if (d.sic == 0) return FAIL;
+    if (d.slc == 0) d.slc = d.sic;
+    if (d.dlc == 0) d.dlc = d.sic;
+    if (d.dic == 0) d.dic = d.sic;
+
+    *desc = d;
+
+    return OK;
+}
+
+
 void prb2str(const rnn_prb_t *p, const res_t *res, char *buffer) {
     int rem_len = max_prb_len;
 
-    DPRINT("%s(%s,%s)", alg2str(p->alg), activation2str(p->activation),
+    DPRINT("%s,%s,%s,", alg2str(p->alg), activation2str(p->activation),
             direction2str(p->direction));
     DPRINT("l%d", p->n_layer);
     DPRINT("t%d", p->n_iter);
-    DPRINT("m%d", p->mb);
+    DPRINT("mb%d", p->mb);
     DPRINT("sic%d", p->sic);
     DPRINT("slc%d", p->slc);
     DPRINT("dic%d", p->dic);
@@ -145,8 +206,7 @@ float logistic(float x) {
     return 1.0f / (1.0f + expf(-x));
 }
 float dlogistic(float x) {
-    float tmp = logistic(x);
-    return tmp * (1 - tmp);
+    return x * (1 - x);
 }
 float relu(float x) {
     return x > 0 ? x : 0;
@@ -155,7 +215,7 @@ float drelu(float x) {
     return float(x > 0);
 }
 float dtanhf(float x) {
-    return (1 - (tanhf(x) * tanhf(x)));
+    return (1 - x) * (1 + x);
 }
 
 int compare_dat(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem_dt,
@@ -164,10 +224,6 @@ int compare_dat(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem_dt,
 
     const char *skind = rnn_data_kind2str(kind);
 
-    int in = 0, below = 0, above = 0;
-    int in_ok = 0, below_ok = 0, above_ok = 0;
-    int non_zero = 0;
-
     diff_norm_t diff_norm;
 
     r->errors = 0;
@@ -175,30 +231,14 @@ int compare_dat(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem_dt,
 
     for (size_t i = 0; i < nelems; ++i) {
         const float dt = ((float *)mem_dt)[i];
-        const float fp0 = ((float *)mem_fp)[i];
-
-        float fp = fp0;
+        const float fp = ((float *)mem_fp)[i];
+        diff_norm.update(fp, dt);
 
         const float diff = fabsf(fp - dt);
         const float rel_diff = diff / (fabsf(fp) > FLT_MIN ? fabsf(fp) : 1);
 
-        bool ok = true;
-        if (fp < p->cfg_[kind].min) {
-            diff_norm.update(p->cfg_[kind].min, dt);
-            ok = dt == p->cfg_[kind].min;
-            below += 1;
-            below_ok += ok;
-        } else if (fp > p->cfg_[kind].max) {
-            diff_norm.update(p->cfg_[kind].max, dt);
-            ok = dt == p->cfg_[kind].max;
-            above += 1;
-            above_ok += ok;
-        } else {
-            diff_norm.update(fp, dt);
-            ok = (fabs(fp) > 1e-5 ? rel_diff : diff) <= p->cfg_[kind].eps;
-            in += 1;
-            in_ok += ok;
-        }
+        const bool ok = (fabs(fp) > 1e-5 ? rel_diff : diff) <= p->cfg[kind].eps;
+
         if (!ok) {
             r->errors++;
             if (r->errors < 10 || verbose >= 10) {
@@ -208,58 +248,58 @@ int compare_dat(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem_dt,
                 case input:
                     inv_ntc_off_f(p, i, n, t, c);
                     print(0, "%lu, %s, [%s][%d,%d,%d] "
-                             "fp:%8g fp0:%8g dt:%8g diff:%8g rdiff:%8g\n",
+                             "fp:%8g dt:%8g diff:%8g rdiff:%8g\n",
                             (unsigned long)i,
                             final_compare == false ? "REORDER " : "", skind, n,
-                            t, c, fp, fp0, dt, diff, rel_diff);
+                            t, c, fp, dt, diff, rel_diff);
                     break;
                 case states:
                     inv_ldsnc_off_f(p, i, l, d, s, n, c);
                     print(0, "%lu, %s, [%s][%d,%d,%d,%d,%d] "
-                             "fp:%8g fp0:%8g dt:%8g diff:%8g rdiff:%8g\n",
+                             "fp:%8g dt:%8g diff:%8g rdiff:%8g\n",
                             (unsigned long)i,
                             final_compare == false ? "REORDER " : "", skind, l,
-                            d, s, n, c, fp, fp0, dt, diff, rel_diff);
+                            d, s, n, c, fp, dt, diff, rel_diff);
                     break;
                 case weights_input:
                     inv_ldigo_off_f(p, i, l, d, w, ic, oc);
                     print(0, "%lu, %s, [%s][%d,%d,%d,%d,%d] "
-                             "fp:%8g fp0:%8g dt:%8g diff:%8g rdiff:%8g\n",
+                             "fp:%8g dt:%8g diff:%8g rdiff:%8g\n",
                             (unsigned long)i,
                             final_compare == false ? "REORDER " : "", skind, l,
-                            d, w, ic, oc, fp, fp0, dt, diff, rel_diff);
+                            d, w, ic, oc, fp, dt, diff, rel_diff);
                     break;
                 case weights_states:
                     inv_ldigo_off_f(p, i, l, d, w, ic, oc);
                     print(0, "%lu, %s, [%s][%d,%d,%d,%d,%d] "
-                             "fp:%8g fp0:%8g dt:%8g diff:%8g rdiff:%8g\n",
+                             "fp:%8g dt:%8g diff:%8g rdiff:%8g\n",
                             (unsigned long)i,
                             final_compare == false ? "REORDER " : "", skind, l,
-                            d, w, ic, oc, fp, fp0, dt, diff, rel_diff);
+                            d, w, ic, oc, fp, dt, diff, rel_diff);
                     break;
                 case bias:
                     inv_ldgo_off_f(p, i, l, d, b, c);
                     print(0, "%lu, %s, [%s][%d,%d,%d,%d] "
-                             "fp:%8g fp0:%8g dt:%8g diff:%8g rdiff:%8g\n",
+                             "fp:%8g dt:%8g diff:%8g rdiff:%8g\n",
                             (unsigned long)i,
                             final_compare == false ? "REORDER " : "", skind, l,
-                            d, b, c, fp, fp0, dt, diff, rel_diff);
+                            d, b, c, fp,  dt, diff, rel_diff);
                     break;
                 case dst_last_layer:
                     inv_tnc_off_f(p, i, s, t, n, c);
                     print(0, "%lu, %s, [%s][%d,%d,%d,%d] "
-                             "fp:%8g fp0:%8g dt:%8g diff:%8g rdiff:%8g\n",
+                             "fp:%8g dt:%8g diff:%8g rdiff:%8g\n",
                             (unsigned long)i,
                             final_compare == false ? "REORDER " : "", skind, s,
-                            t, n, c, fp, fp0, dt, diff, rel_diff);
+                            t, n, c, fp, dt, diff, rel_diff);
                     break;
                 case dst_last_iteration:
                     inv_ldsnc_off_f(p, i, l, d, s, n, c);
                     print(0, "%lu, %s, [%s][%d,%d,%d,%d,%d "
-                             "fp:%8g fp0:%8g dt:%8g diff:%8g rdiff:%8g\n",
+                             "fp:%8g dt:%8g diff:%8g rdiff:%8g\n",
                             (unsigned long)i,
                             final_compare == false ? "REORDER " : "", skind, l,
-                            d, s, n, c, fp, fp0, dt, diff, rel_diff);
+                            d, s, n, c, fp, dt, diff, rel_diff);
                     break;
                 default: assert("unknown data kind"); return FAIL;
                 }
@@ -275,48 +315,46 @@ int compare_dat(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem_dt,
             switch (kind) {
             case input:
                 inv_ntc_off_f(p, i, n, t, c);
-                print(0, "[%4lu][%s][%d,%d,%d] fp:%8g fp0:%8g dt:%8g\n",
-                        (unsigned long)i, skind, n, t, c, fp, fp0, dt);
+                print(0, "[%4lu][%s][%d,%d,%d] fp:%8g dt:%8g\n",
+                        (unsigned long)i, skind, n, t, c, fp, dt);
                 break;
             case states:
                 inv_ldsnc_off_f(p, i, l, d, s, n, c);
-                print(0, "[%4lu][%s][%d,%d,%d,%d,%d] fp:%8g fp0:%8g dt:%8g\n",
-                        (unsigned long)i, skind, l, d, s, n, c, fp, fp0, dt);
+                print(0, "[%4lu][%s][%d,%d,%d,%d,%d] fp:%8g dt:%8g\n",
+                        (unsigned long)i, skind, l, d, s, n, c, fp, dt);
                 break;
             case weights_input:
                 inv_ldigo_off_f(p, i, l, d, w, ic, oc);
-                print(0, "[%4lu][%s][%d,%d,%d,%d,%d] fp:%8g fp0:%8g dt:%8g\n",
-                        (unsigned long)i, skind, l, d, w, ic, oc, fp, fp0, dt);
+                print(0, "[%4lu][%s][%d,%d,%d,%d,%d] fp:%8g dt:%8g\n",
+                        (unsigned long)i, skind, l, d, w, ic, oc, fp, dt);
                 break;
             case weights_states:
                 inv_ldigo_off_f(p, i, l, d, w, ic, oc);
                 break;
-                print(0, "[%4lu][%s][%d,%d,%d,%d,%d] fp:%8g fp0:%8g dt:%8g\n",
-                        (unsigned long)i, skind, l, d, w, ic, oc, fp, fp0, dt);
+                print(0, "[%4lu][%s][%d,%d,%d,%d,%d] fp:%8g dt:%8g\n",
+                        (unsigned long)i, skind, l, d, w, ic, oc, fp, dt);
             case bias:
                 inv_ldgo_off_f(p, i, l, d, b, c);
                 break;
-                print(0, "[%4lu][%s][%d,%d,%d,%d] fp:%8g fp0:%8g dt:%8g\n",
-                        (unsigned long)i, skind, l, d, b, c, fp, fp0, dt);
+                print(0, "[%4lu][%s][%d,%d,%d,%d] fp:%8g dt:%8g\n",
+                        (unsigned long)i, skind, l, d, b, c, fp, dt);
             case dst_last_layer:
                 inv_tnc_off_f(p, i, s, t, n, c);
-                print(0, "[%4lu][%s][%d,%d,%d] fp:%8g fp0:%8g dt:%8g\n",
-                        (unsigned long)i, skind, n, t, c, fp, fp0, dt);
+                print(0, "[%4lu][%s][%d,%d,%d] fp:%8g dt:%8g\n",
+                        (unsigned long)i, skind, n, t, c, fp, dt);
                 break;
             case dst_last_iteration:
                 inv_ldsnc_off_f(p, i, l, d, s, n, c);
-                print(0, "[%4lu][%s][%d,%d,%d,%d,%d] fp:%8g fp0:%8g dt:%8g\n",
-                        (unsigned long)i, skind, l, d, s, n, c, fp, fp0, dt);
+                print(0, "[%4lu][%s][%d,%d,%d,%d,%d] fp:%8g dt:%8g\n",
+                        (unsigned long)i, skind, l, d, s, n, c, fp, dt);
                 break;
             default:
-                print(0, "[%4lu][unknown] fp:%8g fp0:%8g dt:%8g\n",
-                        (unsigned long)i, fp, fp0, dt);
+                print(0, "[%4lu][unknown] fp:%8g dt:%8g\n",
+                        (unsigned long)i, fp, dt);
                 break;
             }
         }
 #endif
-
-        non_zero += fp != 0;
     }
 
     diff_norm.done();
@@ -338,42 +376,6 @@ int compare_dat(const rnn_prb_t *p, rnn_data_kind_t kind, dnn_mem_t &mem_dt,
                 diff_norm.rel_diff(norm_t::L8));
     }
 
-    // const double trust_rg_level = 0.3;
-    //??        const double trust_nz_level = get_trust_nz_level(p, kind,
-    // final_compare);
-
-    // const double trust_rg = (double)in / r->total;
-    // const double trust_nz = (double)non_zero / r->total;
-
-    // const bool no_trust = true /* ...in the test ...at all */
-    // && final_compare
-    //??            && (trust_rg < trust_rg_level || trust_nz <
-    // trust_nz_level)
-    //;
-
-    // const bool dump = verbose >= 20
-    // || (verbose >= 10 && (trust_rg < 1. || trust_nz < 1.));
-    /*??
-    if (dump) {
-        print(0, "@@@ [%s] %strust range:%.2f nz:%.2f "
-            "(level range:%.2f nz:%.2f). "
-            "in:%d (ok:%d) below:%d (ok:%d) above:%d (ok:%d) nz:%d "
-            "total:%lu\n", skind, final_compare ? "final: " : "",
-            trust_rg, trust_nz, trust_rg_level, trust_nz_level, in, in_ok,
-            below, below_ok, above, above_ok, non_zero,
-            (unsigned long)r->total);
-    }
-    */
-
-    /*??
-    if (no_trust) {
-        r->state = MISTRUSTED;
-        print(0, "@@@ [%s] test-bug: trust is too low. "
-            "range:%.2f (?<%.2f) nz:%.2f (?<%.2f) (nz: %d total: %lu)\n",
-            skind, trust_rg, trust_rg_level, trust_nz, trust_nz_level,
-            non_zero, (unsigned long)r->total);
-    }*/
-
     if (r->errors)
         r->state = FAILED;
 
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/bench_shuffle.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/bench_shuffle.cpp
new file mode 100644 (file)
index 0000000..7b7c2df
--- /dev/null
@@ -0,0 +1,110 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <float.h>
+#include <math.h>
+
+#include "mkldnn.h"
+
+#include "mkldnn_common.hpp"
+#include "mkldnn_memory.hpp"
+#include "mkldnn_debug.hpp"
+
+#include "shuffle/shuffle.hpp"
+
+namespace shuffle {
+
+/* global driver parameters */
+int mb = 0;
+dir_t dir = FWD_D;
+mkldnn_data_type_t dt = mkldnn_f32;
+mkldnn_memory_format_t fmt = mkldnn_nchw;
+dims_t dims;
+int axis = 1, group = 1;
+const char *pattern = NULL;
+bool allow_unimpl = false;
+const char *perf_template = "perf,%z,%q,%f,%D,%a,%g,%-t,%0t";
+
+void reset_parameters() {
+    dir = FWD_D;
+    dt = mkldnn_f32;
+    fmt = mkldnn_nchw;
+    axis = 1;
+    group = 1;
+    pattern = NULL;
+}
+
+void check_correctness() {
+    const prb_t p(dims, dir, dt, fmt, axis, group);
+    char pstr[max_prb_len];
+    prb2str(&p, pstr);
+
+    if (pattern && !match_regex(pstr, pattern))
+        return;
+    print(1, "run: %s\n", pstr);
+
+    res_t res{};
+    const int status = shuffle::doit(&p, &res);
+
+    bool want_perf_report = false;
+    parse_result(res, want_perf_report, allow_unimpl, status, pstr);
+
+    if (want_perf_report && bench_mode & PERF)
+        perf_report(&p, &res, pstr);
+
+    benchdnn_stat.tests++;
+}
+
+int bench(int argc, char **argv, bool main_bench) {
+    for (int arg = 0; arg < argc; ++arg) {
+        if (!strncmp("--batch=", argv[arg], 8))
+            SAFE(batch(argv[arg] + 8, bench), CRIT);
+        else if (!strncmp("--dir=", argv[arg], 6))
+            dir = str2dir(argv[arg] + 6);
+        else if (!strncmp("--dt=", argv[arg], 5))
+            dt = str2dt(argv[arg] + 5);
+        else if (!strncmp("--fmt=", argv[arg], 6))
+            fmt = str2fmt(argv[arg] + 6);
+        else if (!strncmp("--axis=", argv[arg], 7))
+            axis = atoi(argv[arg] + 7);
+        else if (!strncmp("--group=", argv[arg], 8))
+            group = atoi(argv[arg] + 8);
+        else if (!strncmp("--match=", argv[arg], 8))
+            pattern = argv[arg] + 8;
+        else if (!strncmp("--mode=", argv[0], 7))
+            bench_mode = str2bench_mode(argv[0] + 7);
+        else if (!strncmp("-v", argv[arg], 2))
+            verbose = atoi(argv[arg] + 2);
+        else if (!strncmp("--verbose=", argv[arg], 10))
+            verbose = atoi(argv[arg] + 10);
+        else {
+            if (!strncmp("--", argv[arg], 2)) {
+                fprintf(stderr, "driver: unknown option: `%s`, exiting...\n",
+                        argv[arg]);
+                exit(2);
+            }
+            dims = str2dims(argv[arg]);
+            check_correctness();
+        }
+    }
+
+    return OK;
+}
+
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/perf_report.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/perf_report.cpp
new file mode 100644 (file)
index 0000000..3ed4018
--- /dev/null
@@ -0,0 +1,125 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <float.h>
+#include <math.h>
+
+#include "mkldnn.h"
+#include "mkldnn_memory.hpp"
+
+#include "shuffle/shuffle.hpp"
+
+namespace shuffle {
+
+#if 0
+See conv/perf_report.cpp for details.
+See modifiers at the same place.
+
+| abbreviation  | description
+|:------------  |:-----------
+| %d            | problem descriptor
+| %D            | expanded problem descriptor (parameters in csv format)
+| %z            | direction
+| %q            | data type (precision)
+| %f            | data format (layout)
+| %a            | axis
+| %g            | group size
+| %@t           | time in ms
+
+The definition of expanded problem descriptor is: `dxdxdxdxd`.
+#endif
+
+void perf_report(const prb_t *p, const res_t *r, const char *pstr) {
+    const auto &t = r->timer;
+    const int max_len = 400;
+    int rem_len = max_len - 1;
+    char buffer[max_len], *buf = buffer;
+
+#   define DPRINT(...) do { \
+        int l = snprintf(buf, rem_len, __VA_ARGS__); \
+        buf += l; rem_len -= l; \
+    } while(0)
+
+    auto modifier2mode = [](char c) {
+        if (c == '-') return benchdnn_timer_t::min;
+        if (c == '0') return benchdnn_timer_t::avg;
+        if (c == '+') return benchdnn_timer_t::max;
+        return benchdnn_timer_t::min;
+    };
+
+    auto modifier2unit = [](char c) {
+        if (c == 'K') return 1e3;
+        if (c == 'M') return 1e6;
+        if (c == 'G') return 1e9;
+        return 1e0;
+    };
+
+    const char *pt = perf_template;
+    char c;
+
+    while ((c = *pt++) != '\0') {
+        if (c != '%') { *buf++ = c; rem_len--; continue; }
+
+        c = *pt++;
+
+        benchdnn_timer_t::mode_t mode = benchdnn_timer_t::min;
+        double unit = 1e0;
+
+        if (c == '-' || c == '0' || c == '+') {
+            mode = modifier2mode(c);
+            c = *pt++;
+        }
+
+        if (c == 'K' || c == 'M' || c == 'G') {
+            unit = modifier2unit(c);
+            c = *pt++;
+        }
+
+        if (c == 'd')
+            DPRINT("%s", pstr);
+        else if (c == 'D') {
+            int len = (int)strnlen(buf, rem_len);
+            dims2str(p->dims, buf);
+            len = (int)strnlen(buf, rem_len);
+            rem_len -= len; buf += len;
+        }
+        else if (c == 'a')
+            DPRINT("%d", p->a);
+        else if (c == 'g')
+            DPRINT("%d", p->g);
+        else if (c == 'z')
+            DPRINT("%s", dir2str(p->dir));
+        else if (c == 'q')
+            DPRINT("%s", dt2str(p->dt));
+        else if (c == 'f')
+            DPRINT("%s", fmt2str(p->fmt));
+        else if (c == 't')
+            DPRINT("%g", t.ms(mode) / unit);
+        else
+            []() { SAFE_V(FAIL); return 0; }();
+    }
+
+    *buf = '\0';
+    assert(rem_len >= 0);
+
+#   undef DPRINT
+    print(0, "%s\n", buffer);
+}
+
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/ref_shuffle.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/ref_shuffle.cpp
new file mode 100644 (file)
index 0000000..425f7f7
--- /dev/null
@@ -0,0 +1,58 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "shuffle/shuffle.hpp"
+#include "src/common/mkldnn_thread.hpp"
+
+namespace shuffle {
+
+void compute_shuffle(const prb_t *p, const dnn_mem_t &src, dnn_mem_t &dst)
+{
+    const int axis = p->a;
+    const int group_size = p->g;
+    const int ndims = (int)p->dims.size();
+    const int axis_size = p->dims[axis];
+    size_t inner_size = 1, outer_size = 1;
+
+    auto transpose = [=] (int a) {
+        int R, C;
+        if (p->dir == FWD_D) {
+            R = group_size;
+            C = axis_size / group_size;
+        } else {
+            R = axis_size / group_size;
+            C = group_size;
+        }
+        int col = a / R;
+        int row = a % R;
+        return C * row + col;
+    };
+
+    for (int i = 0; i < axis ; ++i)
+        outer_size *= (size_t)p->dims[i];
+    for (int i = axis + 1; i < ndims; ++i)
+        inner_size *= (size_t)p->dims[i];
+    const size_t dim = axis_size * inner_size;
+
+    mkldnn::impl::parallel_nd(outer_size, axis_size, inner_size,
+           [&](size_t ou, int a, size_t in) {
+        auto src_off = ou * dim + a * inner_size + in;
+        auto dst_off = ou * dim + transpose(a) * inner_size + in;
+        dst.set_elem(dst_off, src.get_elem(src_off));
+    });
+}
+
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle.cpp
new file mode 100644 (file)
index 0000000..f2db808
--- /dev/null
@@ -0,0 +1,188 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <float.h>
+#include <math.h>
+#include <time.h>
+
+#include "mkldnn.h"
+
+#include "mkldnn_common.hpp"
+#include "mkldnn_memory.hpp"
+#include "norm.hpp"
+
+#include "shuffle/shuffle.hpp"
+
+namespace shuffle {
+
+inline float saturate(float value, float min, float max) {
+    return MAX2(min, MIN2(max, value));
+}
+
+int fill_memory(const prb_t *p, dnn_mem_t &mem) {
+    dt_conf_t c_src;
+    switch (p->dt) {
+        case mkldnn_u8: c_src = conf_u8; break;
+        case mkldnn_s8: c_src = conf_s8; break;
+        case mkldnn_s32: c_src = conf_s32; break;
+        default: c_src = conf_f32; break;
+    }
+    const int range = c_src.range;
+    const int max = c_src.min + range - 1;
+
+    const size_t nelems = mem.nelems();
+
+    for (size_t idx = 0; idx < nelems; ++idx) {
+        float value = saturate((float)(idx % c_src.range), c_src.min, max);
+        mem.set_elem(idx, value);
+    }
+
+    return OK;
+}
+
+static int compare(const prb_t *p, const dnn_mem_t &fp_mem,
+        const dnn_mem_t &dt_mem, res_t *r) {
+    size_t nelems = fp_mem.nelems();
+    assert(nelems == dt_mem.nelems());
+    r->errors = 0;
+
+    for (size_t i = 0; i < nelems; ++i) {
+        const float fp = fp_mem.get_elem(i);
+        const float dt = dt_mem.get_elem(i);
+        const float diff = fabsf(fp - dt);
+        if (r->errors < 10 && diff != 0.0) {
+            printf("idx: %zu fp: %f dt:%f\n", i, fp, dt);
+            r->errors++;
+        }
+    }
+
+    if (r->errors)
+        r->state = FAILED;
+
+    if (r->state == UNTESTED)
+        r->state = PASSED; /* optimism */
+
+    return r->state == FAILED ? FAIL : OK;
+}
+
+static int init_pd(const prb_t *p, mkldnn_shuffle_desc_t &sd,
+        mkldnn_primitive_desc_t &spd, res_t *r) {
+
+    mkldnn_memory_desc_t data_d;
+    mkldnn_dims_t data_dims;
+    const int ndims = (int)p->dims.size();
+
+    for (int i = 0; i < ndims; ++i) data_dims[i] = p->dims[i];
+    DNN_SAFE(mkldnn_memory_desc_init(&data_d, ndims, data_dims, p->dt, p->fmt),
+           WARN);
+
+    mkldnn_status_t init_status = mkldnn_success;
+    mkldnn_primitive_desc_t hint_fwd_pd = NULL;
+    if (p->dir == FWD_D) {
+        auto prop = mkldnn_forward_training;
+        DNN_SAFE(mkldnn_shuffle_forward_desc_init(&sd, prop,
+                    &data_d, p->a, p->g), WARN);
+    } else if (p->dir == BWD_D) {
+        DNN_SAFE(mkldnn_shuffle_backward_desc_init(&sd, &data_d, p->a,
+                    p->g), WARN);
+        mkldnn_shuffle_desc_t sd_fwd;
+        DNN_SAFE(mkldnn_shuffle_forward_desc_init(&sd_fwd,
+                    mkldnn_forward_training, &data_d, p->a, p->g), WARN);
+        DNN_SAFE(mkldnn_primitive_desc_create(&hint_fwd_pd, &sd_fwd, engine,
+                    NULL), WARN);
+    }
+    init_status = mkldnn_primitive_desc_create(&spd, &sd, engine, hint_fwd_pd);
+    mkldnn_primitive_desc_destroy(hint_fwd_pd);
+
+    if (init_status == mkldnn_unimplemented)
+        return r->state = UNIMPLEMENTED, OK;
+    else
+        SAFE(init_status, WARN);
+
+    const char *impl_str = query_impl_info(spd);
+    print(5, "mkldnn implementation: %s\n", impl_str);
+
+    return OK;
+}
+
+int doit(const prb_t *p, res_t *r) {
+
+    res_t res_zero{};
+    *r = res_zero;
+
+    mkldnn_shuffle_desc_t sd;
+    mkldnn_primitive_desc_t spd;
+    mkldnn_primitive_t s{};
+
+    SAFE(init_pd(p, sd, spd, r), WARN);
+    if (r->state == SKIPPED || r->state == UNIMPLEMENTED)
+        return OK;
+
+    const auto fp = p->dt;
+    auto &src_dt_d = sd.data_desc;
+
+    const int ndims = (int)p->dims.size();
+    const auto src_format = (ndims == 1)
+           ? mkldnn_x
+           : (ndims == 2)
+           ? mkldnn_nc
+           : get_default_format(ndims, fmt2data_kind(p->fmt));
+
+    dnn_mem_t data_fp(src_dt_d, fp, src_format),
+              data_dt(src_dt_d);
+    dnn_mem_t d_data_fp(src_dt_d, fp, src_format),
+              d_data_dt(src_dt_d);
+
+    SAFE(fill_memory(p, data_fp), WARN);
+
+    mkldnn_primitive_at_t inputs[1];
+    const_mkldnn_primitive_t outputs[1];
+    SAFE(data_dt.reorder(data_fp), WARN);
+    inputs[0] = {data_dt.p_, 0};
+    outputs[0] = d_data_dt.p_;
+    DNN_SAFE(mkldnn_primitive_create(&s, spd, inputs, outputs), WARN);
+    DNN_SAFE_V(mkldnn_primitive_desc_destroy(spd));
+    SAFE(execute(s), WARN);
+    if (bench_mode & CORR) {
+        compute_shuffle(p, data_fp, d_data_fp);
+        dnn_mem_t data(d_data_dt.md_, fp, src_format);
+        SAFE(data.reorder(d_data_dt), WARN);
+        SAFE(compare(p, d_data_fp, data, r), WARN);
+    }
+
+    if (bench_mode & PERF) {
+        auto &t = r->timer;
+        t.reset();
+        while (true) {
+            SAFE(execute(s), WARN);
+            t.stamp();
+            const bool stop = false
+                || (fix_times_per_prb && t.times() >= fix_times_per_prb)
+                || (!fix_times_per_prb
+                        && t.total_ms() >= max_ms_per_prb
+                        && t.times() >= min_times_per_prb);
+            if (stop) break;
+        }
+    }
+
+    DNN_SAFE_V(mkldnn_primitive_destroy(s));
+    return OK;
+}
+
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle.hpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle.hpp
new file mode 100644 (file)
index 0000000..13770ee
--- /dev/null
@@ -0,0 +1,85 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef _SHUFFLE_HPP
+#define _SHUFFLE_HPP
+
+#include <stdint.h>
+#include <limits.h>
+#include <assert.h>
+#include <vector>
+
+#include "common.hpp"
+#include "dnn_types.hpp"
+#include "mkldnn_common.hpp"
+#include "mkldnn_memory.hpp"
+#include "mkldnn_debug.hpp"
+
+namespace shuffle {
+
+using dims_t = std::vector<int>;
+
+struct dt_conf_t {
+    mkldnn_data_type_t dt;
+    int min;
+    int range;
+};
+
+const int int_max_exact = 1<<24;
+
+const dt_conf_t conf_f32 = {mkldnn_f32, -int_max_exact, 2*int_max_exact};
+const dt_conf_t conf_s8 = {mkldnn_s8, INT8_MIN, -2*INT8_MIN};
+const dt_conf_t conf_u8 = {mkldnn_u8, 0, UINT8_MAX};
+const dt_conf_t conf_s32 = {mkldnn_s32, -int_max_exact, 2*int_max_exact};
+
+const size_t max_desc_len = 196;
+
+struct prb_t {
+    prb_t(dims_t &dims, dir_t dir, mkldnn_data_type_t dt,
+            mkldnn_memory_format_t fmt, int axis, int group)
+        : dims(dims), dir(dir), dt(dt), fmt(fmt), a(axis), g(group) {}
+    ~prb_t() {}
+
+    dims_t dims;
+    dir_t dir;
+    mkldnn_data_type_t dt;
+    mkldnn_memory_format_t fmt;
+    int a, g;
+};
+
+const size_t max_dims_len = 20;
+dims_t str2dims(const char *str);
+void dims2str(const dims_t &dims, char *buffer);
+const size_t max_prb_len = max_desc_len + 196;
+void prb2str(const prb_t *p, char *buffer, bool canonical = false);
+
+extern const char *perf_template; /* performance output template */
+void perf_report(const prb_t *p, const res_t *r, const char *pstr);
+
+inline size_t data_off(const prb_t *p, int mb, int c, int d, int h, int w) {
+    const auto &dims = p->dims;
+    return ((((size_t)mb * dims[1] + c) * dims[2] + d) * dims[3] + h) * dims[4]
+             + w;
+}
+
+void compute_shuffle(const prb_t *p, const dnn_mem_t &src, dnn_mem_t &dst);
+
+int fill_memory(const prb_t *p, dnn_mem_t &src);
+int doit(const prb_t *p, res_t *res);
+int bench(int argc, char **argv, bool main_bench = true);
+}
+
+#endif
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle_aux.cpp b/inference-engine/thirdparty/mkl-dnn/tests/benchdnn/shuffle/shuffle_aux.cpp
new file mode 100644 (file)
index 0000000..90eaeb5
--- /dev/null
@@ -0,0 +1,67 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <stdlib.h>
+#include <assert.h>
+#include "shuffle/shuffle.hpp"
+
+namespace shuffle {
+
+#define DPRINT(...) do { \
+    int l = snprintf(buffer, rem_len, __VA_ARGS__); \
+    buffer += l; rem_len -= l; \
+} while(0)
+
+dims_t str2dims(const char *str) {
+    dims_t dims;
+    do {
+        int dim, len;
+        int scan = sscanf(str, "%d%n", &dim, &len);
+        SAFE_V(scan == 1 ? OK : FAIL);
+        dims.push_back(dim);
+        str += len;
+        SAFE_V(*str == 'x' || *str == '\0' ? OK : FAIL);
+    } while (*str++ != '\0');
+    return dims;
+}
+
+void dims2str(const dims_t &dims, char *buffer) {
+    int rem_len = max_dims_len;
+    for (size_t d = 0; d < dims.size() - 1; ++d)
+        DPRINT("%dx", dims[d]);
+    DPRINT("%d", dims[dims.size() - 1]);
+}
+
+void prb2str(const prb_t *p, char *buffer, bool canonical) {
+    char dims_buf[max_dims_len] = {0};
+    dims2str(p->dims, dims_buf);
+
+    char dir_str[32] = {0};
+    char dt_str[16] = {0};
+    char fmt_str[32] = {0};
+    char axis_str[16] = {0};
+    char group_str[16] = {0};
+
+    snprintf(dir_str, sizeof(dir_str), "--dir=%s ", dir2str(p->dir));
+    snprintf(dt_str, sizeof(dt_str), "--dt=%s ", dt2str(p->dt));
+    snprintf(fmt_str, sizeof(fmt_str), "--fmt=%s ", fmt2str(p->fmt));
+    snprintf(axis_str, sizeof(axis_str), "--axis=%d ", p->a);
+    snprintf(group_str, sizeof(group_str), "--group=%d ", p->g);
+    snprintf(buffer, max_prb_len, "%s%s%s%s%s%s", dir_str, dt_str, fmt_str,
+           axis_str, group_str, dims_buf);
+}
+
+}
index c29057d..9439423 100644 (file)
@@ -28,7 +28,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}
 if(WIN32)
     # Correct 'jnl' macro/jit issue
     if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qlong-double")
+        append(CMAKE_CXX_FLAGS "/Qlong-double")
     endif()
 endif()
 
@@ -54,6 +54,7 @@ file(GLOB PRIM_TEST_CASES_SRC
                               test_inner_product_forward.cpp
                               test_inner_product_backward_data.cpp
                               test_inner_product_backward_weights.cpp
+                              test_shuffle.cpp
                               test_convolution_format_any.cpp
                               test_convolution_forward_f32.cpp
                               test_convolution_forward_f32_3d.cpp
@@ -68,7 +69,9 @@ file(GLOB PRIM_TEST_CASES_SRC
                               test_convolution_backward_weights_f32.cpp
                               test_convolution_backward_weights_s16s16s32.cpp
                               test_deconvolution.cpp
-                              test_gemm.cpp
+                              test_gemm_f32.cpp
+                              test_gemm_s8u8s32.cpp
+                              test_gemm_s8s8s32.cpp
                               test_roi_pooling_forward.cpp
                               test_convolution_eltwise_forward_f32.cpp
                               test_convolution_depthwise_forward_f32.cpp
index 854d532..6fc6d85 100644 (file)
 #define ALGORITHM mkldnn::convolution_direct
 
 #ifdef DIRECTION_FORWARD
+#if defined(FP32)
+#define FMT_DATA_BLOCKED nChw8c
+#define FMT_DATA_BLOCKED16 nChw16c
 #define FMT_WEIGHTS_BLOCKED OIhw8i8o
 #define FMT_WEIGHTS_BLOCKED_G gOIhw8i8o
-#if defined(FP32)
 #define FMT_WEIGHTS_BLOCKED16 OIhw16i16o
 #define FMT_WEIGHTS_BLOCKED16_G gOIhw16i16o
 #elif defined(S16S16S32)
+#define FMT_DATA_BLOCKED nChw8c
+#define FMT_DATA_BLOCKED16 nChw16c
+#define FMT_WEIGHTS_BLOCKED OIhw8i8o
+#define FMT_WEIGHTS_BLOCKED_G gOIhw8i8o
 #define FMT_WEIGHTS_BLOCKED16 OIhw8i16o2i
 #define FMT_WEIGHTS_BLOCKED16_G gOIhw8i16o2i
 #elif defined(U8S8)
+#define FMT_DATA_BLOCKED nhwc
+#define FMT_DATA_BLOCKED16 nhwc
+#define FMT_WEIGHTS_BLOCKED OhIw8o4i
+#define FMT_WEIGHTS_BLOCKED_G gOhIw8o4i
 #define FMT_WEIGHTS_BLOCKED16 OIhw4i16o4i
 #define FMT_WEIGHTS_BLOCKED16_G gOIhw4i16o4i
 #endif
 #define FMT_WEIGHTS_BLOCKED16_IOhw16o16i FMT_WEIGHTS_BLOCKED16
 #define TEST_CASE_NAME_PREFIX Forward
 #elif defined DIRECTION_BACKWARD_DATA
+#define FMT_DATA_BLOCKED nChw8c
+#define FMT_DATA_BLOCKED16 nChw16c
 #define FMT_WEIGHTS_BLOCKED OIhw8o8i
 #define FMT_WEIGHTS_BLOCKED_G gOIhw8o8i
 #if defined(FP32)
@@ -56,6 +68,8 @@
 #endif
 #define TEST_CASE_NAME_PREFIX BackwardData
 #elif defined DIRECTION_BACKWARD_WEIGHTS
+#define FMT_DATA_BLOCKED nChw8c
+#define FMT_DATA_BLOCKED16 nChw16c
 #define FMT_WEIGHTS_BLOCKED OIhw8i8o
 #define FMT_WEIGHTS_BLOCKED_G gOIhw8i8o
 #define FMT_WEIGHTS_BLOCKED16 OIhw16i16o
@@ -67,8 +81,6 @@
 
 #define FMT_BIAS x
 #define FMT_NO_BIAS format_undef
-#define FMT_DATA_BLOCKED nChw8c
-#define FMT_DATA_BLOCKED16 nChw16c
 
 #define CONCAT_WITH_UNDERSCORE_(a,b) a ## _ ## b
 #define CONCAT_WITH_UNDERSCORE(a,b) CONCAT_WITH_UNDERSCORE_(a,b)
index a01a369..a9f4f90 100644 (file)
@@ -267,6 +267,12 @@ void DeathTestAbort(const std::string& message) {
       GetUnitTestImpl()->internal_run_death_test_flag();
   if (flag != NULL) {
     FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    if (parent == NULL) {
+      fprintf(stderr, "Unable to associate stream with file descriptor %d\n",
+        flag->write_fd());
+      fflush(stderr);
+      posix::Abort();
+    }
     fputc(kDeathTestInternalError, parent);
     fprintf(parent, "%s", message.c_str());
     fflush(parent);
index e5bf3dd..5830855 100644 (file)
@@ -855,6 +855,10 @@ void RE::Init(const char* regex) {
   // full match: we need space to prepend a '^', append a '$', and
   // terminate the string with '\0'.
   char* buffer = static_cast<char*>(malloc(len + 3));
+  if (buffer == NULL) {
+      return;
+  }
+
   full_pattern_ = buffer;
 
   if (*regex != '^')
@@ -995,6 +999,11 @@ class CapturedStream {
     }
 
     FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    if (file == NULL) {
+        printf("Unable to open file \"%s\"\n", filename_.c_str());
+        fflush(stdout);
+        return "";
+    }
     const std::string content = ReadEntireFile(file);
     posix::FClose(file);
     return content;
index fb0e354..0f68a34 100644 (file)
@@ -57,7 +57,8 @@ std::string TestPartResult::ExtractSummary(const char* message) {
 // Prints a TestPartResult object.
 std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
   return os
-      << result.file_name() << ":" << result.line_number() << ": "
+      << (result.file_name() == NULL ? "" : result.file_name())
+      << ":" << result.line_number() << ": "
       << (result.type() == TestPartResult::kSuccess ? "Success" :
           result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
           "Non-fatal failure") << ":\n"
index d391f68..7ac0ba1 100644 (file)
@@ -2658,8 +2658,10 @@ void TestInfo::Run() {
 
   // Deletes the test object.
   impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      test, &Test::DeleteSelf_, "the test fixture's destructor");
+  if (test != NULL) {
+      internal::HandleExceptionsInMethodIfSupported(
+          test, &Test::DeleteSelf_, "the test fixture's destructor");
+  }
 
   result_.set_elapsed_time(internal::GetTimeInMillis() - start);
 
@@ -3173,12 +3175,25 @@ void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
   }
 
   for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-    const TestCase& test_case = *unit_test.GetTestCase(i);
+    const auto p_test_case = unit_test.GetTestCase(i);
+    if (!p_test_case) {
+      ColoredPrintf(COLOR_RED, "Could not get test case %d\n", i);
+      fflush(stdout);
+      continue;
+    }
+    const TestCase& test_case = *p_test_case;
     if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
       continue;
     }
     for (int j = 0; j < test_case.total_test_count(); ++j) {
-      const TestInfo& test_info = *test_case.GetTestInfo(j);
+      const auto p_test_info = test_case.GetTestInfo(j);
+      if (!p_test_info) {
+        ColoredPrintf(COLOR_RED, "Could not get test info %d of test case %d\n",
+               j, i);
+        fflush(stdout);
+        continue;
+      }
+      const TestInfo& test_info = *p_test_info;
       if (!test_info.should_run() || test_info.result()->Passed()) {
         continue;
       }
@@ -3398,10 +3413,19 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
                                 const char* test_case_name,
                                 const TestInfo& test_info);
 
+  // Streams an XML representation of a NULL TestInfo object.
+  static void OutputXmlNullTestInfo(::std::ostream* stream,
+                                const char* test_case_name,
+                                const int test_info_index);
+
   // Prints an XML representation of a TestCase object
   static void PrintXmlTestCase(::std::ostream* stream,
                                const TestCase& test_case);
 
+  // Prints an XML representation of a NullTestCase object
+  static void PrintXmlNullTestCase(::std::ostream* stream,
+                               const int test_case_inedx);
+
   // Prints an XML summary of unit_test to output stream out.
   static void PrintXmlUnitTest(::std::ostream* stream,
                                const UnitTest& unit_test);
@@ -3671,6 +3695,17 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
     *stream << "    </testcase>\n";
 }
 
+// Prints an XML representation of a NULL TestInfo object.
+void XmlUnitTestResultPrinter::OutputXmlNullTestInfo(::std::ostream* stream,
+       const char* test_case_name, const int test_info_index) {
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, "testcase", "classname", test_case_name);
+  *stream << "      <failure message=\""
+          << "failed to get test info " << test_info_index << "\""
+          << ">\n</failure>\n"
+          << ">\n    </testcase>\n";
+}
+
 // Prints an XML representation of a TestCase object
 void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
                                                 const TestCase& test_case) {
@@ -3691,12 +3726,27 @@ void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
           << ">\n";
 
   for (int i = 0; i < test_case.total_test_count(); ++i) {
-    if (test_case.GetTestInfo(i)->is_reportable())
-      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+    const auto p_test_info = test_case.GetTestInfo(i);
+    if (!p_test_info) {
+      OutputXmlNullTestInfo(stream, test_case.name(), i);
+      continue;
+    }
+    if (p_test_info->is_reportable())
+      OutputXmlTestInfo(stream, test_case.name(), *p_test_info);
   }
   *stream << "  </" << kTestsuite << ">\n";
 }
 
+// Prints an XML representation of a NULL TestCase object
+void XmlUnitTestResultPrinter::PrintXmlNullTestCase(std::ostream* stream,
+       const int test_case_index) {
+  *stream << "  <testsuite"
+          << "      <failure message=\""
+          << "failed to get test case " << test_case_index << "\""
+          << ">\n</failure>\n"
+          << ">\n  </testsuite>\n";
+}
+
 // Prints an XML summary of unit_test to output stream out.
 void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
                                                 const UnitTest& unit_test) {
@@ -3730,8 +3780,13 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
   *stream << ">\n";
 
   for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
-      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
+    const auto p_test_case = unit_test.GetTestCase(i);
+    if (!p_test_case) {
+      PrintXmlNullTestCase(stream, i);
+      continue;
+    }
+    if (p_test_case->reportable_test_count() > 0)
+      PrintXmlTestCase(stream, *p_test_case);
   }
   *stream << "</" << kTestsuites << ">\n";
 }
@@ -3866,6 +3921,13 @@ class ScopedPrematureExitFile {
       // errors are ignored as there's nothing better we can do and we
       // don't want to fail the test because of this.
       FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      if (pfile == NULL) {
+          fprintf(stderr,
+              "Unable to open file \"%s\"\n", premature_exit_filepath);
+          fflush(stderr);
+          exit(EXIT_FAILURE);
+      }
+
       size_t unused = fwrite("0", 1, 1, pfile);
       (void)unused;
       fclose(pfile);
@@ -4647,6 +4709,10 @@ bool UnitTestImpl::RunAllTests() {
       if (!Test::HasFatalFailure()) {
         for (int test_index = 0; test_index < total_test_case_count();
              test_index++) {
+          if (GetMutableTestCase(test_index) == NULL) {
+              printf("\n.Cound not get test case %d\n", test_index);
+              continue;
+          }
           GetMutableTestCase(test_index)->Run();
         }
       }
index fd74408..c9bf46a 100644 (file)
@@ -30,31 +30,31 @@ INST_TEST_CASE(SimpleSmall_NCHW_expected_failures,
     PARAMS_EXPECT_FAIL(nchw, oihw, FMT_BIAS, nchw, mkldnn_invalid_arguments, 1, 1, 4, 4, 4, 6, 4, 4, 3, 3, 1, 1, 0, 0)
 );
 
-INST_TEST_CASE(SimpleSmall_Blocked16_padded,
-    // non-1x1 (all)
-    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 17, 13, 13, 23, 12, 12, 3, 3, 0, 0, 1, 1),
-    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 21, 13, 13, 16, 12, 12, 3, 3, 0, 0, 1, 1),
-    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 23, 13, 13, 19, 12, 12, 3, 3, 0, 0, 1, 1),
-    // 1x1 (fwd, bwd_w)
-    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 17, 13, 13, 23, 13, 13, 1, 1, 0, 0, 1, 1),
-    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 21, 13, 13, 16, 13, 13, 1, 1, 0, 0, 1, 1),
-    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 23, 13, 13, 19, 13, 13, 1, 1, 0, 0, 1, 1),
-    // 1x1 (bwd_d)
-    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16_IOhw16o16i, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 17, 13, 13, 23, 13, 13, 1, 1, 0, 0, 1, 1),
-    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16_IOhw16o16i, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 21, 13, 13, 16, 13, 13, 1, 1, 0, 0, 1, 1),
-    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16_IOhw16o16i, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 23, 13, 13, 19, 13, 13, 1, 1, 0, 0, 1, 1)
-);
-
-INST_TEST_CASE(SimpleSmall_Blocked8_padded,
-    // non-1x1 (all)
-    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 17, 13, 13, 23, 12, 12, 3, 3, 0, 0, 1, 1),
-    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 21, 13, 13, 16, 12, 12, 3, 3, 0, 0, 1, 1),
-    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 23, 13, 13, 19, 12, 12, 3, 3, 0, 0, 1, 1),
-    // 1x1 (all)
-    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 17, 13, 13, 23, 13, 13, 1, 1, 0, 0, 1, 1),
-    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 21, 13, 13, 16, 13, 13, 1, 1, 0, 0, 1, 1),
-    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 23, 13, 13, 19, 13, 13, 1, 1, 0, 0, 1, 1)
-);
+//INST_TEST_CASE(SimpleSmall_Blocked16_padded,
+//    // non-1x1 (all)
+//    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 17, 13, 13, 23, 12, 12, 3, 3, 0, 0, 1, 1),
+//    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 21, 13, 13, 16, 12, 12, 3, 3, 0, 0, 1, 1),
+//    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 23, 13, 13, 19, 12, 12, 3, 3, 0, 0, 1, 1),
+//    // 1x1 (fwd, bwd_w)
+//    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 17, 13, 13, 23, 13, 13, 1, 1, 0, 0, 1, 1),
+//    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 21, 13, 13, 16, 13, 13, 1, 1, 0, 0, 1, 1),
+//    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 23, 13, 13, 19, 13, 13, 1, 1, 0, 0, 1, 1),
+//    // 1x1 (bwd_d)
+//    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16_IOhw16o16i, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 17, 13, 13, 23, 13, 13, 1, 1, 0, 0, 1, 1),
+//    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16_IOhw16o16i, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 21, 13, 13, 16, 13, 13, 1, 1, 0, 0, 1, 1),
+//    PARAMS(FMT_DATA_BLOCKED16, FMT_WEIGHTS_BLOCKED16_IOhw16o16i, FMT_BIAS, FMT_DATA_BLOCKED16, 2, 1, 23, 13, 13, 19, 13, 13, 1, 1, 0, 0, 1, 1)
+//);
+//
+//INST_TEST_CASE(SimpleSmall_Blocked8_padded,
+//    // non-1x1 (all)
+//    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 17, 13, 13, 23, 12, 12, 3, 3, 0, 0, 1, 1),
+//    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 21, 13, 13, 16, 12, 12, 3, 3, 0, 0, 1, 1),
+//    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 23, 13, 13, 19, 12, 12, 3, 3, 0, 0, 1, 1),
+//    // 1x1 (all)
+//    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 17, 13, 13, 23, 13, 13, 1, 1, 0, 0, 1, 1),
+//    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 21, 13, 13, 16, 13, 13, 1, 1, 0, 0, 1, 1),
+//    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED, 2, 1, 23, 13, 13, 19, 13, 13, 1, 1, 0, 0, 1, 1)
+//);
 
 INST_TEST_CASE(SimpleSmall_NCHW,
     PARAMS(nchw, oihw, FMT_BIAS, nchw,
@@ -89,7 +89,37 @@ INST_TEST_CASE(SimpleSmall_Blocked,
     PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
         2, 1, 32, 13, 13, 48, 13, 13, 3, 3, 1, 1, 1, 1),
     PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
-        2, 1, 32, 13, 13, 48, 11, 11, 3, 3, 0, 0, 1, 1)
+        2, 1, 32, 13, 13, 48, 11, 11, 3, 3, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 2, 8, 10, 10, 16, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 4, 16, 10, 10, 32, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 8, 32, 10, 10, 256, 10, 10, 3, 3, 1, 1, 1, 1)
+);
+
+
+INST_TEST_CASE(SimpleSmall_Blocked_1x1,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 32, 13, 13, 32, 13, 13, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 32, 3, 3, 32, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 32, 4, 4, 32, 4, 4, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 32, 3, 3, 32, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 32, 2, 2, 32, 2, 2, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 32, 13, 13, 48, 13, 13, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 32, 13, 13, 48, 13, 13, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 2, 8, 10, 10, 16, 10, 10, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 4, 16, 10, 10, 32, 10, 10, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED_G, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 8, 32, 10, 10, 256, 10, 10, 1, 1, 0, 0, 1, 1)
 );
 
 INST_TEST_CASE(SimpleSmall_Blocked16,
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/in/gemm_in.h b/inference-engine/thirdparty/mkl-dnn/tests/gtests/in/gemm_in.h
new file mode 100644 (file)
index 0000000..5b3c34a
--- /dev/null
@@ -0,0 +1,136 @@
+constexpr char unused = 'x';
+
+#if defined(FP32)
+INST_TEST_CASE(TestGEMM,
+    test_params{unused, 'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, true, mkldnn_invalid_arguments},
+    test_params{unused, 't', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, true, mkldnn_invalid_arguments},
+    test_params{unused, 'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, true, mkldnn_invalid_arguments},
+    test_params{unused, 'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, true, mkldnn_invalid_arguments},
+
+    test_params{unused, 'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{unused, 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{unused, 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{unused, 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{unused, 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, false},
+    test_params{unused, 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{unused, 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{unused, 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{unused, 'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, false},
+
+    test_params{unused, 'n', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
+    test_params{unused, 'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false},
+    test_params{unused, 't', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
+    test_params{unused, 't', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false},
+    test_params{unused, 'n', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
+    test_params{unused, 'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false},
+    test_params{unused, 't', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
+    test_params{unused, 't', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false}
+);
+
+#else
+
+INST_TEST_CASE(TestGEMM_expected_failures,
+    test_params{'f', 'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, true, mkldnn_invalid_arguments},
+    test_params{'f', 't', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, true, mkldnn_invalid_arguments},
+    test_params{'f', 'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, true, mkldnn_invalid_arguments},
+    test_params{'f', 'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, true, mkldnn_invalid_arguments},
+
+    test_params{'r', 'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, true, mkldnn_invalid_arguments},
+    test_params{'R', 't', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, true, mkldnn_invalid_arguments},
+    test_params{'r', 'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, true, mkldnn_invalid_arguments},
+    test_params{'R', 'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, true, mkldnn_invalid_arguments},
+
+    test_params{'c', 'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, true, mkldnn_invalid_arguments},
+    test_params{'C', 't', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, true, mkldnn_invalid_arguments},
+    test_params{'c', 'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, true, mkldnn_invalid_arguments},
+    test_params{'C', 'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, true, mkldnn_invalid_arguments}
+);
+
+INST_TEST_CASE(TestGEMM_general_cases,
+    /* offsetc is fixed */
+    test_params{'f', 'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{'f', 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{'f', 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{'f', 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{'f', 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'f', 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'f', 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'f', 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'f', 'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, false},
+
+    /* offsetc is row */
+    test_params{'r', 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{'R', 'n', 'T', 30, 20, 10, 2.0, 1.0, 120, 120, 120, false},
+    test_params{'r', 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{'R', 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{'r', 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'r', 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'R', 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'R', 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'R', 'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, false},
+
+    /* offsetc is column */
+    test_params{'C', 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{'c', 'n', 'T', 30, 20, 10, 2.0, 1.0, 120, 120, 120, false},
+    test_params{'c', 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{'c', 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
+    test_params{'C', 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'C', 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'C', 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'c', 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false},
+    test_params{'c', 'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, false}
+);
+
+INST_TEST_CASE(TestGEMM_fractional_scales,
+    /* alpha and beta have non-zero fractional part */
+    test_params{'f', 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, false},
+    test_params{'F', 'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, false},
+    test_params{'f', 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, false},
+    test_params{'F', 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, false},
+    test_params{'f', 'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, false},
+    test_params{'f', 'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, false},
+    test_params{'F', 't', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, false},
+    test_params{'F', 't', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, false},
+    test_params{'f', 'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, false},
+
+    test_params{'r', 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, false},
+    test_params{'R', 'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, false},
+    test_params{'r', 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, false},
+    test_params{'R', 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, false},
+    test_params{'r', 'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, false},
+    test_params{'r', 'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, false},
+    test_params{'R', 't', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, false},
+    test_params{'R', 't', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, false},
+    test_params{'r', 'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, false},
+
+    test_params{'C', 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, false},
+    test_params{'c', 'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, false},
+    test_params{'c', 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, false},
+    test_params{'c', 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, false},
+    test_params{'C', 'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, false},
+    test_params{'C', 'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, false},
+    test_params{'C', 't', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, false},
+    test_params{'c', 't', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, false},
+    test_params{'c', 'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 2, 10000, 2, false}
+);
+
+INST_TEST_CASE(TestGEMM_heavy,
+    test_params{'f', 'n', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
+    test_params{'f', 'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false},
+    test_params{'f', 't', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
+    test_params{'f', 't', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false},
+    test_params{'f', 'n', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
+    test_params{'f', 'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false},
+    test_params{'f', 't', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
+    test_params{'f', 't', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false},
+
+    test_params{'f', 'n', 'n', 2000, 2000, 2000, 2.33f, 1.66f, 2000, 2000, 2000, false},
+    test_params{'f', 'n', 'n', 3000, 3000, 3000, 2.19f, 1.99f, 3000, 3000, 3000, false},
+    test_params{'f', 't', 'n', 2000, 2000, 2000, 2.01f, 1.01f, 2000, 2000, 2000, false},
+    test_params{'f', 't', 'n', 3000, 3000, 3000, 2.99f, 1.19f, 3000, 3000, 3000, false},
+    test_params{'f', 'n', 't', 2000, 2000, 2000, 1.33f, 2.33f, 2000, 2000, 2000, false},
+    test_params{'f', 'n', 't', 3000, 3000, 3000, 1.19f, 2.99f, 3000, 3000, 3000, false},
+    test_params{'f', 't', 't', 2000, 2000, 2000, 1.01f, 2.01f, 2000, 2000, 2000, false},
+    test_params{'f', 't', 't', 3000, 3000, 3000, 1.99f, 2.19f, 3000, 3000, 3000, false}
+);
+#endif
index bf2083c..317c086 100644 (file)
@@ -17,6 +17,7 @@
 #ifndef MKLDNN_TEST_COMMON_HPP
 #define MKLDNN_TEST_COMMON_HPP
 
+#include <limits>
 #include <numeric>
 #include <vector>
 #include <cmath>
@@ -60,6 +61,16 @@ template <typename data_t> inline data_t out_round(float x,
 template <> inline float out_round<float>(float x, mkldnn_round_mode_t rmode)
 { (void)rmode; return x; }
 
+template <typename data_t, typename out_t>
+out_t saturate(const out_t &x) {
+    out_t v = x;
+    if (v <= std::numeric_limits<data_t>::min())
+        v = std::numeric_limits<data_t>::min();
+    if (v > std::numeric_limits<data_t>::max())
+        v = std::numeric_limits<data_t>::max();
+    return v;
+}
+
 inline int right_padding(int i, int o, int k, int p, int s, int d = 0) {
     return (o - 1) * s + (k - 1) * (d + 1) - (p + i - 1);
 }
@@ -226,6 +237,8 @@ inline mkldnn::memory::desc create_md(mkldnn::memory::dims dims,
     case f::nChw16c:
     case f::oihw:
     case f::hwio:
+    case f::oIhw8i:
+    case f::oIhw16i:
     case f::OIhw8i8o:
     case f::OIhw16i16o:
     case f::OIhw8i16o2i:
@@ -236,6 +249,7 @@ inline mkldnn::memory::desc create_md(mkldnn::memory::dims dims,
     case f::IOhw16o16i:
     case f::Ohwi8o:
     case f::Ohwi16o:
+    case f::OhIw8o4i:
         ndims = 4; break;
     case f::ncdhw:
     case f::ndhwc:
@@ -245,6 +259,8 @@ inline mkldnn::memory::desc create_md(mkldnn::memory::dims dims,
     case f::oidhw:
     case f::goihw:
     case f::hwigo:
+    case f::oIdhw8i:
+    case f::oIdhw16i:
     case f::OIdhw8i8o:
     case f::OIdhw16i16o:
     case f::OIdhw8o8i:
@@ -260,6 +276,7 @@ inline mkldnn::memory::desc create_md(mkldnn::memory::dims dims,
     case f::gOIhw8o8i:
     case f::gOIhw16o16i:
     case f::gIOhw16o16i:
+    case f::gOhIw8o4i:
         ndims = 5; break;
     case f::gOIdhw8i8o:
     case f::gOIdhw16i16o:
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_convolution_eltwise_forward_common.hpp
new file mode 100644 (file)
index 0000000..5337807
--- /dev/null
@@ -0,0 +1,280 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "mkldnn_test_common.hpp"
+#include "gtest/gtest.h"
+
+#include "mkldnn.hpp"
+
+namespace mkldnn {
+
+
+template <typename T, typename A> inline T relu_fwd(T s, A alpha) {
+    return s > 0 ? s : static_cast<T>(s * alpha);
+}
+
+template <typename T> T tanh_fwd(T s) {
+    const float e = ::expf(2*s); /* maybe replace with -2*s? */
+    return static_cast<T>((e - 1.0) / (e + 1.0));
+}
+
+template <typename T, typename A> T elu_fwd(T s, A alpha) {
+    return s > 0 ? s : static_cast<T>(alpha * (::expf(s) - 1));
+}
+
+template <typename T>
+T square_fwd(T s) {
+    return s * s;
+}
+
+template <typename T>
+T abs_fwd(T s) {
+    return s > 0 ? s : -s;;
+}
+
+template <typename T>
+T sqrt_fwd(T s) {
+    return s > 0 ? ::sqrtf(s) : 0;
+}
+
+template <typename T, typename A>
+T linear_fwd(T s, A alpha, A beta) {
+    return alpha * s + beta;
+}
+
+template <typename T, typename A>
+T bounded_relu_fwd(T s, A alpha) {
+    s = s > 0 ? s : 0;
+    return s > alpha ? alpha : s;
+}
+
+template <typename T>
+T soft_relu_fwd(T s) {
+    return logf(1 + ::expf(s));
+}
+
+template <typename T>
+T logistic_fwd(T s) {
+    T v = ::expf(s);
+    return v / (v + 1);
+}
+
+template <typename T, typename A>
+T clamp_fwd(T s, A alpha, A beta) {
+    return s > alpha ? (T)(alpha) : s < beta ? (T)(beta) : s;
+}
+
+template <typename data_t_src, typename data_t_wei,
+          typename data_t_acc, typename data_t_dst>
+void compute_ref_conv_eltwise_fwd(const test_convolution_sizes_t &c,
+        const memory &src, const memory &weights, const memory &bias,
+        const memory &dst, bool w_bias, algorithm elt_alg,
+        float elt_alpha, float elt_beta)
+{
+    data_t_src *src_data = (data_t_src *)src.get_data_handle();
+    data_t_wei *weights_data = (data_t_wei *)weights.get_data_handle();
+    data_t_dst *bias_data
+            = (data_t_dst *)(w_bias ? bias.get_data_handle() : nullptr);
+    data_t_dst *dst_data = (data_t_dst *)dst.get_data_handle();
+
+    const memory::desc src_d = src.get_primitive_desc().desc();
+    const memory::desc weights_d = weights.get_primitive_desc().desc();
+    const memory::desc dst_d = dst.get_primitive_desc().desc();
+
+    mkldnn::impl::parallel_nd(c.mb, c.ng, c.oc / c.ng, c.oh, c.ow,
+        [&](int n, int g, int oc, int oh, int ow) {
+            int oidx = n * c.oc * c.oh * c.ow
+                       + g * c.oc / c.ng * c.oh * c.ow
+                       + oc * c.oh * c.ow + oh * c.ow + ow;
+
+            int didx = map_index(dst_d, oidx);
+            dst_data[didx] = bias_data ?
+                             bias_data[map_index(
+                                     bias.get_primitive_desc().desc(),
+                                     g * c.oc / c.ng + oc)] :
+                             data_t_dst{0};
+            for (int ic = 0; ic < c.ic / c.ng; ic++) {
+                for (int kh = 0; kh < c.kh; kh++) {
+                    for (int kw = 0; kw < c.kw; kw++) {
+                        int iw = ow * c.strw
+                                 - c.padw + kw * (1 + c.dilw);
+                        int ih = oh * c.strh
+                                 - c.padh + kh * (1 + c.dilh);
+                        if (iw < 0 || iw >= c.iw) continue;
+                        if (ih < 0 || ih >= c.ih) continue;
+                        int iidx = n * c.ic * c.ih * c.iw
+                                   + g * c.ic / c.ng * c.ih * c.iw
+                                   + ic * c.ih * c.iw + ih * c.iw + iw;
+                        int widx = g * c.oc / c.ng * c.ic
+                                   / c.ng * c.kh * c.kw
+                                   + oc * c.ic / c.ng * c.kh * c.kw
+                                   + ic * c.kh * c.kw + kh * c.kw + kw;
+
+                        dst_data[didx]
+                                += src_data[map_index(src_d, iidx)]
+                                   * weights_data[map_index(
+                                weights_d, widx)];
+                    }
+                }
+            }
+
+            switch (elt_alg) {
+                case eltwise_relu:
+                    dst_data[didx] = relu_fwd(dst_data[didx], elt_alpha);
+                    break;
+                case eltwise_tanh:
+                    dst_data[didx] = tanh_fwd(dst_data[didx]);
+                    break;
+                case eltwise_elu:
+                    dst_data[didx] = elu_fwd(dst_data[didx], elt_alpha);
+                    break;
+                case eltwise_square:
+                    dst_data[didx] = square_fwd(dst_data[didx]);
+                    break;
+                case eltwise_abs:
+                    dst_data[didx] = abs_fwd(dst_data[didx]);
+                    break;
+                case eltwise_sqrt:
+                    dst_data[didx] = sqrt_fwd(dst_data[didx]);
+                    break;
+                case eltwise_linear:
+                    dst_data[didx] = linear_fwd(dst_data[didx], elt_alpha, elt_beta);
+                    break;
+                case eltwise_bounded_relu:
+                    dst_data[didx] = bounded_relu_fwd(dst_data[didx], elt_alpha);
+                    break;
+                case eltwise_soft_relu:
+                    dst_data[didx] = soft_relu_fwd(dst_data[didx]);
+                    break;
+                case eltwise_logistic:
+                    dst_data[didx] = logistic_fwd(dst_data[didx]);
+                    break;
+                default:
+                    assert(!"unknown alg_kind");
+            }
+        }
+    );
+}
+
+template <typename data_t_src, typename data_t_wei,
+          typename data_t_acc, typename data_t_dst>
+class convolution_eltwise_test
+    : public ::testing::TestWithParam<test_convolution_eltwise_params_t> {
+protected:
+    virtual void SetUp()
+    {
+        test_convolution_eltwise_params_t p
+                = ::testing::TestWithParam<
+                test_convolution_eltwise_params_t>::GetParam();
+
+        ASSERT_TRUE(p.engine_kind == engine::kind::cpu);
+        ASSERT_EQ(p.aalgorithm, convolution_direct);
+        auto eng = engine(p.engine_kind, 0);
+        float eltwise_alpha = p.eltwise_alpha;
+        float eltwise_beta = p.eltwise_beta;
+
+        memory::data_type data_type_src = data_traits<data_t_src>::data_type;
+        memory::data_type data_type_dst = data_traits<data_t_dst>::data_type;
+        memory::data_type data_type_wei = data_traits<data_t_wei>::data_type;
+
+        test_convolution_sizes_t cd = p.sizes;
+
+        auto c_src_desc = create_md({ cd.mb, cd.ic, cd.ih, cd.iw },
+                data_type_src, p.formats.src_format);
+        auto c_weights_desc = cd.ng > 1 ?
+                create_md({ cd.ng, cd.oc / cd.ng, cd.ic / cd.ng, cd.kh, cd.kw },
+                        data_type_wei, p.formats.weights_format) :
+                create_md({ cd.oc, cd.ic, cd.kh, cd.kw },
+                        data_type_wei, p.formats.weights_format);
+        auto c_dst_desc = create_md({ cd.mb, cd.oc, cd.oh, cd.ow },
+                data_type_dst, p.formats.dst_format);
+
+        auto c_src = memory({c_src_desc, eng});
+        auto c_weights = memory({c_weights_desc, eng});
+        auto c_dst = memory({c_dst_desc, eng});
+
+        auto dst_ref = memory({c_dst_desc, eng});
+
+        fill_data<data_t_src>(c_src.get_primitive_desc().get_size()
+                / sizeof(data_t_src), (data_t_src *)c_src.get_data_handle(), data_t_src(0), data_t_src(1));
+
+        fill_data<data_t_wei>(
+                c_weights.get_primitive_desc().get_size()
+                / sizeof(data_t_wei),(data_t_wei *)c_weights.get_data_handle(), data_t_wei(0), data_t_wei(1));
+
+        bool with_bias = p.formats.bias_format != memory::format::format_undef;
+        auto c_bias_desc = with_bias ?
+                create_md({ cd.oc }, data_type_dst, p.formats.bias_format) :
+                create_md({}, data_type_dst, p.formats.bias_format);
+        auto c_bias = memory({c_bias_desc, eng});
+        if (with_bias) {
+            fill_data<data_t_dst>(
+                    c_bias.get_primitive_desc().get_size() / sizeof(data_t_dst),
+                    (data_t_dst *)c_bias.get_data_handle(), 1., true);
+        }
+
+        std::vector<int> padR = { cd.padh, cd.padw };
+        for (int i = 0; i < 2; ++i) {
+            if ((cd.ih - ((cd.kh - 1) * (cd.dilh + 1) + 1) + cd.padh + padR[0])
+                / cd.strh + 1 != cd.oh)
+                ++padR[0];
+            if ((cd.iw - ((cd.kw - 1) * (cd.dilw + 1) + 1) + cd.padw + padR[1])
+                / cd.strw + 1 != cd.ow)
+                ++padR[1];
+        }
+
+        auto test = [&]() {
+            mkldnn::post_ops ops;
+            ops.append_eltwise(1.0, p.alg, p.eltwise_alpha, p.eltwise_beta);
+
+            mkldnn::primitive_attr attr;
+            attr.set_post_ops(ops);
+
+            auto conv_desc = with_bias
+                ? convolution_forward::desc(prop_kind::forward_scoring,
+                        p.aalgorithm, c_src_desc, c_weights_desc, c_bias_desc,
+                        c_dst_desc, { cd.strh, cd.strw }, { cd.dilh, cd.dilw },
+                        { cd.padh, cd.padw }, padR, padding_kind::zero)
+                : convolution_forward::desc(prop_kind::forward_scoring,
+                        p.aalgorithm, c_src_desc, c_weights_desc, c_dst_desc,
+                        { cd.strh, cd.strw }, { cd.dilh, cd.dilw },
+                        { cd.padh, cd.padw }, padR, padding_kind::zero);
+
+            auto conv_primitive_desc =
+                convolution_forward::primitive_desc(conv_desc, attr, eng);
+
+            auto conv = with_bias
+                ? convolution_forward(conv_primitive_desc,
+                        c_src, c_weights, c_bias, c_dst)
+                : convolution_forward(conv_primitive_desc,
+                        c_src, c_weights, c_dst);
+            std::vector<primitive> pipeline;
+            pipeline.push_back(conv);
+
+            stream(stream::kind::lazy).submit(pipeline).wait();
+        };
+
+        if (catch_expected_failures(test, p.expect_to_fail, p.expected_status))
+            return;
+
+        compute_ref_conv_eltwise_fwd<data_t_src, data_t_wei, data_t_wei,
+            data_t_dst>(cd, c_src, c_weights, c_bias, dst_ref, with_bias,
+                        p.alg, eltwise_alpha, eltwise_beta);
+        compare_data<data_t_dst>(dst_ref, c_dst);
+    }
+};
+
+}
index dc8550a..e3f2ac5 100644 (file)
@@ -50,6 +50,11 @@ void compute_ref_conv_fwd(const test_convolution_sizes_t &c,
     size_t padded_ic = src_d.data.layout_desc.blocking.padding_dims[1];
     size_t padded_oc = dst_d.data.layout_desc.blocking.padding_dims[1];
 
+    size_t padded_ic_w = weights_d.data.format == mkldnn_OhIw8o4i ? weights_d.data.layout_desc.blocking.padding_dims[1] :
+                                                                    src_d.data.layout_desc.blocking.padding_dims[1];
+    size_t padded_oc_w = weights_d.data.format == mkldnn_OhIw8o4i ? weights_d.data.layout_desc.blocking.padding_dims[0] :
+                                                                    dst_d.data.layout_desc.blocking.padding_dims[1];
+
     mkldnn::impl::parallel_nd(c.mb, c.ng, c.oc / c.ng, c.oh, c.ow,
         [&](int n, int g, int oc, int oh, int ow) {
             data_t_acc a = 0;
@@ -65,14 +70,17 @@ void compute_ref_conv_fwd(const test_convolution_sizes_t &c,
                         size_t iidx = n * padded_ic * c.ih * c.iw
                             + g * padded_ic / c.ng * c.ih * c.iw
                             + ic * c.ih * c.iw + ih * c.iw + iw;
-                        size_t widx = g * padded_oc / c.ng * padded_ic
+                        size_t widx = g * padded_oc_w / c.ng * padded_ic_w
                             / c.ng * c.kh * c.kw
-                            + oc * padded_ic / c.ng * c.kh * c.kw
+                            + oc * padded_ic_w / c.ng * c.kh * c.kw
                             + ic * c.kh * c.kw + kh * c.kw + kw;
+
+                        int iidx_ = map_index(src_d, iidx);
+                        int widx_ = map_index(weights_d, widx);
+
                         a += ((data_t_acc)
-                            src_data[map_index(src_d, iidx)])
-                            *  weights_data[map_index(
-                            weights_d, widx)];
+                            src_data[iidx_]
+                            *  weights_data[widx_]);
                     }
                 }
             }
@@ -154,8 +162,7 @@ protected:
         auto c_bias = test_memory(c_bias_desc, eng);
         auto c_dst = test_memory(c_dst_desc, eng);
 
-        std::shared_ptr<data_t_dst>
-            ref_dst_data(new data_t_dst[c_dst.get_size()]);
+        std::vector<data_t_dst> ref_dst_data(c_dst.get_size());
 
         // Only true for dense format
         fill_data<data_t_dst>(c_dst.get_size() / sizeof(data_t_dst),
@@ -202,7 +209,7 @@ protected:
         s.submit(pipeline).wait();
 
         auto ref_memory = memory(memory::primitive_desc(c_dst_desc, eng),
-                ref_dst_data.get());
+                &ref_dst_data[0]);
         compute_ref_conv_fwd<data_t_src,data_t_wei,data_t_acc,data_t_dst>(
                 cd, attr, c_src_desc, c_weights_desc, c_bias_desc, c_dst_desc,
                 c_src.get(), c_weights.get(), c_bias.get(), ref_memory);
index bc23a46..7a1618f 100644 (file)
@@ -28,10 +28,68 @@ TEST_P(convolution_test, TestConvolution)
 {
 }
 
-#define TEST_PARAM_ATTR
+//#define TEST_PARAM_ATTR
 #define U8S8
 #define DIRECTION_FORWARD
 #include "convolution_common.h"
-#undef TEST_PARAM_ATTR
+
+INST_TEST_CASE(SimpleSmall_Blocked_Padded_Channels,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 7, 3, 3, 5, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 15, 3, 3, 37, 4, 4, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 14, 4, 4, 1, 4, 4, 3, 3, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 7, 3, 3, 33, 3, 3, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 19, 2, 2, 22, 2, 2, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 126, 13, 13, 126, 13, 13, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 77, 13, 13, 99, 11, 11, 3, 3, 0, 0, 1, 1)
+);
+
+INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 13, 13, 35, 13, 13, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 7, 3, 3, 11, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 1, 4, 4, 58, 4, 4, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 27, 3, 3, 33, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 81, 2, 2, 81, 2, 2, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 126, 13, 13, 13, 13, 13, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 111, 13, 13, 71, 13, 13, 1, 1, 0, 0, 1, 1)
+);
+
+INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels,
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 126, 126, 10, 10, 126, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 77, 77, 9, 9, 77, 2, 2, 5, 5, 0, 0, 3, 3),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 68, 68, 26, 26, 68, 13, 13, 4, 4, 1, 1, 2, 2),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 33, 33, 111, 111, 33, 112, 112, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 111, 111, 1, 2, 111, 1, 1, 3, 3, 1, 1, 1, 2),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 29, 29, 16, 32, 29, 16, 18, 3, 3, 1, 2, 1, 2),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 53, 53, 32, 16, 53, 16, 14, 3, 3, 1, 0, 2, 1),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 13, 13, 32, 16, 13, 18, 16, 3, 3, 2, 1, 2, 1),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 9, 9, 500, 500, 9, 698, 698, 3, 3, 100, 100, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 2, 2, 500, 500, 2, 698, 698, 3, 3, 100, 100, 1, 1)
+);
+
+//#undef TEST_PARAM_ATTR
 
 }
index 9c9d9ba..bd04f94 100644 (file)
@@ -28,10 +28,68 @@ TEST_P(convolution_test, TestConvolution)
 {
 }
 
-#define TEST_PARAM_ATTR
+//#define TEST_PARAM_ATTR
 #define U8S8
 #define DIRECTION_FORWARD
 #include "convolution_common.h"
-#undef TEST_PARAM_ATTR
+
+INST_TEST_CASE(SimpleSmall_Blocked_Padded_Channels,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 7, 3, 3, 5, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 15, 3, 3, 37, 4, 4, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 14, 4, 4, 1, 4, 4, 3, 3, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 7, 3, 3, 33, 3, 3, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 19, 2, 2, 22, 2, 2, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 126, 13, 13, 126, 13, 13, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 77, 13, 13, 99, 11, 11, 3, 3, 0, 0, 1, 1)
+);
+
+INST_TEST_CASE(SimpleSmall_Blocked_1x1_Padded_Channels,
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 3, 13, 13, 35, 13, 13, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 7, 3, 3, 11, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 1, 4, 4, 58, 4, 4, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 27, 3, 3, 33, 3, 3, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 81, 2, 2, 81, 2, 2, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 126, 13, 13, 13, 13, 13, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, FMT_WEIGHTS_BLOCKED, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 1, 111, 13, 13, 71, 13, 13, 1, 1, 0, 0, 1, 1)
+);
+
+INST_TEST_CASE(SimpleSmall_Depthwise_Blocked_Padded_Channels,
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 126, 126, 10, 10, 126, 10, 10, 3, 3, 1, 1, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 77, 77, 9, 9, 77, 2, 2, 5, 5, 0, 0, 3, 3),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 68, 68, 26, 26, 68, 13, 13, 4, 4, 1, 1, 2, 2),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        2, 33, 33, 111, 111, 33, 112, 112, 1, 1, 0, 0, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 111, 111, 1, 2, 111, 1, 1, 3, 3, 1, 1, 1, 2),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 29, 29, 16, 32, 29, 16, 18, 3, 3, 1, 2, 1, 2),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 53, 53, 32, 16, 53, 16, 14, 3, 3, 1, 0, 2, 1),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 13, 13, 32, 16, 13, 18, 16, 3, 3, 2, 1, 2, 1),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 9, 9, 500, 500, 9, 698, 698, 3, 3, 100, 100, 1, 1),
+    PARAMS(FMT_DATA_BLOCKED, Goihw8g, FMT_BIAS, FMT_DATA_BLOCKED,
+        1, 2, 2, 500, 500, 2, 698, 698, 3, 3, 100, 100, 1, 1)
+);
+
+//#undef TEST_PARAM_ATTR
 
 }
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm.cpp
deleted file mode 100644 (file)
index 0fe2c1f..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-/*******************************************************************************
-* Copyright 2018 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "mkldnn_test_common.hpp"
-#include "gtest/gtest.h"
-
-#include "mkldnn_types.h"
-#include "mkldnn.h"
-
-namespace mkldnn {
-struct test_params {
-    char transA;
-    char transB;
-    int M;
-    int N;
-    int K;
-    float alpha;
-    float beta;
-    int lda;
-    int ldb;
-    int ldc;
-
-    bool expect_to_fail;
-    mkldnn_status_t expected_status;
-};
-
-void ref_gemm(const char *transa, const char *transb, int m, int n, int k,
-        const float alpha, const float *a, int lda, const float *b,
-        int ldb, float beta, float *c, int ldc) {
-
-    const bool tr_a = transa && (*transa == 'T' || *transa == 't');
-    const bool tr_b = transb && (*transb == 'T' || *transb == 't');
-
-    auto pa = [=] (int i, int j) { return a[j*lda + i]; };
-    auto pb = [=] (int i, int j) { return b[j*ldb + i]; };
-    auto pc = [=] (int i, int j) { return c[j*ldc + i]; };
-
-    mkldnn::impl::parallel_nd(m, n, [&](int im, int in) {
-        float c_elem = (beta == 0.) ? 0. : pc(im, in) * beta;
-        for (int ik = 0; ik < k; ik++) {
-            const float a_elem = tr_a ? pa(ik, im) : pa(im, ik);
-            const float b_elem = tr_b ? pb(in, ik) : pb(ik, in);
-            c_elem += alpha * a_elem * b_elem;
-        }
-        c[in*ldc + im] = c_elem;
-    });
-}
-
-void compare(int M, int N, int ldc, float *C, float *C_ref) {
-    mkldnn::impl::parallel_nd(N, ldc, [&](int i, int j) {
-        float ref = C_ref[i*ldc + j];
-        float got = C[i*ldc + j];
-        float diff = got - ref;
-        float e = (std::abs(ref) > 1e-4) ? diff / ref : diff;
-        EXPECT_NEAR(e, 0.0, 1e-4)
-            << "Row: " << j << " Column: " << i;
-    });
-}
-
-class sgemm_test: public ::testing::TestWithParam<test_params> {
-protected:
-    virtual void SetUp() {
-        test_params p
-            = ::testing::TestWithParam<test_params>::GetParam();
-        catch_expected_failures([=](){Test();}, p.expect_to_fail,
-                    p.expected_status);
-    }
-    virtual void Test() {
-        mkldnn_status_t status;
-        test_params p
-            = ::testing::TestWithParam<test_params>::GetParam();
-        const bool tr_a = (p.transA == 'T' || p.transA == 't');
-        const bool tr_b = (p.transB == 'T' || p.transB == 't');
-        size_t sizeA = !tr_a ? p.lda * p.K : p.lda * p.M,
-                sizeB = !tr_b ? p.ldb * p.N : p.ldb * p.K,
-                sizeC = p.ldc * p.N;
-        float *A = nullptr, *B = nullptr, *C = nullptr, *C_ref = nullptr;
-        A = (float *)test_malloc(sizeA*sizeof(float));
-        B = (float *)test_malloc(sizeB*sizeof(float));
-        C = (float *)test_malloc(sizeC*sizeof(float));
-        C_ref = (float *)test_malloc(sizeC*sizeof(float));
-
-        fill_data<float>(sizeA, A);
-        fill_data<float>(sizeB, B);
-        fill_data<float>(sizeC, C);
-
-        mkldnn::impl::parallel_nd(p.N * p.ldc, [&](int i) { C_ref[i] = C[i]; });
-
-        status = mkldnn_sgemm(&p.transA, &p.transB, &p.M, &p.N, &p.K, &p.alpha, A,
-                &p.lda, B, &p.ldb, &p.beta, C, &p.ldc);
-        if (status != mkldnn_success)
-            throw error(status, "mkldnn_sgemm returned error");
-
-        ref_gemm(&p.transA, &p.transB, p.M, p.N, p.K, p.alpha, A, p.lda,
-                B, p.ldb, p.beta, C_ref, p.ldc);
-        compare(p.M, p.N, p.ldc, C, C_ref);
-
-        test_free((char *)A);
-        test_free((char *)B);
-        test_free((char *)C);
-        test_free((char *)C_ref);
-    }
-};
-TEST_P(sgemm_test, TestSGEMM) {}
-INSTANTIATE_TEST_CASE_P(TestSGEMM, sgemm_test, ::testing::Values(
-    test_params{'n', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, true, mkldnn_invalid_arguments},
-    test_params{'t', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, true, mkldnn_invalid_arguments},
-    test_params{'n', 't', 3, 2, 1, 1.0, 0.0, 3, 1, 8, true, mkldnn_invalid_arguments},
-    test_params{'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, true, mkldnn_invalid_arguments},
-
-    test_params{'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
-    test_params{'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
-    test_params{'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
-    test_params{'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, false},
-    test_params{'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, false},
-    test_params{'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, false},
-    test_params{'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false},
-    test_params{'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, false},
-    test_params{'n', 'n', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2, false},
-
-    test_params{'n', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
-    test_params{'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false},
-    test_params{'t', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
-    test_params{'t', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false},
-    test_params{'n', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
-    test_params{'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false},
-    test_params{'t', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000, false},
-    test_params{'t', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000, false}
-));
-}
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_common.hpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_common.hpp
new file mode 100644 (file)
index 0000000..fa8e683
--- /dev/null
@@ -0,0 +1,316 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef TEST_GEMM_COMMON_H
+#define TEST_GEMM_COMMON_H
+
+#include "mkldnn_test_common.hpp"
+#include "gtest/gtest.h"
+
+#include "mkldnn_types.h"
+#include "mkldnn.h"
+
+#define CONCAT_WITH_UNDERSCORE_(a,b) a ## _ ## b
+#define CONCAT_WITH_UNDERSCORE(a,b) CONCAT_WITH_UNDERSCORE_(a,b)
+
+#define INST_TEST_CASE_(str, ...) INSTANTIATE_TEST_CASE_P( \
+        str, gemm_test, ::testing::Values(__VA_ARGS__))
+#define INST_TEST_CASE(str, ...) INST_TEST_CASE_( \
+        CONCAT_WITH_UNDERSCORE(str,TEST_CASE_NAME_PREFIX), __VA_ARGS__)
+
+namespace mkldnn {
+
+struct test_params {
+    char offsetc;
+    char transA;
+    char transB;
+    int M;
+    int N;
+    int K;
+    float alpha;
+    float beta;
+    int lda;
+    int ldb;
+    int ldc;
+
+    bool expect_to_fail;
+    mkldnn_status_t expected_status;
+};
+
+template <typename data_t>
+void ref_gemm(const char *transa, const char *transb, int m, int n, int k,
+        const data_t alpha, const data_t *a, int lda, const data_t *b,
+        int ldb, data_t beta, data_t *c, int ldc) {
+
+    const bool tr_a = transa && (*transa == 'T' || *transa == 't');
+    const bool tr_b = transb && (*transb == 'T' || *transb == 't');
+
+    auto pa = [=] (int i, int j) { return a[j*lda + i]; };
+    auto pb = [=] (int i, int j) { return b[j*ldb + i]; };
+    auto pc = [=] (int i, int j) { return c[j*ldc + i]; };
+
+    mkldnn::impl::parallel_nd(m, n, [&](int im, int in) {
+        data_t c_elem = (beta == 0.) ? 0. : pc(im, in) * beta;
+
+        for (int ik = 0; ik < k; ik++) {
+            const data_t a_elem = tr_a ? pa(ik, im) : pa(im, ik);
+            const data_t b_elem = tr_b ? pb(in, ik) : pb(ik, in);
+            c_elem += alpha * a_elem * b_elem;
+        }
+        c[in*ldc + im] = c_elem;
+    });
+}
+
+template <typename b_dt>
+void ref_gemm_s8x8s32(const char *transa, const char *transb,
+        const char *offsetc, int m, int n, int k, const float alpha,
+        int8_t *A, int lda, const int8_t *ao, b_dt *B, int ldb,
+        const int8_t *bo, const float beta, int32_t *C, int ldc,
+        const int32_t *co) {
+
+    bool OCisR = (*offsetc == 'R' || *offsetc == 'r');
+    bool OCisC = (*offsetc == 'C' || *offsetc == 'c');
+    bool AisN = (*transa == 'N' || *transa == 'n');
+    bool BisN = (*transb == 'N' || *transb == 'n');
+
+    size_t sizeA = AisN ? lda * k : lda * m;
+    size_t sizeB = BisN ? ldb * n : ldb * k;
+    size_t sizeC = ldc * n;
+
+    double *dA = (double *)test_malloc(sizeA * sizeof(double));
+    double *dB = (double *)test_malloc(sizeB * sizeof(double));
+    double *dC = (double *)test_malloc(sizeC * sizeof(double));
+
+    auto da_setter = [=] (int i, int j, double v) { dA[j * lda + i] = v; };
+    auto db_setter = [=] (int i, int j, double v) { dB[j * ldb + i] = v; };
+
+    auto ia_accessor = [=] (int i, int j) { return A[j * lda + i]; };
+    auto ib_accessor = [=] (int i, int j) { return B[j * ldb + i]; };
+
+    const int a_rows = AisN ? m : k;
+    const int a_cols = AisN ? k : m;
+    mkldnn::impl::parallel_nd(a_cols, a_rows, [&](int j, int i) {
+        da_setter(i, j,
+            static_cast<double>(ia_accessor(i, j)) + static_cast<double>(ao[0]));
+    });
+
+    const int b_rows = BisN ? k : n;
+    const int b_cols = BisN ? n : k;
+    mkldnn::impl::parallel_nd(b_cols, b_rows, [&](int j, int i) {
+        db_setter(i, j,
+            static_cast<double>(ib_accessor(i, j)) + static_cast<double>(bo[0]));
+    });
+
+    ref_gemm(transa, transb, m, n, k, 1.0, dA, lda, dB, ldb, 0.0, dC, ldc);
+
+    auto i2d = [=] (int32_t v) { return static_cast<double>(v); };
+    auto f2d = [=] (float v) { return static_cast<double>(v); };
+
+    mkldnn::impl::parallel_nd(n, m, [&] (int j, int i) {
+        double coffset = OCisR ? i2d(co[j]) : OCisC ? i2d(co[i]) : i2d(co[0]);
+        double val = ((beta == 0.0f) ? 0.0 : f2d(beta) * i2d(C[i + j * ldc]))
+            + f2d(alpha) * dC[i + j * ldc] + coffset;
+        C[i + j * ldc] =
+            static_cast<int32_t>(nearbyint(saturate<int32_t, double>(val)));
+    });
+
+    test_free((char *)dA);
+    test_free((char *)dB);
+    test_free((char *)dC);
+}
+
+template <typename T>
+void compare(int M, int N, int ldc, T *C, T *C_ref, int K = 1) {
+    mkldnn::impl::parallel_nd(N, ldc, [&](int i, int j) {
+        T ref = C_ref[i*ldc + j];
+        T got = C[i*ldc + j];
+        T diff = got - ref;
+        if (data_traits<T>::data_type == memory::data_type::f32) {
+            T e = (std::abs(ref) > 1e-4) ? diff / ref : diff;
+            EXPECT_NEAR(e, 0.0, 1e-4)
+                << "Row: " << j << " Column: " << i;
+        } else {
+            T eps = K / 1000 + 1;
+            EXPECT_NEAR(diff, 0, eps)
+                << "Row: " << j << " Column: " << i;
+        }
+    });
+}
+
+inline void get_matrix_size(const test_params &p, size_t &sizeA,
+        size_t &sizeB, size_t &sizeC) {
+    const bool tr_a = (p.transA == 'T' || p.transA == 't');
+    const bool tr_b = (p.transB == 'T' || p.transB == 't');
+    sizeA = !tr_a ? p.lda * p.K : p.lda * p.M,
+    sizeB = !tr_b ? p.ldb * p.N : p.ldb * p.K,
+    sizeC = p.ldc * p.N;
+}
+
+template <typename T>
+inline T* get_matrix_buffer(size_t n) {
+    return (T*)test_malloc(n * sizeof(T));
+}
+
+template <typename a_dt, typename b_dt, typename c_dt>
+inline void fill_matrix(size_t sizeA, size_t sizeB, size_t sizeC, size_t sizeco,
+        a_dt *A, b_dt *B, c_dt *C, a_dt *ao, a_dt *bo, c_dt *co) {
+    fill_data<a_dt>(sizeA, A);
+    fill_data<b_dt>(sizeB, B);
+    fill_data<c_dt>(sizeC, C);
+    if (ao != nullptr && bo != nullptr && co != nullptr) {
+        fill_data<a_dt>(1, ao);
+        fill_data<a_dt>(1, bo);
+        fill_data<c_dt>(sizeco, co);
+    }
+}
+
+template <typename a_dt, typename b_dt, typename c_dt>
+void run_test_gemm(const test_params &p) {}
+
+template <>
+void run_test_gemm<int8_t, uint8_t, int32_t>(const test_params &p) {
+    size_t sizeA, sizeB, sizeC;
+    get_matrix_size(p, sizeA, sizeB, sizeC);
+
+    int8_t  *A = get_matrix_buffer<int8_t>(sizeA);
+    uint8_t *B = get_matrix_buffer<uint8_t>(sizeB);
+    int32_t *C = get_matrix_buffer<int32_t>(sizeC);
+    int32_t *C_ref = get_matrix_buffer<int32_t>(sizeC);
+
+    bool OCisR = (p.offsetc == 'R' || p.offsetc == 'r');
+    bool OCisC = (p.offsetc == 'C' || p.offsetc == 'c');
+    size_t sizeco = OCisR ? p.N : OCisC ? p.M : 1;
+
+    int8_t ao, bo;
+    int32_t *co = get_matrix_buffer<int32_t>(sizeco);
+
+    fill_matrix<int8_t, uint8_t, int32_t>(sizeA, sizeB, sizeC, sizeco, A, B, C,
+        &ao, &bo, co);
+
+    mkldnn::impl::parallel_nd(p.ldc * p.N,
+        [&](int i) { C_ref[i] = static_cast<int32_t>(C[i]); });
+
+    auto status = mkldnn_gemm_s8u8s32(&p.transA, &p.transB, &p.offsetc,
+        &p.M, &p.N, &p.K, &p.alpha, A, &p.lda, &ao, B, &p.ldb, &bo,
+        &p.beta, C, &p.ldc, co);
+
+    if (status != mkldnn_success)
+        throw error(status, "mkldnn_gemm_s8u8s32 returned error");
+
+    ref_gemm_s8x8s32<uint8_t>(&p.transA, &p.transB, &p.offsetc, p.M, p.N,
+        p.K, p.alpha, A, p.lda, &ao, B, p.ldb, &bo, p.beta, C_ref,
+        p.ldc, co);
+
+    compare(p.M, p.N, p.ldc, C, C_ref, p.K);
+
+    test_free((char *)A);
+    test_free((char *)B);
+    test_free((char *)C);
+    test_free((char *)C_ref);
+    test_free((char *)co);
+}
+
+template <>
+void run_test_gemm<int8_t, int8_t, int32_t>(const test_params &p) {
+    size_t sizeA, sizeB, sizeC;
+    get_matrix_size(p, sizeA, sizeB, sizeC);
+
+    int8_t  *A = get_matrix_buffer<int8_t>(sizeA);
+    int8_t  *B = get_matrix_buffer<int8_t>(sizeB);
+    int32_t *C = get_matrix_buffer<int32_t>(sizeC);
+    int32_t *C_ref = get_matrix_buffer<int32_t>(sizeC);
+
+    bool OCisR = (p.offsetc == 'R' || p.offsetc == 'r');
+    bool OCisC = (p.offsetc == 'C' || p.offsetc == 'c');
+    size_t sizeco = OCisR ? p.N : OCisC ? p.M : 1;
+
+    int8_t ao, bo;
+    int32_t* co = get_matrix_buffer<int32_t>(sizeco);
+
+    fill_matrix<int8_t, int8_t, int32_t>(sizeA, sizeB, sizeC, sizeco, A, B, C,
+        &ao, &bo, co);
+
+    mkldnn::impl::parallel_nd(p.ldc * p.N,
+        [&](int i) { C_ref[i] = static_cast<int32_t>(C[i]); });
+
+    auto status = mkldnn_gemm_s8s8s32(&p.transA, &p.transB, &p.offsetc,
+        &p.M, &p.N, &p.K, &p.alpha, A, &p.lda, &ao, B, &p.ldb, &bo,
+        &p.beta, C, &p.ldc, co);
+
+    if (status != mkldnn_success)
+        throw error(status, "mkldnn_gemm_s8s8s32 returned error");
+
+    ref_gemm_s8x8s32<int8_t>(&p.transA, &p.transB, &p.offsetc, p.M, p.N,
+        p.K, p.alpha, A, p.lda, &ao, B, p.ldb, &bo, p.beta, C_ref,
+        p.ldc, co);
+
+    compare(p.M, p.N, p.ldc, C, C_ref, p.K);
+
+    test_free((char *)A);
+    test_free((char *)B);
+    test_free((char *)C);
+    test_free((char *)C_ref);
+    test_free((char *)co);
+}
+
+template <>
+void run_test_gemm<float, float, float>(const test_params &p) {
+    size_t sizeA, sizeB, sizeC;
+    get_matrix_size(p, sizeA, sizeB, sizeC);
+
+    float *A = get_matrix_buffer<float>(sizeA);
+    float *B = get_matrix_buffer<float>(sizeB);
+    float *C = get_matrix_buffer<float>(sizeC);
+    float *C_ref = get_matrix_buffer<float>(sizeC);
+
+    fill_matrix<float, float, float>(sizeA, sizeB, sizeC, 0, A, B, C,
+        nullptr, nullptr, nullptr);
+
+    mkldnn::impl::parallel_nd(p.N * p.ldc, [&](int i) { C_ref[i] = C[i]; });
+
+    auto status = mkldnn_sgemm(&p.transA, &p.transB, &p.M, &p.N, &p.K, &p.alpha,
+        A, &p.lda, B, &p.ldb, &p.beta, C, &p.ldc);
+    if (status == mkldnn_success) {
+        ref_gemm(&p.transA, &p.transB, p.M, p.N, p.K, p.alpha, A, p.lda, B, p.ldb,
+            p.beta, C_ref, p.ldc);
+        compare(p.M, p.N, p.ldc, C, C_ref);
+    }
+
+    test_free((char *)A);
+    test_free((char *)B);
+    test_free((char *)C);
+    test_free((char *)C_ref);
+
+    if (status != mkldnn_success)
+        throw error(status, "mkldnn_sgemm returned error");
+}
+
+template <typename a_dt, typename b_dt, typename c_dt>
+class gemm_test_common: public ::testing::TestWithParam<test_params> {
+protected:
+    virtual void SetUp() {
+        test_params p
+            = ::testing::TestWithParam<test_params>::GetParam();
+        catch_expected_failures([=](){Test();}, p.expect_to_fail,
+                    p.expected_status);
+    }
+    virtual void Test() {
+        test_params p = ::testing::TestWithParam<test_params>::GetParam();
+        run_test_gemm<a_dt, b_dt, c_dt>(p);
+    }
+};
+}
+#endif
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_f32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_f32.cpp
new file mode 100644 (file)
index 0000000..a9cec53
--- /dev/null
@@ -0,0 +1,33 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "mkldnn_test_common.hpp"
+#include "gtest/gtest.h"
+
+#include "mkldnn.h"
+#include "test_gemm_common.hpp"
+
+namespace mkldnn {
+
+using gemm_test = gemm_test_common<float, float, float>;
+
+TEST_P(gemm_test, TestGEMM)
+{}
+
+#define TEST_CASE_NAME_PREFIX fp32
+#define FP32
+#include "gemm_in.h"
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_s8s8s32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_s8s8s32.cpp
new file mode 100644 (file)
index 0000000..74f9291
--- /dev/null
@@ -0,0 +1,33 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "mkldnn_test_common.hpp"
+#include "gtest/gtest.h"
+
+#include "mkldnn.h"
+#include "test_gemm_common.hpp"
+
+namespace mkldnn {
+
+using gemm_test = gemm_test_common<int8_t, int8_t, int32_t>;
+
+TEST_P(gemm_test, TestGEMM)
+{}
+
+#define TEST_CASE_NAME_PREFIX s8s8s32
+#define S8S8S32
+#include "gemm_in.h"
+}
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_s8u8s32.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_gemm_s8u8s32.cpp
new file mode 100644 (file)
index 0000000..836d799
--- /dev/null
@@ -0,0 +1,33 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "mkldnn_test_common.hpp"
+#include "gtest/gtest.h"
+
+#include "mkldnn.h"
+#include "test_gemm_common.hpp"
+
+namespace mkldnn {
+
+using gemm_test = gemm_test_common<int8_t, uint8_t, int32_t>;
+
+TEST_P(gemm_test, TestGEMM)
+{}
+
+#define TEST_CASE_NAME_PREFIX s8u8s32
+#define S8U8S32
+#include "gemm_in.h"
+}
index 2376cd9..c60885b 100644 (file)
@@ -64,4 +64,24 @@ TEST_F(pd_iter_test, TestReLUImpls) {
     mkldnn_primitive_desc_iterator_destroy(it);
 }
 
+TEST(pd_next_impl, TestEltwiseImpl) {
+    auto eng = engine(engine::kind::cpu, 0);
+    memory::desc md({8, 32, 4, 4}, memory::data_type::f32, memory::format::nChw8c);
+    memory data({md, eng});
+
+    eltwise_forward::desc ed(prop_kind::forward_training,
+            algorithm::eltwise_relu, md, 0, 0);
+    eltwise_forward::primitive_desc epd(ed, eng);
+
+    std::string impl0(epd.impl_info_str());
+    eltwise_forward(epd, data, data);
+
+    while (epd.next_impl()) {
+        std::string impl1(epd.impl_info_str());
+        eltwise_forward(epd, data, data);
+        EXPECT_NE(impl0, impl1);
+        impl0 = impl1;
+    }
+}
+
 }
index b3f3378..6589e16 100644 (file)
@@ -696,6 +696,15 @@ INSTANTIATE_TEST_CASE_P(
             memory::format::nChw16c, { 2, 192, 56, 56, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
             ));
 
+// This tests compatibility with MKL-DNN 0.14
+INSTANTIATE_TEST_CASE_P(
+        TestLRNRegressionWeightFormat, lrn_test_float,
+        ::testing::Values(
+            lrn_test_params_float{ prop_kind::forward_training,
+            engine::kind::cpu, algorithm::lrn_across_channels, memory::format::oihw,
+            memory::format::oihw, { 2, 64, 56, 56, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
+            ));
+
 // Backward does not support WITHIN yet.
 /*
 INSTANTIATE_TEST_CASE_P(
index 559dc1c..4a557e8 100644 (file)
@@ -445,4 +445,42 @@ INSTANTIATE_TEST_CASE_P(
             memory::format::nChw8c, { 2, 256, 27, 27, 1.0e-4f, 0.75f, 1.0f, 5, WITHIN } }
             ));
 
+// This tests compatibility with MKL-DNN 0.14
+INSTANTIATE_TEST_CASE_P(
+        TestLRNRegressionWeightFormat, lrn_forward_test_float,
+        ::testing::Values(
+            lrn_fwd_test_params_float{ prop_kind::forward_training,
+            engine::kind::cpu, algorithm::lrn_across_channels, memory::format::oihw,
+            memory::format::oihw, { 2, 64, 56, 56, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
+        ));
+
+INSTANTIATE_TEST_CASE_P(
+        TestLRNForwardNCHWTail, lrn_forward_test_float,
+        ::testing::Values(
+            lrn_fwd_test_params_float{ prop_kind::forward_training,
+            engine::kind::cpu, algorithm::lrn_across_channels, memory::format::nchw,
+            memory::format::nchw, { 1, 64, 1, 9, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
+            , lrn_fwd_test_params_float{ prop_kind::forward_training,
+            engine::kind::cpu, algorithm::lrn_across_channels, memory::format::nchw,
+            memory::format::nchw, { 1, 64, 2, 9, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
+            , lrn_fwd_test_params_float{ prop_kind::forward_training,
+            engine::kind::cpu, algorithm::lrn_across_channels, memory::format::nchw,
+            memory::format::nchw, { 1, 64, 3, 9, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
+            , lrn_fwd_test_params_float{ prop_kind::forward_training,
+            engine::kind::cpu, algorithm::lrn_across_channels, memory::format::nchw,
+            memory::format::nchw, { 1, 64, 4, 9, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
+            , lrn_fwd_test_params_float{ prop_kind::forward_training,
+            engine::kind::cpu, algorithm::lrn_across_channels, memory::format::nchw,
+            memory::format::nchw, { 1, 64, 5, 9, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
+            , lrn_fwd_test_params_float{ prop_kind::forward_training,
+            engine::kind::cpu, algorithm::lrn_across_channels, memory::format::nchw,
+            memory::format::nchw, { 1, 64, 9, 6, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
+            , lrn_fwd_test_params_float{ prop_kind::forward_training,
+            engine::kind::cpu, algorithm::lrn_across_channels, memory::format::nchw,
+            memory::format::nchw, { 1, 64, 7, 9, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
+            , lrn_fwd_test_params_float{ prop_kind::forward_training,
+            engine::kind::cpu, algorithm::lrn_across_channels, memory::format::nchw,
+            memory::format::nchw, { 1, 64, 8, 9, 1.0e-4f, 0.75f, 1.0f, 5, ACROSS } }
+            ));
+
 }
index 56beaa0..7de9067 100644 (file)
@@ -48,18 +48,18 @@ TEST_F(memory_test, DataPaddingTest) {
     data_t *mem0_ptr = (data_t *)mem0.get_data_handle();
     fill_data<data_t>(N*C_16*H*W, mem0_ptr);
 
-    std::shared_ptr<data_t> mem1_shr_ptr(new data_t[phys_sz]);
-    data_t *mem1_ptr = mem1_shr_ptr.get();
-    std::memcpy((void*)mem1_ptr, mem0_ptr, phys_sz * sizeof(data_t));
+    std::vector<data_t> mem1_vec(phys_sz);
+    mem1_vec.assign(mem0_ptr,
+            mem0_ptr + mem0.get_primitive_desc().get_size() / sizeof(data_t));
 
     mkldnn::memory mem1({{{N, C, H, W}, memory::data_type::f32,
-            memory::format::nChw16c}, e}, mem1_ptr);
+            memory::format::nChw16c}, e}, &mem1_vec[0]);
 
     check_zero_tail<data_t>(0, mem1);
     check_zero_tail<data_t>(1, mem0);
 
     for (size_t i = 0; i < phys_sz; ++i)
-        EXPECT_NEAR(mem0_ptr[i], mem1_ptr[i], 1e-7) << i;
+        EXPECT_NEAR(mem0_ptr[i], mem1_vec[i], 1e-7) << i;
 }
 
 TEST_F(memory_test, WeightPaddingTest) {
@@ -73,18 +73,18 @@ TEST_F(memory_test, WeightPaddingTest) {
     data_t *mem0_ptr = (data_t *)mem0.get_data_handle();
     fill_data<data_t>(O_16*I_16*H*W, mem0_ptr);
 
-    std::shared_ptr<data_t> mem1_shr_ptr(new data_t[phys_sz]);
-    data_t *mem1_ptr = mem1_shr_ptr.get();
-    std::memcpy((void*)mem1_ptr, mem0_ptr, phys_sz * sizeof(data_t));
+    std::vector<data_t> mem1_vec(phys_sz);
+    mem1_vec.assign(mem0_ptr,
+            mem0_ptr + mem0.get_primitive_desc().get_size() / sizeof(data_t));
 
     mkldnn::memory mem1({{{O, I, H, W}, memory::data_type::f32,
-            memory::format::OIhw16i16o}, e}, mem1_ptr);
+            memory::format::OIhw16i16o}, e}, &mem1_vec[0]);
 
     check_zero_tail<data_t>(0, mem1);
     check_zero_tail<data_t>(1, mem0);
 
     for (size_t i = 0; i < phys_sz; ++i)
-        EXPECT_NEAR(mem0_ptr[i], mem1_ptr[i], 1e-7) << i;
+        EXPECT_NEAR(mem0_ptr[i], mem1_vec[i], 1e-7) << i;
 }
 
 }
index a2dc932..f882c6c 100644 (file)
@@ -42,7 +42,7 @@ protected:
         p = ::testing::TestWithParam<decltype(p)>::GetParam();
         size = 1;
         for (auto &d: p.dims) size *= d;
-        data.reserve((size_t)size);
+        data.resize((size_t)size);
     }
 
     void CheckID() {
index 0f09cb2..e32c4f4 100644 (file)
@@ -137,8 +137,9 @@ void check_pool_bwd(const pool_bwd_test_params &p, const memory &diff_src,
     const memory::desc ws_d = ws.get_primitive_desc().desc();
 
     auto pd = p.test_pd;
-    data_t *ref_diff_src
-            = new data_t[(size_t)pd.mb * pd.c * pd.id * pd.ih * pd.iw];
+    std::vector<data_t>
+        ref_diff_src_vec((size_t)pd.mb * pd.c * pd.id * pd.ih * pd.iw);
+    data_t *ref_diff_src = &ref_diff_src_vec[0];
 
     auto apply_offset = [=](int index, int offset) {
         return (index > offset) ? index - offset : 0;
index 99867ff..aa1a191 100644 (file)
@@ -132,8 +132,9 @@ void check_pool_fwd(const pool_test_params &p, const memory &src,
                 num_summands = pd.kw * pd.kh * pd.kd;
             }
 
-            if (p.aalgorithm == pooling_avg_include_padding ||
-                p.aalgorithm == pooling_avg_exclude_padding) {
+            if ((p.aalgorithm == pooling_avg_include_padding ||
+                p.aalgorithm == pooling_avg_exclude_padding) &&
+                num_summands)  {
                 acc_ref = out_round<data_t>(
                     (float)acc_ref / num_summands);
             }
diff --git a/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_shuffle.cpp b/inference-engine/thirdparty/mkl-dnn/tests/gtests/test_shuffle.cpp
new file mode 100644 (file)
index 0000000..e73e589
--- /dev/null
@@ -0,0 +1,372 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cmath>
+
+#include "mkldnn_test_common.hpp"
+#include "gtest/gtest.h"
+
+#include "mkldnn.hpp"
+
+namespace mkldnn {
+
+struct shuffle_test_params {
+    prop_kind aprop_kind;
+    engine::kind engine_kind;
+    memory::format data_format;
+    memory::dims dims;
+    int axis;
+    int group_size;
+    bool expect_to_fail;
+    mkldnn_status_t expected_status;
+};
+
+template <typename data_t>
+void check_shuffle(const shuffle_test_params &p, const memory &input,
+    const memory &output, int ROW)
+{
+    data_t *in_ptr = (data_t *)input.get_data_handle();
+    data_t *out_ptr = (data_t *)output.get_data_handle();
+
+    const memory::desc in_d = input.get_primitive_desc().desc();
+    const memory::desc out_d = output.get_primitive_desc().desc();
+
+    auto dims = in_d.data.dims;
+    auto ndims = in_d.data.ndims;
+    const int axis = p.axis;
+    size_t inner_size = 1, outer_size = 1;
+    const int axis_size = dims[axis];
+    const int padded_axis = in_d.data.layout_desc.blocking.padding_dims[axis];
+
+    auto rev_transpose = [=] (int a) {
+        int COL = axis_size / ROW;
+        int row = a / COL;
+        int col = a % COL;
+        return ROW * col + row;
+    };
+
+    for (int i = 0; i < axis ; ++i)
+        outer_size *= (size_t)dims[i];
+    for (int i = axis + 1; i < ndims; ++i)
+        inner_size *= (size_t)dims[i];
+    const size_t dim = padded_axis * inner_size;
+
+    mkldnn::impl::parallel_nd(outer_size, axis_size, inner_size,
+           [&](size_t ou, int a, size_t in) {
+        data_t refout = in_ptr[map_index(in_d, ou*dim +
+                                 rev_transpose(a)*inner_size + in)];
+        data_t out = out_ptr[map_index(out_d, ou*dim + a*inner_size + in)];
+        EXPECT_NEAR(out, refout, 0);
+    });
+}
+
+template <typename data_t>
+class shuffle_test : public ::testing::TestWithParam<shuffle_test_params> {
+private:
+    std::shared_ptr<test_memory> src;
+    std::shared_ptr<test_memory> dst;
+    std::shared_ptr<test_memory> diff_dst;
+    std::shared_ptr<test_memory> diff_src;
+    std::shared_ptr<memory::desc> src_desc;
+    std::shared_ptr<memory::desc> dst_desc;
+    std::shared_ptr<memory::desc> diff_dst_desc;
+    std::shared_ptr<memory::desc> diff_src_desc;
+    std::shared_ptr<shuffle_forward::primitive_desc> shuffle_fwd_prim_desc;
+    std::shared_ptr<shuffle_forward::primitive_desc> shuffle_bwd_prim_desc;
+    shuffle_test_params p;
+    memory::dims padR;
+    std::shared_ptr<engine> eng;
+    memory::data_type data_type;
+
+protected:
+    virtual void SetUp() {
+        p = ::testing::TestWithParam<decltype(p)>::GetParam();
+        catch_expected_failures([=](){Test();}, p.expect_to_fail,
+                    p.expected_status);
+    }
+
+    void Test() {
+        p = ::testing::TestWithParam<decltype(p)>::GetParam();
+
+        ASSERT_TRUE(p.engine_kind == engine::kind::cpu);
+        eng.reset(new engine(p.engine_kind, 0));
+        data_type = data_traits<data_t>::data_type;
+
+        src_desc.reset(new memory::desc(p.dims, data_type, p.data_format));
+        dst_desc.reset(new memory::desc(p.dims, data_type, p.data_format));
+        diff_dst_desc.reset(new memory::desc(p.dims, data_type, p.data_format));
+        diff_src_desc.reset(new memory::desc(p.dims, data_type, p.data_format));
+
+        bool is_training = p.aprop_kind == prop_kind::forward_training;
+
+        Forward();
+        if (is_training)
+            Backward();
+    }
+
+    void Forward() {
+        auto shuffle_desc = shuffle_forward::desc(p.aprop_kind, *src_desc,
+                 p.axis, p.group_size);
+        shuffle_fwd_prim_desc.reset(new shuffle_forward::
+                 primitive_desc(shuffle_desc, *eng));
+
+        src.reset(new test_memory(*src_desc, *eng));
+        dst.reset(new test_memory(*dst_desc, *eng));
+
+        fill_data<data_t>(src->get_size() / sizeof(data_t),
+                (data_t *)src->get().get_data_handle());
+        check_zero_tail<data_t>(1, src->get());
+        check_zero_tail<data_t>(1, dst->get());
+
+        // Execute
+        std::vector<primitive> pipeline;
+        auto st = stream(stream::kind::lazy);
+        auto s = shuffle_forward(*shuffle_fwd_prim_desc, src->get(), dst->get());
+        pipeline.push_back(s);
+        st.submit(pipeline).wait();
+
+        check_shuffle<data_t>(p, src->get(), dst->get(), p.group_size);
+    }
+
+    void Backward()
+    {
+        auto shuffle_desc = shuffle_backward::desc(*diff_dst_desc, p.axis,
+               p.group_size);
+        diff_dst.reset(new test_memory(*diff_dst_desc, *eng));
+        diff_src.reset(new test_memory(*diff_src_desc, *eng));
+
+        auto shuffle_prim_desc = shuffle_backward::primitive_desc(shuffle_desc,
+                *eng, *shuffle_fwd_prim_desc);
+
+        fill_data<data_t>(diff_dst->get_size() / sizeof(data_t),
+                (data_t *)diff_dst->get().get_data_handle());
+
+        check_zero_tail<data_t>(1, diff_dst->get());
+        check_zero_tail<data_t>(1, diff_src->get());
+
+        // Execute
+        std::vector<primitive> pipeline;
+        auto st = stream(stream::kind::lazy);
+        auto s = shuffle_backward(shuffle_prim_desc, diff_dst->get(),
+            diff_src->get());
+        pipeline.push_back(s);
+        st.submit(pipeline).wait();
+
+        const int axis_size = diff_dst_desc->data.dims[p.axis];
+        check_shuffle<data_t>(p, diff_dst->get(), diff_src->get(),
+            axis_size / p.group_size);
+    }
+};
+
+using shuffle_test_float = shuffle_test<float>;
+using shuffle_test_s8 = shuffle_test<int8_t>;
+using shuffle_test_u8 = shuffle_test<uint8_t>;
+
+#define INST_TEST_CASE(test) \
+TEST_P(test, TestsShuffle) {} \
+INSTANTIATE_TEST_CASE_P(TestShuffle_nChw16c, \
+        test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw16c, {2, 16, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw16c, {2, 64, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw16c, {2, 32, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw16c, {2, 16, 4, 4}, 1, 2 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffle_nChw16c_Tail, \
+        test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw16c, {2, 24, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw16c, {2, 66, 4, 4}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw16c, {2, 34, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw16c, {2, 12, 10, 10}, 1, 2 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffle_NCHW, test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nchw, {2, 10, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nchw, {2, 10, 4, 4}, 1, 5 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffle_NCDHW, test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncdhw, {2, 10, 2, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncdhw, {2, 10, 2, 4, 4}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncdhw, {2, 10, 2, 4, 4}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncdhw, {2, 10, 2, 4, 4}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncdhw, {2, 12, 1, 7, 7}, 1, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncdhw, {2, 12, 2, 7, 7}, 1, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncdhw, {2, 12, 3, 7, 7}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncdhw, {2, 12, 1, 7, 7}, 1, 4 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffleNHWC, test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nhwc, {2, 10, 4, 4}, 3, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nhwc, {2, 10, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nhwc, {2, 10, 4, 4}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nhwc, {2, 10, 4, 4}, 1, 2 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffle_nChw8c, test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw8c, {2, 16, 4, 4}, 2, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw8c, {2, 16, 4, 4}, 2, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw8c, {2, 16, 4, 4}, 1, 8 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw8c, {2, 16, 4, 4}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw8c, {1, 8, 1, 1}, 1, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nChw8c, {1, 8, 1, 1}, 1, 2 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffle_nCdhw16c, test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {2, 16, 2, 4, 4}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {2, 16, 2, 4, 4}, 3, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {2, 16, 2, 4, 4}, 1, 8 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {2, 16, 2, 4, 4}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {1, 16, 2, 1, 1}, 1, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {1, 16, 2, 1, 1}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {1, 16, 2, 1, 1}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {1, 16, 2, 1, 1}, 1, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {1, 32, 1, 5, 5}, 1, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {1, 32, 1, 5, 5}, 1, 8 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {1, 32, 1, 5, 5}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nCdhw16c, \
+            {1, 32, 1, 15, 15}, 3, 5 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffle_OIHW, \
+        test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::oihw, {2, 16, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::oihw, {2, 64, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::oihw, {2, 32, 4, 4}, 2, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::oihw, {2, 16, 4, 4}, 1, 2 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffle_NC, test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nc, {10, 8}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nc, {10, 8}, 1, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nc, {2, 32}, 0, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nc, {10, 32}, 0, 5 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffle_NCW, test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncw, {10, 8, 5}, 1, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncw, {10, 8, 5}, 1, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncw, {2, 32, 5}, 0, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::ncw, {10, 32, 5}, 0, 5 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffle_X, test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::x, {10}, 0, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::x, {8}, 0, 4 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::x, {2}, 0, 2 } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::x, {10}, 0, 5 } \
+            )); \
+ \
+INSTANTIATE_TEST_CASE_P(TestShuffleEF_NCHW, \
+        test, \
+        ::testing::Values( \
+            shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nchw, {2, 15, 4, 4}, 1, 2, \
+            true, mkldnn_invalid_arguments } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nchw, {2, 64, 7, 7}, 2, 2, \
+            true, mkldnn_invalid_arguments  } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nchw, {2, 32, 11, 11}, 2, 2, \
+            true, mkldnn_invalid_arguments  } \
+            , shuffle_test_params{ prop_kind::forward_training, \
+            engine::kind::cpu, memory::format::nchw, {2, 16, 4, 4}, 4, 2, \
+            true, mkldnn_invalid_arguments  } \
+));
+
+INST_TEST_CASE(shuffle_test_float)
+INST_TEST_CASE(shuffle_test_s8)
+INST_TEST_CASE(shuffle_test_u8)
+
+}
index 2ed26a2..e938da6 100644 (file)
@@ -164,12 +164,8 @@ protected:
         auto mem_desc = memory::desc(p.dims, prec, p.memory_format);
         auto mem_prim_desc = memory::primitive_desc(mem_desc, eng);
 
-        // TODO: free
-        auto src_data = new data_t[mem_prim_desc.get_size()];
-        auto dst_data = new data_t[mem_prim_desc.get_size()];
-
-        auto src = memory(mem_prim_desc, src_data);
-        auto dst = memory(mem_prim_desc, dst_data);
+        auto src = memory(mem_prim_desc);
+        auto dst = memory(mem_prim_desc);
 
         auto softmax_desc = softmax_forward::desc(p.aprop_kind, mem_desc,
                     p.axis);
@@ -178,7 +174,7 @@ protected:
         auto softmax = softmax_forward(softmax_prim_desc, src, dst);
 
         auto test_with_given_fill = [&](data_t mean, data_t var) {
-            fill_data<data_t>(mem_prim_desc.get_size(),
+            fill_data<data_t>(mem_prim_desc.get_size() / sizeof(data_t),
                     (data_t *)src.get_data_handle(), mean, var);
 
             stream(stream::kind::lazy).submit({softmax}).wait();
index 54d20b8..1b4a457 100644 (file)
@@ -27,6 +27,7 @@ struct sum_test_params {
     memory::format dst_format;
     memory::dims dims;
     std::vector<float> scale;
+    bool is_output_omitted;
     bool expect_to_fail;
     mkldnn_status_t expected_status;
 };
@@ -90,7 +91,6 @@ protected:
         sum_test_params p
             = ::testing::TestWithParam<sum_test_params>::GetParam();
 
-        ASSERT_EQ(p.srcs_format.size(), p.scale.size());
         const auto num_srcs = p.srcs_format.size();
 
         ASSERT_TRUE(p.engine_kind == engine::kind::cpu);
@@ -101,8 +101,11 @@ protected:
         std::vector<memory> srcs;
 
         for (size_t i = 0; i < num_srcs; i++) {
-            auto desc =
-                memory::desc(p.dims, data_type, p.srcs_format[i]);
+            bool is_fmt_blocked = p.srcs_format[i] == memory::format::blocked;
+            auto desc = memory::desc(p.dims, data_type, is_fmt_blocked
+                ? memory::format::nchw
+                : p.srcs_format[i]);
+            if (is_fmt_blocked) desc.data.format = mkldnn_blocked;
             auto mpd = memory::primitive_desc(desc, eng);
             auto src_memory = memory(mpd);
             const size_t sz =
@@ -113,15 +116,26 @@ protected:
         }
 
         std::shared_ptr<memory> dst;
+        std::shared_ptr<sum::primitive_desc> sum_pd;
 
-        auto dst_desc = memory::desc(p.dims, data_type, p.dst_format);
-        auto sum_pd = sum::primitive_desc(dst_desc, p.scale, srcs_pd);
-        dst.reset(new memory(sum_pd.dst_primitive_desc()));
+        if (p.is_output_omitted) {
+            ASSERT_NO_THROW(sum_pd.reset(
+                new sum::primitive_desc(p.scale, srcs_pd)));
+        } else {
+            bool is_fmt_blocked = p.dst_format == memory::format::blocked;
+            auto dst_desc = memory::desc(p.dims, data_type, is_fmt_blocked
+                ? memory::format::nchw
+                : p.dst_format);
+            if (is_fmt_blocked) dst_desc.data.format = mkldnn_blocked;
+            sum_pd.reset(
+                new sum::primitive_desc(dst_desc, p.scale, srcs_pd));
 
-        ASSERT_EQ(sum_pd.dst_primitive_desc().desc().data.format,
-                dst_desc.data.format);
-        ASSERT_EQ(sum_pd.dst_primitive_desc().desc().data.ndims,
-                dst_desc.data.ndims);
+            ASSERT_EQ(sum_pd->dst_primitive_desc().desc().data.format,
+                    dst_desc.data.format);
+            ASSERT_EQ(sum_pd->dst_primitive_desc().desc().data.ndims,
+                    dst_desc.data.ndims);
+        }
+        ASSERT_NO_THROW(dst.reset(new memory(sum_pd->dst_primitive_desc())));
 
         data_t *dst_data = (data_t *)dst->get_data_handle();
         const size_t sz =
@@ -135,8 +149,7 @@ protected:
         for (size_t i = 0; i < num_srcs; i++) {
             inputs.push_back(srcs[i]);
         }
-
-        auto c = sum(sum_pd, inputs, *dst);
+        auto c = sum(*sum_pd, inputs, *dst);
         std::vector<primitive> pipeline;
         pipeline.push_back(c);
         auto s = stream(stream::kind::eager);
@@ -150,65 +163,92 @@ protected:
 #define CASE_CC(ifmt0, ifmt1, ofmt, dims_, ef, st) \
     sum_test_params{engine::kind::cpu, \
         {memory::format::ifmt0, memory::format::ifmt1}, memory::format::ofmt, \
-        memory::dims dims_, {1.0f, 1.0f}, ef, st}
+        memory::dims dims_, {1.0f, 1.0f}, 0, ef, st}
 
-#define INST_TEST_CASE(test) \
+#define INST_TEST_CASE(test, omit_output) \
 TEST_P(test, TestsSum) {} \
 INSTANTIATE_TEST_CASE_P(TestSum, test, ::testing::Values( \
     sum_test_params{engine::kind::cpu, \
+    {memory::format::blocked, memory::format::blocked}, memory::format::blocked, \
+    {2, 8, 4, 4}, {1.0f, 1.0f}, omit_output}, \
+    sum_test_params{engine::kind::cpu, \
+    {memory::format::nchw, memory::format::blocked}, memory::format::blocked, \
+    {2, 8, 4, 4}, {1.0f, 1.0f}, omit_output}, \
+    sum_test_params{engine::kind::cpu, \
+    {memory::format::blocked, memory::format::nchw}, memory::format::blocked, \
+    {2, 8, 4, 4}, {1.0f, 1.0f}, omit_output}, \
+    sum_test_params{engine::kind::cpu, \
+    {memory::format::nchw, memory::format::nchw}, memory::format::blocked, \
+    {2, 8, 4, 4}, {1.0f, 1.0f}, omit_output}, \
+    sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nChw8c}, memory::format::nchw, \
-    {0, 7, 4, 4}, {1.0f, 1.0f}}, \
+    {0, 7, 4, 4}, {1.0f, 1.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nChw8c}, memory::format::nchw, \
-    {1, 0, 4, 4}, {1.0f, 1.0f}}, \
+    {1, 0, 4, 4}, {1.0f, 1.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nChw8c}, memory::format::nchw, \
-    {1, 8, 0, 4}, {1.0f, 1.0f}}, \
+    {1, 8, 0, 4}, {1.0f, 1.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nChw8c}, memory::format::nchw, \
-    {-1, 8, 4, 4}, {1.0f, 1.0f}, true, mkldnn_invalid_arguments}, \
+    {-1, 8, 4, 4}, {1.0f, 1.0f}, omit_output, true, mkldnn_invalid_arguments}, \
     \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nChw8c}, memory::format::nchw, \
-    {1, 1024, 38, 50}, {1.0f, 1.0f}}, \
+    {1, 1024, 38, 50}, {1.0f, 1.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nchw}, memory::format::nchw, \
-    {2, 8, 2, 2}, {1.0f, 1.0f}}, \
+    {2, 8, 2, 2}, {1.0f, 1.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nChw8c, memory::format::nChw8c}, memory::format::nChw8c, \
-    {2, 16, 3, 4}, {1.0f, 1.0f}}, \
+    {2, 16, 3, 4}, {1.0f, 1.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nchw}, memory::format::nChw8c, \
-    {2, 16, 2, 2}, {1.0f, 1.0f}}, \
+    {2, 16, 2, 2}, {1.0f, 1.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nChw8c, memory::format::nChw8c}, memory::format::nchw, \
-    {2, 16, 3, 4}, {1.0f, 1.0f}}, \
+    {2, 16, 3, 4}, {1.0f, 1.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nchw}, memory::format::nchw, \
-    {2, 8, 2, 2}, {2.0f, 3.0f}}, \
+    {2, 8, 2, 2}, {2.0f, 3.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nChw8c, memory::format::nChw8c}, memory::format::nChw8c,\
-    {2, 16, 3, 4}, {2.0f, 3.0f}}, \
+    {2, 16, 3, 4}, {2.0f, 3.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nchw}, memory::format::nChw8c, \
-    {2, 16, 2, 2}, {2.0f, 3.0f}}, \
+    {2, 16, 2, 2}, {2.0f, 3.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nChw8c, memory::format::nChw8c}, memory::format::nchw, \
-    {2, 16, 3, 4}, {2.0f, 3.0f}}, \
+    {2, 16, 3, 4}, {2.0f, 3.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nChw8c}, memory::format::nchw, \
-    {5, 8, 3, 3}, {2.0f, 3.0f}}, \
+    {5, 8, 3, 3}, {2.0f, 3.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nchw, memory::format::nChw8c}, memory::format::nchw, \
-    {32, 32, 13, 14}, {2.0f, 3.0f}}, \
+    {32, 32, 13, 14}, {2.0f, 3.0f}, omit_output}, \
     sum_test_params{engine::kind::cpu, \
     {memory::format::nChw16c, memory::format::nChw8c}, \
     memory::format::nChw16c, \
-    {2, 16, 3, 3}, {2.0f, 3.0f}} \
+    {2, 16, 3, 3}, {2.0f, 3.0f}, omit_output} \
+)); \
+\
+INSTANTIATE_TEST_CASE_P(TestSumEF, test, ::testing::Values( \
+    sum_test_params{engine::kind::cpu, \
+    {memory::format::nchw, memory::format::nChw8c}, memory::format::nchw, \
+    {1, 8, 4 ,4}, {1.0f}, 0, true, mkldnn_invalid_arguments}, \
+    sum_test_params{engine::kind::cpu, \
+    {memory::format::nchw, memory::format::nChw8c}, memory::format::nchw, \
+    {2, 8, 4 ,4}, {0.1f}, 0, true, mkldnn_invalid_arguments} \
 ));
 
+using sum_test_float_omit_output = sum_test<float,float>;
+using sum_test_u8_omit_output = sum_test<uint8_t,float>;
+using sum_test_s8_omit_output = sum_test<int8_t,float>;
+using sum_test_s32_omit_output = sum_test<int32_t,float>;
+
 using sum_test_float = sum_test<float,float>;
 using sum_test_u8 = sum_test<uint8_t,float>;
+using sum_test_s8 = sum_test<int8_t,float>;
 using sum_test_s32 = sum_test<int32_t,float>;
 
 using sum_cc_f32 = sum_test<float,float>;
@@ -221,9 +261,15 @@ INSTANTIATE_TEST_CASE_P(TestSumCornerCases, sum_cc_f32, ::testing::Values(
     ));
 #undef CASE_CC
 
-INST_TEST_CASE(sum_test_float)
-INST_TEST_CASE(sum_test_u8)
-INST_TEST_CASE(sum_test_s32)
+INST_TEST_CASE(sum_test_float_omit_output, 1)
+INST_TEST_CASE(sum_test_u8_omit_output, 1)
+INST_TEST_CASE(sum_test_s8_omit_output, 1)
+INST_TEST_CASE(sum_test_s32_omit_output, 1)
+
+INST_TEST_CASE(sum_test_float, 0)
+INST_TEST_CASE(sum_test_u8, 0)
+INST_TEST_CASE(sum_test_s8, 0)
+INST_TEST_CASE(sum_test_s32, 0)
 
 #undef INST_TEST_CASE
 }
index a45865f..0cf5045 100644 (file)
@@ -1,5 +1,18 @@
-# Copyright (C) 2018 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+# Copyright (c) 2016 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
 #
 #  Brief description: This cmake file replase original mkl-dnn build scripts
 #  for more convenient integration to IE build process
 set (CMAKE_CXX_STANDARD 11)
 set (CMAKE_CXX_STANDARD_REQUIRED ON)
 
+function(detect_mkl LIBNAME)
+    message(STATUS "Detecting Intel(R) MKL: trying ${LIBNAME}")
+    find_path(MKLINC mkl_cblas.h ${MKL}/include)
+    find_library(MKLLIB ${LIBNAME} "${MKL}/lib")
+
+    if(NOT MKLLIB OR NOT MKLINC)
+        message(FATAL_ERROR "${MKLINC} or ${MKLLIB} are not found")
+        return()
+    endif()
+
+    if(WIN32)
+        find_file(MKLDLL ${LIBNAME}.dll PATHS "${MKL}/lib")
+        if(NOT MKLDLL)
+            message(FATAL_ERROR "${LIBNAME} not found")
+            return()
+        endif()
+    endif()
+
+    set(MKLINC ${MKLINC} PARENT_SCOPE)
+    set(MKLLIB "${MKLLIB}" PARENT_SCOPE)
+    message(STATUS "Intel(R) MKL: include ${MKLINC}")
+    message(STATUS "Intel(R) MKL: lib ${MKLLIB}")
+
+    if(WIN32)
+        set(MKLDLL "${MKLDLL}" PARENT_SCOPE)
+        message(STATUS "Intel(R) MKL: dll ${MKLDLL}")
+    endif()
+endfunction()
+
 set(TARGET mkldnn)
 set(MKLDNN_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/mkl-dnn)
 
 if (THREADING STREQUAL "TBB")
     add_definitions(-DMKLDNN_THR=MKLDNN_THR_TBB)
-    include_directories(${TBB_INCLUDE_DIRS})
 elseif (THREADING STREQUAL "OMP")
     add_definitions(-DMKLDNN_THR=MKLDNN_THR_OMP)
 else()
@@ -47,21 +88,29 @@ if(WIN32)
     endif()
 endif()
 
-if(THREADING STREQUAL "OMP")
-    enable_omp()
-endif()
-
 add_library(${TARGET} STATIC ${HDR} ${SRC})
+set_ie_threading_interface_for(${TARGET})
+
 if(GEMM STREQUAL "OPENBLAS")
     ## enable cblas_gemm from OpenBLAS package
     add_definitions(-DUSE_CBLAS)
     include_directories(${BLAS_INCLUDE_DIRS})
-    target_link_libraries(${TARGET} ${BLAS_LIBRARIES})
+    list(APPEND ${TARGET}_LINKER_LIBS ${BLAS_LIBRARIES})
 elseif (GEMM STREQUAL "MKL")
-    ## enable cblas_gemm from mklml package
-    include(MKL.cmake)
+    ## enable cblas_gemm from mlkml package
+if(WIN32)
+    detect_mkl("mklml")
+else()
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+        detect_mkl("mklml_intel")
+    else()
+        detect_mkl("mklml_gnu")
+    endif()
 endif()
-## enable internal jit_gemm from mkl-dnn if neither MKL nor OPENBLAS defined
-
-target_link_libraries(${TARGET} ${${TARGET}_LINKER_LIBS})
+    add_definitions(-DUSE_MKL -DUSE_CBLAS)
+    include_directories(AFTER ${MKLINC})
+    list(APPEND ${TARGET}_LINKER_LIBS ${MKLLIB})
+endif()
+## enable jit_gemm from mlk-dnn
 
+target_link_libraries(${TARGET} PRIVATE ${${TARGET}_LINKER_LIBS})
\ No newline at end of file
diff --git a/inference-engine/thirdparty/ocv/opencv_hal_sse.hpp b/inference-engine/thirdparty/ocv/opencv_hal_sse.hpp
new file mode 100644 (file)
index 0000000..fe1327a
--- /dev/null
@@ -0,0 +1,2745 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_HAL_SSE_HPP
+#define OPENCV_HAL_SSE_HPP
+
+#include <algorithm>
+
+/**********************************
+ * YL: removed needless dependency:
+#include "opencv2/core/utility.hpp"
+**********************************/
+
+// DM: declare missing types
+using uint64 = uint64_t;
+using int64  = int64_t;
+
+#ifndef CV_StaticAssert
+#  define CV_StaticAssert static_assert
+#endif
+
+#ifndef CV_DECL_ALIGNED
+#  ifdef __GNUC__
+#    define CV_DECL_ALIGNED(x) __attribute__ ((aligned (x)))
+#  elif defined _MSC_VER
+#    define CV_DECL_ALIGNED(x) __declspec(align(x))
+#  else
+#    define CV_DECL_ALIGNED(x)
+#  endif
+#endif // CV_DECL_ALIGNED
+
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 1
+#define CV_SIMD128_FP16 0  // no native operations with FP16 type.
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() : val(_mm_setzero_si128()) {}
+    explicit v_uint8x16(__m128i v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+    uchar get0() const
+    {
+        return (uchar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() : val(_mm_setzero_si128()) {}
+    explicit v_int8x16(__m128i v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+              schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
+                            (char)v4, (char)v5, (char)v6, (char)v7,
+                            (char)v8, (char)v9, (char)v10, (char)v11,
+                            (char)v12, (char)v13, (char)v14, (char)v15);
+    }
+    schar get0() const
+    {
+        return (schar)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() : val(_mm_setzero_si128()) {}
+    explicit v_uint16x8(__m128i v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+    ushort get0() const
+    {
+        return (ushort)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() : val(_mm_setzero_si128()) {}
+    explicit v_int16x8(__m128i v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
+                             (short)v4, (short)v5, (short)v6, (short)v7);
+    }
+    short get0() const
+    {
+        return (short)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() : val(_mm_setzero_si128()) {}
+    explicit v_uint32x4(__m128i v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
+    }
+    unsigned get0() const
+    {
+        return (unsigned)_mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() : val(_mm_setzero_si128()) {}
+    explicit v_int32x4(__m128i v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        val = _mm_setr_epi32(v0, v1, v2, v3);
+    }
+    int get0() const
+    {
+        return _mm_cvtsi128_si32(val);
+    }
+
+    __m128i val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    typedef __m128 vector_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() : val(_mm_setzero_ps()) {}
+    explicit v_float32x4(__m128 v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        val = _mm_setr_ps(v0, v1, v2, v3);
+    }
+    float get0() const
+    {
+        return _mm_cvtss_f32(val);
+    }
+
+    __m128 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() : val(_mm_setzero_si128()) {}
+    explicit v_uint64x2(__m128i v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
+    }
+    uint64 get0() const
+    {
+        int a = _mm_cvtsi128_si32(val);
+        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    }
+
+    __m128i val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    typedef __m128i vector_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() : val(_mm_setzero_si128()) {}
+    explicit v_int64x2(__m128i v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
+    }
+    int64 get0() const
+    {
+        int a = _mm_cvtsi128_si32(val);
+        int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    }
+
+    __m128i val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    typedef __m128d vector_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() : val(_mm_setzero_pd()) {}
+    explicit v_float64x2(__m128d v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        val = _mm_setr_pd(v0, v1);
+    }
+    double get0() const
+    {
+        return _mm_cvtsd_f64(val);
+    }
+
+    __m128d val;
+};
+
+namespace hal_sse_internal
+{
+    template <typename to_sse_type, typename from_sse_type>
+    to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
+
+#define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
+    template<> inline \
+    to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
+    { return sse_cast_intrin(a); }
+
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
+    OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
+}  // namespace hal_sse_internal
+
+#define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
+inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
+template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
+{ return _Tpvec(cast(a.val)); }
+
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
+
+inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
+inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
+inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
+inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
+
+template<typename _Tpvec> inline
+v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
+template<typename _Tpvec> inline
+v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
+inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
+{ return v_float32x4(_mm_castsi128_ps(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
+{ return v_float64x2(_mm_castsi128_pd(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
+inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
+{ return _Tpvec(_mm_castps_si128(a.val)); } \
+inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
+{ return _Tpvec(_mm_castpd_si128(a.val)); }
+
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
+OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
+
+inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
+inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
+inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
+inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
+
+//////////////// PACK ///////////////
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
+                                       _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
+}
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16(255);
+    __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{ return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
+
+template<int n> inline
+v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
+                                       _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+template<int n> inline
+v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
+                                       _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
+}
+
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
+
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
+{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
+
+template<int n> inline
+v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
+                                     _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
+}
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
+    __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
+}
+
+
+// byte-wise "mask ? a : b"
+inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
+{
+#if CV_SSE4_1
+    return _mm_blendv_epi8(b, a, mask);
+#else
+    return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
+#endif
+}
+
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
+    __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
+    __m128i r = _mm_packs_epi32(a1, b1);
+    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
+    __m128i r = _mm_packs_epi32(a1, a1);
+    _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+template<int n> inline
+v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta32 = _mm_set1_epi32(32768);
+
+    // preliminary saturate negative values to zero
+    __m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
+    __m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
+
+    __m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
+    return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    __m128i delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(a.val, delta32);
+    __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, r);
+}
+
+template<int n> inline
+v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
+    __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
+    return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
+    __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
+    __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{ return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
+
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
+}
+
+template<int n> inline
+v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
+                                     _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x4& a)
+{
+    __m128i delta = _mm_set1_epi32(1 << (n-1));
+    __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
+    _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
+}
+
+
+// [a0 0 | b0 0]  [a1 0 | b1 0]
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
+    return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+// [a0 0 | b0 0]  [a1 0 | b1 0]
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
+    return v_int32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+inline void v_pack_store(int* ptr, const v_int64x2& a)
+{
+    __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a1);
+}
+
+template<int n> inline
+v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    uint64 delta = (uint64)1 << (n-1);
+    v_uint64x2 delta2(delta, delta);
+    __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
+    return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    uint64 delta = (uint64)1 << (n-1);
+    v_uint64x2 delta2(delta, delta);
+    __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline __m128i v_sign_epi64(__m128i a)
+{
+    return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
+}
+
+inline __m128i v_srai_epi64(__m128i a, int imm)
+{
+    __m128i smask = v_sign_epi64(a);
+    return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
+}
+
+template<int n> inline
+v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    int64 delta = (int64)1 << (n-1);
+    v_int64x2 delta2(delta, delta);
+    __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
+    __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
+    __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
+    return v_int32x4(_mm_unpacklo_epi32(v0, v1));
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x2& a)
+{
+    int64 delta = (int64)1 << (n-1);
+    v_int64x2 delta2(delta, delta);
+    __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
+    __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
+    _mm_storel_epi64((__m128i*)ptr, a2);
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
+    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
+    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
+    __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
+
+    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
+    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
+    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
+
+    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
+}
+
+#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+    { \
+        return _Tpvec(intrin(a.val, b.val)); \
+    } \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+    { \
+        a.val = intrin(a.val, b.val); \
+        return a; \
+    }
+
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
+OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
+
+inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
+}
+inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_int32x4(_mm_mullo_epi32(a.val, b.val));
+#else
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    __m128i d0 = _mm_unpacklo_epi32(c0, c1);
+    __m128i d1 = _mm_unpackhi_epi32(c0, c1);
+    return v_int32x4(_mm_unpacklo_epi64(d0, d1));
+#endif
+}
+inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
+{
+    a = a * b;
+    return a;
+}
+inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
+{
+    a = a * b;
+    return a;
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
+    __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
+    c.val = _mm_unpacklo_epi16(v0, v1);
+    d.val = _mm_unpackhi_epi16(v0, v1);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    __m128i v0 = _mm_mullo_epi16(a.val, b.val);
+    __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
+    c.val = _mm_unpacklo_epi16(v0, v1);
+    d.val = _mm_unpackhi_epi16(v0, v1);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i c0 = _mm_mul_epu32(a.val, b.val);
+    __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
+    c.val = _mm_unpacklo_epi64(c0, c1);
+    d.val = _mm_unpackhi_epi64(c0, c1);
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_int32x4(_mm_madd_epi16(a.val, b.val));
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{
+    return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
+    OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
+    inline _Tpvec operator ~ (const _Tpvec& a) \
+    { \
+        return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
+    }
+
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
+OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{ return v_float32x4(_mm_sqrt_ps(x.val)); }
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
+    __m128 t = x.val;
+    __m128 h = _mm_mul_ps(t, _0_5);
+    t = _mm_rsqrt_ps(t);
+    t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
+    return v_float32x4(t);
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{ return v_float64x2(_mm_sqrt_pd(x.val)); }
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    const __m128d v_1 = _mm_set1_pd(1.);
+    return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
+}
+
+#define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
+inline _Tpuvec v_abs(const _Tpsvec& x) \
+{ return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
+
+OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
+OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
+inline v_uint32x4 v_abs(const v_int32x4& x)
+{
+    __m128i s = _mm_srli_epi32(x.val, 31);
+    __m128i f = _mm_srai_epi32(x.val, 31);
+    return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
+}
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{
+    return v_float64x2(_mm_and_pd(x.val,
+        _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
+
+inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_SSE4_1
+    return v_int8x16(_mm_min_epi8(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi8((char)-128);
+    return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+#endif
+}
+inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
+{
+#if CV_SSE4_1
+    return v_int8x16(_mm_max_epi8(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi8((char)-128);
+    return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
+                                                       _mm_xor_si128(b.val, delta))));
+#endif
+}
+inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
+{
+#if CV_SSE4_1
+    return v_uint16x8(_mm_min_epu16(a.val, b.val));
+#else
+    return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
+#endif
+}
+inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
+{
+#if CV_SSE4_1
+    return v_uint16x8(_mm_max_epu16(a.val, b.val));
+#else
+    return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
+#endif
+}
+inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
+{
+#if CV_SSE4_1
+    return v_uint32x4(_mm_min_epu32(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, b.val, a.val));
+#endif
+}
+inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
+{
+#if CV_SSE4_1
+    return v_uint32x4(_mm_max_epu32(a.val, b.val));
+#else
+    __m128i delta = _mm_set1_epi32((int)0x80000000);
+    __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
+    return v_uint32x4(v_select_si128(mask, a.val, b.val));
+#endif
+}
+inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_int32x4(_mm_min_epi32(a.val, b.val));
+#else
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
+#endif
+}
+inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
+{
+#if CV_SSE4_1
+    return v_int32x4(_mm_max_epi32(a.val, b.val));
+#else
+    return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
+#endif
+}
+
+#define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
+inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
+{ return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
+{ return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
+} \
+inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
+} \
+inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    __m128i smask = _mm_set1_##suffix(sbit); \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
+    return _Tpuvec(_mm_xor_si128(res, not_mask)); \
+} \
+inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
+} \
+inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
+} \
+inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
+} \
+inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i not_mask = _mm_set1_epi32(-1); \
+    return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
+}
+
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
+OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
+
+#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
+
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
+OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
+
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
+OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
+
+#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
+inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
+{ \
+    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
+} \
+inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
+{ \
+    __m128i smask = _mm_set1_epi32(smask32); \
+    __m128i a1 = _mm_xor_si128(a.val, smask); \
+    __m128i b1 = _mm_xor_si128(b.val, smask); \
+    return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
+}
+
+OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
+OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
+
+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{
+    return v_max(a, b) - v_min(a, b);
+}
+
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{
+    __m128i d = _mm_sub_epi32(a.val, b.val);
+    __m128i m = _mm_cmpgt_epi32(b.val, a.val);
+    return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return a * b + c;
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+#if CV_FMA3
+    return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
+#else
+    return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
+#endif
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+#if CV_FMA3
+    return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
+#else
+    return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
+#endif
+}
+
+#define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
+inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
+    return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
+} \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    _Tpvec res = v_fma(a, a, b*b); \
+    return _Tpvec(_mm_sqrt_##suffix(res.val)); \
+} \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return v_fma(a, a, b*b); \
+} \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+{ \
+    return v_fma(a, b, c); \
+}
+
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
+OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
+
+#define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
+inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(srai(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shl(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shl(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shr(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shr(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(srai(a.val, imm)); \
+}
+
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
+OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
+
+namespace hal_sse_internal
+{
+    template <int imm,
+        bool is_invalid = ((imm < 0) || (imm > 16)),
+        bool is_first = (imm == 0),
+        bool is_half = (imm == 8),
+        bool is_second = (imm == 16),
+        bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
+    class v_sse_palignr_u8_class;
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, true, false, false, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i&) const
+        {
+            return a;
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, true, false, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, false, true, false>
+    {
+    public:
+        inline __m128i operator()(const __m128i&, const __m128i& b) const
+        {
+            return b;
+        }
+    };
+
+    template <int imm>
+    class v_sse_palignr_u8_class<imm, false, false, false, false, true>
+    {
+#if CV_SSSE3
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            return _mm_alignr_epi8(b, a, imm);
+        }
+#else
+    public:
+        inline __m128i operator()(const __m128i& a, const __m128i& b) const
+        {
+            enum { imm2 = (sizeof(__m128i) - imm) };
+            return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
+        }
+#endif
+    };
+
+    template <int imm>
+    inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
+    {
+        CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
+        return v_sse_palignr_u8_class<imm>()(a, b);
+    }
+}  // namespace hal_sse_internal
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        _mm_srli_si128(
+            v_sse_reinterpret_as<__m128i>(a.val), imm2)));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        _mm_slli_si128(
+            v_sse_reinterpret_as<__m128i>(a.val), imm2)));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        v_sse_palignr_u8<imm2>(
+            v_sse_reinterpret_as<__m128i>(a.val),
+            v_sse_reinterpret_as<__m128i>(b.val))));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_sse_internal;
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
+        v_sse_palignr_u8<imm2>(
+            v_sse_reinterpret_as<__m128i>(b.val),
+            v_sse_reinterpret_as<__m128i>(a.val))));
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                                     _mm_loadl_epi64((const __m128i*)ptr1))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_si128((__m128i*)ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ _mm_stream_si128((__m128i*)ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ \
+    if ( mode == hal::STORE_UNALIGNED ) \
+        _mm_storeu_si128((__m128i*)ptr, a.val); \
+    else if ( mode == hal::STORE_ALIGNED_NOCACHE )  \
+        _mm_stream_si128((__m128i*)ptr, a.val); \
+    else \
+        _mm_store_si128((__m128i*)ptr, a.val); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, a.val); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(_mm_loadu_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(_mm_load_##suffix(ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    return _Tpvec(_mm_castsi128_##suffix( \
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
+                           _mm_loadl_epi64((const __m128i*)ptr1)))); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storeu_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ _mm_store_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ _mm_stream_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+{ \
+    if ( mode == hal::STORE_UNALIGNED ) \
+        _mm_storeu_##suffix(ptr, a.val); \
+    else if ( mode == hal::STORE_ALIGNED_NOCACHE )  \
+        _mm_stream_##suffix(ptr, a.val); \
+    else \
+        _mm_store_##suffix(ptr, a.val); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    __m128i a1 = _mm_cast##suffix##_si128(a.val); \
+    _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
+inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
+{ \
+    __m128i val = a.val; \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
+    return (scalartype)_mm_cvtsi128_si32(val); \
+} \
+inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
+{ \
+    __m128i val = a.val; \
+    __m128i smask = _mm_set1_epi16(sbit); \
+    val = _mm_xor_si128(val, smask); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
+    val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
+    return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^  sbit); \
+}
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
+inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
+{ \
+    __m128i val = a.val; \
+    val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
+    val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
+    val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
+    return (scalartype)_mm_cvtsi128_si32(val); \
+} \
+inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
+{ \
+    __m128i val = a.val; \
+    val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
+    val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
+    val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
+    return (unsigned scalartype)_mm_cvtsi128_si32(val); \
+}
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    regtype val = a.val; \
+    val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
+    val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
+    return (scalartype)_mm_cvt##extract(val); \
+}
+
+#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype CV_DECL_ALIGNED(16) buf[4]; \
+    v_store_aligned(buf, a); \
+    scalartype s0 = scalar_func(buf[0], buf[1]); \
+    scalartype s1 = scalar_func(buf[2], buf[3]); \
+    return scalar_func(s0, s1); \
+}
+
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+#if CV_SSE3
+    __m128 ab = _mm_hadd_ps(a.val, b.val);
+    __m128 cd = _mm_hadd_ps(c.val, d.val);
+    return v_float32x4(_mm_hadd_ps(ab, cd));
+#else
+    __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
+    __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
+    return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
+#endif
+}
+
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
+inline v_uint32x4 v_popcount(const _Tpvec& a) \
+{ \
+    __m128i m1 = _mm_set1_epi32(0x55555555); \
+    __m128i m2 = _mm_set1_epi32(0x33333333); \
+    __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
+    __m128i p = a.val; \
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
+    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
+    p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
+    p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
+    return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
+}
+
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
+OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
+
+#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
+} \
+inline bool v_check_all(const _Tpvec& a) \
+{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
+inline bool v_check_any(const _Tpvec& a) \
+{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
+
+#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
+inline __m128i v_packq_epi32(__m128i a)
+{
+    __m128i b = _mm_packs_epi32(a, a);
+    return _mm_packs_epi16(b, b);
+}
+
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
+OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
+
+#if CV_SSE4_1
+#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
+}
+
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
+
+#else // CV_SSE4_1
+
+#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
+}
+
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
+// OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
+OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
+#endif
+
+#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
+inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
+{ \
+    __m128i z = _mm_setzero_si128(); \
+    b0.val = _mm_unpacklo_##suffix(a.val, z); \
+    b1.val = _mm_unpackhi_##suffix(a.val, z); \
+} \
+inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
+{ \
+    __m128i z = _mm_setzero_si128(); \
+    return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
+} \
+inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
+{ \
+    b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
+    b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
+} \
+inline _Tpwsvec v_load_expand(const _Tps* ptr) \
+{ \
+    __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
+    return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
+}
+
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
+OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
+
+inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
+{
+    __m128i z = _mm_setzero_si128();
+    b0.val = _mm_unpacklo_epi32(a.val, z);
+    b1.val = _mm_unpackhi_epi32(a.val, z);
+}
+inline v_uint64x2 v_load_expand(const unsigned* ptr)
+{
+    __m128i z = _mm_setzero_si128();
+    return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
+}
+inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
+{
+    __m128i s = _mm_srai_epi32(a.val, 31);
+    b0.val = _mm_unpacklo_epi32(a.val, s);
+    b1.val = _mm_unpackhi_epi32(a.val, s);
+}
+inline v_int64x2 v_load_expand(const int* ptr)
+{
+    __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
+    __m128i s = _mm_srai_epi32(a, 31);
+    return v_int64x2(_mm_unpacklo_epi32(a, s));
+}
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    __m128i z = _mm_setzero_si128();
+    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
+    return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
+    a = _mm_unpacklo_epi8(a, a);
+    a = _mm_unpacklo_epi8(a, a);
+    return v_int32x4(_mm_srai_epi32(a, 24));
+}
+
+#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
+    b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
+    c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
+    d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
+}
+
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
+
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{
+    return v_rotate_right<s>(a, b);
+}
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{ return v_int32x4(_mm_cvtps_epi32(a.val)); }
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    __m128i a1 = _mm_cvtps_epi32(a.val);
+    __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(_mm_cvttps_epi32(a.val)); }
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{ return v_int32x4(_mm_cvtpd_epi32(a.val)); }
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_add_epi32(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    __m128i a1 = _mm_cvtpd_epi32(a.val);
+    __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
+    mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
+    return v_int32x4(_mm_sub_epi32(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{ return v_int32x4(_mm_cvttpd_epi32(a.val)); }
+
+#define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
+    __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
+    __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
+    __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
+\
+    b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
+    b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
+    b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
+    b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
+}
+
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
+
+// load deinterleave
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+
+    __m128i t10 = _mm_unpacklo_epi8(t00, t01);
+    __m128i t11 = _mm_unpackhi_epi8(t00, t01);
+
+    __m128i t20 = _mm_unpacklo_epi8(t10, t11);
+    __m128i t21 = _mm_unpackhi_epi8(t10, t11);
+
+    __m128i t30 = _mm_unpacklo_epi8(t20, t21);
+    __m128i t31 = _mm_unpackhi_epi8(t20, t21);
+
+    a.val = _mm_unpacklo_epi8(t30, t31);
+    b.val = _mm_unpackhi_epi8(t30, t31);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+#if CV_SSE4_1
+    const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+    __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
+    __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
+    __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
+    const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+    const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
+    const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
+    a0 = _mm_shuffle_epi8(a0, sh_b);
+    b0 = _mm_shuffle_epi8(b0, sh_g);
+    c0 = _mm_shuffle_epi8(c0, sh_r);
+    a.val = a0;
+    b.val = b0;
+    c.val = c0;
+#elif CV_SSSE3
+    const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
+    const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
+    const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
+
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+
+    __m128i s0 = _mm_shuffle_epi8(t0, m0);
+    __m128i s1 = _mm_shuffle_epi8(t1, m1);
+    __m128i s2 = _mm_shuffle_epi8(t2, m2);
+
+    t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
+    a.val = _mm_alignr_epi8(s2, t0, 5);
+
+    t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
+    b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
+
+    t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
+    c.val = _mm_alignr_epi8(t2, s0, 11);
+#else
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+
+    __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
+
+    __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
+    __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
+    __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
+
+    __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
+    __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
+    __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
+
+    a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
+    b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
+    c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
+#endif
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
+
+    u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
+    u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
+    u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
+    u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
+
+    v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
+    v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
+    v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
+    v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
+
+    a.val = _mm_unpacklo_epi8(v0, v1);
+    b.val = _mm_unpackhi_epi8(v0, v1);
+    c.val = _mm_unpacklo_epi8(v2, v3);
+    d.val = _mm_unpackhi_epi8(v2, v3);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1 a2 b2 a3 b3
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
+    __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
+    __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
+    __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
+
+    a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+#if CV_SSE4_1
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
+    __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
+    __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
+    __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
+    __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
+
+    const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
+    const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    a0 = _mm_shuffle_epi8(a0, sh_a);
+    b0 = _mm_shuffle_epi8(b0, sh_b);
+    c0 = _mm_shuffle_epi8(c0, sh_c);
+
+    a.val = a0;
+    b.val = b0;
+    c.val = c0;
+#else
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+
+    __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
+
+    __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
+    __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
+    __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
+
+    a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
+    b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
+    c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
+#endif
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
+    __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
+    __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
+    __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
+
+    u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
+    u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
+    u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
+    u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
+
+    a.val = _mm_unpacklo_epi16(u0, u1);
+    b.val = _mm_unpackhi_epi16(u0, u1);
+    c.val = _mm_unpacklo_epi16(u2, u3);
+    d.val = _mm_unpackhi_epi16(u2, u3);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));     // a0 b0 a1 b1
+    __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
+
+    __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
+    __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
+
+    a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
+    b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
+    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
+
+    __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
+    __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
+    __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
+
+    a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
+    b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
+    c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
+    v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4)));  // a1 b1 c1 d1
+    v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8)));  // a2 b2 c2 d2
+    v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
+
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+{
+    const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
+
+    __m128 u0 = _mm_loadu_ps(ptr);       // a0 b0 a1 b1
+    __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
+
+    a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
+    b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
+{
+    __m128 t0 = _mm_loadu_ps(ptr + 0);
+    __m128 t1 = _mm_loadu_ps(ptr + 4);
+    __m128 t2 = _mm_loadu_ps(ptr + 8);
+
+    __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
+    a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
+
+    __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
+    __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
+    b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
+
+    __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
+    c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
+{
+    __m128 t0 = _mm_loadu_ps(ptr +  0);
+    __m128 t1 = _mm_loadu_ps(ptr +  4);
+    __m128 t2 = _mm_loadu_ps(ptr +  8);
+    __m128 t3 = _mm_loadu_ps(ptr + 12);
+    __m128 t02lo = _mm_unpacklo_ps(t0, t2);
+    __m128 t13lo = _mm_unpacklo_ps(t1, t3);
+    __m128 t02hi = _mm_unpackhi_ps(t0, t2);
+    __m128 t13hi = _mm_unpackhi_ps(t1, t3);
+    a.val = _mm_unpacklo_ps(t02lo, t13lo);
+    b.val = _mm_unpackhi_ps(t02lo, t13lo);
+    c.val = _mm_unpacklo_ps(t02hi, t13hi);
+    d.val = _mm_unpackhi_ps(t02hi, t13hi);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+    b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
+
+    t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
+    b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
+    c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
+                                v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
+{
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
+    __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
+
+    a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
+    b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
+    c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
+    d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
+}
+
+// store interleave
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+#if CV_SSE4_1
+    const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
+    const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
+    const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
+    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+    const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
+    const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
+    __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
+    __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
+    __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
+#elif CV_SSSE3
+    const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
+    const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
+    const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
+
+    __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
+    t0 = _mm_alignr_epi8(c.val, t0, 5);
+    __m128i v0 = _mm_shuffle_epi8(t0, m0);
+
+    __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
+    t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
+    __m128i v1 = _mm_shuffle_epi8(t1, m1);
+
+    __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
+    t2 = _mm_alignr_epi8(t2, a.val, 11);
+    __m128i v2 = _mm_shuffle_epi8(t2, m2);
+#else
+    __m128i z = _mm_setzero_si128();
+    __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
+    __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
+    __m128i c0 = _mm_unpacklo_epi8(c.val, z);
+    __m128i c1 = _mm_unpackhi_epi8(c.val, z);
+
+    __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
+    __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
+    __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
+    __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
+
+    __m128i p10 = _mm_unpacklo_epi32(p00, p01);
+    __m128i p11 = _mm_unpackhi_epi32(p00, p01);
+    __m128i p12 = _mm_unpacklo_epi32(p02, p03);
+    __m128i p13 = _mm_unpackhi_epi32(p02, p03);
+
+    __m128i p20 = _mm_unpacklo_epi64(p10, p11);
+    __m128i p21 = _mm_unpackhi_epi64(p10, p11);
+    __m128i p22 = _mm_unpacklo_epi64(p12, p13);
+    __m128i p23 = _mm_unpackhi_epi64(p12, p13);
+
+    p20 = _mm_slli_si128(p20, 1);
+    p22 = _mm_slli_si128(p22, 1);
+
+    __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
+    __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
+    __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
+    __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
+
+    __m128i p40 = _mm_unpacklo_epi64(p30, p31);
+    __m128i p41 = _mm_unpackhi_epi64(p30, p31);
+    __m128i p42 = _mm_unpacklo_epi64(p32, p33);
+    __m128i p43 = _mm_unpackhi_epi64(p32, p33);
+
+    __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
+#endif
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+        _mm_stream_si128((__m128i*)(ptr + 32), v2);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+        _mm_store_si128((__m128i*)(ptr + 32), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, const v_uint8x16& d,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
+    __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
+
+    __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
+    __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
+    __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 16), v1);
+        _mm_stream_si128((__m128i*)(ptr + 32), v2);
+        _mm_stream_si128((__m128i*)(ptr + 48), v3);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 16), v1);
+        _mm_store_si128((__m128i*)(ptr + 32), v2);
+        _mm_store_si128((__m128i*)(ptr + 48), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 48), v3);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
+                                const v_uint16x8& b, const v_uint16x8& c,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+#if CV_SSE4_1
+    const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
+    const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
+    const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
+    __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
+    __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
+    __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
+
+    __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
+    __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
+    __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
+#else
+    __m128i z = _mm_setzero_si128();
+    __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
+    __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
+    __m128i c0 = _mm_unpacklo_epi16(c.val, z);
+    __m128i c1 = _mm_unpackhi_epi16(c.val, z);
+
+    __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
+    __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
+    __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
+    __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
+
+    __m128i p20 = _mm_unpacklo_epi64(p10, p11);
+    __m128i p21 = _mm_unpackhi_epi64(p10, p11);
+    __m128i p22 = _mm_unpacklo_epi64(p12, p13);
+    __m128i p23 = _mm_unpackhi_epi64(p12, p13);
+
+    p20 = _mm_slli_si128(p20, 2);
+    p22 = _mm_slli_si128(p22, 2);
+
+    __m128i p30 = _mm_unpacklo_epi64(p20, p21);
+    __m128i p31 = _mm_unpackhi_epi64(p20, p21);
+    __m128i p32 = _mm_unpacklo_epi64(p22, p23);
+    __m128i p33 = _mm_unpackhi_epi64(p22, p23);
+
+    __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
+#endif
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+        _mm_stream_si128((__m128i*)(ptr + 16), v2);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+        _mm_store_si128((__m128i*)(ptr + 16), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                const v_uint16x8& c, const v_uint16x8& d,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
+    __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
+    __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
+    __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
+
+    __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
+    __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
+    __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
+    __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 8), v1);
+        _mm_stream_si128((__m128i*)(ptr + 16), v2);
+        _mm_stream_si128((__m128i*)(ptr + 24), v3);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 8), v1);
+        _mm_store_si128((__m128i*)(ptr + 16), v2);
+        _mm_store_si128((__m128i*)(ptr + 24), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 16), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 24), v3);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 4), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
+    v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
+
+    __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
+    __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
+    __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1);
+        _mm_stream_si128((__m128i*)(ptr + 8), v2);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 4), v1);
+        _mm_store_si128((__m128i*)(ptr + 8), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v2);
+    }
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    v_uint32x4 v0, v1, v2, v3;
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0.val);
+        _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0.val);
+        _mm_store_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_store_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_store_si128((__m128i*)(ptr + 12), v3.val);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0.val);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
+        _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
+        _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
+    }
+}
+
+// 2-channel, float only
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
+    __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+    }
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
+    __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
+    __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
+    __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
+    __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
+    __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
+    __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
+    __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+        _mm_stream_ps(ptr + 8, v2);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+        _mm_store_ps(ptr + 8, v2);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+        _mm_storeu_ps(ptr + 8, v2);
+    }
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, const v_float32x4& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
+    __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
+    __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
+    __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
+    __m128 v0 = _mm_unpacklo_ps(u0, u1);
+    __m128 v2 = _mm_unpacklo_ps(u2, u3);
+    __m128 v1 = _mm_unpackhi_ps(u0, u1);
+    __m128 v3 = _mm_unpackhi_ps(u2, u3);
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_ps(ptr, v0);
+        _mm_stream_ps(ptr + 4, v1);
+        _mm_stream_ps(ptr + 8, v2);
+        _mm_stream_ps(ptr + 12, v3);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_ps(ptr, v0);
+        _mm_store_ps(ptr + 4, v1);
+        _mm_store_ps(ptr + 8, v2);
+        _mm_store_ps(ptr + 12, v3);
+    }
+    else
+    {
+        _mm_storeu_ps(ptr, v0);
+        _mm_storeu_ps(ptr + 4, v1);
+        _mm_storeu_ps(ptr + 8, v2);
+        _mm_storeu_ps(ptr + 12, v3);
+    }
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+    }
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
+    __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+        _mm_stream_si128((__m128i*)(ptr + 4), v2);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+        _mm_store_si128((__m128i*)(ptr + 4), v2);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v2);
+    }
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, const v_uint64x2& d,
+                               hal::StoreMode mode = hal::STORE_UNALIGNED)
+{
+    __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
+    __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
+    __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
+    __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
+
+    if ( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm_stream_si128((__m128i*)(ptr), v0);
+        _mm_stream_si128((__m128i*)(ptr + 2), v1);
+        _mm_stream_si128((__m128i*)(ptr + 4), v2);
+        _mm_stream_si128((__m128i*)(ptr + 6), v3);
+    }
+    else if ( mode == hal::STORE_ALIGNED )
+    {
+        _mm_store_si128((__m128i*)(ptr), v0);
+        _mm_store_si128((__m128i*)(ptr + 2), v1);
+        _mm_store_si128((__m128i*)(ptr + 4), v2);
+        _mm_store_si128((__m128i*)(ptr + 6), v3);
+    }
+    else
+    {
+        _mm_storeu_si128((__m128i*)(ptr), v0);
+        _mm_storeu_si128((__m128i*)(ptr + 2), v1);
+        _mm_storeu_si128((__m128i*)(ptr + 4), v2);
+        _mm_storeu_si128((__m128i*)(ptr + 6), v3);
+    }
+}
+
+#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(_mm_cvtepi32_ps(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    return v_float32x4(_mm_cvtpd_ps(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(_mm_cvtepi32_pd(a.val));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(_mm_cvtps_pd(a.val));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
+}
+
+#if CV_FP16
+inline v_float32x4 v128_load_fp16_f32(const short* ptr)
+{
+    return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
+}
+
+inline void v_store_fp16(short* ptr, const v_float32x4& a)
+{
+    __m128i fp16_value = _mm_cvtps_ph(a.val, 0);
+    _mm_storel_epi64((__m128i*)ptr, fp16_value);
+}
+#endif
+
+////////////// Lookup table access ////////////////////
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
+}
+
+// loads pairs from the table and deinterleaves them, e.g. returns:
+//   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
+//   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
+// note that the indices are float's indices, not the float-pair indices.
+// in theory, this function can be used to implement bilinear interpolation,
+// when idxvec are the offsets within the image.
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+    __m128 z = _mm_setzero_ps();
+    __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
+    __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
+    xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
+    xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
+    __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
+    __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
+    x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
+    y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int idx[2];
+    v_store_low(idx, idxvec);
+    __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
+    __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
+    x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
+    y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
+}
+
+
+////////////// FP16 support ///////////////////////////
+// Currently disabled (DM)
+#if 0
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
+    const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
+    const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
+    __m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
+    __m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
+    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
+    __m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
+
+    t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
+    __m128i zmask = _mm_cmpeq_epi32(e, z);
+    __m128i ft = v_select_si128(zmask, zt, t);
+    return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    const __m128i signmask = _mm_set1_epi32(0x80000000);
+    const __m128i rval = _mm_set1_epi32(0x3f000000);
+
+    __m128i t = _mm_castps_si128(v.val);
+    __m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
+    t = _mm_andnot_si128(signmask, t);
+
+    __m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
+    __m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
+    __m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
+    __m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
+    __m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
+    tt = _mm_sub_epi32(tt, rval);
+    __m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
+    __m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
+    nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
+    t = v_select_si128(tinymask, tt, nt);
+    t = v_select_si128(finitemask, t, naninf);
+    t = _mm_or_si128(t, sign);
+    t = _mm_packs_epi32(t, t);
+    _mm_storel_epi64((__m128i*)ptr, t);
+}
+#endif
+
+inline void v_cleanup() {}
+
+//! @name Check SIMD support
+//! @{
+//! @brief Check CPU capability of SIMD operation
+static inline bool hasSIMD128()
+{
+    return (CV_CPU_HAS_SUPPORT_SSE2) ? true : false;
+}
+
+//! @}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}  // namespace cv
+
+#endif  // OPENCV_HAL_SSE_HPP
diff --git a/model-optimizer/ModelOptimizer b/model-optimizer/ModelOptimizer
deleted file mode 100644 (file)
index 436414f..0000000
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/usr/bin/env python3
-
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import argparse
-import logging as log
-import os
-import sys
-
-from mo.main import main
-from mo.utils.cli_parser import get_caffe_legacy_cli_parser
-from mo.utils.version import get_version
-from mo.utils.versions_checker import check_python_version
-
-
-def fill_missed(argv: argparse.Namespace):
-    missed_in_old = {
-        'log_level': 'INFO',  # could not be specified with legacy Model Optimizer
-        'output': '',
-        'input': '',
-        'input_shape': '',
-        'model_name': '',
-        'reverse_input_channels': False,
-        'silent': False
-    }
-
-    for (new_opt, val) in missed_in_old.items():
-        setattr(argv, new_opt, val)
-
-
-def fill_supported(argv: argparse.Namespace):
-    setattr(argv, 'scale_values', argv.scale)
-    setattr(argv, 'scale', argv.f)
-
-    mapping_dic = {
-        'version': 'version',
-        'framework': 'framework',
-        'input_proto': 'd',
-        'input_model': 'w',
-        'output_dir': 'o',
-        'batch': 'b',
-        'mean_values': 'ms',
-        'mean_file': 'mf',
-        'mean_file_offsets': 'mo',
-        'data_type': 'p',
-        'disable_fusing': 'fuse',  # special case - should be handled differently
-        'custom_mapping_file': 'k'
-    }
-
-    for (new_opt, old_opt) in mapping_dic.items():
-        setattr(argv, new_opt, getattr(argv, old_opt, 'NONE'))
-
-    # Old MO rule for fuse: ("0"|"false" to disable, "1"|"true" to enable)
-    argv.disable_fusing = argv.fuse == '0' or argv.fuse == 'false'
-
-    # mean file has higher priority
-    argv.mean_values = argv.mean_values or ()
-    if argv.mean_file and argv.mean_values:
-        argv.mean_values = ()
-
-    if argv.mean_values:
-        argv.mean_values = '({})'.format(argv.mean_values)
-
-    if argv.mean_file_offsets:
-        argv.mean_file_offsets = '({})'.format(argv.mean_file_offsets)
-
-    if argv.k:
-        argv.k = os.path.join(argv.k, 'CustomLayersMapping.xml')
-
-
-def process_legacy_params(argv: argparse.Namespace):
-    fill_supported(argv)
-    fill_missed(argv)
-
-    argv.framework = 'caffe'
-    argv.log_level = 'INFO'
-
-    if argv.version:
-        print('Version of Model Optimizer is: {}'.format(get_version()))
-        sys.exit(1)
-    elif not argv.w:
-        log.error('Path to binary weights file (.caffemodel) is required')
-        sys.exit(1)
-    elif not argv.d:
-        log.error('Path to model proto file (.prototxt) is required')
-        sys.exit(1)
-
-    sys.exit(main(argv))
-
-
-if __name__ == "__main__":
-    ret_code = check_python_version()
-    if ret_code:
-        sys.exit(ret_code)
-
-    argv = get_caffe_legacy_cli_parser().parse_args()
-
-    forever_legacy_group = [
-        '-ListA', '-ListF', '-ListN', '--network',  # general
-        '-l', '-nl',  # Learning phase
-        '-v', '-nv',  # Feedback phase
-        '-dm', '-dr', '-ds',  # Dumping files
-        '-q',  # Quantization
-        '-c', '--code-cfg',  # OVX specifics
-        '--hfuse',  # Fusing
-        '-mx'  # mixed precision
-    ]
-
-    print('[ WARNING ] The following options are ignored as legacy ones and not supported: {}'.
-          format(', '.join(['"{}"'.format(arg) for arg in forever_legacy_group])))
-
-    process_legacy_params(argv)
diff --git a/model-optimizer/extensions/back/ConvolutionReshaper.py b/model-optimizer/extensions/back/ConvolutionReshaper.py
new file mode 100644 (file)
index 0000000..155d1eb
--- /dev/null
@@ -0,0 +1,78 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+
+from mo.back.replacement import BackReplacementPattern
+from mo.ops.reshape import Reshape
+
+
+class ConvolutionReshaper(BackReplacementPattern):
+    """
+        Workarounds absence of 1D Convolution support in Inference Engine by converting it to 2D Convolution
+            - updating shape dependent Convolution parameters with fake H: dilation, kernel, pad, stride
+            - reshape weights from [OIX] -> [OIYX] = [OI1X]
+            - inserting fake H dimension by adding reshapes before and after Convolution: [NCW] -> [NCHW] = [NC1W]
+    """
+    enabled = True
+
+    @staticmethod
+    def pattern():
+        return dict(
+            nodes=[
+                ('conv', dict(type='Convolution'))
+            ],
+            edges=[]
+        )
+
+    def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
+        conv = match['conv']
+
+        assert len(conv.out_nodes()) == 1, "Convolution operation {} should have 1 output data node".format(conv.id)
+        out_data = conv.out_node()
+
+        assert out_data.has_valid('shape'), 'Output shape is undefined for {} in back phase'.format(conv.id)
+        out_shape = out_data.shape
+
+        if out_shape.size != 3:
+            return
+
+        assert len(conv.in_nodes()) >= 1, "Convolution operation {} should have more than 1 input data node".format(conv.id)
+        inp_data = conv.in_node()
+
+        assert inp_data.has_valid('shape'), 'Input shape is undefined for {} in back phase'.format(conv.id)
+        inp_shape = inp_data.shape
+        new_inp_shape = np.insert(inp_shape, 2, 1)
+
+        # setting to None to be overwritten by infer function
+        conv.kernel_spatial_idx = None
+        conv.spatial_dims = None
+
+        # inserting fake H dimension
+        conv.dilation = np.insert(conv.dilation, 2, 1)
+        conv.kernel_spatial = np.append([1], conv.kernel_spatial)
+        conv.pad = np.insert(conv.pad, 2, [0, 0], axis=0)
+        conv.stride = np.insert(conv.stride, 2, 1)
+
+        weights_index = len(conv.in_nodes()) - 2
+        weights_node = conv.in_node(weights_index)
+        weights_node.value = np.reshape(weights_node.value, np.insert(weights_node.value.shape, 2, 1))
+        weights_node.shape = np.array(weights_node.value.shape, dtype=np.int64)
+
+        conv.bracket_op_with_another_op(inp=inp_data, out=out_data, new_op_class=Reshape,
+                                        op_before_params={'dim': new_inp_shape},
+                                        op_after_params={'dim': out_shape})
diff --git a/model-optimizer/extensions/back/PermuteForReshape.py b/model-optimizer/extensions/back/PermuteForReshape.py
new file mode 100644 (file)
index 0000000..f0f14c4
--- /dev/null
@@ -0,0 +1,80 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+
+from extensions.back.ConvolutionReshaper import ConvolutionReshaper
+from extensions.back.TileReshaper import TileReshaper
+from mo.back.replacement import BackReplacementPattern
+from mo.front.common.layout import get_width_dim, get_height_dim, get_features_dim, indices_mapping
+from mo.ops.op import PermuteAttrs
+from mo.ops.permute import Permute
+
+
+class PermuteForReshape(BackReplacementPattern):
+    """
+       Fixes problem with Reshapes that changes shape of tensor from >= 4D tensor
+       (where permutation works) to 3D (where permutation doesn't work since we are not sure in new layout).
+       that leads to wrong shapes after permutations (since one part of shapes is permuted while other isn't).
+    """
+    enabled = True
+
+    def run_before(self):
+        return [ConvolutionReshaper,
+                TileReshaper,
+                ]
+
+    @staticmethod
+    def pattern():
+        return dict(
+            nodes=[('reshape', {'kind': 'op', 'type': 'Reshape'})],
+            edges=[],
+        )
+
+    @staticmethod
+    def replace_pattern(graph: nx.MultiDiGraph, match: dict):
+        reshape = match['reshape']
+        assert len(reshape.in_nodes()) > 0
+        if graph.graph['layout'] == 'NCHW' or reshape.has_and_set('nchw_layout') or\
+                reshape.soft_get('correct_data_layout') is True:
+            return
+
+        input_node = reshape.in_node()
+        output_node = reshape.out_node()
+        input_shape = input_node.shape
+        output_shape = output_node.shape
+
+        if len(input_shape) >= 4 and len(output_shape) == 3:
+            # Check that we will permute some shapes in this Reshape by our permutation pass
+            layout = 'NCHW'
+            c_idx = get_features_dim(layout, len(input_shape))
+            hw_idx = [get_width_dim(layout, len(input_shape)), get_height_dim(layout, len(input_shape))]
+            if input_shape[c_idx] != 1 and np.any(input_shape[hw_idx] != [1, 1]):
+                # then nhwc -> nchw permutation can change shapes significantly
+                # We need to wrap up node with NCHW -> NHWC permutes and don't touch it later
+                permutation = PermuteAttrs.get_nchw_to_nhwc_permutation(len(input_shape))
+                permutation_back = PermuteAttrs.get_nchw_to_nhwc_permutation(len(input_shape))
+
+                # 1. Insert input Permute
+                #    This Permute will permute input from original input layout to operation layout
+                edge_attrs = graph.get_edge_data(input_node.id, reshape.id)[0]
+                graph.remove_edge(input_node.id, reshape.id)
+
+                permute_op = Permute(graph, {'order': permutation.perm, 'name': reshape.name + '/Permute_'})
+                permute_data_node = permute_op.create_node_with_data([input_node])
+
+                graph.add_edge(permute_data_node.id, reshape.id, **edge_attrs)
\ No newline at end of file
diff --git a/model-optimizer/extensions/back/TileReshaper.py b/model-optimizer/extensions/back/TileReshaper.py
new file mode 100644 (file)
index 0000000..7c6e2d6
--- /dev/null
@@ -0,0 +1,74 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import networkx as nx
+import numpy as np
+
+from extensions.back.EltwiseBroadcast import EltwiseBroadcast
+from mo.back.replacement import BackReplacementPattern
+from mo.ops.reshape import Reshape
+
+
+class TileReshaper(BackReplacementPattern):
+    enabled = True
+
+    def run_after(self):
+        return [EltwiseBroadcast]
+
+    @staticmethod
+    def pattern():
+        return dict(
+            nodes=[
+                ('tile', dict(type='Tile'))
+            ],
+            edges=[]
+        )
+
+    @staticmethod
+    def replace_pattern(graph: nx.MultiDiGraph, match: dict):
+        """
+        Workarounds not supported type of Tile in Inference Engine (Tiles are supported for 2-D or 4-D tensors):
+        Searches for Tiles with 3D shapes and covers it with Reshapes.
+
+        Example: Tile (axis=1, tiles=16):
+            in_shape: [1,1,101]
+            out_shape: [1,16,101]
+
+        Old behaviour:
+            Tile -> [1,16,101]
+        New behaviour:
+            Reshape [1,1,101,1] -> Tile -> [1,16,101,1] -> Reshape [1,16,101]
+        """
+        tile = match['tile']
+
+        assert len(tile.out_nodes()) == 1, "Tile operation {} should have 1 output data node".format(tile.id)
+        out_data = tile.out_node()
+
+        assert out_data.has_valid('shape'), 'Output shape is undefined for {} in back phase'.format(tile.id)
+        out_shape = out_data.shape
+
+        if out_shape.size != 3:
+            return
+
+        assert len(tile.in_nodes()) == 1, "Tile operation {} should have 1 input data node".format(tile.id)
+        inp_data = tile.in_node()
+
+        assert inp_data.has_valid('shape'), 'Input shape is undefined for {} in back phase'.format(tile.id)
+        inp_shape = inp_data.shape
+        new_inp_shape = np.append(inp_shape, [1])
+
+        tile.bracket_op_with_another_op(inp=inp_data, out=out_data, new_op_class=Reshape,
+                                        op_before_params={'dim': new_inp_shape},
+                                        op_after_params={'dim': out_shape})
index 6f690c4..f657bc7 100644 (file)
@@ -25,7 +25,7 @@ class DisableUnsupportedNDOperations(BackReplacementPattern):
     """
         This pass disables ND Convolutions/Deconvolutions/Poolings
     """
-    enabled = True
+    enabled = False
 
     unsupported_operations = ['Convolution', 'Deconvolution', 'Pooling']
 
index f5945f0..72e4cb4 100644 (file)
@@ -1,5 +1,5 @@
 """
- Copyright (c) 2017-2018 Intel Corporation
+ Copyright (c) 2018 Intel Corporation
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
index d26ee5e..488e161 100644 (file)
@@ -1,5 +1,5 @@
 """
- Copyright (c) 2017-2018 Intel Corporation
+ Copyright (c) 2018 Intel Corporation
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 import networkx as nx
 
 from mo.back.replacement import BackReplacementPattern
-from mo.middle.passes.eliminate import remove_op_node
+from mo.middle.passes.eliminate import remove_op_node_with_data_node
 
 
 class RemoveLastSoftMaxPattern(BackReplacementPattern):
@@ -50,4 +50,4 @@ class RemoveLastSoftMaxPattern(BackReplacementPattern):
         child = softmax.out_node()
         if not child.has_and_set('is_output'):
             return
-        remove_op_node(graph, softmax)
+        remove_op_node_with_data_node(graph, softmax)
similarity index 89%
rename from model-optimizer/extensions/front/tf/Pack.py
rename to model-optimizer/extensions/front/Pack.py
index 916e03d..a7defba 100644 (file)
@@ -30,9 +30,9 @@ class Pack(FrontReplacementOp):
     def replace_op(self, graph: nx.MultiDiGraph, node: Node):
         expand_dims_nodes = list()
         expand_axis_node = Const(graph, dict(value=node.axis)).create_node([])
-        for ind, input_node in enumerate(node.in_nodes()):
+        for ind, edge_attrs in node.in_edges().items():
             expand_dims_nodes.append(ExpandDims(graph, dict(name=node.name + '/ExpandDims_')).
-                                     create_node([node.in_node(ind), expand_axis_node]))
+                                     create_node([(node.in_node(ind), edge_attrs['out']), expand_axis_node]))
 
         out_node = Concat(graph, dict(name=node.name + '/Concat_', axis=node.axis)).create_node(expand_dims_nodes)
         # Replace edge from out port 0 of the matched node with a edge from node out_node.id with port 0.
diff --git a/model-optimizer/extensions/front/caffe/axpy.py b/model-optimizer/extensions/front/caffe/axpy.py
new file mode 100644 (file)
index 0000000..e5f5759
--- /dev/null
@@ -0,0 +1,43 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from mo.front.common.replacement import FrontReplacementOp
+from mo.graph.graph import Node
+from mo.ops.lin_op import Add
+from mo.ops.scale_shift import ScaleShiftOp
+
+
+class AxpyToEltwise(FrontReplacementOp):
+    """
+    Replaces Axpy layer with ScaleShift and Eltwise.
+    """
+    op = "Axpy"
+    enabled = True
+
+    def replace_op(self, graph: nx.MultiDiGraph, node: Node):
+        in_node_0 = node.in_node(0)
+        in_node_1 = node.in_node(1)
+        in_node_2 = node.in_node(2)
+
+        ss = ScaleShiftOp(graph, {'name': node.id + "/ScaleShift_", 'axis': 0})
+        scale_shift = ss.create_node(inputs=[in_node_1, in_node_0])
+
+        el = Add(graph, {'name': node.id + "/Add_"})
+        el_node = el.create_node(inputs=[scale_shift, in_node_2])
+
+        return [el_node.id]
diff --git a/model-optimizer/extensions/front/caffe/bn.py b/model-optimizer/extensions/front/caffe/bn.py
new file mode 100644 (file)
index 0000000..06ad486
--- /dev/null
@@ -0,0 +1,60 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+
+from mo.front.caffe.extractors.utils import embed_input
+from mo.front.common.replacement import FrontReplacementOp
+from mo.graph.graph import Node
+from mo.ops.scale_shift import ScaleShiftOp
+from mo.utils.error import Error
+
+
+class BNToScaleShift(FrontReplacementOp):
+    """
+    Replaces BN layer with ScaleShift.
+    """
+    op = "BN"
+    enabled = True
+
+    def replace_op(self, graph: nx.MultiDiGraph, node: Node):
+        attrs = {'name': node.id + "/ScaleShift_"}
+
+        param = graph.node[node.id]['pb'].bn_param
+        pb_model = graph.node[node.id]['model_pb']
+        blobs = pb_model.blobs
+
+        if len(blobs) != 4:
+            raise Error("Incorrect number of blobs in BN layer {}".format(node.id))
+
+        mean = np.array(blobs[0].data)
+        var = np.array(blobs[1].data)
+        betta = np.array(blobs[2].data)
+        gamma = np.array(blobs[3].data)
+
+        gamma = gamma + np.repeat(param.eps, gamma.shape)
+
+        scale = 1.0 / np.sqrt(gamma) * mean
+        shift = var - betta * scale
+
+        embed_input(attrs, 1, 'scale', scale, 'weights')
+        embed_input(attrs, 2, 'bias', shift, 'biases')
+
+        ss = ScaleShiftOp(graph, attrs)
+        scale_shift = ss.create_node([node.in_node(0)])
+
+        return [scale_shift.id]
index dd8fb96..296fcf3 100644 (file)
@@ -129,11 +129,18 @@ class DetectionOutputFrontExtractor(FrontExtractorOp):
             'pad_mode': pad_mode,
             'pad_value': ','.join(str(x) for x in param.save_output_param.resize_param.pad_value),
             'interp_mode': interp_mode,
-            'input_width': param.input_width,
-            'input_height': param.input_height,
-            'normalized': int(param.normalized)
         }
 
+        # these params can be omitted in caffe.proto and in param as consequence,
+        # so check if it is set or set to default
+        fields = [field[0].name for field in param.ListFields()]
+        if 'input_width' in fields:
+            attrs['input_width'] = param.input_width
+        if 'input_height' in fields:
+            attrs['input_height'] = param.input_height
+        if 'normalized' in fields:
+            attrs['normalized'] = int(param.normalized)
+
         mapping_rule = merge_attrs(param, attrs)
 
         # force setting infer function because it doesn't exist in proto so merge_attrs will not set it
diff --git a/model-optimizer/extensions/front/caffe/flatten_ext.py b/model-optimizer/extensions/front/caffe/flatten_ext.py
new file mode 100644 (file)
index 0000000..a68d81c
--- /dev/null
@@ -0,0 +1,36 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.flatten import Flatten
+
+
+class FlattenFrontExtractor(FrontExtractorOp):
+    op = 'Flatten'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        proto_layer = node.pb
+        param = proto_layer.flatten_param
+
+        attrs = {
+            'axis': param.axis,
+            'end_axis': param.end_axis,
+        }
+
+        Flatten.update_node_stat(node, attrs)
+        return __class__.enabled
index b1d5ba7..ae8a8da 100644 (file)
@@ -39,6 +39,9 @@ class InterpFrontExtractor(FrontExtractorOp):
 
         mapping_rule = merge_attrs(param, update_attrs)
 
+        # in Caffe can be 2 inputs, shape should be got from shape of the second input
+        mapping_rule['parse_2nd_input'] = 'shape'
+
         # update the attributes of the node
         Op.get_op_class_by_name(__class__.op).update_node_stat(node, mapping_rule)
         return __class__.enabled
index ca584f4..96540a1 100644 (file)
@@ -31,6 +31,7 @@ class PoolingFrontExtractor(FrontExtractorOp):
         param = proto_layer.pooling_param
 
         method = 'max'
+        exclude_pad = 'true'
         kernel = [0, 0]
         stride = [1, 1]
         padding = [0, 0]
@@ -45,8 +46,10 @@ class PoolingFrontExtractor(FrontExtractorOp):
 
         if param.pool == 0:
             method = 'max'
+            exclude_pad = 'true'
         elif param.pool == 1:
             method = 'avg'
+            exclude_pad = 'false'
         else:
             raise ValueError('Unknown Pooling Method!')
 
@@ -64,7 +67,7 @@ class PoolingFrontExtractor(FrontExtractorOp):
             'pad': np.array([[0, 0], [0, 0], [padding[1], padding[1]], [padding[0], padding[0]]], dtype=np.int64),
             'pad_spatial_shape': np.array([[padding[1], padding[1]], [padding[0], padding[0]]], dtype=np.int64),
             'pool_method': method,
-            'exclude_pad': 'false',
+            'exclude_pad': exclude_pad,
             'global_pool': global_pooling,
             'output_spatial_shape': None,
             'rounding_type': rt
index 644c374..ae87dc4 100644 (file)
@@ -38,8 +38,6 @@ class PriorBoxFrontExtractor(FrontExtractorOp):
             'aspect_ratio': np.array(param.aspect_ratio),
             'min_size': np.array(param.min_size),
             'max_size': np.array(param.max_size),
-            'width': list(param.width),
-            'height': list(param.height),
             'flip': int(param.flip),
             'clip': int(param.clip),
             'variance': list(variance),
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-
-from mo.front.caffe.extractors.utils import weights_biases
+from mo.front.caffe.collect_attributes import collect_attributes
 from mo.front.common.extractors.utils import layout_attrs
 from mo.front.extractor import FrontExtractorOp
 from mo.ops.op import Op
 
 
-class InnerProductFrontExtractor(FrontExtractorOp):
-    op = 'fullyconnected'
+class PreluFrontExtractor(FrontExtractorOp):
+    op = 'ShuffleChannel'
     enabled = True
 
     @staticmethod
     def extract(node):
-        mapping_rule = {
-            'out-size': node.pb.num_output
-        }
+        mapping_rule = collect_attributes(node.pb.shuffle_channel_param)
         mapping_rule.update(layout_attrs())
-        mapping_rule.update(weights_biases(node.pb.bias_term, node.pb))
-        Op.get_op_class_by_name('FullyConnected').update_node_stat(node, mapping_rule)
+
+        # update the attributes of the node
+        Op.get_op_class_by_name(__class__.op).update_node_stat(node, mapping_rule)
         return __class__.enabled
diff --git a/model-optimizer/extensions/front/caffe/softmax_ext.py b/model-optimizer/extensions/front/caffe/softmax_ext.py
new file mode 100644 (file)
index 0000000..6bb8d74
--- /dev/null
@@ -0,0 +1,36 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.softmax import Softmax
+
+
+class SoftmaxFrontExtractor(FrontExtractorOp):
+    op = 'Softmax'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        proto_layer = node.pb
+        param = proto_layer.softmax_param
+
+        attrs = {
+            'axis': param.axis
+        }
+
+        # update the attributes of the node
+        Softmax.update_node_stat(node, attrs)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/caffe/split_to_identity.py b/model-optimizer/extensions/front/caffe/split_to_identity.py
new file mode 100644 (file)
index 0000000..d46c1c3
--- /dev/null
@@ -0,0 +1,38 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from mo.front.common.replacement import FrontReplacementOp
+
+
+class SplitToIdentity(FrontReplacementOp):
+    """
+    The Split layer in Caffe copies input blob to a number of output layers. The Split layer in Inference Engine divides
+    the input blob into several peaces. The Caffe Split layer is redundant because Inference Engine takes care of
+    creation of the intermediate blobs if it is necessary.
+
+    The replacer changes the 'op' attribute of the node to 'Identity' and set all 'out' edge attributes to be 0. So the
+    Identity operations are removed further in the pipeline.
+    """
+    op = "Split"
+    enabled = True
+
+    def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict):
+        split_node = match['op']
+        split_node.op = 'Identity'
+        for u, v, edge_attrs in split_node.graph.out_edges(split_node.id, data=True):
+            edge_attrs['out'] = 0
diff --git a/model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py b/model-optimizer/extensions/front/kaldi/add_reshape_around_convolution.py
new file mode 100644 (file)
index 0000000..02c0e0f
--- /dev/null
@@ -0,0 +1,62 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import networkx as nx
+
+from mo.front.common.replacement import FrontReplacementOp
+from mo.graph.graph import Node
+from mo.ops.convolution import Convolution
+from mo.ops.reshape import Reshape
+
+
+class ReplaceConvolutionReshape(FrontReplacementOp):
+    """
+       This pass adds Reshapes around a Convolution layer for reshaping from NH to NCHW
+       For example:
+           Let's suppose we have next graph:
+
+           Prev_Layer [N, H] -> Convolution [N, C, H, W] -> Next_Layer [N, H]
+
+           In this case Convolution takes only [N, H] from input tensor in 3rd dim
+           So this pass will convert this graph to the next one:
+
+           Prev_Layer [N, H] -> Reshape [N, 1, H, 1] -> Convolution [N, C=1, H, W=1] -> Reshape [N, 1, H, 1] -> Next_Layer [N, H]
+
+   """
+    op = "Convolution"
+    enabled = True
+
+    def replace_op(self, graph: nx.MultiDiGraph, node: Node):
+        input_node = node.in_node(0)
+        port = graph.get_edge_data(input_node.id, node.id)[0]['out']
+        input_reshape_node = Reshape(graph,
+                                     {
+                                         'name': '/Reshape/' + node.name,
+                                         'axis': 1,
+                                         'infer': Reshape.kaldi_infer
+                                     }).create_node([(input_node, port)])
+
+        convolution_node = Convolution(graph,
+                                       node.attrs()
+                                       ).create_node([input_reshape_node])
+
+        output_reshape_node = Reshape(graph,
+                                      {
+                                          'name': node.name + '/Reshape/',
+                                          'axis': 1,
+                                          'infer': Reshape.kaldi_infer
+                                      }).create_node([convolution_node])
+
+        return [output_reshape_node.id]
diff --git a/model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py b/model-optimizer/extensions/front/kaldi/add_reshape_around_pooling.py
new file mode 100644 (file)
index 0000000..b7326ad
--- /dev/null
@@ -0,0 +1,59 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from mo.front.common.replacement import FrontReplacementOp
+from mo.graph.graph import Node
+from mo.ops.pooling import Pooling
+from mo.ops.reshape import Reshape
+
+
+class ReplacePoolingReshape(FrontReplacementOp):
+    """
+        This pass adds Reshapes around a Pooling layer for reshaping from NH to NCHW
+        For example:
+            Let's suppose we have next graph:
+
+            Prev_Layer [N, H] -> Pooling [N, C, H, W] -> Next_Layer [N, H]
+
+            In this case Pooling takes only [N, H] from input tensor in 3rd dim
+            So this pass will convert this graph to the next one:
+
+            Prev_Layer [N, H] -> Reshape [N, 1, H, 1] -> Pooling [N, C=1, H, W=1] -> Reshape [N, 1, H, 1] -> Next_Layer [N, H]
+
+    """
+    op = "Pooling"
+    enabled = True
+
+    def replace_op(self, graph: nx.MultiDiGraph, node: Node) -> list:
+        input_node = node.in_node(0)
+
+        input_reshape_node = Reshape(graph,
+                                     {
+                                         'name': 'Reshape/' + node.name,
+                                         'infer': Reshape.kaldi_infer
+                                     }).create_node([input_node])
+
+        pooling_node = Pooling(graph, graph.nodes[node.id]).create_node([input_reshape_node])
+
+        output_reshape_node = Reshape(graph,
+                                      {
+                                          'name': node.name + '/Reshape/',
+                                          'infer': Reshape.kaldi_infer
+                                      }).create_node([pooling_node])
+
+        return [output_reshape_node.id]
diff --git a/model-optimizer/extensions/front/kaldi/add_reshape_for_conv.py b/model-optimizer/extensions/front/kaldi/add_reshape_for_conv.py
deleted file mode 100644 (file)
index 58ccf0b..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-"""
- Copyright (c) 2017-2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import networkx as nx
-
-from extensions.front.kaldi.replace_lstm_node_pattern import create_node
-from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Node
-
-
-class ReplaceConvolutionReshape(FrontReplacementOp):
-    op = "Convolution"
-    enabled = True
-
-    def replace_op(self, graph: nx.MultiDiGraph, node: Node):
-        input_nodes = node.in_nodes()
-
-        conv_attrs = graph.node[node.id]['pb'].__dict__
-        input_reshape = create_node(graph, 'Reshape_Convolution', {'type': 'Reshape',
-                                                                   'axis': 1,
-                                                                   'num_axes': -1,
-                                                                   'dim': None},
-                                    tuple(n.id for i, n in input_nodes.items()))
-        convolution = create_node(graph, 'Convolution', conv_attrs, tuple([input_reshape.id]))
-        output_reshape = create_node(graph, 'Convolution_Reshape', {'type': 'Reshape',
-                                                                    'axis': 1,
-                                                                    'num_axes': -1,
-                                                                    'dim': None},
-                                     tuple([convolution.id]))
-
-        return [output_reshape.id]
diff --git a/model-optimizer/extensions/front/kaldi/add_reshape_for_pooling.py b/model-optimizer/extensions/front/kaldi/add_reshape_for_pooling.py
deleted file mode 100644 (file)
index d62dedd..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-"""
- Copyright (c) 2017-2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-import copy
-
-import networkx as nx
-import numpy as np
-
-from extensions.front.kaldi.replace_lstm_node_pattern import create_node
-from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Node
-
-
-class ReplacePoolingReshape(FrontReplacementOp):
-    op = "Pooling"
-    enabled = True
-
-    def replace_op(self, graph: nx.MultiDiGraph, node: Node):
-        input_nodes = node.in_nodes()
-
-        pool_attrs = graph.node[node.id]['pb'].__dict__
-        input_reshape = create_node(graph, 'Reshape_Pooling', {'type': 'Reshape',
-                                                               'axis': 1,
-                                                               'num_axes': -1,
-                                                               'dim': None},
-                                    tuple(n.id for i, n in input_nodes.items()))
-
-        pooling = create_node(graph, 'pooling', pool_attrs, tuple([input_reshape.id]))
-
-        output_reshape = create_node(graph, 'Pooling_Reshape', {
-            'type': 'Reshape',
-            'axis': 1,
-            'num_axes': -1,
-            'dim': None
-        }, tuple([pooling.id]))
-
-        return [output_reshape.id]
index b76f540..a5c9a8c 100644 (file)
@@ -19,7 +19,7 @@ import numpy as np
 
 from extensions.front.kaldi.fuse_repeated_reshape import FuseRepeatedReshapes
 from mo.front.common.replacement import FrontReplacementPattern
-from mo.middle.passes.eliminate import remove_op_node
+from mo.middle.passes.eliminate import remove_op_node_with_data_node
 
 
 class EliminateRedundantReshape(FrontReplacementPattern):
@@ -46,4 +46,4 @@ class EliminateRedundantReshape(FrontReplacementPattern):
         out_node = reshape_node.out_node()
         if not np.array_equal(in_node.shape, out_node.shape):
             return False
-        remove_op_node(graph, reshape_node)
+        remove_op_node_with_data_node(graph, reshape_node)
index 006edc6..9a8a984 100644 (file)
@@ -17,7 +17,7 @@
 import networkx as nx
 
 from mo.front.common.replacement import FrontReplacementPattern
-from mo.middle.passes.eliminate import remove_op_node
+from mo.middle.passes.eliminate import remove_op_node_with_data_node
 
 
 class FuseRepeatedReshapes(FrontReplacementPattern):
@@ -43,4 +43,4 @@ class FuseRepeatedReshapes(FrontReplacementPattern):
         if (node.has_valid('type') and node.type == 'Reshape' and
                 len(node.out_nodes()) == 1 and node.out_node().has_valid('kind') and node.out_node().kind == 'data' and
                 len(node.out_node().out_nodes()) == 1):
-            remove_op_node(graph, node)
+            remove_op_node_with_data_node(graph, node)
index e9bd31f..bfba4c4 100644 (file)
@@ -1,5 +1,5 @@
 """
- Copyright (c) 2017-2018 Intel Corporation
+ Copyright (c) 2018 Intel Corporation
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import copy
+import numpy as np
 
 import networkx as nx
 
+from mo.front.caffe.extractors.utils import embed_input
 from mo.front.common.replacement import FrontReplacementOp
-from mo.front.kaldi.extractor import common_kaldi_fields
-from mo.front.kaldi.utils import KaldiNode
-from mo.graph.graph import Node, unique_id as unique_node_id
+from mo.graph.graph import Node
+from mo.ops.activation import Activation
+from mo.ops.clamp import Clamp
+from mo.ops.eltwise import Eltwise
+from mo.ops.inner_product import InnerProduct
+from mo.ops.memory import Memory
+from mo.ops.scale_shift import ScaleShiftOp
+from mo.ops.split import Split
 
 
 def unique_id(prefix: str = 'id') -> str:
@@ -40,90 +46,50 @@ def unique_id(prefix: str = 'id') -> str:
 unique_id.names = []
 
 
-def create_node(graph: nx.MultiDiGraph, name: str, attrs: dict, inputs: tuple = (), out_indexes: tuple = ([0]),
-                weights=None, biases=None):
-    """
-    Create node with name 'name' and attributes from 'attrs'.
-    Incoming edges for the node creates from nodes with id from 'inputs'
-    Outgoing edges for the node creates to nodes with id from 'out_indexes'
-    :param graph: graph to operate on.
-    :param name: name how to save added node.
-    :param attrs: optional attributes to be set. Attributes of the node
-    :param inputs: tuple of ids inputs nodes
-    :param out_indexes: tuple of ids outputs nodes
-    :param weights: np.array of weights
-    :param biases: np.array of biases
-    :return:
-    """
-    unique_name = unique_node_id(graph, '{}_'.format(name))
-    layer = KaldiNode(unique_name)
-    layer.set_weight(weights)
-    layer.set_bias(biases)
-    layer.set_attrs(attrs)
-
-    graph.add_node(unique_name, pb=layer, kind='op')
-    new_graph_node = Node(graph, unique_name)
-    graph.node[unique_name].update(common_kaldi_fields(new_graph_node))
-
-    edge_attrs = {
-        'out': 0,
-        'in': 0,
-        'name': layer.name,
-        'fw_tensor_debug_info': [('', layer.name)],  # debug anchor for a framework tensor name and port
-        'in_attrs': ['in', 'name'],
-        'out_attrs': ['out', 'name'],
-        'data_attrs': ['fw_tensor_debug_info']
-    }
-
-    edges = []
-    for index, noe_id in enumerate(inputs):
-        attrs = copy.deepcopy(edge_attrs)
-        attrs['fw_tensor_debug_info'] = [(Node(graph, noe_id).soft_get('name'), None)]
-        if index < len(out_indexes):
-            attrs['out'] = out_indexes[index]
-        attrs['in'] = index
-        edges.append((noe_id, new_graph_node.id, attrs))
-
-    graph.add_edges_from(edges)
-
-    return new_graph_node
-
-
 class ReplaceLSTMNodePattern(FrontReplacementOp):
-    op = "LSTMProjectedStreams"
+    op = "LSTMCell"
     enabled = True
 
     def replace_op(self, graph: nx.MultiDiGraph, node: Node):
-        input_node = node.in_nodes()[0]
-        out_node = node.out_node()
+        input_node = node.in_node()
 
         memory_pair_input = unique_id('id')
         memory_pair_output = unique_id('id')
-        # Input -> FullyConnected
-        fc_layer_after_input = create_node(graph, 'input_fullyconnected',
-                                           dict(type='FullyConnected',
-                                                num_output=node.pb.gifo_x_weights_shape[0],
-                                                bias_term=True),
-                                           tuple([input_node.id]),
-                                           weights=node.pb.gifo_x_weights,
-                                           biases=node.pb.gifo_biases)
 
-        prev_lstm_output_node = create_node(graph, 'prev_memory_output',
-                                            dict(type='Memory', id=memory_pair_input, index=1, size=2))
+        # Input -> FullyConnected
+        fc_layer_after_input_attrs = {'name': 'input_fullyconnected',
+                                      'num_output': node.gifo_x_weights_shape[0],
+                                      'bias_term': True
+                                      }
+
+        embed_input(fc_layer_after_input_attrs, 1, 'weights', node.gifo_x_weights)
+        embed_input(fc_layer_after_input_attrs, 2, 'biases', node.gifo_biases)
+        fc_layer_after_input = InnerProduct(graph, fc_layer_after_input_attrs).create_node([input_node])
+
+        prev_lstm_output = Memory(graph, {'name': 'prev_memory_output',
+                                          'id': memory_pair_input,
+                                          'index': 1,
+                                          'size': 2,
+                                          'shape': np.array([node.gifo_r_weights_shape[1]], dtype=np.int64)
+                                          }).create_node()
 
         # *Memory(output) -> FullyConnected
-        fc_layer_from_prev_state = create_node(graph, 'prev_memory_output_fullyconnected',
-                                               dict(type='FullyConnected', num_output=node.pb.gifo_r_weights_shape[0],
-                                                    bias_term=False),
-                                               tuple([prev_lstm_output_node.id]),
-                                               weights=node.pb.gifo_r_weights)
+        fc_layer_from_prev_state_attrs = {'name': 'prev_memory_output_fullyconnected',
+                                          'num_output': node.gifo_r_weights_shape[0],
+                                          'bias_term': False
+                                          }
+
+        embed_input(fc_layer_from_prev_state_attrs, 1, 'weights', node.gifo_r_weights)
+        fc_layer_from_prev_state = InnerProduct(graph, fc_layer_from_prev_state_attrs).create_node(
+            [prev_lstm_output])
 
         # Memory -> FullyConnected  \
         #                           *Eltwise(sum)
         # Input -> FullyConnected   /
-        join_input_prev_state_sum_node = create_node(graph, 'join_input_eltwise',
-                                                     dict(type='Eltwise', operation='sum'),
-                                                     tuple([fc_layer_from_prev_state.id, fc_layer_after_input.id]))
+        join_input_prev_state_sum = Eltwise(graph, {'name': 'join_input_eltwise',
+                                                    'operation': 'sum'
+                                                    }).create_node([fc_layer_from_prev_state,
+                                                                    fc_layer_after_input])
 
         # *Eltwise(sum) -> Split
         # it is split into 4 nodes: Act, Eltw*3
@@ -134,154 +100,151 @@ class ReplaceLSTMNodePattern(FrontReplacementOp):
         #     |\
         #     | \__(3)Eltwise(sum)
         #     |____(4)Eltwise(sum)
-        split_joined_input = create_node(graph, 'join_input_split',
-                                         dict(type='Split', axis=None, num_split=4),
-
-                                         tuple([join_input_prev_state_sum_node.id]))
-
-        prev_lstm_state_node = create_node(graph, 'prev_memory_state',
-                                           dict(type='Memory', id=memory_pair_output, index=1, size=2))
+        split_joined_input = Split(graph, {'name': 'join_input_split',
+                                           'axis': 1,
+                                           'num_split': 4
+                                           }).create_node([join_input_prev_state_sum])
+
+        prev_lstm_state = Memory(graph, {'name': 'prev_memory_state',
+                                         'id': memory_pair_output,
+                                         'index': 1,
+                                         'size': 2,
+                                         'shape': np.array([node.input_gate_weights.shape[0]], dtype=np.int64)
+                                         }).create_node()
 
         # *Memory(state) -> *ScaleShift(input)
-        state_input_scaleshift_node = create_node(graph, 'input_scaleshift',
-                                                  dict(type='ScaleShift', bias_term=False),
-                                                  tuple([prev_lstm_state_node.id]),
-                                                  weights=node.pb.input_gate_weights)
+        state_input_scaleshift_attrs = {'name': 'input_scaleshift',
+                                        'bias_term': False
+                                        }
+        embed_input(state_input_scaleshift_attrs, 1, 'weights', node.input_gate_weights)
+        state_input_scaleshift = ScaleShiftOp(graph, state_input_scaleshift_attrs).create_node([prev_lstm_state])
 
         # *Memory(state) -> *ScaleShift(forget)
-        state_forget_scaleshift_node = create_node(graph, 'forget_scaleshift',
-                                                   dict(type='ScaleShift', bias_term=False),
-                                                   tuple([prev_lstm_state_node.id]),
-                                                   weights=node.pb.forget_gate_weights)
+        state_forget_scaleshift_attrs = {'name': 'forget_scaleshift',
+                                         'bias_term': False
+                                         }
+        embed_input(state_forget_scaleshift_attrs, 1, 'weights', node.forget_gate_weights)
+        state_forget_scaleshift = ScaleShiftOp(graph, state_forget_scaleshift_attrs).create_node([prev_lstm_state])
 
         # Split                                 \
         #                                       (2)Eltwise(sum)
         # Memory(state) -> *ScaleShift(input)  /
-        join_prev_lstm_input_joined_input_sum_node = create_node(graph, 'join_prev_lstm_input_joined_input_eltwise',
-                                                                 dict(type='Eltwise', operation='sum'),
-                                                                 tuple([
-                                                                     split_joined_input.id,
-                                                                     state_input_scaleshift_node.id
-                                                                 ]), out_indexes=(1, 0))
-
+        join_prev_lstm_input_joined_input_sum = Eltwise(graph, {'name': 'join_prev_lstm_input_joined_input_eltwise',
+                                                                'operation': 'sum'
+                                                                }).create_node([(split_joined_input, 1),
+                                                                                state_input_scaleshift
+                                                                                ])
         # Split                                 \
         #                                       (3)Eltwise(sum)
         # Memory(state) -> *ScaleShift(forget)  /
-        join_prev_lstm_input_joined_forget_sum_node = create_node(graph, 'join_prev_lstm_input_joined_forget_sum',
-                                                                  dict(type='Eltwise', operation='sum'),
-                                                                  tuple([
-                                                                      split_joined_input.id,
-                                                                      state_forget_scaleshift_node.id
-                                                                  ]),
-                                                                  out_indexes=(2, 0))
+        join_prev_lstm_input_joined_forget_sum = Eltwise(graph, {'name': 'join_prev_lstm_input_joined_forget_sum',
+                                                                 'operation': 'sum'
+                                                                 }).create_node([(split_joined_input, 2),
+                                                                                 state_forget_scaleshift
+                                                                                 ])
 
         # Split -> Tanh
-        remember_tahn = create_node(graph, 'remember_tahn',
-                                    dict(type='Activation', operation='tanh'),
-                                    tuple([split_joined_input.id]), out_indexes=(0,))
+        remember_tahn = Activation(graph, {'name': 'remember_tahnv',
+                                           'operation': 'tanh'
+                                           }).create_node([(split_joined_input, 0)])
 
         # Split -> (2)Eltwise(sum) -> *Sigmoid
-        remember_sigmoid = create_node(graph, 'remember_sigmoid',
-                                       dict(type='Activation', operation='sigmoid'),
-                                       tuple([join_prev_lstm_input_joined_input_sum_node.id]))
+        remember_sigmoid = Activation(graph, {'name': 'remember_sigmoid',
+                                              'operation': 'sigmoid'
+                                              }).create_node(
+            [join_prev_lstm_input_joined_input_sum])
 
         # Split -> (3)Eltwise(sum) -> **Sigmoid
-        forget_sigmoid = create_node(graph, 'forget_sigmoid',
-                                     dict(type='Activation', operation='sigmoid'),
-                                     tuple([join_prev_lstm_input_joined_forget_sum_node.id]))
+        forget_sigmoid = Activation(graph, {'name': 'forget_sigmoid',
+                                            'operation': 'sigmoid'
+                                            }).create_node(
+            [join_prev_lstm_input_joined_forget_sum])
 
         # *Memory(state)                        \
         #                                       (6)Eltwise(mul)
         # Split -> (3)Eltwise(sum) -> **Sigmoid /
-        join_forget_prev_state_mul_node = create_node(graph, 'join_forget_prev_state_mul',
-                                                      dict(type='Eltwise', operation='mul'),
-                                                      tuple([
-                                                          forget_sigmoid.id,
-                                                          prev_lstm_state_node.id
-                                                      ]))
+        join_forget_prev_state_mul = Eltwise(graph, {'name': 'join_forget_prev_state_mul',
+                                                     'operation': 'mul'
+                                                     }).create_node(
+            [forget_sigmoid, prev_lstm_state])
 
         # Split -> Tahn                         \
         #                                       (5)Eltwise(mul)
         # Split -> (2)Eltwise(sum) -> *Sigmoid   /
-        join_remember_candidates_mul_node = create_node(graph, 'join_remember_candidates_mul',
-                                                        dict(type='Eltwise', operation='mul'),
-                                                        tuple([
-                                                            remember_tahn.id,
-                                                            remember_sigmoid.id,
-                                                        ]))
+        join_remember_candidates_mul = Eltwise(graph, {'name': 'join_remember_candidates_mul',
+                                                       'operation': 'mul'
+                                                       }).create_node(
+            [remember_tahn, remember_sigmoid])
 
         # (5)Eltwise(mul)  \
         #               (7)Eltwise(sum)
         # (6)Eltwise(mul)   /
-        join_forget_remember_sum_node = create_node(graph, 'join_forget_remember_sum',
-                                                    dict(type='Eltwise', operation='sum'),
-                                                    tuple([
-                                                        join_forget_prev_state_mul_node.id,
-                                                        join_remember_candidates_mul_node.id,
-                                                    ]))
+        join_forget_remember_sum = Eltwise(graph, {'name': 'join_forget_remember_sum',
+                                                   'operation': 'sum'
+                                                   }).create_node(
+            [join_forget_prev_state_mul, join_remember_candidates_mul])
 
         # (7)Eltwise(sum) -> Clamp
-        join_forget_clamp_node = create_node(graph, 'join_forget_clamp',
-                                             dict(type='Clamp', max=node.pb.clip_value, min=-node.pb.clip_value),
-                                             tuple([join_forget_remember_sum_node.id]))
-
+        join_forget_clamp = Clamp(graph, {'name': 'join_forget_clamp',
+                                          'max': node.clip_value,
+                                          'min': -node.clip_value
+                                          }).create_node(
+            [join_forget_remember_sum])
+        #
         # Clamp -> (2)Memory(state)
-        next_lstm_state_node = create_node(graph, 'next_lstm_state',
-                                           dict(type='Memory', id=memory_pair_output, index=0, size=2),
-                                           tuple([join_forget_clamp_node.id]))
+        Memory(graph, {'name': 'next_lstm_state',
+                       'id': memory_pair_output,
+                       'index': 0,
+                       'size': 2,
+                       'shape': np.array([node.input_gate_weights.shape[0]], dtype=np.int64)
+                       }).create_node([join_forget_clamp])
 
         # Clamp -> (2)Tahn
-        state_filtered_tahn_node = create_node(graph, 'state_filtered_tahn',
-                                               dict(type='Activation', operation='tanh'),
-                                               tuple([join_forget_clamp_node.id]))
+        state_filtered_tahn = Activation(graph, {'name': 'state_filtered_tahn',
+                                                 'operation': 'tanh'
+                                                 }).create_node([join_forget_clamp])
 
         # Clamp -> (2)ScaleShift
-        clamp_scaleshift_node = create_node(graph, 'clamp_scaleshift',
-                                            dict(type='ScaleShift', bias_term=False),
-                                            tuple([join_forget_clamp_node.id]),
-                                            weights=node.pb.output_gate_weights)
+        clamp_scaleshift_attrs = {'name': 'clamp_scaleshift',
+                                  'bias_term': False}
+        embed_input(clamp_scaleshift_attrs, 1, 'weights', node.output_gate_weights)
+        clamp_scaleshift = ScaleShiftOp(graph, clamp_scaleshift_attrs).create_node([join_forget_clamp])
 
         # Split                 \
         #                       (4)Eltwise(sum)
         # Clamp -> (2)ScaleShift /
-        join_next_lstm_input_joined_input_sum_node = create_node(graph, 'join_next_lstm_input_joined_input_sum',
-                                                                 dict(type='Eltwise', operation='sum'),
-                                                                 tuple([
-                                                                     split_joined_input.id,
-                                                                     clamp_scaleshift_node.id
-                                                                 ]),
-                                                                 out_indexes=(3, 0))
+        join_next_lstm_input_joined_input_sum = Eltwise(graph, {'name': 'join_next_lstm_input_joined_input_sum',
+                                                                'operation': 'sum'
+                                                                }).create_node([(split_joined_input, 3), clamp_scaleshift])
 
         # (4)Eltwise(sum) -> (3)Sigmoid
-        output_sigmoid = create_node(graph, 'output_sigmoid',
-                                     dict(type='Activation', operation='sigmoid'),
-                                     tuple([join_next_lstm_input_joined_input_sum_node.id]))
+        output_sigmoid = Activation(graph, {'name': 'output_sigmoid',
+                                            'operation': 'sigmoid'
+                                            }).create_node(
+            [join_next_lstm_input_joined_input_sum])
 
         # (4)Eltwise(sum) -> (3)Sigmoid         \
         #                                       (5)Eltwise(mul)
         # Clamp -> (2)Tahn                      /
-        joined_output_mul_node = create_node(graph, 'joined_output_mul',
-                                             dict(type='Eltwise', operation='mul'),
-                                             tuple([
-                                                 state_filtered_tahn_node.id,
-                                                 output_sigmoid.id
-                                             ]))
+        joined_output_mul = Eltwise(graph, {'name': 'joined_output_mul',
+                                            'operation': 'mul'
+                                            }).create_node([state_filtered_tahn, output_sigmoid])
 
         # (5)Eltwise(mul) -> (3)FullyConnected
-        fc_output_node = create_node(graph, 'FullyConnected',
-                                     dict(type='FullyConnected', num_output=node.pb.projection_weights_shape[0],
-                                          bias_term=False),
-                                     tuple([joined_output_mul_node.id]),
-                                     weights=node.pb.projection_weights)
+        fc_output_attrs = {'name': 'FullyConnected',
+                           'num_output': node.projection_weights_shape[0],
+                           'bias_term': False}
+        embed_input(fc_output_attrs, 1, 'weights', node.projection_weights)
+        fc_output = InnerProduct(graph, fc_output_attrs).create_node([joined_output_mul])
 
         #                   / (2)Memory(output)
         # (3)FullyConnected
         #                   \ Output (any next node) (edge created automatically after replacement)
-        create_node(graph, 'next_lstm_output',
-                    dict(type='Memory', id=memory_pair_input, index=0, size=2),
-                    tuple([fc_output_node.id]))
-
-        graph.remove_edges_from([input_node.id, node.id])
-        graph.remove_edges_from([node.id, out_node.id])
-
-        return [fc_output_node.id]
+        Memory(graph, {'name': 'next_lstm_output',
+                       'id': memory_pair_input,
+                       'index': 0,
+                       'size': 2,
+                       'shape': np.array([node.gifo_r_weights_shape[1]], dtype=np.int64)
+                       }).create_node([fc_output])
+
+        return [fc_output.id]
diff --git a/model-optimizer/extensions/front/kaldi/replace_splice_node_pattern.py b/model-optimizer/extensions/front/kaldi/replace_splice_node_pattern.py
new file mode 100644 (file)
index 0000000..360a225
--- /dev/null
@@ -0,0 +1,84 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import numpy as np
+
+import networkx as nx
+
+from extensions.front.kaldi.replace_lstm_node_pattern import unique_id
+from mo.front.common.partial_infer.utils import int64_array
+from mo.front.common.replacement import FrontReplacementOp
+from mo.graph.graph import Node
+from mo.ops.concat import Concat
+from mo.ops.crop import Crop
+from mo.ops.memory import Memory
+
+
+class ReplaceSpliceNodePattern(FrontReplacementOp):
+    """
+       This pass decomposes Splice layer to the sequence Slice Concat and Memory layers
+       For example:
+           Let's suppose we have next graph:
+
+           Input (N, H) -> Slice -> Next_Layer (N, k*H)
+
+           Where (N, k*H) is is real input of subsequent topology.
+           Splice is used for accumulation next (k-1)/2 and previous (k-1)/2 input data
+
+           So this pass will convert this graph to the next one:
+
+                                    Input [N, H]                  __
+                                                \               /
+                                                 Concat [N, k*H]
+                                                /               \
+           Memory [N, k*H] -> Slice [N, (k-1)*H]                 Memory [N, k*H]
+
+   """
+    op = "Splice"
+    enabled = True
+
+    def replace_op(self, graph: nx.MultiDiGraph, node: Node):
+        input_node = node.in_nodes()[0]
+        memory_pair_id = unique_id('id')
+        # Memory(in)
+        input_memory = Memory(graph, {'name': 'prev_splice_memory',
+                                      'id': memory_pair_id,
+                                      'index': 1,
+                                      'size': 2,
+                                      'shape': np.array(([input_node.shape[1] * len(node.context)]),
+                                                        dtype=np.int64)}).create_node()
+        # Memory(in)  \
+        #             Crop
+        # Input(temp) /
+        crop = Crop(graph, {'name': 'Splice_Crop',
+                            'axis': np.array([1], dtype=np.int64),
+                            'offset': np.array([input_node.shape[1]], dtype=np.int64),
+                            'dim': np.array([input_node.shape[1] * (len(node.context) - 1)],
+                                            dtype=np.int64)}).create_node([input_memory])
+
+        # Crop   \
+        #         Concat
+        # Input  /
+        concat_node = Concat(graph, {'name': 'Splice_Concat',
+                                     'axis': 1}).create_node([crop, input_node])
+
+        # Concat -> Memory(out)
+        Memory(graph, {'name': 'out_splice_memory',
+                       'id': memory_pair_id,
+                       'index': 0,
+                       'size': 2,
+                       'shape': np.array([input_node.shape[1] * len(node.context)],
+                                         dtype=np.int64)}).create_node([concat_node])
+        return [concat_node.id]
diff --git a/model-optimizer/extensions/front/mxnet/RNN_ext.py b/model-optimizer/extensions/front/mxnet/RNN_ext.py
new file mode 100644 (file)
index 0000000..1ae8e31
--- /dev/null
@@ -0,0 +1,62 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from mo.front.extractor import FrontExtractorOp
+from extensions.ops.lstm_sequence import LSTMSequence
+from mo.utils.error import Error
+from mo.utils.utils import refer_to_faq_msg
+
+
+class RNNFrontExtractor(FrontExtractorOp):
+    op = 'RNN'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = get_mxnet_layer_attrs(node.symbol_dict)
+        mode = attrs.str('mode', None)
+        state_size = attrs.int('state_size', None)
+        bidirectional = attrs.bool('bidirectional', False)
+        num_layers = attrs.int('num_layers', 1)
+
+        node_attrs = {
+            'batch_dim': 1,
+            'sequence_dim': 0,
+            'blobs_wrb': False,
+            'hidden_size': state_size,
+            'has_num_directions': bidirectional,
+            'format': 'mxnet',
+        }
+
+        if bidirectional:
+            raise Error(
+                "Operation RNN with bidirectional not supported. num_directions = 1 is supported only " +
+                refer_to_faq_msg(86))
+
+        if num_layers > 1:
+            raise Error(
+                "Operation RNN with num_layers more then one not supported. num_layers = 1 is supported only " +
+                refer_to_faq_msg(86))
+
+        if mode == 'lstm':
+            LSTMSequence.update_node_stat(node, node_attrs)
+        else:
+            raise Error(
+                "Operation RNN with mode '{}' not supported. Please register RNN as custom op. " +
+                refer_to_faq_msg(86),
+                mode)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/mxnet/block_grad_ext.py b/model-optimizer/extensions/front/mxnet/block_grad_ext.py
new file mode 100644 (file)
index 0000000..0d5946e
--- /dev/null
@@ -0,0 +1,29 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from extensions.ops.identity import IdentityOp
+from mo.front.extractor import FrontExtractorOp
+from mo.graph.graph import Node
+
+
+class BlockGradExt(FrontExtractorOp):
+    op = 'BlockGrad'
+    enabled = True
+
+    @staticmethod
+    def extract(node: Node):
+        IdentityOp.update_node_stat(node, {})
+        return __class__.enabled
index 26aa839..6463bfb 100644 (file)
 
 import numpy as np
 
-from mo.front.common.extractors.utils import layout_attrs
 from mo.front.extractor import FrontExtractorOp
 from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.ops.convolution import Convolution
-
+from mo.front.common.extractors.utils import layout_attrs
 
 class ConvFrontExtractor(FrontExtractorOp):
     op = 'Convolution'
@@ -31,33 +30,39 @@ class ConvFrontExtractor(FrontExtractorOp):
         attr = get_mxnet_layer_attrs(node.symbol_dict)
 
         kernel = attr.tuple("kernel", int, None)
-        stride = attr.tuple("stride", int, (1, 1))
-        padding = attr.tuple("pad", int, (0, 0))
-        dilate = attr.tuple("dilate", int, (1, 1))
+        stride = attr.tuple("stride", int, tuple(np.ones(len(kernel), dtype=np.int64)))
+        padding = attr.tuple("pad", int, tuple(np.zeros(len(kernel), dtype=np.int64)))
+        dilate = attr.tuple("dilate", int, tuple(np.ones(len(kernel), dtype=np.int64)))
         group = attr.int("num_group", 1)
         output = attr.int("num_filter", None)
         bias_term = attr.str("no_bias", 'False') == 'False'
 
+        final_dilations = np.array([1, 1, *[d for d in dilate]], dtype=np.int64) if dilate is not None else None
+
         node_attrs = {
-            'op': 'Conv2D',
+            'op': __class__.op,
             'bias_addable': True,
             'bias_term': bias_term,
-            'pad': np.array([[0, 0], [0, 0], [padding[0], padding[0]], [padding[1], padding[1]]], dtype=np.int64),
-            'pad_spatial_shape': np.array([[padding[0], padding[0]], [padding[1], padding[1]]], dtype=np.int64),
-            'dilation': np.array([1, 1, dilate[0], dilate[1]], dtype=np.int64),
+            'pad': np.array([[0, 0], [0, 0], *[[pad, pad] for pad in padding]], dtype=np.int64),
+            'pad_spatial_shape': np.array([[pad, pad] for pad in padding], dtype=np.int64),
+            'dilation': final_dilations,
             'output_spatial_shape': None,
             'output_shape': None,
-            'stride': np.array([1, 1, stride[0], stride[1]], dtype=np.int64),
+            'stride': np.array([1, 1, *[s for s in stride]], dtype=np.int64),
             'group': group,
             'output': output,
-            'kernel_spatial': np.array([kernel[0], kernel[1]], dtype=np.int64),
+            'kernel_spatial': np.array([k for k in kernel], dtype=np.int64),
 
             'input_feature_channel': 1,
             'output_feature_channel': 0,
-            'kernel_spatial_idx': np.array([2, 3], dtype=np.int64),
+            'kernel_spatial_idx': None,
             'reshape_kernel': True,
+
+            'spatial_dims': None,
+            'channel_dims': np.array([1], dtype=np.int64),
+            'batch_dims': np.array([0], dtype=np.int64),
+            'layout': 'NCHW',
         }
-        node_attrs.update(layout_attrs())
 
         # update the attributes of the node
         Convolution.update_node_stat(node, node_attrs)
@@ -75,16 +80,16 @@ class DeconvFrontExtractor(FrontExtractorOp):
                                      (kernel_shape[node.spatial_dims] - 1) * node.dilation[node.spatial_dims]
         padding[node.spatial_dims] = padding[node.spatial_dims] - node.output_spatial_shape;
         padding[node.spatial_dims] = (padding[node.spatial_dims] + 1) / 2
-        return np.array([[0, 0], [0, 0], [padding[2], padding[2]], [padding[3], padding[3]]], dtype=np.int64)
+        return np.array([[0, 0], [0, 0], *[[pad, pad] for pad in padding[2:]]], dtype=np.int64)
 
     @staticmethod
     def extract(node):
         attr = get_mxnet_layer_attrs(node.symbol_dict)
 
         kernel = attr.tuple("kernel", int, None)
-        stride = attr.tuple("stride", int, (1, 1))
-        padding = attr.tuple("pad", int, (0, 0))
-        dilate = attr.tuple("dilate", int, (1, 1))
+        stride = attr.tuple("stride", int, tuple(np.ones(len(kernel), dtype=np.int64)))
+        padding = attr.tuple("pad", int, tuple(np.zeros(len(kernel), dtype=np.int64)))
+        dilate = attr.tuple("dilate", int, tuple(np.ones(len(kernel), dtype=np.int64)))
         group = attr.int("num_group", 1)
         output = attr.int("num_filter", None)
         bias_term = attr.str("no_bias", 'True') == 'False'
@@ -92,27 +97,32 @@ class DeconvFrontExtractor(FrontExtractorOp):
         if target_shape:
             target_shape = np.array(target_shape, dtype=np.int64)
 
+        final_dilations = np.array([1, 1, *[d for d in dilate]], dtype=np.int64) if dilate is not None else None
         node_attrs = {
+            'op': __class__.op,
             'type': 'Deconvolution',
-            'op': 'Deconv2D',
             'bias_addable': True,
             'bias_term': bias_term,
-            'pad': np.array([[0, 0], [0, 0], [padding[0], padding[0]], [padding[1], padding[1]]], dtype=np.int64),
-            'pad_spatial_shape': np.array([[padding[0], padding[0]], [padding[1], padding[1]]], dtype=np.int64),
-            'dilation': np.array([1, 1, dilate[0], dilate[1]], dtype=np.int64),
+            'pad': np.array([[0, 0], [0, 0], *[[pad, pad] for pad in padding]], dtype=np.int64),
+            'pad_spatial_shape': np.array([[pad, pad] for pad in padding], dtype=np.int64),
+            'dilation': final_dilations,
             'output_spatial_shape': target_shape,
             'output_shape': None,
-            'stride': np.array([1, 1, stride[0], stride[1]], dtype=np.int64),
+            'stride': np.array([1, 1, *[s for s in stride]], dtype=np.int64),
             'group': group,
             'output': output,
-            'kernel_spatial': np.array([kernel[0], kernel[1]], dtype=np.int64),
-            'input_feature_channel': 0,
-            'output_feature_channel': 1,
-            'kernel_spatial_idx': np.array([2, 3], dtype=np.int64),
+            'kernel_spatial': np.array([k for k in kernel], dtype=np.int64),
+            'input_feature_channel': 1,
+            'output_feature_channel': 0,
+            'kernel_spatial_idx': None,
             'reshape_kernel': True,
-            'get_pad': DeconvFrontExtractor.get_pad
+
+            'spatial_dims': None,
+            'channel_dims': np.array([1], dtype=np.int64),
+            'batch_dims': np.array([0], dtype=np.int64),
+            'layout': 'NCHW',
+            'get_pad': DeconvFrontExtractor.get_pad,
         }
-        node_attrs.update(layout_attrs())
 
         # update the attributes of the node
         Convolution.update_node_stat(node, node_attrs)
diff --git a/model-optimizer/extensions/front/mxnet/copy_ext.py b/model-optimizer/extensions/front/mxnet/copy_ext.py
new file mode 100644 (file)
index 0000000..cc06a54
--- /dev/null
@@ -0,0 +1,29 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from extensions.ops.identity import IdentityOp
+from mo.front.extractor import FrontExtractorOp
+from mo.graph.graph import Node
+
+
+class CopyExt(FrontExtractorOp):
+    op = '_copy'
+    enabled = True
+
+    @staticmethod
+    def extract(node: Node):
+        IdentityOp.update_node_stat(node, {})
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/mxnet/dropout_ext.py b/model-optimizer/extensions/front/mxnet/dropout_ext.py
new file mode 100644 (file)
index 0000000..ee16973
--- /dev/null
@@ -0,0 +1,29 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from extensions.ops.identity import IdentityOp
+from mo.front.extractor import FrontExtractorOp
+from mo.graph.graph import Node
+
+
+class DropoutExt(FrontExtractorOp):
+    op = 'Dropout'
+    enabled = True
+
+    @staticmethod
+    def extract(node: Node):
+        IdentityOp.update_node_stat(node, {})
+        return __class__.enabled
 """
 
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
+from mo.ops.flatten import Flatten
 
 
-class ClampFrontExtractor(FrontExtractorOp):
-    op = 'clamp'
+class FlattenFrontExtractor(FrontExtractorOp):
+    op = 'Flatten'
     enabled = True
 
     @staticmethod
     def extract(node):
-        mapping_rule = {
-            'min': node.pb.min,
-            'max': node.pb.max,
+        attrs = {
+            'axis': 1,
+            'end_axis': -1,
         }
 
-        Op.get_op_class_by_name('Clamp').update_node_stat(node, mapping_rule)
+        Flatten.update_node_stat(node, attrs)
         return __class__.enabled
diff --git a/model-optimizer/extensions/front/mxnet/instance_norm_ext.py b/model-optimizer/extensions/front/mxnet/instance_norm_ext.py
new file mode 100644 (file)
index 0000000..26fe674
--- /dev/null
@@ -0,0 +1,35 @@
+"""
+ Copyright (c) 2017-2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.graph.graph import Node
+from extensions.ops.instance_normalization import InstanceNormalization
+from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+
+
+class InstanceNormFrontExtractor(FrontExtractorOp):
+    op = 'InstanceNorm'
+    enabled = True
+
+    @staticmethod
+    def extract(node: Node):
+        attr = get_mxnet_layer_attrs(node.symbol_dict)
+        node_attrs = {
+            'epsilon': attr.float('eps', 0.001)
+        }
+
+        InstanceNormalization.update_node_stat(node, node_attrs)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/mxnet/max_ext.py b/model-optimizer/extensions/front/mxnet/max_ext.py
new file mode 100644 (file)
index 0000000..3db428c
--- /dev/null
@@ -0,0 +1,36 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from mo.ops.reduce import Reduce
+
+
+class MaxFrontExtractor(FrontExtractorOp):
+    op = 'max'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = get_mxnet_layer_attrs(node.symbol_dict)
+        data = {
+            'axis': [attrs.int('axis', 0)],
+            'reduce_type': 'max',
+            'keep_dims': False
+        }
+        # update the attributes of the node
+        Reduce.update_node_stat(node, data)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/mxnet/maximum_ext.py b/model-optimizer/extensions/front/mxnet/maximum_ext.py
new file mode 100644 (file)
index 0000000..573a2dd
--- /dev/null
@@ -0,0 +1,28 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.eltwise import Eltwise
+
+
+class MaximumFrontExtractor(FrontExtractorOp):
+    op = '_maximum'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        Eltwise.update_node_stat(node, {'operation': 'max'})
+        return __class__.enabled
@@ -18,15 +18,11 @@ from mo.front.extractor import FrontExtractorOp
 from mo.ops.op import Op
 
 
-class ActivationFrontExtractor(FrontExtractorOp):
-    op = 'activation'
+class MinimumFrontExtractor(FrontExtractorOp):
+    op = '_minimum'
     enabled = True
 
     @staticmethod
     def extract(node):
-        mapping_rule = {
-           'operation': node.pb.operation
-        }
-
-        Op.get_op_class_by_name('Activation').update_node_stat(node, mapping_rule)
+        Op.update_node_stat(node, {'op': 'Minimum'})
         return __class__.enabled
index 69c8617..6a2452f 100644 (file)
@@ -16,7 +16,6 @@
 
 import numpy as np
 
-from mo.front.common.extractors.utils import layout_attrs
 from mo.front.extractor import FrontExtractorOp
 from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.ops.pooling import Pooling
@@ -31,24 +30,26 @@ class PoolingFrontExtractor(FrontExtractorOp):
         attrs = get_mxnet_layer_attrs(node.symbol_dict)
 
         kernel = attrs.tuple("kernel", int, None)
-        stride = attrs.tuple("stride", int, (1, 1))
-        padding = attrs.tuple("pad", int, (0, 0))
+        stride = attrs.tuple("stride", int, tuple(np.ones(len(kernel), dtype=np.int64)))
+        padding = attrs.tuple("pad", int, tuple(np.zeros(len(kernel), dtype=np.int64)))
         method = attrs.str("pool_type", None)
         rt = 'floor'
 
         data = {
-            'window': np.array([1, 1, kernel[0], kernel[1]], dtype=np.int64),
-            'stride': np.array([1, 1, stride[0], stride[1]], dtype=np.int64),
-            'pad': np.array([[0, 0], [0, 0], [padding[0], padding[0]], [padding[1], padding[1]]], dtype=np.int64),
-            'pad_spatial_shape': np.array([[padding[0], padding[0]], [padding[1], padding[1]]], dtype=np.int64),
+            'window': np.array([1, 1, *[k for k in kernel]], dtype=np.int64),
+            'stride': np.array([1, 1, *[s for s in stride]], dtype=np.int64),
+            'pad': np.array([[0, 0], [0, 0], *[[pad, pad] for pad in padding]], dtype=np.int64),
+            'pad_spatial_shape': np.array([[pad, pad] for pad in padding], dtype=np.int64),
             'pool_method': method,
             'exclude_pad': 'false',
             'output_spatial_shape': None,
+            'spatial_dims': None,
+            'channel_dims': np.array([1], dtype=np.int64),
+            'batch_dims': np.array([0], dtype=np.int64),
+            'layout': 'NCHW',
             'rounding_type': rt,
         }
 
-        data.update(layout_attrs())
-
         pooling_conv = attrs.str("pooling_convention", 'valid')
         if pooling_conv:
             data["pooling_convention"] = pooling_conv
diff --git a/model-optimizer/extensions/front/mxnet/reshape_ext.py b/model-optimizer/extensions/front/mxnet/reshape_ext.py
new file mode 100644 (file)
index 0000000..32251fe
--- /dev/null
@@ -0,0 +1,44 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import logging as log
+import numpy as np
+
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.reshape import Reshape
+
+
+class ReshapeFrontExtractor(FrontExtractorOp):
+    op = 'Reshape'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = get_mxnet_layer_attrs(node.symbol_dict)
+        dim = attrs.tuple("shape", int, None)
+        update_attrs = {
+            'dim': np.array(dim)
+        }
+        for d in dim:
+            if d in [-2, -3, -4]:
+                log.error('The attribute "shape" of the operation "{}" contains value "{}" which is not supported.'.
+                          format(node.soft_get('name'), d))
+                return False
+
+        # update the attributes of the node
+        Reshape.update_node_stat(node, update_attrs)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/mxnet/rnn_param_concat.py b/model-optimizer/extensions/front/mxnet/rnn_param_concat.py
new file mode 100644 (file)
index 0000000..8b21e7e
--- /dev/null
@@ -0,0 +1,35 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from mo.ops.concat import Concat
+
+
+class RNNParamConcatFrontExtractor(FrontExtractorOp):
+    op = '_rnn_param_concat'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = get_mxnet_layer_attrs(node.symbol_dict)
+        data = {
+            'axis': attrs.int("dim", 1),
+        }
+
+        # update the attributes of the node
+        Concat.update_node_stat(node, data)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/mxnet/slice_channel_ext.py b/model-optimizer/extensions/front/mxnet/slice_channel_ext.py
new file mode 100644 (file)
index 0000000..95b1cd8
--- /dev/null
@@ -0,0 +1,41 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import numpy as np
+
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.split import Split
+
+
+class SliceChannelFrontExtractor(FrontExtractorOp):
+    op = 'SliceChannel'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = get_mxnet_layer_attrs(node.symbol_dict)
+        axis = attrs.int("axis", 1)
+        num_outputs = attrs.int("num_outputs", 0)
+
+        node_attrs = {
+            'axis': axis,
+            'num_split': num_outputs
+        }
+
+        # update the attributes of the node
+        Split.update_node_stat(node, node_attrs)
+        return __class__.enabled
index f39b473..10991ea 100644 (file)
@@ -28,7 +28,7 @@ class SoftmaxFrontReplacementSubgraph(FrontReplacementSubgraph):
     def pattern(self):
         return dict(
             nodes=[
-                ('softmax', dict(op='Softmax'))
+                ('softmax', dict(type='SoftMax'))
             ],
             edges=[]
         )
index 13f0ba6..2dbb114 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import numpy as np
-
 from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.front.extractor import FrontExtractorOp
 from mo.ops.softmax import Softmax
index 794b046..c2071da 100644 (file)
@@ -28,7 +28,7 @@ class SoftmaxFrontExtractor(FrontExtractorOp):
         attrs = get_mxnet_layer_attrs(node.symbol_dict)
 
         update_attrs = {
-            'type': 'Softmax',
+            'type': 'SoftMax',
             'axis': attrs.int("axis", -1),
             'temperature': attrs.float('temperature', 1.0)
         }
index fccaaaf..d26b544 100644 (file)
@@ -32,7 +32,7 @@ class SsdPatternFlattenSoftmaxActivation(FrontReplacementSubgraph):
     def pattern(self):
         return dict(
             nodes=[
-                ('softmax_activation', dict(op='Softmax')),
+                ('softmax_activation', dict(op='SoftMax')),
                 ('multi_box_detection', dict(op='_contrib_MultiBoxDetection'))
             ],
             edges=[
index edf7944..5686dc2 100644 (file)
@@ -18,7 +18,7 @@ import networkx as nx
 
 from extensions.front.mxnet.ssd_pattern_remove_reshape import SsdPatternRemoveReshape
 from mo.front.common.replacement import FrontReplacementSubgraph
-from mo.middle.passes.eliminate import remove_node_from_graph
+from mo.graph.graph import erase_node
 
 
 class SsdPatternRemoveFlatten(FrontReplacementSubgraph):
@@ -50,4 +50,4 @@ class SsdPatternRemoveFlatten(FrontReplacementSubgraph):
          match : dict
            Patterns which were found in graph structure.
         """
-        remove_node_from_graph(graph, match['multi_box_prior'], match['flatten'])
+        erase_node(match['flatten'])
index 2d987d5..cf12e19 100644 (file)
@@ -18,7 +18,7 @@ import networkx as nx
 
 from mo.front.common.replacement import FrontReplacementSubgraph
 from mo.front.mxnet.extractors.utils import get_json_layer_attrs
-from mo.middle.passes.eliminate import remove_node_from_graph
+from mo.graph.graph import erase_node
 
 
 class SsdPatternRemoveReshape(FrontReplacementSubgraph):
@@ -49,7 +49,7 @@ class SsdPatternRemoveReshape(FrontReplacementSubgraph):
          match : dict
            Patterns which were found in graph structure.
         """
-        remove_node_from_graph(graph, match['concat'], match['reshape'])
+        erase_node(match['reshape'])
 
         # concat should be performed for the third axis
         concat_node = match['concat']
index b5299f8..a3af10c 100644 (file)
@@ -33,7 +33,7 @@ class SsdPatternRemoveTranspose(FrontReplacementSubgraph):
         return dict(
             nodes=[
                 ('transpose', dict(op='transpose')),
-                ('softmax_activation', dict(op='Softmax')),
+                ('softmax_activation', dict(op='SoftMax')),
                 ('multi_box_detection', dict(op='_contrib_MultiBoxDetection'))
             ],
             edges=[
diff --git a/model-optimizer/extensions/front/mxnet/stack_ext.py b/model-optimizer/extensions/front/mxnet/stack_ext.py
new file mode 100644 (file)
index 0000000..6b5b79b
--- /dev/null
@@ -0,0 +1,37 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from extensions.ops.pack import PackOp
+
+
+class StackFrontExtractor(FrontExtractorOp):
+    op = 'stack'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = get_mxnet_layer_attrs(node.symbol_dict)
+
+        update_attrs = {
+            'axis': attrs.int('axis', 0)
+        }
+
+        # update the attributes of the node
+        PackOp.update_node_stat(node, update_attrs)
+
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/mxnet/swapaxes_ext.py b/model-optimizer/extensions/front/mxnet/swapaxes_ext.py
new file mode 100644 (file)
index 0000000..1b34f09
--- /dev/null
@@ -0,0 +1,39 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from mo.front.extractor import FrontExtractorOp
+from extensions.ops.swapaxes import SwapAxes
+
+
+class SwapAxesFrontExtractor(FrontExtractorOp):
+    op = 'SwapAxis'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = get_mxnet_layer_attrs(node.symbol_dict)
+        dim1 = attrs.int("dim1", 0)
+        dim2 = attrs.int("dim2", 0)
+
+        update_attrs = {
+            'dim1': dim1,
+            'dim2': dim2,
+        }
+
+        # update the attributes of the node
+        SwapAxes.update_node_stat(node, update_attrs)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/mxnet/up_sampling_ext.py b/model-optimizer/extensions/front/mxnet/up_sampling_ext.py
new file mode 100644 (file)
index 0000000..a4284b1
--- /dev/null
@@ -0,0 +1,38 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from mo.front.extractor import FrontExtractorOp
+from extensions.ops.resample import ResampleOp
+
+
+class UpSamplingFrontExtractor(FrontExtractorOp):
+    op = 'UpSampling'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = get_mxnet_layer_attrs(node.symbol_dict)
+
+        node_attrs = {
+            'type': 'Resample',
+            'factor': attrs.int("scale", 1),
+            'resample_type': 'caffe.ResampleParameter.NEAREST',
+            'antialias': 0
+        }
+        # update the attributes of the node
+        ResampleOp.update_node_stat(node, node_attrs)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/mxnet/zeros_ext.py b/model-optimizer/extensions/front/mxnet/zeros_ext.py
new file mode 100644 (file)
index 0000000..00923d2
--- /dev/null
@@ -0,0 +1,43 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import numpy as np
+
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.const import Const
+
+
+class ZerosFrontExtractor(FrontExtractorOp):
+    op = '_zeros'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = get_mxnet_layer_attrs(node.symbol_dict)
+        shape = list(attrs.tuple('shape', int, None))
+        for i, s in enumerate(shape):
+            if s == 0:
+                shape[i] = 1
+
+        update_attrs = {
+            'shape': np.ndarray(shape),
+            'value': np.zeros(shape),
+        }
+
+        # update the attributes of the node
+        Const.update_node_stat(node, update_attrs)
+        return __class__.enabled
index 095e6d6..262a469 100644 (file)
@@ -38,17 +38,17 @@ class ConvFrontExtractor(FrontExtractorOp):
         if pads is not None:
             pads = pads.reshape([2, -1])
             pads = np.transpose(pads)
-            final_pad = np.array([[0, 0], [0, 0], *[p for p in reversed(pads)]], dtype=np.int64)
+            final_pad = np.array([[0, 0], [0, 0], *pads], dtype=np.int64)
 
         # Extract dilations attribute
         # In case if dilations is not specified it will be set in default (1) in infer function
         dilations = onnx_attr(node, 'dilations', 'ints', default=None, dst_type=lambda x: np.array(x, dtype=np.int64))
-        final_dilations = np.array([1, 1, *[d for d in dilations]], dtype=np.int64) if dilations is not None else None
+        final_dilations = np.array([1, 1, *dilations], dtype=np.int64) if dilations is not None else None
 
         # Extract dilations attribute
         # In case if dilations is not specified it will be set in default (1) in infer function
         strides = onnx_attr(node, 'strides', 'ints', default=None, dst_type=lambda x: np.array(x, dtype=np.int64))
-        final_strides = np.array([1, 1, *[s for s in strides]], dtype=np.int64) if strides is not None else None
+        final_strides = np.array([1, 1, *strides], dtype=np.int64) if strides is not None else None
 
         kernel_shape = onnx_attr(node, 'kernel_shape', 'ints', default=None)
         auto_pad = onnx_attr(node, 'auto_pad', 's', default=None, dst_type=get_onnx_autopad)
@@ -60,14 +60,14 @@ class ConvFrontExtractor(FrontExtractorOp):
             'bias_addable': True,
             'bias_term': None,
             'pad': final_pad,
-            'pad_spatial_shape': np.array([pad for pad in pads], dtype=np.int64) if pads is not None else None,
+            'pad_spatial_shape': np.array(pads, dtype=np.int64) if pads is not None else None,
             'dilation': final_dilations,
             'output_spatial_shape': None,
             'output_shape': None,
             'stride': final_strides,
             'group': group,
             'output': None,
-            'kernel_spatial': np.array([x for x in kernel_shape], dtype=np.int64) if kernel_shape is not None else None,
+            'kernel_spatial': np.array(kernel_shape, dtype=np.int64) if kernel_shape is not None else None,
 
             'input_feature_channel': 1,
             'output_feature_channel': 0,
index 1296e5b..11aaa1b 100644 (file)
@@ -15,9 +15,8 @@
 """
 
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
-
 from mo.front.onnx.extractors.utils import onnx_attr
+from mo.ops.flatten_onnx import FlattenONNX
 
 
 class FlattenFrontExtractor(FrontExtractorOp):
@@ -31,6 +30,5 @@ class FlattenFrontExtractor(FrontExtractorOp):
             'axis': axis
         }
 
-        # update the attributes of the node
-        Op.get_op_class_by_name(__class__.op).update_node_stat(node, attrs)
-        return __class__.enabled
\ No newline at end of file
+        FlattenONNX.update_node_stat(node, attrs)
+        return __class__.enabled
index 3bd6e63..1484bc8 100644 (file)
@@ -16,7 +16,7 @@
 
 import numpy as np
 
-from extensions.ops.take import Take
+from extensions.ops.gather import Gather
 from mo.front.extractor import FrontExtractorOp
 from mo.front.onnx.extractors.utils import onnx_attr
 
@@ -32,5 +32,5 @@ class GatherFrontExtractor(FrontExtractorOp):
             'axis': np.array(onnx_attr(node, 'axis', 'i', default=0), dtype=np.int64)
         }
 
-        Take.update_node_stat(node, attrs)
+        Gather.update_node_stat(node, attrs)
         return __class__.enabled
index 34267a1..20bc8ba 100644 (file)
@@ -20,6 +20,7 @@ import numpy as np
 from extensions.ops.lstm_sequence import LSTMSequence
 from mo.front.extractor import FrontExtractorOp
 from mo.front.onnx.extractors.utils import onnx_attr
+from mo.ops.op import Op
 
 
 class LSTMFrontExtractor(FrontExtractorOp):
@@ -29,11 +30,25 @@ class LSTMFrontExtractor(FrontExtractorOp):
     @staticmethod
     def extract(node):
 
+        def split_helper(node, index: int, direction: str):
+            return Op._create_data_node(
+                node.graph,
+                name=node.name + '/SplittedBiLSTM/{}/'.format(direction),
+                attrs={'value': node.value[index], 'shape': np.array(node.value[index].shape, dtype=np.int64)}
+            )
+
         attrs = {
             'hidden_size': np.array(onnx_attr(node, 'hidden_size', 'i'), dtype=np.int64),
             'batch_dim': 1,
             'sequence_dim': 0,
             'blobs_wrb': True,
+            'has_num_directions': True,
+            'direction': onnx_attr(node, 'direction', 's', b'forward').decode().lower(),
+            'format': 'onnx',
+            'blob_bidirectional_split': lambda node: (
+                split_helper(node, 0, 'forward'),
+                split_helper(node, 1, 'reverse')
+            )
         }
 
         LSTMSequence.update_node_stat(node, attrs)
index 01f5430..38b3189 100644 (file)
@@ -24,5 +24,5 @@ class MatMulFrontExtractor(FrontExtractorOp):
 
     @staticmethod
     def extract(node):
-        InnerProduct.update_node_stat(node, attrs={'infer': onnx_matmul_infer})
+        InnerProduct.update_node_stat(node, attrs={'op': 'MatMul', 'infer': onnx_matmul_infer, 'type': None})
         return InnerProduct.enabled
diff --git a/model-optimizer/extensions/front/onnx/neg_ext.py b/model-optimizer/extensions/front/onnx/neg_ext.py
new file mode 100644 (file)
index 0000000..939c167
--- /dev/null
@@ -0,0 +1,34 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import numpy as np
+
+from mo.front.extractor import FrontExtractorOp
+from mo.front.onnx.extractors.utils import onnx_attr
+
+
+class NegFrontExtractor(FrontExtractorOp):
+    # Neg operation will be transformed to ImageScalar and further will be converted to Mul
+    op = 'Neg'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        node['scale'] = np.array(-1, dtype=np.int64)
+        node['bias'] = np.array(0, dtype=np.int64)
+        node['op'] = 'ImageScaler'
+
+        return __class__.enabled
index dad94c0..449949f 100644 (file)
  limitations under the License.
 """
 
-import logging as log
+import numpy as np
 
-from mo.ops.pad import Pad
 from mo.front.extractor import FrontExtractorOp
 from mo.front.onnx.extractors.utils import onnx_attr
-from mo.utils.error import Error
-
-import numpy as np
+from mo.ops.pad import Pad
 
 
 class PadFrontExtractor(FrontExtractorOp):
@@ -36,21 +33,12 @@ class PadFrontExtractor(FrontExtractorOp):
 
         assert pads is not None
 
-        if mode.lower() != 'constant':
-            log.error('Pad.mode != constant for node {}. It is not supported. '
-                'Model conversion is not aborted but the final IR will be not correct.'.format(node.name))
-
-        if value != 0:
-            log.error('Pad.value == {} != 0 for node {}. It is not supported. '
-                'MOdel conversion is not aborted but the final IR will be not correct.'.format(value, node.name))
-
         # MO Pad op and ONNX Pad op have different format for pads values
         # MO Pad has Dx2 where D is the total number of dimensions
         # ONNX Pad pads flat layout, so
         # need to reshape and transpose
 
-        pads = pads.reshape([2,-1])
-        pads = np.transpose(pads)
+        pads = np.transpose(pads.reshape([2, -1]))
 
         Pad.update_node_stat(node, {'mode': mode, 'pads': pads, 'fill_value': value})
         return __class__.enabled
index 04a56f6..17c894c 100644 (file)
@@ -57,7 +57,21 @@ class GlobalAveragePoolFrontExtractor(FrontExtractorOp):
         attrs = common_onnx_pool_extractor(node)
         attrs.update({'pooling_convention': 'full',
                       'global_pool': True,
-                      'window': np.array([1, 1, 0, 0], dtype=np.int64)
+                     })
+
+        Pooling.update_node_stat(node, attrs)
+        return __class__.enabled
+
+
+class GlobalMaxPoolFrontExtractor(FrontExtractorOp):
+    op = 'GlobalMaxPool'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = common_onnx_pool_extractor(node)
+        attrs.update({'pooling_convention': 'full',
+                      'global_pool': True,
                      })
 
         Pooling.update_node_stat(node, attrs)
@@ -94,7 +108,7 @@ def common_onnx_pool_extractor(node):
     exclude_pad = onnx_attr(node, 'count_include_pad', 'i', default=0) == 0
 
     global_pooling = 0
-    if node.op == 'MaxPool':
+    if node.op in ['MaxPool', 'GlobalMaxPool']:
         method = 'max'
     elif node.op in ['AveragePool', 'GlobalAveragePool']:
         method = 'avg'
diff --git a/model-optimizer/extensions/front/onnx/reduce_sum_ext.py b/model-optimizer/extensions/front/onnx/reduce_sum_ext.py
new file mode 100644 (file)
index 0000000..8886eab
--- /dev/null
@@ -0,0 +1,32 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import numpy as np
+from mo.front.extractor import FrontExtractorOp
+from mo.front.onnx.extractors.utils import onnx_attr
+from mo.graph.graph import Node
+from mo.ops.reduce import Reduce
+
+
+class ReduceSumFrontExtractor(FrontExtractorOp):
+    op = 'ReduceSum'
+    enabled = True
+
+    @staticmethod
+    def extract(node: Node):
+        axis = onnx_attr(node, 'axes', 'ints', default=None, dst_type= lambda x: np.array(x, dtype=np.int64))
+        keep_dims = onnx_attr(node, 'keepdims', 'i', default=True)
+        Reduce.update_node_stat(node, {'axis': axis, 'keep_dims': keep_dims, 'reduce_type': 'sum'})
+        return __class__.enabled
  limitations under the License.
 """
 
-from mo.front.caffe.extractors.utils import weights_biases
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
+from mo.front.onnx.extractors.utils import onnx_attr
+from mo.ops.softmax import Softmax
 
 
-class ScaleShiftFrontExtractor(FrontExtractorOp):
-    op = 'scaleshift'
+class SoftmaxFrontExtractor(FrontExtractorOp):
+    op = 'Softmax'
     enabled = True
 
     @staticmethod
     def extract(node):
-        mapping_rule = {}
-        mapping_rule.update(weights_biases(node.pb.bias_term, node.pb))
+        axis = onnx_attr(node, 'axis', 'i', default=1)
+
+        attrs = {
+            'axis': axis
+        }
+
         # update the attributes of the node
-        Op.get_op_class_by_name('ScaleShift').update_node_stat(node, mapping_rule)
+        Softmax.update_node_stat(node, attrs)
         return __class__.enabled
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-from mo.front.common.partial_infer.slice import caffe_slice_infer
+import numpy as np
+
+from extensions.ops.splitv import SplitV
+from mo.front.common.partial_infer.utils import int64_array
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
+from mo.front.onnx.extractors.utils import onnx_attr
 
 
 class SplitFrontExtractor(FrontExtractorOp):
-    op = 'split'
+    op = 'Split'
     enabled = True
 
     @staticmethod
     def extract(node):
-        mapping_rule = {
-            'axis': node.pb.axis if node.pb.axis else 1,
-            'num_split': node.pb.num_split,
+        attrs = {
+            'size_splits': onnx_attr(node, 'split', 'ints', default=None, dst_type=int64_array),
+            'axis': onnx_attr(node, 'axis', 'i', default=0, dst_type=np.int64)
         }
-        Op.get_op_class_by_name('Split').update_node_stat(node, mapping_rule)
+        # update the attributes of the node
+        SplitV.update_node_stat(node, attrs)
         return __class__.enabled
diff --git a/model-optimizer/extensions/front/tf/BlockLSTM.py b/model-optimizer/extensions/front/tf/BlockLSTM.py
new file mode 100644 (file)
index 0000000..3e1bed4
--- /dev/null
@@ -0,0 +1,128 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import logging as log
+
+import networkx as nx
+
+from mo.front.common.replacement import FrontReplacementOp
+from mo.graph.graph import Node
+from mo.utils.error import Error
+
+
+class BlockLSTM(FrontReplacementOp):
+    """
+    We prepare TensorFlow BlockLSTM op to be replaced with LSTMSequence op that will be repacked to TensorIterator later
+
+    TensorFlow BlockLSTM op description:
+
+        Op parameters:
+         cell_clip:    Value to clip the 'cs' value to.
+         use_peephole: Whether to use peephole weights.
+         forget_bias:  The forget gate bias.
+
+        Inputs:
+         0: seq_len_max:  Maximum time length actually used by this input. Outputs are padded with 0s beyond this length
+         1: x:            The sequence input to the LSTM, shape (timelen, batch_size, num_inputs)
+         2: cs_prev:      Value of the initial cell state
+         3: h_prev:       Initial output of cell (to be used for peephole)
+         4: w:            The weight matrix
+         5: wci:          The weight matrix for input gate peephole connection
+         6: wcf:          The weight matrix for forget gate peephole connection
+         7: wco:          The weight matrix for output gate peephole connection
+         8: b:            The bias vector
+
+        Outputs:
+         0: i:            The input gate                    over the whole time sequence
+         1: cs:           The cell state before the tanh    over the whole time sequence
+         2: f:            The forget gate                   over the whole time sequence
+         3: o:            The output gate                   over the whole time sequence
+         4: ci:           The cell input                    over the whole time sequence
+         5: co:           The cell after the tanh           over the whole time sequence
+         6: h:            The output h vector               over the whole time sequence
+
+    Limitations:
+    - peephole connection, so we check `use_peephole`!=True and cut `wci`, `wco`, `wcf` off
+    - cell_clip parameter, so we check `cell_clip==-1`, which means we do not clip
+    """
+    op = "BlockLSTM"
+    enabled = True
+
+    def nodes_to_remove(self, graph: nx.MultiDiGraph, match: dict):
+        # do not remove matched node
+        return []
+
+    def replace_op(self, graph: nx.MultiDiGraph, node: Node):
+        if node.use_peephole:
+            raise Error("BlockLSTM operation is not supported with `use_peephole`==True. Node: {}"
+                        "".format(node.soft_get('name')))
+
+        if node.cell_clip != -1:
+            raise Error("Clipping is not supported for BlockLSTM operation. `cell_clip`={!s} for node: {}"
+                        "".format(node.cell_clip, node.soft_get('name')))
+
+        log.debug("Start BlockLSTM->LSTMSequence translation for node: {} with parameters:\n"
+                  "`cell_clip`={!s}, `use_peephole`=={!s}, `forget_bias`={!s}\n"
+                  "inputs: {},\noutputs:{}".format(node.soft_get('name'), node.cell_clip, node.use_peephole,
+                                                   node.forget_bias, {p: i.id for p, i in node.in_nodes().items()},
+                                                   {p: o.id for p, o in node.out_nodes().items()}))
+
+        log.debug("Cutting all inputs for peephole connection (5, 6, 7 input ports) off, as `use_peephole`=False")
+        [graph.remove_edge(node.in_node(p).id, node.id) for p, input_data in node.in_nodes().items() if p in [5, 6, 7]]
+
+        log.debug("Cutting seq_len_max input off")
+        graph.remove_edge(node.in_node(0).id, node.id)
+
+        """
+        Reconnecting input edges of LSTMSequence:
+        TF input edges:             Description:                 MO input edges:
+              1                          input                        0
+              4                         weights                       1
+              8                         biases                        2
+              3               h_prev: initial output of cell          3
+              2               cs_prev: initial cell state             4
+        """
+        inputs = node.in_edges()
+        assert 1 in inputs, "Sequence input to the BlockLSTM is required (1 port). Node {}".format(node.id)
+        assert 2 in inputs, "Value of the initial cell state is required (2 port). Node {}".format(node.id)
+        assert 3 in inputs, "Initial output of cell is required input to BlockLSTM (3 port). Node {}".format(node.id)
+        assert 4 in inputs, "The weight matrix is required input to BlockLSTM (4 port) . Node {}".format(node.id)
+        assert 8 in inputs, "The bias vector is required input to BlockLSTM (8 port). Node {}".format(node.id)
+
+        inputs[3]['in'] = 3
+        inputs[1]['in'] = 0
+        inputs[4]['in'] = 1
+        inputs[2]['in'] = 4
+        inputs[8]['in'] = 2
+
+        log.debug("Checking for unsupported outputs usage (output ports: 0, 2, 3, 4, 5)")
+        for port, input_data in node.out_nodes().items():
+            if port in [0, 2, 3, 4, 5]:
+                raise Error("Output port {} of BlockLSTM node {} is not supported".format(node.id, port))
+
+        """
+        Reconnecting output edges of LSTMSequence:
+        TF output edges:             Description:                 MO output edges:
+              6                     output h vector                     0
+              1                   cell state before the tanh            1
+        """
+
+        outputs = node.out_edges()
+        if 6 in outputs:
+            outputs[6]['out'] = 0
+
+        # do not replace any output edge
+        return []
diff --git a/model-optimizer/extensions/front/tf/BlockLSTM_ext.py b/model-optimizer/extensions/front/tf/BlockLSTM_ext.py
new file mode 100644 (file)
index 0000000..feddc17
--- /dev/null
@@ -0,0 +1,33 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from extensions.ops.BlockLSTM import BlockLSTM
+from mo.front.extractor import FrontExtractorOp
+
+
+class BlockLSTMExtractor(FrontExtractorOp):
+    op = 'BlockLSTM'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = {
+            'use_peephole': node.pb.attr['use_peephole'].b,
+            'cell_clip': node.pb.attr['cell_clip'].f,
+            'forget_bias': node.pb.attr['forget_bias'].f,
+        }
+        BlockLSTM.update_node_stat(node, attrs)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/tf/CTCGreedyDecoder.py b/model-optimizer/extensions/front/tf/CTCGreedyDecoder.py
new file mode 100644 (file)
index 0000000..e36bf50
--- /dev/null
@@ -0,0 +1,82 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+
+from mo.front.common.replacement import FrontReplacementSubgraph
+from mo.graph.graph import replace_node, Node
+from mo.utils.error import Error
+
+
+class CTCGreedyDecoderReplacement(FrontReplacementSubgraph):
+    """
+    The TF implementation of the CTCGreedyDecoder produces a tuple with two tensors. The first element in the tuple is
+    the SparseTensor which is converted to a regular tensor with the SparseToDense operation. This replacer matches
+    CTCGreedyDecoder and SparseToDense operations and removes the SparseToDense and Cast operation which is also used
+    in the SparseToDense operation, because Inference Engine implementation of the CTCGreedyDecoder produces regular
+    tensor as output.
+
+    The second input to the CTCGreedyDecoder in the TensorFlow is a 1D tensor with sequence lengths. In the Inference
+    Engine the second input to the CTCGreedyDecoder is a 2D tensor where the first element in each row is equal to 0
+    and all others are equal to 1. The length of the row is equal to the sequence length. The replacer modifies the
+    second input to be compatible with the Inference Engine CTCGreedyDecoder layer implementation.
+    """
+    enabled = True
+
+    @staticmethod
+    def pattern(**kwargs):
+        return dict(
+            nodes=[
+                ('decoder', dict(op='CTCGreedyDecoder')),
+                ('cast', dict(op='Cast')),
+                ('sparse_to_dense', dict(op='SparseToDense')),
+            ],
+            edges=[
+                ('decoder', 'sparse_to_dense', {'out': 0}),
+                ('decoder', 'cast', {'out': 1}),
+                ('cast', 'sparse_to_dense', {'out': 0}),
+            ]
+        )
+
+    def nodes_to_remove(self, graph: nx.MultiDiGraph, match: dict):
+        return [match['cast'].id, match['sparse_to_dense']]
+
+    def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict):
+        decoder_node = match['decoder']
+        graph.remove_edge(decoder_node.id, match['sparse_to_dense'].id)
+        graph.remove_edge(decoder_node.id, match['cast'].id)
+        replace_node(match['sparse_to_dense'], decoder_node)
+
+        # update the TensorFlow infer function for the CTCGreedyDecoder to make necessary changes with the second input
+        decoder_node['old_infer'] = decoder_node.infer
+        decoder_node.infer = __class__.tf_greedy_decoder_infer
+        return {}
+
+    @staticmethod
+    def tf_greedy_decoder_infer(node: Node):
+        sequence_length_node = node.in_node(1)
+        if sequence_length_node.value is None:
+            raise Error('The second input to the CTCGreedyDecoder node "{}" is not constant. This case is not '
+                        'supported with the Inference Engine.'.format(node.soft_get('name')))
+        # the batch size is the dimension with index 1 for the layer CTCGreedyDecoder
+        new_value = np.ones([node.in_node(0).shape[1], sequence_length_node.value[0]])
+        new_value[:, 0] = 0
+        new_value = np.transpose(new_value)
+        sequence_length_node.value = new_value
+        sequence_length_node.shape = sequence_length_node.value.shape
+
+        node.old_infer(node)
diff --git a/model-optimizer/extensions/front/tf/CTCGreedyDecoder_ext.py b/model-optimizer/extensions/front/tf/CTCGreedyDecoder_ext.py
new file mode 100644 (file)
index 0000000..89986e4
--- /dev/null
@@ -0,0 +1,30 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from extensions.ops.ctc_greedy_decoder import CTCGreedyDecoderOp
+from mo.front.extractor import FrontExtractorOp
+
+
+class CTCCGreedyDecoderFrontExtractor(FrontExtractorOp):
+    op = 'CTCGreedyDecoder'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = {
+            'ctc_merge_repeated': int(node.pb.attr['merge_repeated'].b),
+        }
+        CTCGreedyDecoderOp.update_node_stat(node, attrs)
+        return __class__.enabled
index 99f57ac..c62f9f6 100644 (file)
@@ -23,7 +23,7 @@ import numpy as np
 from extensions.front.standalone_const_eraser import StandaloneConstEraser
 from extensions.front.sub import Sub
 from extensions.front.tf.CropAndResizeReplacement import CropAndResizeReplacement
-from extensions.front.tf.Pack import Pack
+from extensions.front.Pack import Pack
 from extensions.front.tf.Unpack import Unpack
 from extensions.ops.DetectionOutput import DetectionOutput
 from extensions.ops.priorbox_clustered import PriorBoxClusteredOp
@@ -156,7 +156,7 @@ def _relax_reshape_nodes(graph: nx.MultiDiGraph, pipeline_config: PipelineConfig
         old_reshape_node = _skip_node_of_type(input_node.out_node(), ['Identity'])
         assert (old_reshape_node.op == 'Reshape')
         reshape_size_node = Const(graph, {'value': np.array([0, -1, 1, 4])}).create_node([])
-        new_reshape_op = Reshape(graph, {'name': input_node.id + '/Reshape'})
+        new_reshape_op = Reshape(graph, {'name': input_node.id + '/Reshape', 'correct_data_layout': True})
         new_reshape_node = new_reshape_op.create_node([input_node, reshape_size_node])
         replace_node(old_reshape_node, new_reshape_node)
 
@@ -166,7 +166,7 @@ def _relax_reshape_nodes(graph: nx.MultiDiGraph, pipeline_config: PipelineConfig
         old_reshape_node = _skip_node_of_type(input_node.out_node(), ['Identity'])
         assert (old_reshape_node.op == 'Reshape')
         reshape_size_node_2 = Const(graph, {'value': np.array([0, -1, num_classes + 1])}).create_node([])
-        new_reshape_op_2 = Reshape(graph, {'name': input_node.id + '/Reshape'})
+        new_reshape_op_2 = Reshape(graph, {'name': input_node.id + '/Reshape', 'correct_data_layout': True})
         new_reshape_node_2 = new_reshape_op_2.create_node([input_node, reshape_size_node_2])
         replace_node(old_reshape_node, new_reshape_node_2)
 
@@ -475,6 +475,12 @@ class ObjectDetectionAPIDetectionOutputReplacement(FrontReplacementFromConfigFil
         # only one output edge match
         return {match.output_node(0)[0].id: new_sub_graph['detection_output_node'].id}
 
+    @staticmethod
+    def skip_nodes_by_condition(current_node: Node, condition: callable):
+        while condition(current_node):
+            current_node = current_node.in_node()
+        return current_node
+
     def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch):
         argv = graph.graph['cmd_params']
         if argv.tensorflow_object_detection_api_pipeline_config is None:
@@ -500,9 +506,14 @@ class ObjectDetectionAPIDetectionOutputReplacement(FrontReplacementFromConfigFil
         fake_background_locs_const_op = Const(graph, dict(value=fake_background_locs_blob))
         fake_background_locs_const_node = fake_background_locs_const_op.create_node([])
 
+        # Workaround for PermuteForReshape pass.
+        # We looking for first not Reshape-typed node before match.single_input_node(0)[0].in_node(0).
+        # And add  reshape_loc node after this first not Reshape-typed node.
+        current_node = self.skip_nodes_by_condition(match.single_input_node(0)[0].in_node(0),
+                                                    lambda x: x['kind'] == 'op' and x.soft_get('type') == 'Reshape')
+
         reshape_loc_op = Reshape(graph, dict(dim=np.array([first_stage_max_proposals, num_classes, 4])))
-        reshape_loc_node = reshape_loc_op.create_node([match.single_input_node(0)[0].in_node(0)],
-                                                      dict(name='reshape_loc'))
+        reshape_loc_node = reshape_loc_op.create_node([current_node], dict(name='reshape_loc'))
 
         concat_loc_op = Concat(graph, dict(axis=1))
         concat_loc_node = concat_loc_op.create_node([fake_background_locs_const_node, reshape_loc_node],
@@ -584,7 +595,6 @@ class ObjectDetectionAPIDetectionOutputReplacement(FrontReplacementFromConfigFil
                  top_k=_value_or_raise(match, pipeline_config, 'postprocessing_max_detections_per_class'),
                  keep_top_k=_value_or_raise(match, pipeline_config, 'postprocessing_max_total_detections'),
                  nms_threshold=_value_or_raise(match, pipeline_config, 'postprocessing_iou_threshold')))
-        PermuteAttrs.set_permutation(reshape_priors_node, detection_output_node, None)
         # sets specific name to the node so we can find it in other replacers
         detection_output_node.name = 'detection_output'
 
@@ -737,7 +747,8 @@ class ObjectDetectionAPIProposalReplacement(FrontReplacementFromConfigFileSubGra
 
         reshape_classes_op = Reshape(graph, dict(dim=np.array([0, -1, 2])))
         reshape_classes_node = reshape_classes_op.create_node([permute_predictions_node],
-                                                              dict(name='reshape_FirstStageBoxPredictor_class'))
+                                                              dict(name='reshape_FirstStageBoxPredictor_class',
+                                                                   nchw_layout=True))
 
         softmax_conf_op = Softmax(graph, dict(axis=2))
         softmax_conf_node = softmax_conf_op.create_node([reshape_classes_node],
diff --git a/model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py b/model-optimizer/extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py
new file mode 100644 (file)
index 0000000..a46bb50
--- /dev/null
@@ -0,0 +1,120 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+
+from extensions.ops.DetectionOutput import DetectionOutput
+from extensions.ops.splitv import SplitV
+from mo.front.subgraph_matcher import SubgraphMatch
+from mo.front.tf.replacement import FrontReplacementFromConfigFileSubGraph
+from mo.graph.graph import Node
+from mo.ops.concat import Concat
+from mo.ops.const import Const
+from mo.ops.eltwise import Eltwise
+from mo.ops.power import Power
+from mo.ops.reshape import Reshape
+
+
+class RetinaNetFilteredDetectionsReplacement(FrontReplacementFromConfigFileSubGraph):
+    """
+    The class replaces the sub-graph that performs boxes post-processing and NMS with the DetectionOutput layer.
+
+    The post-processing in the RetinaNet topology is performed differently from the DetectionOutput layer implementation
+    in the Inference Engine. The first one calculates (d_x1, d_y1, d_x2, d_y2) which are a factor of the prior box width
+    and height. The DetectionOuput with "code_type" equal to "caffe.PriorBoxParameter.CORNER" just adds predicted deltas
+    to the prior box coordinates. This replacer add nodes which calculate prior box widths and heights, apply variances
+    to the predicated box coordinates and multiply them. With this approach the DetectionOutput layer with "code_type"
+    equal to "caffe.PriorBoxParameter.CORNER" produces the same result as the post-processing in the original topology.
+    """
+    replacement_id = 'RetinaNetFilteredDetectionsReplacement'
+
+    @staticmethod
+    def _create_sub(graph: nx.MultiDiGraph, input_1: Node, port_1: int, input_2: Node, port_2: int):
+        negate = Power(graph, dict(scale=-1, name=input_2.name + '/negate_'))
+        add = Eltwise(graph, dict(operation='sum', name=input_1.name + '/add_'))
+        out_node = add.create_node([(input_1, port_1), negate.create_node([(input_2, port_2)])])
+        return out_node
+
+    def output_edges_match(self, graph: nx.DiGraph, match: SubgraphMatch, new_sub_graph: dict):
+        return {match.output_node(0)[0].id: new_sub_graph['detection_output_node'].id}
+
+    def nodes_to_remove(self, graph: nx.MultiDiGraph, match: SubgraphMatch):
+        new_nodes_to_remove = match.matched_nodes_names()
+        new_nodes_to_remove.remove(match.single_input_node(0)[0].id)
+        new_nodes_to_remove.remove(match.single_input_node(1)[0].id)
+        new_nodes_to_remove.remove(match.single_input_node(2)[0].id)
+        return new_nodes_to_remove
+
+    def generate_sub_graph(self, graph: nx.MultiDiGraph, match: SubgraphMatch):
+        reshape_classes_op = Reshape(graph, {'dim': np.array([0, -1])})
+        reshape_classes_node = reshape_classes_op.create_node([match.single_input_node(1)[0]],
+                                                              dict(name='do_reshape_classes'))
+
+        priors_node = match.single_input_node(2)[0]
+
+        placeholder = [Node(graph, node_id) for node_id in graph.nodes() if Node(graph, node_id).op == 'Placeholder'][0]
+        im_height = placeholder.shape[1]
+        im_width = placeholder.shape[2]
+
+        # scale prior boxes to the [0, 1] interval
+        priors_scale_const_node = Const(graph, {'value': np.array([1 / im_width,
+                                                                   1 / im_height,
+                                                                   1 / im_width,
+                                                                   1 / im_height])}).create_node([])
+        priors_scale_node = Eltwise(graph, {'name': 'scale_priors', 'operation': 'mul'}).create_node(
+            [priors_node, priors_scale_const_node])
+
+        # calculate prior boxes widths and heights
+        split_node = SplitV(graph, {'axis': 2, 'size_splits': [1, 1, 1, 1]}).create_node([priors_scale_node])
+        priors_width_node = __class__._create_sub(graph, split_node, 2, split_node, 0)
+        priors_height_node = __class__._create_sub(graph, split_node, 3, split_node, 1)
+
+        # concat weights and heights into a single tensor and multiple with the box coordinates regression values
+        concat_width_height_node = Concat(graph, {'name': 'concat_priors_width_height', 'axis': -1}).create_node(
+            [priors_width_node, priors_height_node, priors_width_node, priors_height_node])
+        applied_width_height_regressions_node = Eltwise(graph, {'name': 'final_regressions', 'operation': 'mul'}). \
+            create_node([concat_width_height_node, match.single_input_node(0)[0]])
+
+        # reshape to 2D tensor as Inference Engine Detection Output layer expects
+        reshape_regression_op = Reshape(graph, {'dim': np.array([0, -1])})
+        reshape_regression_node = reshape_regression_op.create_node([applied_width_height_regressions_node],
+                                                                    {'name': 'reshape_regression'})
+
+        detection_output_op = DetectionOutput(graph, match.custom_replacement_desc.custom_attributes)
+        detection_output_op.attrs['old_infer'] = detection_output_op.attrs['infer']
+        detection_output_op.attrs['infer'] = __class__.do_infer
+        detection_output_node = detection_output_op.create_node(
+            [reshape_regression_node, reshape_classes_node, priors_scale_node],
+            dict(name=detection_output_op.attrs['type'], clip=1, normalized=1, variance_encoded_in_target=0))
+
+        return {'detection_output_node': detection_output_node}
+
+    @staticmethod
+    def do_infer(node):
+        # append variances to the tensor with boxes regressions
+        prior_boxes = node.in_node(2).value
+        assert prior_boxes is not None, "The prior boxes are not constants"
+        if prior_boxes is not None:
+            variances = np.tile(node.variance, [prior_boxes.shape[-2], 1])
+            prior_boxes = prior_boxes.reshape([-1, 4])
+            prior_boxes = np.concatenate((prior_boxes, variances), 0)
+            #  adding another dimensions, as the prior-boxes are expected as 3d tensor
+            prior_boxes = prior_boxes.reshape((1, 2, -1))
+            node.in_node(2).shape = np.array(prior_boxes.shape, dtype=np.int64)
+            node.in_node(2).value = prior_boxes
+
+        node.old_infer(node)
index ac9e753..278998c 100644 (file)
@@ -61,8 +61,6 @@ class SSDToolboxDetectionOutputReplacement(FrontReplacementFromConfigFileSubGrap
                                                             dict(name='DetectionOutput_Reshape_priors_'))
         # create Detection Output node with three inputs: locations, confidences and prior boxes
         detection_output_op = DetectionOutput(graph, match.custom_replacement_desc.custom_attributes)
-        detection_output_op.attrs['old_infer'] = detection_output_op.attrs['infer']
-        detection_output_op.attrs['infer'] = __class__.do_infer
         detection_output_node = detection_output_op.create_node(
             [reshape_loc_node, reshape_conf_node, reshape_priors_node],
             dict(name=detection_output_op.attrs['type'] + '_'))
@@ -72,12 +70,3 @@ class SSDToolboxDetectionOutputReplacement(FrontReplacementFromConfigFileSubGrap
         output_op = Output(graph)
         output_op.create_node([detection_output_node], dict(name='sink_'))
         return {}
-
-    @staticmethod
-    def do_infer(node: Node):
-        """
-        This infer function is used to set attribute 'force_precision' in the data node of the prior boxes because
-        it should be in FP32 even if the model has been created in the FP16 or another format.
-        """
-        node.in_node(2)['force_precision'] = 'FP32'
-        node.old_infer(node)
diff --git a/model-optimizer/extensions/front/tf/assign_elimination.py b/model-optimizer/extensions/front/tf/assign_elimination.py
new file mode 100644 (file)
index 0000000..2a6dc07
--- /dev/null
@@ -0,0 +1,67 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import logging as log
+
+import networkx as nx
+
+from mo.front.common.replacement import FrontReplacementOp
+from mo.utils.error import Error
+
+
+class AssignElimination(FrontReplacementOp):
+    op = "Assign"
+    enabled = True
+
+    def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict):
+        node = match['op']
+        # here we request all data flow output edges (control flow edges will not be listed)
+        out_edges = node.out_edges()
+        if len(out_edges) == 0:
+            graph.remove_node(node.id)
+            log.debug('Assign op was removed {}'.format(node.id))
+        else:
+            raise Error('Data flow edge coming out of Assign node {}'.format(node.id))
+
+
+class AssignSubElimination(FrontReplacementOp):
+    op = "AssignSub"
+    enabled = True
+
+    def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict):
+        node = match['op']
+        # here we request all data flow output edges (control flow edges will not be listed)
+        out_edges = node.out_edges()
+        if len(out_edges) == 0:
+            graph.remove_node(node.id)
+            log.debug('AssignSub op was removed {}'.format(node.id))
+        else:
+            raise Error('Data flow edge coming out of AssignSub node {}'.format(node.id))
+
+
+class AssignAddElimination(FrontReplacementOp):
+    op = "AssignAdd"
+    enabled = True
+
+    def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict):
+        node = match['op']
+        # here we request all data flow output edges (control flow edges will not be listed)
+        out_edges = node.out_edges()
+        if len(out_edges) == 0:
+            graph.remove_node(node.id)
+            log.debug('AssignAdd op was removed {}'.format(node.id))
+        else:
+            raise Error('Data flow edge coming out of AssignAdd node {}'.format(node.id))
index c744148..37391ae 100644 (file)
@@ -19,6 +19,7 @@ import networkx as nx
 from extensions.ops.lstm_cell import LSTMCell
 from mo.front.common.replacement import FrontReplacementSubgraph
 from mo.graph.graph import Node, replace_node, get_inputs_with_ports
+from mo.ops.output import Output
 
 
 class BasicLSTMCell(FrontReplacementSubgraph):
@@ -173,6 +174,14 @@ class BasicLSTMCell(FrontReplacementSubgraph):
         for i, output in enumerate(__class__.outputs):
             replace_node(match[output], lstm_node, i)
 
+        # Because of LSTMCell specification, this layer MUST have 2 outputs.
+        # => we need to create fake consumers for LSTMCell
+        # when this node haven't some outputs.
+        for i in [0, 1]:
+            if i not in lstm_node.out_nodes():
+                fake_output_node = Output(graph, dict(name=lstm_node.name + "/Output_{}".format(i)))
+                fake_output_node.create_node(inputs=[lstm_node], edge_attrs={'out': i, 'in': 0})
+
         lstm_node['tf'] = True
         lstm_node['extra_inputs'] = {name: match[name].id for name in __class__.extra_inputs}
         lstm_node['inputs'] = {name: match[name].id for name in __class__.inputs}
index 67de856..8838cd5 100644 (file)
@@ -48,8 +48,8 @@ class Conv3DBackpropInputV2InputFrontExtractor(FrontExtractorOp):
     def extract(node):
         attrs = tf_create_attrs(node, 4, 3)
         attrs.update({'op': __class__.op,
-                      'get_weights_permute': PermuteAttrs.Permutation(perm=int64_array([3, 4, 0, 1, 2]),
-                                                                      inv=int64_array([2, 3, 4, 0, 1]))
+                      'get_weights_permute': PermuteAttrs.Permutation(perm=int64_array([4, 3, 0, 1, 2]),
+                                                                      inv=int64_array([2, 3, 4, 1, 0]))
                       })
 
         # update the attributes of the node
diff --git a/model-optimizer/extensions/front/tf/fake_const.py b/model-optimizer/extensions/front/tf/fake_const.py
new file mode 100644 (file)
index 0000000..2a487ef
--- /dev/null
@@ -0,0 +1,43 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import logging as log
+
+import networkx as nx
+
+from mo.front.common.replacement import FrontReplacementOp
+from mo.front.tf.extractors.utils import tf_dtype_extractor
+from mo.graph.graph import Node
+from mo.ops.const import Const
+
+
+class FakeConstToConst(FrontReplacementOp):
+    op = "FakeConst"
+    enabled = True
+
+    def replace_sub_graph(self, graph: nx.MultiDiGraph, match: dict):
+        node = match['op']
+        if not node.has_valid('value'):
+            log.debug("No value in FakeConst node {}".format(node.id))
+            return
+        node_value = node.value
+        extracted_attrs = {
+            'data_type': tf_dtype_extractor(node.pb.attr['dtype'].type),
+            'shape': node_value.shape,
+            'value': node_value
+        }
+        Const.update_node_stat(node, extracted_attrs)
+        log.debug('FakeConst op was translated to Const op with shape = {} and value.shape = {}'
+                  ''.format(extracted_attrs['shape'], extracted_attrs['value'].shape))
index ac94e1d..5a2b591 100644 (file)
@@ -13,8 +13,6 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import logging as log
-
 import numpy as np
 
 from mo.front.extractor import FrontExtractorOp
@@ -29,14 +27,16 @@ class FIFOQueueV2Extractor(FrontExtractorOp):
     @staticmethod
     def extract(node):
         shapes = node.pb.attr['shapes'].list.shape
-        if len(shapes) != 2:
-            log.error("FIFOQueueV2 is supported with exactly 2 outputs")
-            return False
         tf_types = node.pb.attr['component_types'].list.type
         extracted_types = []
         for t in tf_types:
             extracted_types.append(tf_dtype_extractor(t))
-        shape = shapes[0].dim
-        new_shape = np.array([1, shape[0].size, shape[1].size, shape[2].size], dtype=np.int64)
-        Op.update_node_stat(node, {'shape': new_shape, 'types': extracted_types})
+        result_shapes = []
+        for shape_pb in shapes:
+            shape = shape_pb.dim
+            if len(shape) == 3:
+                result_shapes.append(np.array([1, shape[0].size, shape[1].size, shape[2].size], dtype=np.int64))
+            else:
+                result_shapes.append(np.array(shape, dtype=np.int64))
+        Op.update_node_stat(node, {'shapes': result_shapes, 'types': extracted_types})
         return __class__.enabled
index 6eebe8c..576dcf1 100644 (file)
@@ -19,7 +19,7 @@ import networkx as nx
 import numpy as np
 
 from mo.front.common.replacement import FrontReplacementSubgraph
-from mo.graph.graph import create_edge, erase_node
+from mo.graph.graph import create_edge, erase_node, Node
 from mo.ops.input import Input
 
 
@@ -61,7 +61,7 @@ class FIFOQueue(FrontReplacementSubgraph):
             there is no label_batch node
         """
         true_placeholder_shape = match['placeholder'].shape
-        placeholder_shape = match['fifo_queue'].shape
+        placeholder_shape = match['fifo_queue'].shapes[0]
         assert true_placeholder_shape.ndim <= 1
         if true_placeholder_shape.ndim == 1 and len(true_placeholder_shape) > 1:
             log.warning(
@@ -82,3 +82,37 @@ class FIFOQueue(FrontReplacementSubgraph):
         create_edge(placeholder, match['image_batch'])
         log.info("FIFOQueueV2 pattern was detected. New shape of placeholder {} is {}. Use -b to set batch size if "
                  "needed".format(placeholder.id, placeholder['shape']))
+
+
+class QueueDequeueManyV2(FrontReplacementSubgraph):
+    """
+    Replaces the combination of the FIFOQueueV2 + QueueDequeueManyV2 operations with a number of Placeholders.
+    """
+    enabled = True
+
+    @staticmethod
+    def pattern(**kwargs):
+        return dict(
+            nodes=[
+                ('fifo_queue', dict(op='FIFOQueueV2')),
+                ('queue_deque', dict(op='QueueDequeueManyV2')),
+            ],
+            edges=[
+                ('fifo_queue', 'queue_deque', {'out': 0}),
+            ]
+        )
+
+    @staticmethod
+    def replace_sub_graph(graph: nx.MultiDiGraph, match: dict, **kwargs):
+        inputs_dict = {}
+        for u, v, edge_attrs in graph.out_edges(match['queue_deque'].id, data=True):
+            out_port = edge_attrs['out']
+            shape = match['fifo_queue'].shapes[out_port]
+            if out_port not in inputs_dict:
+                input_op = Input(graph, {'shape': shape.copy()})
+                inputs_dict[out_port] = input_op.create_node([])
+            create_edge(inputs_dict[out_port], Node(graph, v), edge_attrs['out'], edge_attrs['in'], edge_attrs)
+
+        graph.remove_node(match['queue_deque'].id)
+        graph.remove_node(match['fifo_queue'].id)
+
diff --git a/model-optimizer/extensions/front/tf/gather_ext.py b/model-optimizer/extensions/front/tf/gather_ext.py
new file mode 100644 (file)
index 0000000..0cb924f
--- /dev/null
@@ -0,0 +1,59 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import numpy as np
+
+from extensions.ops.gather import Gather
+from mo.front.extractor import FrontExtractorOp
+
+
+class GatherFrontExtractor(FrontExtractorOp):
+    op = 'Gather'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = {}
+
+        Gather.update_node_stat(node, attrs)
+
+        return __class__.enabled
+
+
+class ResourceGatherFrontExtractor(FrontExtractorOp):
+    op = 'ResourceGather'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = {}
+
+        Gather.update_node_stat(node, attrs)
+
+        return __class__.enabled
+
+
+class GatherV2FrontExtractor(FrontExtractorOp):
+    op = 'GatherV2'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = {}
+
+        Gather.update_node_stat(node, attrs)
+
+        return __class__.enabled
 """
 
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
+from mo.ops.reduce import Reduce
 
 
-class MemoryFrontExtractor(FrontExtractorOp):
-    op = 'memory'
+class MaxFrontExtractor(FrontExtractorOp):
+    op = 'Max'
     enabled = True
 
     @staticmethod
     def extract(node):
-        mapping_rule = {
-            'id': node.pb.id,
-            'index': node.pb.index,
-            'size': node.pb.size
+        data = {
+            'reduce_type': 'max',
+            'keep_dims': node.pb.attr['keep_dims'].b
         }
-        Op.get_op_class_by_name('Memory').update_node_stat(node, mapping_rule)
+        Reduce.update_node_stat(node, data)
         return __class__.enabled
diff --git a/model-optimizer/extensions/front/tf/mvn_unrolled.py b/model-optimizer/extensions/front/tf/mvn_unrolled.py
new file mode 100644 (file)
index 0000000..a73ed49
--- /dev/null
@@ -0,0 +1,106 @@
+"""
+ Copyright (c) 2017-2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import logging as log
+
+import networkx as nx
+
+from extensions.front.squared_difference import SquaredDifference
+from extensions.front.sub import Sub
+from mo.front.common.replacement import FrontReplacementSubgraph
+from mo.graph.graph import Node, replace_node
+from mo.ops.div import Div
+from mo.ops.op import Op
+
+
+class MVNUnrolled(FrontReplacementSubgraph):
+    enabled = True
+
+    def run_before(self):
+        return [SquaredDifference, Div, Sub]
+
+    def pattern(self):
+        log.debug('Enabled MVN replacement')
+        return dict(
+            nodes=[
+                ('mean', dict(kind='op', op='Mean')),
+                ('stop_grad', dict(kind='op', op='StopGradient')),
+                ('sqdiff', dict(kind='op', op='SquaredDifference')),
+                ('variance', dict(kind='op', op='Mean')),
+                ('add', dict(kind='op', op='Add')),
+                ('pow', dict(kind='op', op='Pow')),
+                ('sub', dict(kind='op', op='Sub')),
+                ('truediv', dict(kind='op', op='Div')),
+            ],
+            edges=[
+                ('mean', 'stop_grad', {'in': 0}),
+                ('stop_grad', 'sqdiff', {'in': 1}),
+                ('sqdiff', 'variance', {'in': 0}),
+                ('mean', 'sub', {'in': 1}),
+                ('variance', 'add'),
+                ('add', 'pow', {'in': 0}),
+                ('pow', 'truediv', {'in': 1}),
+                ('sub', 'truediv', {'in': 0}),
+            ])
+
+    @staticmethod
+    def replace_sub_graph(graph: nx.MultiDiGraph, match: dict):
+        MVN = Op.get_op_class_by_name('MVN')
+
+        mvn = MVN(graph, dict(
+            name=match['truediv'].name + '/MVN_',
+            required_reduction_indices=[1, 2] if graph.graph['layout'] == 'NHWC' else [2, 3]
+        ))
+        mvn.attrs['old_infer'] = mvn.attrs['infer']
+        mvn.attrs['infer'] = __class__.infer
+
+        mean_reduction = match['mean'].in_node(1)
+        variance_reduction = match['variance'].in_node(1)
+        pow2 = match['pow'].in_node(1)
+        eps = match['add'].in_node(0 if match['add'].in_node(0).id != match['variance'].id else 1)
+
+        new_subgraph = mvn.create_node([match['mean'].in_node(0), mean_reduction, variance_reduction, pow2, eps])
+
+        replace_node(match['truediv'], new_subgraph)
+
+    @staticmethod
+    def infer(node: Node):
+        if not (node.in_node(1).has_valid('value') and node.in_node(2).has_valid('value')):
+            log.warning('Reduction indices for mean and variance for MVN node {} are not constants'.format(node.name))
+            return
+
+        if not (all(node.in_node(1).value == node.required_reduction_indices) and
+                    all(node.in_node(2).value == node.required_reduction_indices)):
+            log.warning('Reduction indices for mean {} and variance {} do not match required ones {}'.format(
+                node.in_node(1).value,
+                node.in_node(2).value,
+                node.required_reduction_indices
+            ))
+            return
+        
+        if not (node.in_node(3).has_valid('value') and node.in_node(4).has_valid('value')):
+            log.warning('Power or/and epsilon values for MVN node {} are not constants'.format(node.name))
+            return
+
+        if node.in_node(3).value != 0.5:
+            log.warning('Power for MVN node {} ({}) is not equal to 0.5'.format(node.name, node.in_node(3).value))
+            return
+
+        node['eps'] = node.in_node(4).value
+
+        for i in range(1, 5):
+            node.graph.remove_edge(node.in_node(i).id, node.id)
+        node.old_infer(node)
index d643998..23b1f45 100644 (file)
@@ -18,7 +18,7 @@ import logging as log
 
 import networkx as nx
 
-from extensions.front.tf.Pack import Pack
+from extensions.front.Pack import Pack
 from extensions.ops.resample import ResampleOp
 from mo.front.common.replacement import FrontReplacementSubgraph
 from mo.graph.graph import replace_node
index 9486e11..ceb385c 100644 (file)
@@ -13,9 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import networkx as nx
 
-from mo.front.common.replacement import FrontReplacementOp
+from mo.front.common.partial_infer.elemental import copy_shape_infer
 from mo.front.extractor import FrontExtractorOp
 from mo.graph.graph import Node
 
@@ -27,4 +26,5 @@ class NextIterationExtractor(FrontExtractorOp):
     @staticmethod
     def extract(node: Node):
         node['is_cyclic'] = True
+        node['infer'] = copy_shape_infer
         return __class__.enabled
index 859c2be..542d9aa 100644 (file)
@@ -26,3 +26,22 @@ class PadFrontExtractor(FrontExtractorOp):
     def extract(node):
         Pad.update_node_stat(node)
         return __class__.enabled
+
+
+class PadV2FrontExtractor(FrontExtractorOp):
+    op = 'PadV2'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        return __class__.enabled
+
+
+class MirrorPadFrontExtractor(FrontExtractorOp):
+    op = 'MirrorPad'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        Pad.update_node_stat(node, {'mode': node.pb.attr['mode'].s.decode('utf-8').lower()})
+        return __class__.enabled
index 56ab93b..772747c 100644 (file)
@@ -61,6 +61,19 @@ class MaxPool3DFrontExtractor(FrontExtractorOp):
         return __class__.enabled
 
 
+class AvgPool3DFrontExtractor(FrontExtractorOp):
+    op = 'AvgPool3D'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        attrs = create_pooling_attrs(node, 'avg')
+        attrs.update({'op': __class__.op})
+        # update the attributes of the node
+        Pooling.update_node_stat(node, attrs)
+        return __class__.enabled
+
+
 def create_pooling_attrs(node, pool_method):
     data_format = node.pb.attr["data_format"]
 
diff --git a/model-optimizer/extensions/front/tf/rank_ext.py b/model-optimizer/extensions/front/tf/rank_ext.py
new file mode 100644 (file)
index 0000000..71ca94d
--- /dev/null
@@ -0,0 +1,28 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from extensions.ops.rank import Rank
+
+
+class RankFrontExtractor(FrontExtractorOp):
+    op = 'Rank'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        Rank.update_node_stat(node)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/tf/retinanet.json b/model-optimizer/extensions/front/tf/retinanet.json
new file mode 100644 (file)
index 0000000..4687e6a
--- /dev/null
@@ -0,0 +1,31 @@
+[
+    {
+        "custom_attributes": {
+            "code_type": "caffe.PriorBoxParameter.CORNER",
+            "pad_mode": "caffe.ResizeParameter.CONSTANT",
+            "resize_mode": "caffe.ResizeParameter.WARP",
+            "confidence_threshold": 0.05,
+            "top_k": 6000,
+            "keep_top_k": 300,
+            "nms_threshold": 0.5,
+            "variance": [0.2, 0.2, 0.2, 0.2]
+        },
+        "include_inputs_to_sub_graph": true,
+        "include_outputs_to_sub_graph": true,
+        "id": "RetinaNetFilteredDetectionsReplacement",
+        "instances": {
+            "end_points": [
+                "filtered_detections/map/TensorArrayStack/TensorArrayGatherV3",
+                "filtered_detections/map/TensorArrayStack_1/TensorArrayGatherV3",
+                "filtered_detections/map/TensorArrayStack_2/TensorArrayGatherV3"
+            ],
+            "start_points": [
+                "regression/concat",
+                "classification/concat",
+                "anchors/concat",
+                "clipped_boxes/Shape"
+            ]
+        },
+        "match_kind": "points"
+    }
+]
\ No newline at end of file
diff --git a/model-optimizer/extensions/front/tf/reverse_sequence.py b/model-optimizer/extensions/front/tf/reverse_sequence.py
new file mode 100644 (file)
index 0000000..2c6491f
--- /dev/null
@@ -0,0 +1,31 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from extensions.ops.reverse_sequence import ReverseSequence
+
+
+class ReverseSequenceFrontExtractor(FrontExtractorOp):
+    op = 'ReverseSequence'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        ReverseSequence.update_node_stat(node, {
+            'seq_dim': node.pb.attr['seq_dim'],
+            'batch_dim': node.pb.attr['batch_dim'],
+        })
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/tf/reverse_v2.py b/model-optimizer/extensions/front/tf/reverse_v2.py
new file mode 100644 (file)
index 0000000..6254d23
--- /dev/null
@@ -0,0 +1,28 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from extensions.ops.reverse_sequence import ReverseSequence
+
+
+class ReverseV2FrontExtractor(FrontExtractorOp):
+    op = 'ReverseV2'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        ReverseSequence.update_node_stat(node)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/tf/softmax_ext.py b/model-optimizer/extensions/front/tf/softmax_ext.py
new file mode 100644 (file)
index 0000000..8891b5f
--- /dev/null
@@ -0,0 +1,32 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.softmax import Softmax
+
+
+class SoftmaxFrontExtractor(FrontExtractorOp):
+    op = 'Softmax'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        # the default value for the TF Softmax is -1
+        axis = -1
+        if 'axis' in node.pb.attr:
+            axis = node.pb.attr['axis'].i
+        Softmax.update_node_stat(node, {'axis': axis})
+        return __class__.enabled
 """
 
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
+from mo.ops.power import Power
 
 
-class EltwiseFrontExtractor(FrontExtractorOp):
-    op = 'eltwise'
+class SqrtExtractor(FrontExtractorOp):
+    op = 'Sqrt'
     enabled = True
 
     @staticmethod
     def extract(node):
-        mapping_rule = {
-            'operation': node.pb.operation,
-        }
         # update the attributes of the node
-        Op.get_op_class_by_name('Eltwise').update_node_stat(node, mapping_rule)
+        Power.update_node_stat(node, {'power': 1 / 2, 'op': SqrtExtractor.op})
         return __class__.enabled
-
diff --git a/model-optimizer/extensions/front/tf/square_ext.py b/model-optimizer/extensions/front/tf/square_ext.py
new file mode 100644 (file)
index 0000000..6a3e939
--- /dev/null
@@ -0,0 +1,29 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.power import Power
+
+
+class SquareExtractor(FrontExtractorOp):
+    op = 'Square'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        # update the attributes of the node
+        Power.update_node_stat(node, {'power': 2, 'op': SquareExtractor.op})
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/tf/stop_gradient_ext.py b/model-optimizer/extensions/front/tf/stop_gradient_ext.py
new file mode 100644 (file)
index 0000000..fd166a7
--- /dev/null
@@ -0,0 +1,29 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.graph.graph import Node
+from extensions.ops.stop_gradient import StopGradientOp
+
+
+class StopGradientExtractor(FrontExtractorOp):
+    op = 'StopGradient'
+    enabled = True
+
+    @staticmethod
+    def extract(node: Node):
+        StopGradientOp.update_node_stat(node, {})
+        return __class__.enabled
\ No newline at end of file
diff --git a/model-optimizer/extensions/front/tf/variable_ext.py b/model-optimizer/extensions/front/tf/variable_ext.py
new file mode 100644 (file)
index 0000000..7f4c270
--- /dev/null
@@ -0,0 +1,37 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.op import Op
+
+
+class VariableExtractor(FrontExtractorOp):
+    op = 'Variable'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        Op.update_node_stat(node, {'op': 'FakeConst'})
+        return __class__.enabled
+
+
+class VariableV2Extractor(FrontExtractorOp):
+    op = 'VariableV2'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        Op.update_node_stat(node, {'op': 'FakeConst'})
+        return __class__.enabled
  limitations under the License.
 """
 
-import numpy as np
+import networkx as nx
+from mo.middle.replacement import MiddleReplacementPattern
 
-from mo.front.common.partial_infer.flatten import flatten_infer
 
-
-def flatten_ext(attrs):
-    node_attrs = {
-        'type': 'Flatten',
-        'axis': 1,
-        'num_axes': 0,
-        'infer': flatten_infer
-    }
-    return node_attrs
+class AddIsCyclicAttribute(MiddleReplacementPattern):
+    @staticmethod
+    def find_and_replace_pattern(graph: nx.MultiDiGraph):
+        is_acyclic = nx.is_directed_acyclic_graph(graph)
+        graph.graph['is_cyclic'] = not is_acyclic
\ No newline at end of file
diff --git a/model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py b/model-optimizer/extensions/middle/BlockLSTMtoLSTMSequence.py
new file mode 100644 (file)
index 0000000..9835442
--- /dev/null
@@ -0,0 +1,259 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+
+from extensions.middle.FusePermutesSequence import FusePermutesSequence
+from extensions.middle.lstm_sequence_normalize import LSTMSequenceNormalize
+from extensions.middle.lstm_sequence_tensor_iterator import LSTMSequenceTensorIterator
+from mo.middle.replacement import MiddleReplacementPattern
+from mo.utils.error import Error
+
+
+class BlockLSTMtoLSTMSequence(MiddleReplacementPattern):
+    """
+    MO virtual operation LSTMSequence that converts to IE TensorIterator with LSTMCell inside supports 3 outputs:
+    0: concatenated hidden states over the whole time sequence,
+    1: last hidden state,
+    2: last cell state.
+
+    Replacer do several tasks:
+    1. Checks if current BlockLSTM can be translated to IR (IE does not support concatenated cell state output
+    which can be produced by BlockLSTM)
+    2. Searches for sub-graph, that takes last cell state out of unsupported concatenated cell state output.
+    We cut this sub-graph off in case if there are no other consumers of concatenated cell state output and we connect
+    BlockLSTM to consumers of this sub-graph by port producing last cell state output
+    3. (Optional. Resolves by multiple checks) We cut the same sug-graph (as in 2) for concatenated cell states check
+    for better performance
+    """
+    enabled = True
+
+    def run_before(self):
+        return [FusePermutesSequence, LSTMSequenceTensorIterator]
+
+    def pattern(self):
+        return dict(
+            nodes=[
+                ('BlockLSTM', dict(op='BlockLSTM')),
+
+                # 0 port: output h vector over the whole time sequence
+                ('concatenated_hidden_states', (dict(kind='data'))),
+
+                ('mul', dict(op='Mul')),
+                ('mul_data', dict(kind='data')),
+                ('after_mul_op_to_the_rest_of_model', dict(kind='op')),
+                ('concat_0', dict(op='ConcatV2')),
+                ('concat_0_data', dict(kind='data')),
+                ('reshape_0', dict(op='Reshape')),
+                ('reshape_0_data', dict(kind='data')),
+                ('gather_0', dict(op='Gather')),
+                ('gather_0_data', dict(kind='data')),
+
+                # 1 port: cell state before the tanh over the whole time sequence
+                ('concatenated_cell_states_data', (dict(kind='data'))),
+
+                ('concat_1', dict(op='ConcatV2')),
+                ('concat_1_data', dict(kind='data')),
+                ('reshape_1', dict(op='Reshape')),
+                ('reshape_1_data', dict(kind='data')),
+                ('gather_1', dict(op='Gather')),
+                ('gather_1_data', dict(kind='data')),
+            ],
+            edges=[
+                ('BlockLSTM', 'concatenated_hidden_states', {'out': 0}),
+                ('concatenated_hidden_states', 'mul'),
+                ('mul', 'mul_data'),
+                ('mul_data', 'after_mul_op_to_the_rest_of_model'),
+                ('mul_data', 'concat_0'),
+                ('concat_0', 'concat_0_data'),
+                ('concat_0_data', 'reshape_0'),
+                ('reshape_0', 'reshape_0_data'),
+                ('reshape_0_data', 'gather_0'),
+                ('gather_0', 'gather_0_data'),
+
+                ('BlockLSTM', 'concatenated_cell_states_data', {'out': 1}),
+                ('concatenated_cell_states_data', 'concat_1', {'in': 1}),
+                ('concat_1', 'concat_1_data'),
+                ('concat_1_data', 'reshape_1'),
+                ('reshape_1', 'reshape_1_data'),
+                ('reshape_1_data', 'gather_1'),
+                ('gather_1', 'gather_1_data')
+            ]
+        )
+
+    @staticmethod
+    def replace_pattern(graph: nx.MultiDiGraph, match: dict):
+        time_len = match['concatenated_hidden_states'].shape[0]
+        """
+        Working with concatenated_cell_states_data part first, because IE TensorIterator primitive doesn't have
+        concatenated cell states output and if we can not collepse it, then we does not support this type of BlockLSTM
+
+        We simplify the sub-graph below by taking another output of BlockLSTM:
+        concatenated cell states over the whole time sequence -> last cell state
+
+        BlockLSTM
+           || out 1 (concatenated cell states comming out of BlockLSTM)
+           \/  in 1
+        ConcatV2
+           || (concatenation with initial state or another unused data)
+           \/
+        Reshape
+           ||
+           \/
+         Gather (taking the last cell state from previous BlockLSTM, if Gather indexes == time_len)
+        """
+        # check that there are no other consumers of concatenated_cell_states_data data flow
+        valid_output_names = ['concat_1', 'concat_1_data', 'reshape_1', 'reshape_1_data', 'gather_1', 'gather_1_data']
+        valid_output_node_ids = [match[name].id for name in valid_output_names]
+        node_names_to_check_outputs = ['concatenated_cell_states_data', 'concat_1_data', 'reshape_1_data']
+        for name in node_names_to_check_outputs:
+            for node in match[name].out_nodes():
+                if node.id not in valid_output_node_ids:
+                    raise Error("BlockLSTM node {} has output which contains concatenated cell states over the whole "
+                                "time sequence. It is not replaceable by another output and is not supported "
+                                "originally".format(match['BlockLSTM'].id))
+
+        # check that we really take the last cell state data by Gather
+        gather_indexes = match['gather_1'].in_node(1).value
+        if len(gather_indexes) == 1:
+            gather_index = gather_indexes[0]
+        else:
+            raise Error("BlockLSTM node {} has output which contains concatenated cell states over the whole "
+                        "time sequence. It is not replaceable by another output and is not supported "
+                        "originally".format(match['BlockLSTM'].id))
+        if gather_index != time_len:
+            raise Error("BlockLSTM node {} has output which contains concatenated cell states over the whole "
+                        "time sequence. It is not replaceable by another output and is not supported "
+                        "originally".format(match['BlockLSTM'].id))
+
+        """
+        We passed #1 and #2 stages from class description. It means that we can translate the rest of the pattern 
+        to LSTMSequence even without following optimizations
+        """
+
+        node = match['BlockLSTM']
+        weights_node = node.in_node(1)
+        biases_node = node.in_node(2)
+        shift_const = node.forget_bias
+
+        # Assign temporary shape for them for easier manipulation
+        # TF stores weights in IO order
+        input_size = node.in_node(0).shape[-1]
+        hidden_size = node.in_node(3).shape[-1]
+        weights = weights_node.value
+        biases = biases_node.value
+        assert weights.shape[0] == input_size + hidden_size, "weights.shape={} input_size={} hidden_size={}".format(weights.shape, input_size, hidden_size)
+        assert weights.shape[1] == biases.shape[0] == 4 * hidden_size, "weights.shape={} biases.shape={} hidden_size={}".format(weights.shape, biases.shape, hidden_size)
+
+        weights = weights.reshape([
+            weights.shape[0],
+            4,  # gates
+            hidden_size
+        ])
+
+        biases = biases.reshape([
+            4,  # gates
+            hidden_size
+        ])
+
+        # Reorder gates icfo --> fico for both weights and biases
+        gate_reorder = [2, 0, 1, 3]
+        weights = np.take(weights, gate_reorder, axis=1)
+        biases = np.take(biases, gate_reorder, axis=0)
+
+        # shift_const.value should be added to the first 1/4th part of the biases (f-gate: 0)
+        # Note: in case of moving this code up before gate reordering, the addition
+        # should be applied at different place
+        biases[0] += shift_const
+
+        # Return to the original shapes
+        weights = weights.reshape([weights.shape[0], -1])
+        biases = biases.flatten()
+
+        # TF stores weights in IO, but IE requires it in OI: transpose
+        weights = weights.transpose()
+
+        weights_node.value = weights
+        weights_node.shape = np.array(weights.shape, dtype=np.int64)
+        biases_node.value = biases
+        biases_node.shape = np.array(biases.shape, dtype=np.int64)
+
+        attrs = dict(graph.get_edge_data(match['gather_1'].id, match['gather_1_data'].id)[0])
+        attrs.update({'out': 2})
+        graph.remove_edge(match['BlockLSTM'].id, match['concatenated_cell_states_data'].id)
+        graph.remove_edge(match['gather_1'].id, match['gather_1_data'].id)
+
+        graph.add_edge(match['BlockLSTM'].id, match['gather_1_data'].id, **attrs)
+
+        match['BlockLSTM'].op = 'LSTMSequence'
+        match['BlockLSTM']['sequence_dim'] = 0  # TF reference
+        match['BlockLSTM']['batch_dim'] = 1  # TF reference
+        match['BlockLSTM']['direction'] = 'forward'  # TF reference
+        match['BlockLSTM']['hidden_size'] = match['concatenated_hidden_states'].shape[-1]
+        match['BlockLSTM']['format'] = 'tf'
+
+        """
+        Optional #3 optimization from class description following
+        """
+        data_to_mul = [n for n in match['mul'].in_nodes().values() if n.id != match['concatenated_hidden_states'].id]
+        if len(data_to_mul) != 1:
+            return  # unexpected type of mul
+        data_to_mul = data_to_mul[0]
+        if not data_to_mul.has_valid('value'):
+            return  # unexpected type of mul
+        data_to_mul_value = data_to_mul.value
+        if not np.all(data_to_mul_value == 1):
+            return  # unexpected type of mul
+
+        # remove useless mul
+        attrs = dict(graph.get_edge_data(match['BlockLSTM'].id, match['concatenated_hidden_states'].id)[0])
+        graph.remove_edge(match['BlockLSTM'].id, match['concatenated_hidden_states'].id)
+        graph.remove_edge(match['mul'].id, match['mul_data'].id)
+        graph.add_edge(match['BlockLSTM'].id, match['mul_data'].id, **attrs)
+
+        # find true usages of concatenated hidden states data (not last hidden state)
+        valid_output_names = ['mul_data', 'concat_0', 'concat_0_data', 'reshape_0', 'reshape_0_data', 'gather_0',
+                              'gather_0_data']
+        valid_output_node_ids = [match[name].id for name in valid_output_names]
+        node_names_to_check_outputs = ['mul_data', 'concat_0_data', 'reshape_0_data']
+
+        list_of_concatenated_hidden_states_children_node_ids = []
+        for name in node_names_to_check_outputs:
+            for node in match[name].out_nodes():
+                if node.id not in valid_output_node_ids:
+                    list_of_concatenated_hidden_states_children_node_ids.append(node.id)
+
+        if len(list_of_concatenated_hidden_states_children_node_ids) != 1:
+            return  # not supported placement of patten
+        conacenated_child_node_id = list_of_concatenated_hidden_states_children_node_ids[0]
+        if conacenated_child_node_id != match['after_mul_op_to_the_rest_of_model'].id:
+            return  # not supported placement of patten
+
+        gather_indexes = match['gather_0'].in_node(1).value
+        if len(gather_indexes) == 1:
+            gather_index = gather_indexes[0]
+        else:
+            return  # we have to translate this type of BlockLSTM to LSTMSequence to TensorIterator as is
+        if gather_index != time_len:
+            return  # we have to translate this type of BlockLSTM to LSTMSequence to TensorIterator as is
+
+        attrs = dict(graph.get_edge_data(match['gather_0'].id, match['gather_0_data'].id)[0])
+        attrs.update({'out': 1})
+        graph.remove_edge(match['mul_data'].id, match['concat_0'].id)
+        graph.remove_edge(match['gather_0'].id, match['gather_0_data'].id)
+
+        graph.add_edge(match['BlockLSTM'].id, match['gather_0_data'].id, **attrs)
index 0027342..73459b0 100644 (file)
 
 import networkx as nx
 
-from mo.graph.graph import erase_node, Node
-from mo.middle.passes.eliminate import remove_op_node
+from mo.graph.graph import Node
+from mo.middle.passes.eliminate import remove_op_node_with_data_node
 from mo.middle.replacement import MiddleReplacementPattern
+from mo.utils.graph import pseudo_topological_sort
 
 
 class ConstSwitchEraser(MiddleReplacementPattern):
@@ -28,10 +29,10 @@ class ConstSwitchEraser(MiddleReplacementPattern):
     enabled = True
 
     def find_and_replace_pattern(self, graph: nx.MultiDiGraph):
-        for n in nx.topological_sort(graph):
+        for n in pseudo_topological_sort(graph):
             if graph.node[n]['kind'] == 'data' or graph.node[n]['op'] != 'Switch':
                 continue
             switch_op_node = Node(graph, n)
             pred_id_data_node = switch_op_node.in_node(1)
-            erase_node(pred_id_data_node)
-            remove_op_node(graph, switch_op_node)
+            graph.remove_edge(pred_id_data_node.id, switch_op_node.id)
+            remove_op_node_with_data_node(graph, switch_op_node)
index 2f5fa88..cec09cc 100644 (file)
@@ -25,7 +25,6 @@ from mo.ops.reshape import Reshape
 from mo.middle.replacement import MiddleReplacementPattern
 from extensions.middle.SliceConverter import ConvertSlice
 
-
 class ConvertGroupedStridedSlice(MiddleReplacementPattern):
     """
         This pass converts subgraphs where StridedSlices used for splitting single channel to single Split layers
@@ -110,15 +109,10 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern):
                     shape = np.array(input_shape)
                     size_splits.append(l - prev_r)
                     shape[split_channel_dim] = l - prev_r
-                    data_node = Op._create_data_node(graph, 'fake_data', {'shape': shape})
-                    # added fake Reshape to workaround IE issue with Split and fake nodes
-                    fake_op = Reshape(graph, dict(name=out_nodes[0].name + "/" + str(l) + "_fake_op", dim=shape[1:]))
-                    fake_out_node = Op._create_data_node(graph, 'fake_out_data',
-                                                         {'shape': shape[1:], 'is_output': True})
-                    fake_op.create_node_with_data([data_node], fake_op.attrs, data_nodes=[fake_out_node])
-
+                    data_node = Op._create_data_node(graph, 'fake_data', {'shape': shape, 'is_output': True})
                     final_data_nodes_list.append(data_node)
 
+
                 prev_r = r
                 size_splits.append(r - l)
                 final_data_nodes_list.append(out)
@@ -130,11 +124,7 @@ class ConvertGroupedStridedSlice(MiddleReplacementPattern):
                 shape = input_shape.copy()
                 shape[split_channel_dim] = input_shape[split_channel_dim] - prev_r
                 size_splits.append(input_shape[split_channel_dim] - prev_r)
-                data_node = Op._create_data_node(graph, 'fake_data', {'shape': shape})
-                # added fake Reshape to workaround IE issue with Split and fake nodes
-                fake_op = Reshape(graph, dict(name=out_nodes[0].name + "/" + str(l) + "_fake_op", dim=shape[1:]))
-                fake_out_node = Op._create_data_node(graph, 'fake_out_data', {'shape': shape[1:], 'is_output': True})
-                fake_op.create_node_with_data([data_node], fake_op.attrs, data_nodes=[fake_out_node])
+                data_node = Op._create_data_node(graph, 'fake_data', {'shape': shape, 'is_output': True})
                 final_data_nodes_list.append(data_node)
 
             if not valid_for_replacement:
index 919367f..7f2e87c 100644 (file)
@@ -52,12 +52,12 @@ class ConvertLayoutDependentOperations(MiddleReplacementPattern):
                     # if Node has NHWC and graph has NCHW layout
                     permutation = PermuteAttrs.get_nchw_to_nhwc_permutation(len(node.layout))
 
-                # Schematic representation og transformation below
+                # Schematic representation of transformation below
                 #
                 #                                           \            NCHW                              NCHW
                 #            NHWC                        --  \            |  permutation       permutation  |
                 #   data-->Convolution(example)-->data   --  /            |      |       NCHW      |        |
-                #                                             data->Permute->data->Convolution->data->Permute->data
+                #                                           /   data->Permute->data->Convolution->data->Permute->data
 
                 # 1. Insert input Permute
                 #    This Permute will permute input from original input layout to operation layout
index 2371303..74ff069 100644 (file)
@@ -48,7 +48,7 @@ class Eltwise1DInputReshape(MiddleReplacementPattern):
 
     def find_and_replace_pattern(self, graph: nx.MultiDiGraph):
         layout = graph.graph['layout']
-        for n in nx.topological_sort(graph):
+        for n in list(graph.nodes()):
             if 'type' in graph.node[n] and graph.node[n]['type'] == 'Eltwise' and get_value_id(Node(graph, n)) is None:
                 eltwise_op_node = Node(graph, n)
                 out_shape = eltwise_op_node.out_node().shape
index ff9239a..ea5c1c1 100644 (file)
@@ -65,13 +65,14 @@ class FusePermutesSequence(MiddleReplacementPattern):
 
                 if np.array_equal(final_permutation, [x for x in range(len(list_of_permutes[0].order))]):
                     first_data_node, last_data_node = list_of_permutes[0].in_node(), list_of_permutes[-1].out_node()
+                    graph.remove_edge(first_data_node.id, list_of_permutes[0].id)
                 else:
                     if len(list_of_permutes) < 2:
                         continue
                     first_data_node, last_data_node = list_of_permutes[0].out_node(), list_of_permutes[-1].out_node()
                     list_of_permutes[0].order = final_permutation
+                    graph.remove_edge(first_data_node.id, first_data_node.out_node().id)
 
-                graph.remove_edge(first_data_node.id, first_data_node.out_node().id)
                 graph.remove_edge(last_data_node.in_node().id, last_data_node.id)
 
                 merge_data_nodes(graph, first_data_node, last_data_node)
diff --git a/model-optimizer/extensions/middle/GemmResolver.py b/model-optimizer/extensions/middle/GemmResolver.py
new file mode 100644 (file)
index 0000000..29a39b9
--- /dev/null
@@ -0,0 +1,62 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from extensions.middle.NormalizeFullyConnected import NormalizeFullyConnected
+from mo.front.common.partial_infer.utils import mark_input_bins, assign_dims_to_weights, int64_array
+from mo.middle.replacement import MiddleReplacementPattern
+from mo.ops.op import PermuteAttrs
+
+
+class GemmResolver(MiddleReplacementPattern):
+    enabled = True
+
+    def run_before(self):
+        return [NormalizeFullyConnected]
+
+    def pattern(self):
+        return dict(
+            nodes=[
+                   ('input_0', dict(kind='data')),
+                   ('input_1', dict(kind='data')),
+                   ('fc', dict(op='MatMul')),
+                   ('fc_data', dict(kind='data'))],
+            edges=[
+                ('input_0', 'fc', {'in': 0}),
+                ('input_1', 'fc', {'in': 1}),
+                ('fc', 'fc_data')
+            ]
+        )
+
+    def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
+        if not match['input_0'].has_valid('value') and not match['input_1'].has_valid('value') or \
+                not match['input_0'].has_valid('value') and match['input_1'].has_valid('value') and match['input_1'].shape.size > 2:
+            match['fc']['type'] = 'GEMM'
+        elif not match['input_0'].has_valid('value') and match['input_1'].has_valid('value'):
+            match['fc']['type'] = 'FullyConnected'
+            node = match['fc']
+            mark_input_bins(node)
+            weights_node = match['input_1']
+            assign_dims_to_weights(weights_node, None, 0, 1, 2)
+            PermuteAttrs.set_permutation(weights_node, node, PermuteAttrs.Permutation(perm=int64_array([1, 0]),
+                                                                                      inv=int64_array([0, 1])))
+            weights_shape = weights_node.shape
+
+            node['out-size'] = weights_shape[1]
+
+
+
diff --git a/model-optimizer/extensions/middle/NormalizePad.py b/model-optimizer/extensions/middle/NormalizePad.py
new file mode 100644 (file)
index 0000000..2e9e89f
--- /dev/null
@@ -0,0 +1,49 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+
+from mo.middle.passes.eliminate import remove_op_node_with_data_node
+from mo.middle.replacement import MiddleReplacementPattern
+
+
+class NormalizePad(MiddleReplacementPattern):
+    """
+    The replacer finds all Pad operations and remove inputs with index 1 and 2. These inputs contain padding values
+    for each input tensor dimension and optionally the pad value for case of padding with a 'constant' mode.
+
+    The Pad layer is removed if all padding values are equal to 0.
+    """
+    enabled = True
+
+    def pattern(self):
+        return dict(
+            nodes=[
+                ('pad', dict(kind='op', op='Pad'))
+            ],
+            edges=[]
+        )
+
+    def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
+        node = match['pad']
+        for port, input_node in node.in_nodes().items():
+            if port != 0:
+                graph.remove_edge(input_node.id, node.id)
+
+        # remove Pad operation if all pads are equal to 0
+        if np.all(node.pads == 0):
+            remove_op_node_with_data_node(graph, node)
diff --git a/model-optimizer/extensions/middle/PadToPoolingMiddleReplacer.py b/model-optimizer/extensions/middle/PadToPoolingMiddleReplacer.py
deleted file mode 100644 (file)
index bdf8ab9..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import numpy as np
-import networkx as nx
-
-from mo.ops.pooling import Pooling
-from mo.graph.graph import unique_id
-from mo.middle.replacement import MiddleReplacementPattern
-from mo.front.common.layout import get_features_dim
-
-class PadToPoolingMiddleReplacer(MiddleReplacementPattern):
-    op = "Pad"
-    enabled = False
-
-    def pattern(self):
-        return dict(
-            nodes=[
-                ('pad', dict(kind='op', op='Pad'))
-            ],
-            edges=[]
-        )
-
-    def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
-        node = match['pad']
-        input = node.in_node()
-        output = node.out_node()
-        if len(output.out_nodes()) > 0:
-            ndim = len(input.shape)
-            pad = node.pads
-            graph.remove_edge(input.id, node.id)
-            graph.remove_edge(node.id, output.id)
-            pool_node = unique_id(graph, node.name + '/Pool_')
-            Pooling(graph, dict(name=pool_node, window=np.ones(ndim, dtype=np.int64),
-                                output_spatial_shape=None,
-                                batch_dims=np.array([0], dtype=np.int64),
-                                channel_dims=np.array([get_features_dim(graph.graph['layout'], ndim)], dtype=np.int64),
-                                stride=np.array(np.ones(ndim, dtype=np.int64)),
-                                pad=pad, exclude_pad='false', pool_method='max')).create_node_with_data(inputs=[input], data_nodes=[output])
index 7c3f0b6..9564b5d 100644 (file)
@@ -104,6 +104,7 @@ class PixelLinkReshape(MiddleReplacementPattern):
             graph.add_edge(permute_before_node.id, node_split.id, **attrs)
 
             node = match['reshape_pack']
+            node['nchw_layout'] = True
             new_reshape_shape = np.concatenate((np.array([node.in_node(0).shape[0]]),
                                                 np.array([np.prod(node.in_node(0).shape[[1, 2, 3]])]),
                                                 np.array([node.in_node(0).shape[-1]])))
index 8d8537e..6c6c91d 100644 (file)
@@ -19,9 +19,11 @@ import logging as log
 import networkx as nx
 import numpy as np
 
+from mo.front.caffe.extractors.utils import get_canonical_axis_index
 from mo.front.common.layout import get_batch_dim, get_features_dim
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.pooling import Pooling
+from mo.ops.power import Power
 from mo.ops.reshape import Reshape
 
 
@@ -29,7 +31,13 @@ class ReduceReplacer(MiddleReplacementPattern):
     op = "Reduce"
     enabled = True
 
-    supported_reduce_types = ['mean']
+    supported_reduce_types = ['mean', 'max', 'sum']
+
+    pool_method_map = {
+        'max': 'max',
+        'mean': 'avg',
+        'sum': 'avg'
+    }
 
     def pattern(self):
         return dict(
@@ -41,19 +49,24 @@ class ReduceReplacer(MiddleReplacementPattern):
 
     def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
         node = match['reduce']
-        if not node.has_valid('reduce_type') and node.reduce_type.lower() not in self.supported_reduce_types:
+        if not node.has_valid('reduce_type') or node.reduce_type.lower() not in self.supported_reduce_types:
             log.error("Reduce type {} is not supported for node {}".format(node.soft_get('reduce_type'), node.id))
             return
 
+        reduce_type = node.reduce_type.lower()
+        if reduce_type not in self.pool_method_map:
+            log.error("Reduce type {} is not included in pool_method_map. Please update pool_method_map with new key "
+                      "{}".format(reduce_type, reduce_type))
+            return
+
+        input_data = node.in_node()
+        output_data = node.out_node()
+
         input_shape = node.in_node().shape
         output_shape = node.out_node().shape
-        ndim = len(input_shape)
 
-        # Currently only NCHW layout is supported
-        layout = graph.graph['layout']
-        if layout != 'NCHW':
-            log.error('{} layout currently is not supported'.format(layout))
-            return
+        # normalize node.axis to exclude negative indices
+        node.axis = [get_canonical_axis_index(input_shape, a) for a in node.axis]
 
         axis = node.axis
 
@@ -63,42 +76,58 @@ class ReduceReplacer(MiddleReplacementPattern):
                 log.error("Reduce with not consecutive axes {} is not supported ".format(axis))
                 return
 
+        layout = graph.graph['layout']
+
         # So now we are sure that we can convert Reduce to appropriate operation
-        if node.reduce_type.lower() == 'mean':
-            # 1. Calculate shape that will be used in reduction
-            reduction_dim = np.prod([input_shape[idx] for idx in axis])
-            begin_dims = np.array([input_shape[idx] for idx in range(axis[0])])
-            end_dim = np.prod([input_shape[idx] for idx in range(axis[-1] + 1, len(input_shape))])
 
-            # 2. Create reshape with appropriate shape
+        # 1. Calculate shape that will be used in reduction
+        reduction_dim = np.prod([input_shape[idx] for idx in axis])
+        begin_dims = np.array([input_shape[idx] for idx in range(axis[0])])
+        end_dim = np.prod([input_shape[idx] for idx in range(axis[-1] + 1, len(input_shape))])
+
+        # 2. Create reshape with appropriate shape
+        if layout == 'NCHW':
             if len(begin_dims) > 2:
-                begin_dims = np.array([np.prod(begin_dims[0:-1], begin_dims[-1])], dtype=np.int64)
+                begin_dims = np.array([np.prod(begin_dims[0:-1]), begin_dims[-1]], dtype=np.int64)
             else:
                 # Expand begin_dims to 2
                 begin_dims = np.array(np.append(begin_dims, [1] * (2 - len(begin_dims))), dtype=np.int64)
-
             reshape_shape = np.array([*begin_dims, reduction_dim, end_dim], dtype=np.int64)
+            pool_window = np.array([1, 1, reduction_dim, 1], dtype=np.int64)
+        elif layout == 'NHWC':
+            begin_dims = np.prod(begin_dims)
+            reshape_shape = np.array([begin_dims, reduction_dim, 1, end_dim], dtype=np.int64)
+            pool_window = np.array([1, reduction_dim, 1, 1], dtype=np.int64)
+        else:
+            log.error('{} layout currently is not supported'.format(layout))
+            return
 
-            # 3. Reduce => Reshape->Pooling->Reshape
-            reshape_op = Reshape(graph, {'name': node.id + '/Reshape', 'dim': reshape_shape})
-            final_reshape_op = Reshape(graph, {'name': node.id + '/FinalReshape', 'dim': output_shape})
-            pooling_op = Pooling(graph,
-                                 dict(name=node.id + '/Pool', window=np.array([1, 1, reduction_dim, 1], dtype=np.int64),
-                                      output_spatial_shape=None,
-                                      batch_dims=np.array([get_batch_dim(layout, 4)], dtype=np.int64),
-                                      channel_dims=np.array([get_features_dim(layout, 4)], dtype=np.int64),
-                                      exclude_pad='false', pool_method='avg'))
-
-            input_data = node.in_node()
-            output_data = node.out_node()
-
-            graph.remove_edge(input_data.id, node.id)
-            graph.remove_edge(node.id, output_data.id)
-
-            final_reshape_op.create_node_with_data(
-                inputs=[pooling_op.create_node_with_data(
-                    inputs=[reshape_op.create_node_with_data(
-                        inputs=[input_data]
-                    )]
-                )],
-                data_nodes=output_data)
+        # 3. Reduce => Reshape->Pooling->Reshape
+        reshape_op = Reshape(graph, {'name': node.id + '/Reshape', 'dim': reshape_shape})
+        final_reshape_op = Reshape(graph, {'name': node.id + '/FinalReshape', 'dim': output_shape})
+        pooling_op = Pooling(graph,
+                             dict(name=node.id + '/Pool',
+                                  window=pool_window,
+                                  output_spatial_shape=None,
+                                  batch_dims=np.array([get_batch_dim(layout, 4)], dtype=np.int64),
+                                  channel_dims=np.array([get_features_dim(layout, 4)], dtype=np.int64),
+                                  exclude_pad='false', pool_method=self.pool_method_map[reduce_type]))
+
+        graph.remove_edge(input_data.id, node.id)
+        graph.remove_edge(node.id, output_data.id)
+
+        final_reshape_op.create_node_with_data(
+            inputs=[pooling_op.create_node_with_data(
+                inputs=[reshape_op.create_node_with_data(
+                    inputs=[input_data]
+                )]
+            )],
+            data_nodes=output_data)
+
+        # 4. If it is reduction with summation, we need to multiply by size of the reduction slice with Mul op
+        if reduce_type == 'sum':
+            output_data.in_node().insert_node_with_data_after(
+                output_data,
+                Power,
+                {'name': node.name + '/Mul', 'scale': float(reduction_dim)}
+            )
diff --git a/model-optimizer/extensions/middle/ShuffleChannel.py b/model-optimizer/extensions/middle/ShuffleChannel.py
new file mode 100644 (file)
index 0000000..5370aeb
--- /dev/null
@@ -0,0 +1,71 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import networkx as nx
+import numpy as np
+
+from extensions.middle.ShufflenetReshape import FeatureShuffleReshape
+from mo.middle.replacement import MiddleReplacementPattern
+from mo.ops.permute import Permute
+from mo.ops.reshape import Reshape
+from mo.utils.error import Error
+
+
+class ShuffleChannel(MiddleReplacementPattern):
+    """
+    Replaces Caffe ShuffleChannel with Reshapes and Permute layers
+    """
+
+    enabled = True
+
+    def run_after(self):
+        return [FeatureShuffleReshape]
+
+    def pattern(self):
+        return dict(
+            nodes=[
+                ('op', dict(op='ShuffleChannel')),
+            ],
+            edges=[
+            ])
+
+    def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
+        if graph.graph['layout'] != "NCHW":
+            return
+
+        node = match['op']
+
+        in_node = node.in_node(0)
+        out_node = node.out_node(0)
+        group = int(node['group'])
+
+        graph.remove_edge(in_node.id, node.id)
+        graph.remove_edge(node.id, out_node.id)
+
+        rows = group
+        cols = in_node.shape[1] // group
+
+        if rows * cols != in_node.shape[1]:
+            raise Error("Group {} should divide input channels number {} without reminder for node {}".format(group, in_node.shape[1], node.id))
+
+        reshape_split = Reshape(graph, attrs={'name': node.id + '/Reshape_split_',
+                                              'dim': np.array([in_node.shape[0], rows, cols, -1])})
+        reshape_split_node = reshape_split.create_node_with_data([in_node])
+        transpose = Permute(graph, attrs={'name': node.id + '/Transpose_',
+                                          'order': np.array([0, 2, 1, 3])})
+        transpose_node = transpose.create_node_with_data([reshape_split_node])
+        reshape_concat = Reshape(graph, attrs={'name': node.id + '/Reshape_concat_',
+                                               'dim': out_node.shape})
+        reshape_concat.create_node_with_data([transpose_node], data_nodes=[out_node])
index de9593c..f85d60d 100644 (file)
@@ -19,6 +19,8 @@ import logging as log
 import networkx as nx
 import numpy as np
 
+from mo.front.common.layout import get_features_dim, get_height_dim, get_width_dim
+from mo.front.common.partial_infer.utils import int64_array
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.reshape import Reshape
 
@@ -74,15 +76,14 @@ class FeatureShuffleReshape(MiddleReplacementPattern):
         # Input shapes can be either NCHW or NHWC, so in case of channel split, feature channel can be splited as
         # follows in comments below
         # So feature_dims_split list contains possible dims responsible for feature dim
-        if graph.graph['layout'] == 'NCHW':
+        layout = graph.graph['layout']
+        feature_dim = get_features_dim(layout, len(input_shape))
+        spatial_dims = [get_height_dim(layout, len(input_shape)), get_width_dim(layout, len(input_shape))]
+        if layout == 'NCHW':
             # NC1C2HW or NC1C2(H*W)
-            feature_dim = 1
-            spatial_dims = [2, 3]
             feature_dims_split = np.array([feature_dim, feature_dim + 1])
         else:
             # NHWC1C2 or N(H*W)C1C2 or (N*H*W)C1C2
-            feature_dim = 3
-            spatial_dims = [1, 2]
             feature_dims_split = np.array([len(reshape1_shape) - 2, len(reshape1_shape) - 1])
 
         # Check that feature_dims_split suits reshape layer shape
@@ -128,8 +129,10 @@ class FeatureShuffleReshape(MiddleReplacementPattern):
 
 class ReshapeSoftmaxReshape(MiddleReplacementPattern):
     """
-    In case of NHWC this pass finds patterns Reshape(-1,2)->Softmax and changes first Reshape dims for NCHW format.
-    This transformation is necessary because after conversion to NCHW this sequence will have wrong interpretation
+    In case of NHWC this pass finds patterns Reshape(-1,C) -> Softmax and changes first Reshape dims for NCHW format.
+    This transformation is necessary because after conversion to NCHW this sequence will have wrong interpretation.
+    There is no need to permute data before reshape because the Softmax will be performed over the features dimension
+    so the output will be in a correct layout.
     """
 
     enabled = True
@@ -148,7 +151,8 @@ class ReshapeSoftmaxReshape(MiddleReplacementPattern):
                    ])
 
     def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
-        if graph.graph['layout'] != 'NHWC':
+        layout = graph.graph['layout']
+        if layout != 'NHWC':
             return
 
         reshape1 = match['reshape1']
@@ -170,8 +174,8 @@ class ReshapeSoftmaxReshape(MiddleReplacementPattern):
             return
 
         # Define feature dim
-        feature_dim = 3
-        spatial_dims = [1, 2]
+        feature_dim = get_features_dim(layout, len(input_shape))
+        spatial_dims = [get_height_dim(layout, len(input_shape)), get_width_dim(layout, len(input_shape))]
 
         # Skip transform in case if spatial dims in input shape are equal to [1,1]
         if np.array_equal(input_shape[spatial_dims], np.array([1, 1])):
@@ -192,6 +196,9 @@ class ReshapeSoftmaxReshape(MiddleReplacementPattern):
                                              np.array([reshape1_shape[-1]]),
                                              np.array([np.prod(input_shape[spatial_dims])])))
 
+        # update 'dim' attribute but preserve batch dimension size which could be -1
+        reshape1.dim = int64_array([reshape1.dim[0], *new_reshape1_shape[1:]])
+
         old_shape = np.array(reshape1.out_node().shape)
         reshape1.out_node().shape = new_reshape1_shape
         softmax.out_node().shape = new_reshape1_shape
@@ -202,13 +209,13 @@ class ReshapeSoftmaxReshape(MiddleReplacementPattern):
         softmax['nchw_layout'] = True
         softmax.out_node()['nchw_layout'] = True
 
-        # Create final Reshape to keep original shape for softmax output
+        # Create final Reshape to keep original shape for softmax output if softmax is not the last node
         softmax_out_data = softmax.out_node()
-        next_operation = softmax_out_data.out_node()
-        # Save edge attributes & remove edge
-        edge_attrs = graph.get_edge_data(softmax_out_data.id, next_operation.id)[0]
-        graph.remove_edge(softmax_out_data.id, next_operation.id)
-
-        reshape_op = Reshape(graph, dict(name="Reshape_", dim=np.array(old_shape)))
-        reshape_out_data = reshape_op.create_node_with_data(inputs=[softmax_out_data])
-        graph.add_edges_from([(reshape_out_data.id, next_operation.id, edge_attrs)])
+        if len(softmax_out_data.out_nodes()) != 0:
+            next_operation = softmax_out_data.out_node()
+            # Save edge attributes & remove edge
+            edge_attrs = graph.get_edge_data(softmax_out_data.id, next_operation.id)[0]
+            graph.remove_edge(softmax_out_data.id, next_operation.id)
+            reshape_op = Reshape(graph, dict(name=softmax.id + "/Reshape", dim=np.array(old_shape), nchw_layout=True))
+            reshape_out_data = reshape_op.create_node_with_data(inputs=[softmax_out_data])
+            graph.add_edges_from([(reshape_out_data.id, next_operation.id, edge_attrs)])
index c714ef0..f6e925b 100644 (file)
@@ -39,7 +39,7 @@ class ConvertSlice(MiddleReplacementPattern):
     def pattern(self):
         return dict(
             nodes=[
-                ('slice',dict(kind='op', op='Slice'))
+                ('slice', dict(kind='op', op='Slice'))
             ],
             edges=[]
         )
@@ -69,8 +69,11 @@ class ConvertSlice(MiddleReplacementPattern):
         elif dims == 1:
             # If Slice use only one axis, than
             # convert Slice to StridedSlice
+
             node['op'] = 'StridedSlice'
             node['type'] = 'StridedSlice'
+            node['new_axis_mask'] = np.zeros(len(output_data.shape), dtype=np.bool)
+            node['shrink_axis_mask'] = np.zeros(len(output_data.shape), dtype=np.bool)
 
             convert_negative_indices(begin, input.shape)
             convert_negative_indices(end, input.shape)
diff --git a/model-optimizer/extensions/middle/SwapAxesMiddleReplacer.py b/model-optimizer/extensions/middle/SwapAxesMiddleReplacer.py
new file mode 100644 (file)
index 0000000..276ff7f
--- /dev/null
@@ -0,0 +1,51 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+
+from mo.middle.replacement import MiddleReplacementPattern
+from mo.ops.op import Op
+from mo.ops.reshape import Reshape
+
+
+class SwapAxesMiddleReplacer(MiddleReplacementPattern):
+    enabled = False
+
+    def pattern(self):
+        return dict(
+            nodes=[('swapaxes', dict(kind='op', op='swapaxes'))],
+            edges=[],
+        )
+
+    def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
+        """
+            Replace swapaxes layer:
+            swapaxes -> Reshape
+        """
+
+        swapaxes = match['swapaxes']
+        swapaxes_in_node = swapaxes.in_node()
+        swapaxes_out_node = swapaxes.out_node()
+
+        input_edge_attrs = graph.get_edge_data(swapaxes_in_node.id, swapaxes.id)[0]
+        output_edge_attrs = graph.get_edge_data(swapaxes.id, swapaxes_out_node.id)[0]
+
+        graph.remove_edge(swapaxes_in_node.id, swapaxes.id)
+        graph.remove_edge(swapaxes.id, swapaxes_out_node.id)
+        Reshape(graph, {'dim': np.array(swapaxes_in_node.shape)}).create_node_with_data(inputs=[swapaxes_in_node],
+                                                                                      data_nodes=[swapaxes_out_node],
+                                                                                      edge_attrs=[input_edge_attrs, output_edge_attrs])
index 1913704..b029b45 100644 (file)
  limitations under the License.
 """
 
-import logging as log
-import numpy as np
 import networkx as nx
-from mo.graph.graph import Node
-from mo.graph.graph import erase_node
+import numpy as np
+
+from extensions.middle.FusePermutesSequence import FusePermutesSequence
 from mo.middle.replacement import MiddleReplacementPattern
 
 
@@ -34,6 +33,11 @@ class TensorFlowLSTMtoGeneric(MiddleReplacementPattern):
     def run_after(self):
         return []
 
+    def run_before(self):
+        return [
+            FusePermutesSequence,
+        ]
+
     def pattern(self):
         return dict(
             nodes=[('lstm', dict(op='LSTMCell', tf=True))],
@@ -57,8 +61,10 @@ class TensorFlowLSTMtoGeneric(MiddleReplacementPattern):
         hidden_size = node.in_node(1).shape[1]
         weights = weights_node.value
         biases = biases_node.value
-        assert weights.shape[0] == input_size + hidden_size, "weights.shape={} input_size={} hidden_size={}".format(weights.shape, input_size, hidden_size)
-        assert weights.shape[1] == biases.shape[0] == 4 * hidden_size, "weights.shape={} biases.shape={} hidden_size={}".format(weights.shape, biases.shape, hidden_size)
+        assert weights.shape[0] == input_size + hidden_size, "weights.shape={} input_size={} hidden_size={}".format(
+            weights.shape, input_size, hidden_size)
+        assert weights.shape[1] == biases.shape[0] == 4 * hidden_size,\
+            "weights.shape={} biases.shape={} hidden_size={}".format(weights.shape, biases.shape, hidden_size)
 
         weights = weights.reshape([
             weights.shape[0],
index 529900e..70b169f 100644 (file)
@@ -71,8 +71,6 @@ Shape -> StridedSlice -> Enter -|    LogicalAnd --> LoopCond (data)
                 ('Enter_2_less_data', dict(kind='data')),
                 ('minimum', dict(kind='op', op='Minimum')),
                 ('minimum_data', dict(kind='data')),
-                ('Maximum',  dict(kind='op', op='Maximum')),
-                ('Maximum_data', dict(kind='data')),
 
                 ('and', dict(kind='op', op='LogicalAnd')),
                 ('and_data', dict(kind='data')),
@@ -152,8 +150,6 @@ Shape -> StridedSlice -> Enter -|    LogicalAnd --> LoopCond (data)
                 ('add_2', 'add_2_data'),
                 ('add_2_data', 'NextIteration_2'),
 
-                ('Maximum', 'Maximum_data'),
-                ('Maximum_data', 'minimum'),
                 ('minimum', 'minimum_data'),
                 ('minimum_data', 'Enter_2_less'),
                 ('Enter_2_less', 'Enter_2_less_data'),
@@ -174,6 +170,8 @@ Shape -> StridedSlice -> Enter -|    LogicalAnd --> LoopCond (data)
     @staticmethod
     def replace_pattern(graph: nx.MultiDiGraph, match: dict):
         log.debug('================== ConditionFind ===============')
+        max_node = match['minimum'].in_node(1).in_node()
+        assert max_node['kind'] == 'op' and max_node['op'] == 'Maximum'
 
         #init_1
         init_1 = match['init_1_data'].value
@@ -205,9 +203,105 @@ Shape -> StridedSlice -> Enter -|    LogicalAnd --> LoopCond (data)
 
         # Delete useless nodes
         safe_nodes = ['loop_cond_data', 'Identity_2_data', 'Strided_slice', 'Strided_slice_data',
-                      'Maximum', 'Maximum_data', 'minimum', 'minimum_data']
+                      'minimum', 'minimum_data']
         nodes_for_remove = []
         for node in match.keys():
             if node not in safe_nodes:
                 nodes_for_remove.append(match[node].id)
         graph.remove_nodes_from(nodes_for_remove)
+
+
+class SimpleConditionMather(MiddleReplacementPattern):
+    @staticmethod
+    def pattern():
+        log.debug('+++++++++++++++ SimpleConditionMatching ++++++++++++++++')
+        return dict(
+            nodes=[
+                ('Enter_1_less', dict(kind='op', op='Enter')),
+                ('Strided_slice', dict(kind='op', op='StridedSlice')),
+                ('Strided_slice_data', dict(kind='data')),
+                ('Enter_1_less_data', dict(kind='data')),
+
+                ('Less_1', dict(kind='op', op='Less')),
+                ('Merge_1', dict(kind='op', op='Merge')),
+                ('Merge_1_data', dict(kind='data')),
+                ('Less_1_data', dict(kind='data')),
+
+                ('loop_cond', dict(kind='op', op='LoopCond')),
+                ('loop_cond_data', dict(kind='data')),
+
+                ('init_1', dict(kind='op', op='Const')),
+                ('init_1_data',  dict(kind='data')),
+                ('Enter_1', dict(kind='op', op='Enter')),
+                ('Enter_1_data',  dict(kind='data')),
+
+
+                ('Switch_1', dict(kind='op', op='Switch')),
+                ('Switch_1_data', dict(kind='data')),
+                ('Identity_1', dict(kind='op', op='Identity')),
+                ('Identity_1_data', dict(kind='data')),
+                ('add_1', dict(kind='op', op='Add')),
+                ('add_1_y',  dict(kind='op', op='Const')),
+                ('add_1_y_data', dict(kind='data')),
+                ('add_1_data', dict(kind='data')),
+                ('NextIteration_1', dict(kind='op', op='NextIteration')),
+            ],
+            edges=[
+                ('Strided_slice', 'Strided_slice_data'),
+                ('Strided_slice_data', 'Enter_1_less'),
+                ('Enter_1_less', 'Enter_1_less_data'),
+                ('Enter_1_less_data', 'Less_1'),
+                ('Less_1', 'Less_1_data'),
+                ('Less_1_data', 'loop_cond'),
+
+                ('loop_cond', 'loop_cond_data'),
+                ('loop_cond_data', 'Switch_1'),
+
+                ('init_1', 'init_1_data'),
+                ('init_1_data', 'Enter_1'),
+                ('Enter_1', 'Enter_1_data'),
+                ('Enter_1_data', 'Merge_1'),
+                ('Merge_1', 'Merge_1_data'),
+                ('Merge_1_data', 'Less_1'),
+
+                ('Merge_1_data', 'Switch_1'),
+                ('Switch_1', 'Switch_1_data'),
+                ('Switch_1_data', 'Identity_1'),
+                ('Identity_1', 'Identity_1_data'),
+                ('Identity_1_data', 'add_1'),
+                ('add_1_y', 'add_1_y_data'),
+                ('add_1_y_data', 'add_1'),
+                ('add_1', 'add_1_data'),
+                ('add_1_data', 'NextIteration_1'),
+
+            ],
+        )
+
+    @staticmethod
+    def replace_pattern(graph: nx.MultiDiGraph, match: dict):
+        log.debug('================== SimpleConditionFind ===============')
+        # init_1
+        init_1 = match['init_1_data'].value
+        assert init_1 is not None
+        init_1 = int(init_1)
+
+        # step_1
+        assert match['add_1_y_data'].value is not None
+        step_1 = int(match['add_1_y_data'].value)
+
+        match['loop_cond_data'].value = None
+
+        # Create condition node and delete all useless nodes from condition pattern
+        condition_attrs = dict(iter=dict(init=init_1, step=step_1), \
+                               name=match['loop_cond'].name + '/TensorIteratorCondition_')
+        condition = TensorIteratorCondition(graph, attrs=condition_attrs)
+        condition.create_node_with_data(inputs=[match['Strided_slice_data']],
+                                        data_nodes=[match['loop_cond_data'], match['Identity_1_data']])
+
+        # Delete useless nodes
+        safe_nodes = ['loop_cond_data', 'Identity_1_data', 'Strided_slice', 'Strided_slice_data']
+        nodes_for_remove = []
+        for node in match.keys():
+            if node not in safe_nodes:
+                nodes_for_remove.append(match[node].id)
+        graph.remove_nodes_from(nodes_for_remove)
\ No newline at end of file
index 71cd28f..5dfea5b 100644 (file)
@@ -66,7 +66,18 @@ class ConditionChecks(MiddleReplacementPattern):
 
         # Check for comparing SS and seq_length source (it should be one tensor)
         # SIMPLE CHECK
-        assert match['Strided_slice_data'].value == match['minimum_data'].value
+        assert match['Strided_slice_data'].value is not None
+        if match['minimum_data'].value is None:
+            log.warning('TF loop doesn\'t have a constant upper bound produced by node {}, or ModelOptimizer '
+                        'cannot detect a constant in this case. Loops with a dynamic number of iterations are not '
+                        'supported, so in the resulting IR, generated TensorIterator will have '
+                        'a maximum number of iterations determined by input tensor size: {}',
+                        match['minimum_data'].soft_get('name'),
+                        match['Strided_slice_data'].value
+            )
+        else:
+            assert match['Strided_slice_data'].value == match['minimum_data'].value, \
+                'Values do not match: {} and {}'.format(match['Strided_slice_data'].value, match['minimum_data'].value)
 
         # SMART CHECK
         # TODO: add here some smart check for tensors equality
@@ -80,4 +91,4 @@ class ConditionChecks(MiddleReplacementPattern):
             if ta.has_valid('kind') and ta['kind'] == 'op' and ta['op'] in type_list:
                 assert ta.in_node(0).id == ss.id
 
-        log.debug('+++++++++++++++ Condition Check was successful ++++++++++++++++')
\ No newline at end of file
+        log.debug('+++++++++++++++ Condition Check was successful ++++++++++++++++')
index a36809c..65cdb40 100644 (file)
@@ -15,8 +15,8 @@
 """
 
 import logging as log
-
 import networkx as nx
+import numpy as np
 
 from extensions.ops.TensorIterator_ops import TensorIteratorInput
 from mo.middle.replacement import MiddleReplacementPattern
@@ -37,7 +37,8 @@ class SmartInputMatcher(MiddleReplacementPattern):
         |                                                  ^
         |__________________________________________________|
     """
-    enabled = True
+
+    enabled = False  # called from mo.pipeline.tf directly
 
     @staticmethod
     def pattern():
@@ -139,7 +140,7 @@ class SmartInputMatcher(MiddleReplacementPattern):
         # Create input node with params
         # axis == 0 because in TensorArray we ALWAYS iterate over 0 axis, other params will be fill later (with
         # condition)
-        input_node = TensorIteratorInput(graph, dict(axis=0, start=start, end=end, stride=None, part_size=None,
+        input_node = TensorIteratorInput(graph, dict(axis=0, start=start, stride=None, part_size=None,
                                                 external_port_id=str(match['Enter_data'].value),
                                                 internal_layer_id=match['TensorArrayRead_data'].id,
                                                 name=match['TensorArrayRead'].name + '/TensorIteratorInput_'
@@ -157,6 +158,9 @@ class SmartInputMatcher(MiddleReplacementPattern):
 
 
 class SimpleInputMatcher(MiddleReplacementPattern):
+
+    enabled = False  # called from mo.pipeline.tf directly
+
     """
     This pattern match simple inputs (without partitions) in while loops in TF (this inputs are set by Enter nodes).
     """
@@ -181,4 +185,39 @@ class SimpleInputMatcher(MiddleReplacementPattern):
         input_node.create_node_with_data(inputs=[match['Enter'].in_node()], data_nodes=[match['Enter'].out_node()])
 
         # Delete useless nodes
-        graph.remove_nodes_from([match['Enter'].id])
\ No newline at end of file
+        graph.remove_nodes_from([match['Enter'].id])
+
+
+class BackEdgeSimpleInputMatcher(MiddleReplacementPattern):
+
+    enabled = False  # called from mo.pipeline.tf directly
+
+    @staticmethod
+    def pattern():
+        return dict(
+            nodes=[
+                ('BackEdge', dict(kind='op', op='TensorIteratorBackEdge')),
+            ],
+            edges=[
+            ],
+        )
+
+    @staticmethod
+    def replace_pattern(graph: nx.MultiDiGraph, match: dict):
+        log.debug('================== SimpleBackEdgeInputFind ===============')
+
+        assert len(match['BackEdge'].in_nodes()) == 3
+        condition = match['BackEdge'].in_node(2)
+        init_input = match['BackEdge'].in_node(0)
+        cycle_input = match['BackEdge'].in_node(1)
+
+        # We need to create new TensorItertorInput node only if this node doesn't exist already.
+        if len(init_input.in_nodes()) == 0:
+            input_node = TensorIteratorInput(graph, dict(external_port_id=None,
+                                             internal_layer_id=None,
+                                             name=match['BackEdge'].name + '/TensorIteratorInput_'
+                                            ))
+            input_data_node = input_node.create_node_with_data(inputs=[init_input])
+            input_data_node.shape = np.array(init_input.shape, dtype=np.int64)
+            graph.remove_edges_from([(init_input.id, match['BackEdge'].id)])
+            graph.add_edges_from([(input_data_node.id, match['BackEdge'].id, {'in': 0, 'out': 0})])
index 813bad1..218b129 100644 (file)
@@ -75,16 +75,15 @@ def dfs(graph: nx.MultiDiGraph, node_name: str, stop_nodes: list, visited: set =
                     visited.add(out_node_name)
                     d.append(out_node_name)
 
-def get_body(graph, cond, inputs, outputs):
+def get_body(graph, inputs, outputs):
     nodes, extra_inputs = sub_graph_between_nodes(
         graph,
-        [cond] + inputs,
+        inputs,
         outputs,
         lambda node: node.soft_get('op')  == 'TensorIteratorInput'
     )
-    nodes = list(set(nodes) - set([cond] + inputs) - set(outputs) - set(extra_inputs))
+    nodes = list(set(nodes) - set(inputs) - set(outputs) - set(extra_inputs))
     return nodes, extra_inputs
-    #return nx.MultiDiGraph()
 
 
 class TensorIteratorMerge(MiddleReplacementPattern):
@@ -102,11 +101,10 @@ class TensorIteratorMerge(MiddleReplacementPattern):
         # Here we will found all parts of TI: condition, inputs/outputs, back edges, body and create TensorIterator Op
         # and make all checks needed for TensorIteator work
         cond_data = match['condition'].out_node(0)
-        time_data = match['condition'].out_node(1)
+        time_data = match['condition'].out_node(1) if len(match['condition'].out_nodes()) > 1 else None
         name = match['condition'].name
 
         assert match['condition'].in_node(0).has_valid('value')
-        assert match['condition'].in_node(1).has_valid('value')
 
         back_edges = []
         inputs = []
@@ -120,18 +118,23 @@ class TensorIteratorMerge(MiddleReplacementPattern):
             elif node['kind'] == 'op' and node['op'] == 'TensorIteratorOutput':
                 outputs.append(node.id)
 
-        for node in time_data.out_nodes():
-            if node['kind'] == 'op' and node['op'] == 'TensorIteratorInput':
-                inputs.append(node.id)
-            elif node['kind'] == 'op' and node['op'] == 'TensorIteratorOutput':
-                outputs.append(node.id)
-            else:
-                # something goes wrong here
-                assert False
+        if time_data is not None:
+            for node in time_data.out_nodes():
+                if node['kind'] == 'op' and node['op'] == 'TensorIteratorInput':
+                    inputs.append(node.id)
+                elif node['kind'] == 'op' and node['op'] == 'TensorIteratorOutput':
+                    outputs.append(node.id)
+                else:
+                    # something goes wrong here
+                    assert False
 
-        graph.remove_nodes_from([cond_data.id, time_data.id])
+        condition = match['condition']
+        tensor_sequence_length = condition.in_node(0)
+        graph.remove_nodes_from([condition.id, cond_data.id, tensor_sequence_length.id])
+        if time_data is not None:
+            graph.remove_nodes_from([time_data.id])
 
-        body_nodes, extra_inputs = get_body(graph, match['condition'].id, inputs, outputs)
+        body_nodes, extra_inputs = get_body(graph, inputs, outputs)
         body_nodes = list(set(body_nodes) - set([cond_data]))
 
         inputs += extra_inputs
@@ -170,18 +173,19 @@ class TensorIteratorMerge(MiddleReplacementPattern):
             {
                 'from_data_id': node.in_node(1),
                 'to_data_id': node.out_node(0),
-                'init_data_id': node.in_node(0)
+                'init_data_id': node.in_node(0),
             } for node in back_edges
         ]
-        
+
         body = nx.MultiDiGraph(name='body')
         body.graph['layout'] = graph.graph['layout']
         body.add_nodes_from([(node, graph.node[node]) for node in body_nodes])
         body.add_edges_from([(u,v,k,d)for u,v,k,d in graph.edges(data=True, keys=True) if u in body_nodes and v in body_nodes])
-        
-        graph.remove_nodes_from(body_nodes + [match['condition'].id] + [inp.id for inp in inputs] + [out.id for out in outputs])
 
-        for i, edge in enumerate(back_edges_data, start=0):
+        graph.remove_nodes_from(body_nodes + [match['condition'].id] + [inp.id for inp in inputs] + [out.id for out in outputs])
+        internal_id_count = 0
+        real_back_edges = []
+        for edge in back_edges_data:
             assert edge['from_data_id'].id in body.nodes()
             assert edge['to_data_id'].id in body.nodes()
             assert edge['init_data_id'].id in body.nodes()
@@ -190,31 +194,65 @@ class TensorIteratorMerge(MiddleReplacementPattern):
             edge['init_data_id'] = Node(body, edge['init_data_id'].id)
             edge['from_data_id']['is_output'] = True
 
+            # Assign/reuse ids for the back-edge start; it comes from from_data_id
+            assert len(edge['from_data_id'].in_nodes()) == 1
+            # layer id
             if not edge['from_data_id'].in_node().has_valid('internal_layer_id'):
-                edge['from_data_id'].in_node()['internal_layer_id'] = 4*i+0
+                edge['from_data_id'].in_node()['internal_layer_id'] = internal_id_count
+                internal_id_count += 1
             edge['from_layer'] = edge['from_data_id'].in_node()['internal_layer_id']
+
+            # port id
             if 'internal_port_id' not in edge['from_data_id'].in_edge():
-                edge['from_data_id'].in_edge()['internal_port_id'] = 4*i+1
+                edge['from_data_id'].in_edge()['internal_port_id'] = internal_id_count
+                internal_id_count += 1
             edge['from_port'] = edge['from_data_id'].in_edge()['internal_port_id']
 
-            #assert not edge['to_data_id'].out_node().has_valid('internal_layer_id')
-            if edge['to_data_id'].in_node().has_valid('internal_layer_id'):
-                edge['to_data_id'].out_node()['internal_layer_id'] = edge['to_data_id'].in_node().internal_layer_id
-            elif not edge['to_data_id'].out_node().has_valid('internal_layer_id'):
-                edge['to_data_id'].out_node()['internal_layer_id'] = 4*i+2
-            edge['to_layer'] = edge['to_data_id'].out_node()['internal_layer_id']
-            
-            assert 'internal_port_id' not in edge['to_data_id'].out_edge()
-            if 'internal_port_id' in edge['init_data_id'].out_edge():
-                edge['to_data_id'].out_edge()['internal_port_id'] = edge['init_data'].out_edge()['internal_port_id']
-            else:
-                edge['to_data_id'].out_edge()['internal_port_id'] = 4*i+3
-            edge['to_port'] = edge['to_data_id'].out_edge()['internal_port_id']
-
-            body.add_edges_from([(edge['init_data_id'].id, edge['to_data_id'].out_node().id, deepcopy(edge['to_data_id'].out_edge()))])
-            body.remove_nodes_from([edge['to_data_id'].in_node().id, edge['to_data_id'].id])
-
-        for i, ext_inp in enumerate(external_inputs, start=4*len(back_edges_data)):
+            # Look at all consumers for a data that ends a back-edge
+            # For each such consumer, there will be a separate back-edge (and input)
+            current_real_back_edges = []
+            for _, consumer, key, edge_attrs in body.out_edges(edge['to_data_id'].id, data=True, keys=True):
+
+                real_edge = {}
+                real_edge.update(edge) # all real back_edges have the same back-edge start
+
+                consumer = Node(body, consumer)
+
+                if real_edge['to_data_id'].in_node().has_valid('internal_layer_id'):
+                    assert False
+                    real_edge['to_data_id'].out_node()['internal_layer_id'] = real_edge['to_data_id'].in_node().internal_layer_id
+                elif not consumer.has_valid('internal_layer_id'):
+                    consumer['internal_layer_id'] = internal_id_count
+                    internal_id_count += 1
+                real_edge['to_layer'] = consumer['internal_layer_id']
+
+                assert 'internal_port_id' not in edge_attrs
+                assert len(real_edge['init_data_id'].out_edges()) == 1
+                assert not 'internal_port_id' in real_edge['init_data_id'].out_edge()
+                edge_attrs['internal_port_id'] = internal_id_count
+                internal_id_count += 1
+                real_edge['to_port'] = edge_attrs['internal_port_id']
+                real_edge['consumer'] = consumer
+                real_edge['consumer_key'] = key
+
+                real_edge['attrs'] = deepcopy(edge_attrs)
+                current_real_back_edges.append(real_edge)
+
+            # connect initial data node with each consumer providing actual edge attributes
+            body.add_edges_from([
+                (
+                    real_edge['init_data_id'].id,
+                    real_edge['consumer'].id,
+                    real_edge['consumer_key'],
+                    real_edge['attrs'])
+            for real_edge in current_real_back_edges])
+
+            body.remove_nodes_from([edge['to_data_id'].id, edge['to_data_id'].in_node().id])
+            real_back_edges += current_real_back_edges
+
+        real_external_inputs = []
+
+        for ext_inp in external_inputs:
             assert ext_inp['external_data_id'].id not in body.nodes()
             assert ext_inp['internal_data_id'].id in body.nodes()
             ext_inp['internal_data_id'] = Node(body, ext_inp['internal_data_id'].id)
@@ -224,31 +262,45 @@ class TensorIteratorMerge(MiddleReplacementPattern):
                 shape = ext_inp['internal_data_id'].shape.copy()
                 assert not ext_inp['internal_data_id'].has_valid('value')
                 new_input_data = Op._create_data_node(body, ext_inp['internal_data_id'].name + '/UnsqueezedInput', dict(shape=np.insert(shape, ext_inp['axis'], 1)))
-                reshape_op = Reshape(body, dict(name=ext_inp['internal_data_id'].name + '/InputSqueeze', dim=shape))
+                dim = shape.copy()
+                # try to do it dynamically reshapable along one of the axis
+                # it is practically useful to reshape along batch dimension, but here we cannot detect where it is
+                # so, we are guessing based onother transflormaions that it is the major dimension
+                dim[0] = -1
+                reshape_op = Reshape(body, dict(name=ext_inp['internal_data_id'].name + '/InputSqueeze', dim=dim))
                 reshape_op.create_node_with_data([new_input_data], data_nodes=[ext_inp['internal_data_id']])
                 ext_inp['internal_data_id'] = new_input_data
 
             ext_inp['internal_data_id']['is_input'] = True
             assert len(ext_inp['internal_data_id'].in_nodes()) == 0
-            assert len(ext_inp['internal_data_id'].out_nodes()) == 1
-            if not 'internal_layer_id' in  ext_inp['internal_data_id'].out_node():
-                ext_inp['internal_data_id'].out_node()['internal_layer_id'] = i
-            if not 'internal_port_id' in ext_inp['internal_data_id'].out_edge():
-                ext_inp['internal_data_id'].out_edge()['internal_port_id'] = i
-            ext_inp['internal_layer_id'] = ext_inp['internal_data_id'].out_node()['internal_layer_id']
-            ext_inp['internal_port_id'] = ext_inp['internal_data_id'].out_edge()['internal_port_id']
-            ext_inp['external_port_id'] = i
-
-        for i, ext_out in enumerate(external_outputs, start=4*len(back_edges_data) + len(external_inputs)):
+            ext_inp['external_port_id'] = internal_id_count
+            internal_id_count += 1
+            for _, consumer, edge_attrs in body.out_edges(ext_inp['internal_data_id'].id, data=True):
+                real_ext_inp = {}
+                real_ext_inp.update(ext_inp)
+                consumer = Node(body, consumer)
+                if not consumer.has_valid('internal_layer_id'):
+                    consumer['internal_layer_id'] = internal_id_count
+                    internal_id_count += 1
+                if not 'internal_port_id' in edge_attrs:
+                    edge_attrs['internal_port_id'] = internal_id_count
+                    internal_id_count += 1
+                real_ext_inp['internal_layer_id'] = consumer['internal_layer_id']
+                real_ext_inp['internal_port_id'] = edge_attrs['internal_port_id']
+                real_external_inputs.append(real_ext_inp)
+
+        for ext_out in external_outputs:
             assert ext_out['external_data_id'].id not in body.nodes()
             assert ext_out['internal_data_id'].id in body.nodes()
             ext_out['internal_data_id'] = Node(body, ext_out['internal_data_id'].id)
 
             if ext_out['axis'] is not None:
                 # Insert unsqueezing resize at output port that has partitioning
-                shape = ext_out['internal_data_id'].shape.copy()
+                dim = ext_out['internal_data_id'].shape.copy()
+                # trying to make it dynamically reshapable (see related comment above for the first Reshape)
+                dim[0] = -1
                 assert not ext_out['internal_data_id'].has_valid('value')
-                reshape_op = Reshape(body, dict(name=ext_out['internal_data_id'].name + '/OutputUnsqueeze', dim=np.insert(shape, ext_out['axis'], 1)))
+                reshape_op = Reshape(body, dict(name=ext_out['internal_data_id'].name + '/OutputUnsqueeze', dim=np.insert(dim, ext_out['axis'], 1)))
                 ext_out['internal_data_id'] = reshape_op.create_node_with_data([ext_out['internal_data_id']])
 
             # TODO: add here working with simple outputs
@@ -257,28 +309,30 @@ class TensorIteratorMerge(MiddleReplacementPattern):
             #assert len(ext_out['internal_data_id'].out_nodes()) == 0
             assert len(ext_out['internal_data_id'].in_nodes()) == 1
             if not 'internal_layer_id' in ext_out['internal_data_id'].in_node():
-                ext_out['internal_data_id'].in_node()['internal_layer_id'] = i
+                ext_out['internal_data_id'].in_node()['internal_layer_id'] = internal_id_count
+                internal_id_count += 1
             if not 'internal_port_id' in ext_out['internal_data_id'].in_edge():
-                ext_out['internal_data_id'].in_edge()['internal_port_id'] = i
+                ext_out['internal_data_id'].in_edge()['internal_port_id'] = internal_id_count
+                internal_id_count += 1
             ext_out['internal_layer_id'] = ext_out['internal_data_id'].in_node()['internal_layer_id']
             ext_out['internal_port_id'] = ext_out['internal_data_id'].in_edge()['internal_port_id']
-            ext_out['external_port_id'] = i
+            ext_out['external_port_id'] = internal_id_count
+            internal_id_count += 1
 
         ti_op = TensorIterator(graph, {
             'name': name + '/TensorIterator',
             'body': body,
 
-            # FOR TESTING PURPOSES
             'input_port_map': [
                 {field: external_input[field] for field in [ 'external_port_id', 'internal_layer_id', 'internal_port_id', 'axis', 'stride', 'part_size', 'start', 'end']}
-                for external_input in external_inputs],
+                for external_input in real_external_inputs],
 
             'output_port_map': [
                 {field: external_output[field] for field in [ 'external_port_id', 'internal_layer_id', 'internal_port_id', 'axis', 'stride', 'part_size', 'start', 'end']}
                 for external_output in external_outputs],
             'back_edges': [
                 {field: edge[field] for field in [ 'from_layer', 'from_port', 'to_layer', 'to_port']}
-                for edge in back_edges_data],
+                for edge in real_back_edges],
         })
 
         ti_outs = ti_op.create_node_with_data(
index 7908bdf..b0923bc 100644 (file)
@@ -19,7 +19,7 @@ import logging as log
 import networkx as nx
 
 from extensions.middle.ConstSwitchResolver import ConstSwitchEraser
-from mo.graph.graph import erase_node
+from mo.middle.passes.eliminate import remove_op_node_with_data_node
 from mo.middle.replacement import MiddleReplacementPattern
 
 
@@ -31,14 +31,11 @@ class UselessMergeEraser(MiddleReplacementPattern):
 
     def pattern(self):
         return dict(
-            nodes=[('merge', dict(kind='op', op='Merge')),
-                   ('merge_data', dict(kind='data'))],
-            edges=[('merge', 'merge_data')]
+            nodes=[('merge', dict(kind='op', op='Merge'))],
+            edges=[]
         )
 
     def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
         if len(graph.in_edges(match['merge'].id)) <= 1:
-            erase_node(match['merge'])
-            erase_node(match['merge_data'])
-            log.info("Useles Merge op and data nodes was deleted op='{}' data='{}'"
-                     "".format(match['merge'].id, match['merge_data'].id))
+            remove_op_node_with_data_node(graph, match['merge'])
+            log.info("Useles Merge op and data nodes was deleted op='{}'".format(match['merge'].id))
index 18d687f..b8272ea 100644 (file)
@@ -21,7 +21,7 @@ import numpy as np
 
 from extensions.middle.ConvertGroupedStridedSlice import ConvertGroupedStridedSlice
 from extensions.middle.SliceConverter import ConvertSlice
-from mo.graph.graph import erase_node
+from mo.middle.passes.eliminate import remove_op_node_with_data_node
 from mo.middle.replacement import MiddleReplacementPattern
 
 
@@ -46,10 +46,9 @@ class UselessStridedSliceEraser(MiddleReplacementPattern):
         if np.array_equal(input_data_node.shape, output_data_node.shape) and \
                 all(elem.step == 1 for elem in match['strided_slice'].slices):
             log.info("Useless StridedSlice op '{}' has been detected".format(match['strided_slice'].id))
-            # remove inputs to Strided Slice so it has just one input with data so we can use 'erase_node' function
+            # remove inputs to Strided Slice so it has just one input with data so we can use 'remove_op_node' function
             graph.remove_edge(match['strided_slice'].in_node(1).id, match['strided_slice'].id)
             graph.remove_edge(match['strided_slice'].in_node(2).id, match['strided_slice'].id)
             graph.remove_edge(match['strided_slice'].in_node(3).id, match['strided_slice'].id)
 
-            erase_node(match['strided_slice'])
-            erase_node(output_data_node)
+            remove_op_node_with_data_node(graph, match['strided_slice'])
diff --git a/model-optimizer/extensions/middle/decompose_bi_lstm.py b/model-optimizer/extensions/middle/decompose_bi_lstm.py
new file mode 100644 (file)
index 0000000..0cfad4e
--- /dev/null
@@ -0,0 +1,188 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+from copy import deepcopy
+
+from extensions.ops.lstm_sequence import LSTMSequence
+from mo.utils.error import Error
+from mo.middle.replacement import MiddleReplacementPattern
+from mo.ops.concat import Concat
+from mo.ops.op import Op
+from mo.ops.split import Split
+from mo.graph.graph import Node
+
+
+class DecomposeBiLSTM(MiddleReplacementPattern):
+    ''' Decomposes bidirectional LSTMSequence to forward and reverse LSTM ops.
+
+        To extract forward and reverse parts from initial blobs, the helper
+        functions used that should be already built-in into the operation attributes.
+
+        Both initial state are split to two part, two parts of the results are concatenated.
+        Axis of split/concat is completelly defined by ONNX/LSTM specification.
+    '''
+
+    enabled = True
+
+    def pattern(self):
+        return dict(
+            nodes=[
+                ('lstm', dict(kind='op', op='LSTMSequence', format='onnx', direction='bidirectional')),
+                ('input', dict(kind='data')),
+                ('W', dict(kind='data')),
+                ('R', dict(kind='data')),
+            ],
+            edges=[
+                ('input', 'lstm', {'in': 0}),
+                ('W', 'lstm', {'bin': 'W'}),
+                ('R', 'lstm', {'bin': 'R'}),
+            ]
+        )
+
+
+    def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
+        bilstm = match['lstm']
+        new_init_hiddens = self.split_data(bilstm.in_node(5))
+        new_init_cells = self.split_data(bilstm.in_node(6))
+        assert bilstm.has_valid('blob_bidirectional_split'), \
+            'Node {} doesnt\'t have blob_bidirectional_split attribute defined.'.format(bilstm.soft_get('name'))
+        splitted_W = bilstm.blob_bidirectional_split(bilstm.in_node(1))
+        splitted_R = bilstm.blob_bidirectional_split(bilstm.in_node(2))
+        splitted_B = bilstm.blob_bidirectional_split(bilstm.in_node(3)) if 3 in bilstm.in_nodes() else (None, None)
+
+        outputs = self.split_bilstm(
+            bilstm,
+            new_init_hiddens,
+            new_init_cells,
+            splitted_W,
+            splitted_R,
+            splitted_B,
+        )
+
+        self.concat(bilstm, outputs[0], outputs[1], bilstm.out_nodes())
+
+    def split_data(self, data: Node):
+        """ Split data node into two part along 0 axis """
+        assert len(data.shape) == 3
+        assert data.shape[0] == 2
+
+        output_data = [Op._create_data_node(data.graph, name=data.name + '/SplittedBiLSTM/{}'.format(['forward', 'reverse'][i])) for i in [0, 1]]
+        split_op = Split(data.graph, dict(name=data.name + '/DecomposedBiLSTM_0', axis=0, num_split=2))
+        return split_op.create_node_with_data([data], data_nodes=output_data)
+
+
+    def split_bilstm(self,
+                     bilstm,
+                     new_init_hiddens,
+                     new_init_cells,
+                     splitted_W,
+                     splitted_R,
+                     splitted_B):
+        """ Split one bilstm node into 2 one-directional lstm nodes.
+
+            All input data nodes should be already prepared; they are
+            have 2 in the major dimension.
+        """
+        assert len(bilstm.out_nodes()) == 3
+        all_outputs = []
+        for i in [0, 1]:
+            direction = ['forward', 'reverse'][i]
+            op = LSTMSequence(bilstm.graph, {
+                'hidden_size': bilstm.hidden_size,
+                'direction': direction,
+                'batch_dim': bilstm.batch_dim,
+                'sequence_dim': bilstm.sequence_dim,
+                'blobs_wrb': bilstm.blobs_wrb,
+                'has_num_directions': bilstm.has_num_directions,
+                'format': bilstm.format,
+                'name': bilstm.name + '/Split/' + direction,
+            })
+
+            output_data = Op._create_data_node(
+                bilstm.graph,
+                name=bilstm.out_node(0).name + '/Split/' + str(i),
+                attrs = {'shape': bilstm.out_node(0).shape.copy()}
+            )
+
+            assert output_data.shape[1] == 2
+            output_data.shape[1] = 1
+
+            output_hidden = Op._create_data_node(
+                bilstm.graph,
+                name=bilstm.out_node(1).name + '/Split/' + str(i),
+                attrs = {'shape': bilstm.out_node(1).shape.copy()}
+            )
+
+            assert output_hidden.shape[0] == 2
+            output_hidden.shape[0] = 1
+
+            output_cell = Op._create_data_node(
+                bilstm.graph,
+                name=bilstm.out_node(2).name + '/Split/' + str(i),
+                attrs = {'shape': bilstm.out_node(2).shape.copy()}
+            )
+
+            assert output_cell.shape[0] == 2
+            output_cell.shape[0] = 1
+
+            all_outputs.append(
+                op.create_node_with_data(
+                    inputs = [
+                        bilstm.in_node(0),
+                        splitted_W[i],
+                        splitted_R[i],
+                        splitted_B[i],
+                        None,
+                        new_init_hiddens[i],
+                        new_init_cells[i],
+                    ],
+                    data_nodes = [
+                        output_data,
+                        output_hidden,
+                        output_cell
+                    ]
+                )
+            )
+        return all_outputs
+
+
+    def concat(self, bilstm, forward_outputs, reverse_outputs, final_outputs):
+        """ Concatenates two set of outputs from BiLSTM """
+
+        concat_ops = [
+            Concat(bilstm.graph, {
+                'name': bilstm.name + '/FinalConcat/Data',
+                'axis': 1
+            }),
+            Concat(bilstm.graph, {
+                'name': bilstm.name + '/FinalConcat/HiddenState',
+                'axis': 0
+            }),
+            Concat(bilstm.graph, {
+                'name': bilstm.name + '/FinalConcat/CellState',
+                'axis': 0
+            })
+        ]
+
+        bilstm.graph.remove_node(bilstm.id)
+
+        for i in final_outputs:
+            concat_ops[i].create_node_with_data(
+                [forward_outputs[i], reverse_outputs[i]],
+                data_nodes=[final_outputs[i]]
+            )
index 7158619..f2fe561 100644 (file)
@@ -18,6 +18,7 @@ import networkx as nx
 import numpy as np
 from copy import deepcopy
 
+from extensions.middle.decompose_bi_lstm import DecomposeBiLSTM
 from mo.utils.error import Error
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.op import Op
@@ -49,7 +50,8 @@ def permute_before_and_after(inp: Node, middle: Node, out: Node, order):
     permute = Permute(middle.graph, dict(order=inverse_perm(np.array(order))))
 
     middle.graph.remove_edge(middle.id, out.id)
-    new_out = Op.create_data_node(middle.graph, middle, {'shape': out.shape[order]})
+    new_out = Op._create_data_node(middle.graph, name=middle.name + '/WithoutPermute', attrs={'shape': out.shape[order]})
+    middle.graph.add_edge(middle.id, new_out.id, key=0, out=0)
     permute.create_node_with_data([new_out], dict(name=middle.name + '/OutputPermute'), data_nodes=out)
 
 
@@ -57,7 +59,9 @@ class LSTMSequenceNormalize(MiddleReplacementPattern):
     ''' Convert blobs and shapes of ONNX-like LSTM to IE compatible form.
 
         Fuse W, R and optional B input blobs to weights and biases according
-        to IE LSTM specification.
+        to IE LSTM specification. In case of bidirectional LSTM, the resulting
+        blobs are not directly supported by IE, but it will be further processed
+        by a separate transformation to break down to one-directional LSTMs.
 
         The target form of this operation is not normally covered by a dedicated
         layer in IE. It should be further transformed to some other layer
@@ -66,7 +70,7 @@ class LSTMSequenceNormalize(MiddleReplacementPattern):
 
         Post-conditions:
 
-        Inputs have the forllowing order:
+        Inputs have the following order:
             0: input data
             1: weights blob
             2: biases blob
@@ -77,27 +81,32 @@ class LSTMSequenceNormalize(MiddleReplacementPattern):
     enabled = True
 
 
+    def run_after(self):
+        return [
+            DecomposeBiLSTM
+        ]
+
+
     def pattern(self):
         return dict(
             nodes=[
-                ('lstm', dict(kind='op', op='LSTMSequence')),
+                ('lstm', dict(kind='op', op='LSTMSequence', format='onnx')),
                 ('input', dict(kind='data')),
                 ('W', dict(kind='data')),
                 ('R', dict(kind='data')),
-                # don't capture B here as it is optional, as well as extra outputs
-                ('output', dict(kind='data')),
             ],
             edges=[
                 ('input', 'lstm', {'in': 0}),
                 ('W', 'lstm', {'bin': 'W'}),
                 ('R', 'lstm', {'bin': 'R'}),
-                ('lstm', 'output', {'out': 0}),
             ]
         )
 
 
     def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
         self.repack_weights(graph, match)
+        if match['lstm'].has_num_directions:
+            self.squeeze_num_directions(graph, match)
         self.batch_sequence_transpose(graph, match)
         self.check_not_supported_ports(graph, match)
         self.states_squeeze(graph, match)
@@ -109,11 +118,16 @@ class LSTMSequenceNormalize(MiddleReplacementPattern):
         W = match['W'].value.copy()
         R = match['R'].value.copy()
 
+        # bidirectional case should be processed separately before this transformation
+        if lstm.direction not in ['forward', 'reverse']:
+            raise Error('ONNX/LSTM operator with `forward` or `reverse` is supported only. '
+                'Node {} has direction = {} which is not supported.'.format(lstm.name, lstm.direction))
+
         graph.remove_edge(match['W'].id, lstm.id)
         graph.remove_edge(match['R'].id, lstm.id)
 
         # find optional 'B'
-        if len(lstm.in_nodes()) > 3:
+        if 3 in lstm.in_nodes():
             # TODO: check if 'bin': 'B' attribute is assigned to this edge
             B = lstm.in_node(3).value.copy()
             graph.remove_edge(lstm.in_node(3).id, lstm.id)
@@ -132,16 +146,16 @@ class LSTMSequenceNormalize(MiddleReplacementPattern):
 
         W, R = [x.reshape([
                 1,  # 0: num of directions, limitation: should be 1
-                1,  # 1: placeholder for concatenation of W and R matrices
+                1,  # 1: dummy dimension to be aligned with B
                 4,  # 2: four output parts of the matrix for all gates in order: i, o, f, c
                 lstm.hidden_size,  # 3: output size per direction and gate
-                -1])  # 4: input size
+                -1])  # 4: input size/hidden size in W/R
             for x in (W, R)]
 
         input_size = match['input'].shape[2]
         assert input_size == W.shape[-1]
 
-        WR = np.concatenate([W, R], axis=1)
+        WR = np.concatenate([W, R], axis=4)
 
         # Reorder gates: iofc --> fico
         gate_reorder = [2, 0, 3, 1]
@@ -186,11 +200,28 @@ class LSTMSequenceNormalize(MiddleReplacementPattern):
             )
 
 
+    def squeeze_num_directions(self, graph: nx.MultiDiGraph, match: dict):
+        """ Assuming considered LSTM node has num_directions in output shape, remove it. """
+        lstm = match['lstm']
+        # num_directions is at 1st position in output shape, please refer to LSTMSequence op definition
+
+        direction_dim = [1, 0, 0] # index of dimension with direction index
+        for i in lstm.out_nodes():
+            old_data_node = lstm.out_node(i)
+            old_shape = old_data_node.shape.copy()
+            new_shape = np.delete(old_shape, direction_dim[i])
+            data = Op._create_data_node(graph, name=lstm.name + '/Out/{}/'.format(i), attrs={'shape': new_shape})
+            graph.remove_edge(lstm.id, old_data_node.id)
+            graph.add_edge(lstm.id, data.id, key=0, out=i)
+            reshape = Reshape(graph, dict(dim=old_shape))
+            reshape.create_node_with_data([data], dict(name=lstm.name + '/SqueezeNumDirections/{}'.format(i)), data_nodes=[old_data_node])
+
+
     def batch_sequence_transpose(self, graph: nx.MultiDiGraph, match: dict):
 
         lstm = match['lstm']
         inp = match['input']
-        out = match['output']
+        out = lstm.out_node(0)
 
         if lstm.batch_dim == 0:
             assert lstm.sequence_dim == 1
index 55dd79a..f576cde 100644 (file)
 
 import networkx as nx
 import numpy as np
-from copy import deepcopy
 
+from extensions.middle.FusePermutesSequence import FusePermutesSequence
 from extensions.middle.lstm_sequence_normalize import LSTMSequenceNormalize
+from extensions.middle.mxnet_lstm_sequence_normalize import MXNetLSTMSequenceNormalize
+from extensions.ops.lstm_cell import LSTMCell
+from extensions.ops.tensor_iterator import TensorIterator
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.op import Op
-from mo.ops.permute import Permute
 from mo.ops.reshape import Reshape
-from extensions.ops.lstm_cell import LSTMCell
-from extensions.ops.tensor_iterator import TensorIterator
-from extensions.middle.FusePermutesSequence import FusePermutesSequence
 
 
 class LSTMSequenceTensorIterator(MiddleReplacementPattern):
-    ''' Converts normalized LSTMSequence op to TensorIterator.
+    """ Converts normalized LSTMSequence op to TensorIterator.
 
         Normalized LSTMSequence means that it should be processed by
         LSTMSequenceNormalize transform that ensures its stict form.
@@ -38,19 +37,16 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern):
         with TensorIterator connected in the same way as an original LSTMSequence
         node and with internal body represented as LSTMCell op node with necessary
         squeezes and unsqueezes around.
-    '''
+    """
 
     enabled = True
 
-
     def run_after(self):
-        return [LSTMSequenceNormalize]
-
+        return [LSTMSequenceNormalize, MXNetLSTMSequenceNormalize]
 
     def run_before(self):
         return [FusePermutesSequence]
 
-
     def pattern(self):
         return dict(
             nodes=[
@@ -70,42 +66,94 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern):
             ]
         )
 
-
     def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
         lstm = match['lstm']
 
         # Build TensorIterator body first
-        body = nx.MultiDiGraph(name=lstm.name + '/sub_graph')
-        inputs = [Op._create_data_node(body, lstm.name + '/inport/' + str(inp), {'shape': lstm.in_node(inp).shape.copy(), 'value': lstm.in_node(inp).value.copy() if lstm.in_node(inp).value is not None else None}) for inp in [0, 3, 4, 1, 2]]
+        body = nx.MultiDiGraph(name=lstm.name + '/sub_graph', layout=graph.graph['layout'])
+        inputs = [Op._create_data_node(body, lstm.name + '/inport/' + str(inp),
+                                       {'shape': lstm.in_node(inp).shape.copy(),
+                                        'value': lstm.in_node(inp).value.copy()
+                                        if lstm.in_node(inp).value is not None and inp in [1, 2] else None})
+                                        for inp in [0, 3, 4, 1, 2]]
         inputs[0].shape[lstm.sequence_dim] = 1
-        input_squeeze = Reshape(body, dict(name=lstm.name + '/input_squeeze', dim=np.delete(inputs[0].shape, lstm.sequence_dim), internal_layer_id=0))
+        reshape_dim = inputs[0].shape.copy()
+        reshape_dim[lstm.batch_dim] = -1
+        reshape_dim = np.delete(reshape_dim, lstm.sequence_dim)
+        input_squeeze = Reshape(
+            body,
+            dict(name=lstm.name + '/input_squeeze', internal_layer_id=0, dim=reshape_dim)
+        )
         inputs[0] = input_squeeze.create_node_with_data([inputs[0]], edge_attrs=[{'internal_port_id': 0}])
-        lstm_cell_op = LSTMCell(body, dict(hidden_size=match['lstm'].hidden_size, name=lstm.name + '/LSTMCell', internal_layer_id=1))
-        outputs = [Op._create_data_node(body, lstm.name + '/outport/' + str(out),  {'shape': lstm.out_node(out).shape.copy() if out in lstm.out_nodes() else lstm.in_node(3).shape.copy()}) for out in [0,1] ]
+        lstm_cell_op = LSTMCell(body, dict(hidden_size=match['lstm'].hidden_size, name=lstm.name + '/LSTMCell',
+                                           internal_layer_id=1))
+        outputs = [Op._create_data_node(body, lstm.name + '/outport/' + str(out),
+                                        {'shape': lstm.out_node(out).shape.copy() if out in lstm.out_nodes()
+                                        else lstm.in_node(3).shape.copy(), 'is_output': True}) for out in [0, 1]]
         unsqueezed_output_shape = outputs[0].shape.copy()
         unsqueezed_output_shape[lstm.sequence_dim] = 1
         squeezed_output_shape = np.delete(unsqueezed_output_shape, lstm.sequence_dim)
         outputs[0].shape = squeezed_output_shape
-        output_unsqueeze = Reshape(body, dict(name=lstm.name + 'output_unsqueeze', dim=unsqueezed_output_shape, internal_layer_id=2))
+        unsqueezed_output_shape[lstm.batch_dim] = -1
+        output_unsqueeze = Reshape(body, dict(name=lstm.name + 'output_unsqueeze', dim=unsqueezed_output_shape,
+                                              internal_layer_id=2))
         # TODO edge attributes should be assigned by the op itself
-        lstm_cell_node = lstm_cell_op.create_node_with_data(inputs, data_nodes=outputs, edge_attrs=[{}, {'internal_port_id': 1}, {'internal_port_id': 2}, {'bin': 'weights'}, {'bin': 'biases'}])
+        lstm_cell_node = lstm_cell_op.create_node_with_data(inputs, data_nodes=outputs,
+                                                            edge_attrs=[{}, {'internal_port_id': 1},
+                                                                        {'internal_port_id': 2}, {'bin': 'weights'},
+                                                                        {'bin': 'biases'}])
         lstm_cell_node[0].in_node().out_edge(0)['internal_port_id'] = 4
         lstm_cell_node[0].in_node().out_edge(1)['internal_port_id'] = 5
         lstm_cell_node[0] = output_unsqueeze.create_node_with_data([lstm_cell_node[0]])
         lstm_cell_node[0].in_node().out_edge(0)['internal_port_id'] = 3
-        
+        lstm_cell_node[0]['is_output'] = True
+
+        assert lstm.direction in ['forward', 'reverse']
+        if lstm.direction == 'forward':
+            stride = 1
+            start = None
+            end = None
+        else:
+            assert lstm.direction == 'reverse'
+            stride = -1
+            start = -1
+            end = 0
+
+        output_port_map = [{
+            'external_port_id': 3,
+            'internal_layer_id': 2,
+            'internal_port_id': 3,
+            'axis': lstm.sequence_dim,
+            'stride': stride,
+            'start': start,
+            'end': end,
+            'part_size': 1,
+        }]
+
+        if len(lstm.out_nodes()) == 3:
+            output_port_map.extend([{
+                'external_port_id': 4,
+                'internal_layer_id': 1,
+                'internal_port_id': 4,
+            }, {
+                'external_port_id': 5,
+                'internal_layer_id': 1,
+                'internal_port_id': 5,
+            }])
+
         ti_op = TensorIterator(graph, {
             'name': lstm.name + '/TensorIterator',
             'body': body,
 
-            # FOR TESTING PURPOSES
             'input_port_map': [
                 {
                     'external_port_id': 0,
                     'internal_layer_id': 0,
                     'internal_port_id': 0,
                     'axis': lstm.sequence_dim,
-                    'stride': 1,
+                    'stride': stride,
+                    'start': start,
+                    'end': end,
                     'part_size': 1,
                 },
                 {
@@ -120,16 +168,8 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern):
                 },
             ],
 
-            'output_port_map': [
-                {
-                    'external_port_id': 3,
-                    'internal_layer_id': 2,
-                    'internal_port_id': 3,
-                    'axis': lstm.sequence_dim,
-                    'stride': 1,
-                    'part_size': 1,
-                },
-            ],
+            'output_port_map': output_port_map,
+
             'back_edges': [
                 {
                     'from_layer': 1,
@@ -146,6 +186,18 @@ class LSTMSequenceTensorIterator(MiddleReplacementPattern):
             ]
         })
 
-        outs = ti_op.create_node_with_data([lstm.in_node(i) for i in [0, 3, 4]], data_nodes=list(lstm.out_nodes().values()), edge_attrs=[{'external_port_id': 0}, {'external_port_id': 1}, {'external_port_id': 2}])
+        assert sorted(lstm.out_nodes().keys()) == list(range(len(lstm.out_nodes()))), \
+            "There are gaps in output ports of LSTMSequence operation. Node {}".format(lstm.id)
+        outs = ti_op.create_node_with_data([lstm.in_node(i) for i in [0, 3, 4]],
+                                           data_nodes=[lstm.out_node(i) for i in range(len(lstm.out_nodes()))],
+                                           edge_attrs=[{'external_port_id': 0}, {'external_port_id': 1},
+                                                       {'external_port_id': 2}])
+
+        if not isinstance(outs, list):
+            outs = list([outs])
+
         graph.remove_node(lstm.id)
-        outs.in_edge(0)['external_port_id'] = 3
+        outs[0].in_edge(0)['external_port_id'] = 3
+        for i, out in enumerate(outs[1:]):
+            external_port_id = 4 + i
+            out.in_edge()['external_port_id'] = external_port_id
index 9c7878d..a7b6b56 100644 (file)
@@ -37,7 +37,7 @@ class TensorIteratorLSTM(MiddleReplacementPattern):
         in a separate pass.
     """
 
-    enabled = True
+    enabled = False
 
     def run_after(self):
         return [TensorIteratorMerge, LSTMSequenceNormalize, LSTMSequenceTensorIterator, TensorFlowLSTMtoGeneric]
@@ -111,7 +111,7 @@ class CheckUnsupportedLSTMCell(MiddleReplacementPattern):
         Initiates the second translation round if find any not supported LSTMCell instances.
     """
 
-    enabled = True
+    enabled = False
 
     def run_after(self):
         return [TensorIteratorLSTM]
diff --git a/model-optimizer/extensions/middle/mxnet_lstm_sequence_normalize.py b/model-optimizer/extensions/middle/mxnet_lstm_sequence_normalize.py
new file mode 100644 (file)
index 0000000..17fb9b1
--- /dev/null
@@ -0,0 +1,168 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+from copy import deepcopy
+
+from mo.middle.replacement import MiddleReplacementPattern
+from mo.ops.op import Op
+from mo.ops.reshape import Reshape
+from mo.graph.graph import Node
+
+
+class MXNetLSTMSequenceNormalize(MiddleReplacementPattern):
+    ''' Convert blobs and shapes of MXNet-like LSTM to IE compatible form.
+
+        The target form of this operation is not normally covered by a dedicated
+        layer in IE. It should be further transformed to some other layer
+        that are supported by IE. This transformation pass involves weights and
+        shapes processing only.
+
+        Post-conditions:
+
+        Inputs have the following order:
+            0: input data
+            1: weights blob
+            2: biases blob
+            3: initial hidden state [optional]
+            4: initial cell state [optional]
+    '''
+    enabled = True
+
+    def pattern(self):
+        return dict(
+            nodes=[
+                ('lstm', dict(kind='op', op='LSTMSequence', format='mxnet')),
+                ('input', dict(kind='data')),
+                ('hidden_state', dict(kind='data')),
+                ('cell_state', dict(kind='data')),
+                ('params', dict(kind='data')),
+            ],
+            edges=[
+                ('input', 'lstm', {'in': 0}),
+                ('hidden_state', 'lstm', {'in': 2}),
+                ('cell_state', 'lstm', {'in': 3}),
+                ('params', 'lstm', {'in': 1}),
+            ]
+        )
+
+
+    def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
+        input = match['input']
+        lstm = match['lstm']
+        params = match['params'].value.copy()
+        hidden_state = match['hidden_state']
+        cell_state = match['cell_state']
+
+        hidden_state_edge_attrs = deepcopy(graph.get_edge_data(hidden_state.id, lstm.id)[0])
+        cell_state_edge_attrs = deepcopy(graph.get_edge_data(cell_state.id, lstm.id)[0])
+
+        graph.remove_edge(match['params'].id, lstm.id)
+        graph.remove_edge(match['hidden_state'].id, lstm.id)
+        graph.remove_edge(match['cell_state'].id, lstm.id)
+
+        self.repack_weights(graph, input, lstm, params)
+
+        reshape = Reshape(graph, dict(dim=[lstm.in_node(0).shape[0], lstm.hidden_size]))
+
+        if len(lstm.in_nodes()) > 2:
+            hidden_state_edge_attrs['in'] = 3
+            new_init_h = reshape.create_node_with_data([hidden_state], attrs=dict(name=lstm.name + '/HiddenStateResize'))
+            graph.add_edge(new_init_h.id, lstm.id, **hidden_state_edge_attrs)
+
+        if len(lstm.in_nodes()) > 3:
+            cell_state_edge_attrs['in'] = 4
+            new_init_c = reshape.create_node_with_data([cell_state], attrs=dict(name=lstm.name + '/CellStateResize'))
+            graph.add_edge(new_init_c.id, lstm.id, **cell_state_edge_attrs)
+
+
+    def repack_weights(self, graph: nx.MultiDiGraph, input: Node, lstm: Node, params: np.array):
+        input_size = input.shape[2]
+
+        direction = 2 if lstm.has_num_directions else 1
+        bsize = (2*lstm.hidden_size*direction*1)*4
+
+        assert direction == 1
+
+        W = np.array(params[0:len(params) - bsize])
+        B = np.array(params[len(params) - bsize:])
+
+        WX = np.array(W[0:lstm.hidden_size*4*input_size])
+        WH = np.array(W[lstm.hidden_size*4*input_size:])
+
+        WX = WX.reshape([lstm.hidden_size*4, input_size])
+        WH = WH.reshape([lstm.hidden_size*4, lstm.hidden_size])
+
+        WX = WX.transpose([1, 0])
+        WH = WH.transpose([1, 0])
+
+        WX = WX.reshape([
+                1,  # 0: num of directions, limitation: should be 1
+               -1,  # 3: input size
+                4,  # 1: four output parts of the matrix for all gates in order: i, f, c, o
+                lstm.hidden_size,  # 2: output size per direction and gate
+        ])
+
+        WH = WH.reshape([
+                1,  # 0: num of directions, limitation: should be 1
+               -1,  # 3: hidden state size
+                4,  # 1: four output parts of the matrix for all gates in order: i, f, c, o
+                lstm.hidden_size,  # 2: output size per direction and gate
+        ])
+
+        B = B.reshape([
+                 1,  # 0: num of directions, limitation: should be 1
+                 2,  # 3: num of component B
+                 4,  # 1: four output parts of the matrix for all gates in order: i, f, c, o
+                 lstm.hidden_size,  # 2: output size per direction and gate
+        ])
+
+        assert WX.shape[1] == input_size
+        assert WH.shape[1] == lstm.hidden_size
+
+        W = np.concatenate([WX, WH], axis=1)
+
+        # Reorder gates: ifco --> fico
+        gate_reorder = [1, 0, 2, 3]
+        W = np.take(W, gate_reorder, axis=2)
+        B = np.take(B, gate_reorder, axis=2)
+
+        inout_reorder = [0, 2, 3, 1]
+        W = W.transpose(inout_reorder)
+        B = B.transpose(inout_reorder)
+
+        final_shape = [W.shape[0] * W.shape[1] * lstm.hidden_size, -1]
+        W = W.reshape(final_shape)
+        B = B.reshape(final_shape)
+
+        # Sum component of B
+        B = np.add.reduce(B, axis=1, keepdims=True)
+        B = B.squeeze(axis=1)
+
+        assert W.ndim == 2
+        assert B.ndim == 1
+        assert W.shape[0] == lstm.hidden_size * 4
+        assert B.shape[0] == lstm.hidden_size * 4
+        assert W.shape[1] == lstm.hidden_size + input_size
+
+        for blob, port, name in [(W, 1, 'weights'), (B, 2, 'biases')]:
+            Op.create_and_connect_input_data_node(
+                graph,
+                lstm,
+                {'value': blob, 'shape': np.array(blob.shape, dtype=np.int64)},
+                {'in': port, 'bin': name, 'permutation': None}
+            )
index 4c927d4..fbd3d63 100644 (file)
@@ -18,9 +18,9 @@ import networkx as nx
 import numpy as np
 from copy import deepcopy
 
-from mo.graph.graph import copy_node
+from mo.graph.graph import copy_node, Node, dict_includes
 from mo.utils.error import Error
-from mo.middle.passes.eliminate import remove_op_node
+from mo.middle.passes.eliminate import remove_op_node_with_data_node
 from mo.middle.pattern_match import find_isomorphisms, find_pattern_matches
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.op import Op
@@ -29,6 +29,7 @@ from extensions.middle.FusePermutesSequence import FusePermutesSequence
 from extensions.middle.TensorIteratorMerge import TensorIteratorMerge
 from extensions.middle.lstm_sequence_normalize import LSTMSequenceNormalize, permute_before_and_after
 from extensions.middle.lstm_sequence_tensor_iterator import LSTMSequenceTensorIterator
+from extensions.middle.decompose_bi_lstm import DecomposeBiLSTM
 
 
 class PermuteTensorIteratorLSTM(MiddleReplacementPattern):
@@ -41,7 +42,7 @@ class PermuteTensorIteratorLSTM(MiddleReplacementPattern):
     enabled = True
 
     def run_after(self):
-        return [TensorIteratorMerge, LSTMSequenceNormalize, LSTMSequenceTensorIterator, FusePermutesSequence]
+        return [TensorIteratorMerge, LSTMSequenceNormalize, LSTMSequenceTensorIterator, FusePermutesSequence, DecomposeBiLSTM]
 
     def pattern(self):
         return dict(
@@ -62,10 +63,10 @@ class PermuteTensorIteratorLSTM(MiddleReplacementPattern):
                 ('input', 'direct_permute'),
                 ('direct_permute', 'input_permuted'),
 
-                ('input_permuted', 'ti', {'in': 0}),
+                ('input_permuted', 'ti', {'in': 0}),   # affected by permute
                 ('init_hidden', 'ti', {'in': 1}),
                 ('init_cell', 'ti', {'in': 2}),
-                ('ti', 'output_permuted', {'out': 0}),
+                ('ti', 'output_permuted', {'out': 0}), # affected by permute
 
                 ('output_permuted', 'inverse_permute'),
                 ('inverse_permute', 'output'),
@@ -73,6 +74,9 @@ class PermuteTensorIteratorLSTM(MiddleReplacementPattern):
         )
 
     def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
+
+        # This transformation works if and only if a body of TI
+        # matches the following topology (Reshape -> LSTMCell -> Reshape)
         nodes=[
             ('input_unsqueezed'),
             ('squeeze', dict(op='Reshape')),
@@ -109,23 +113,61 @@ class PermuteTensorIteratorLSTM(MiddleReplacementPattern):
         isomorphisms = find_isomorphisms(ti.body, nodes, edges)
         if len(list(isomorphisms)) != 1:
             return
+        isomorphism = isomorphisms[0]
 
         direct_permute = match['direct_permute']
         inverse_permute = match['inverse_permute']
 
-        if not direct_permute.has_valid('order') or not np.array_equal(direct_permute.order, [1, 0, 2]):
+        permute_order = [1, 0, 2]
+
+        # Check both perumute orders exactly match expected one - [1, 0, 2]
+        if not direct_permute.has_valid('order') or not np.array_equal(direct_permute.order, permute_order):
             return
-        if not inverse_permute.has_valid('order') or not np.array_equal(inverse_permute.order, [1, 0, 2]):
+        if not inverse_permute.has_valid('order') or not np.array_equal(inverse_permute.order, permute_order):
             return
 
+
+        def find_ports(port_map: list, attrs: dict):
+            """ Find all ports in a given port map with specified attributes """
+            result = []
+            for i, port in enumerate(port_map):
+                if dict_includes(port, attrs):
+                    result.append(i)
+            return result
+
+        # Check TI has only single partitioned input/output port; all partitioned ports have defined axis
+        data_input_port = find_ports(ti.input_port_map, {'axis': lambda attr: attr in [0, 1]})
+        data_output_port = find_ports(ti.output_port_map, {'axis': lambda attr: attr in [0, 1]})
+        assert len(data_input_port) == 1
+        assert len(data_output_port) == 1
+        data_input_port = data_input_port[0]
+        data_output_port = data_output_port[0]
+        # Verify that they are really connected to Permute layers (guarantied by port numbers of TI, see the pattern)
+        assert ti.in_edge(0)['external_port_id'] == ti.input_port_map[data_input_port]['external_port_id']
+        assert ti.out_edge(0)['external_port_id'] == ti.output_port_map[data_output_port]['external_port_id']
+
+        # Verify that the TI body have required Reshapes connected to the found ports
+        squeeze = isomorphism['squeeze']
+        unsqueeze = isomorphism['unsqueeze']
+        assert squeeze['internal_layer_id'] == ti.input_port_map[data_input_port]['internal_layer_id']
+        assert squeeze.in_edge(0)['internal_port_id'] == ti.input_port_map[data_input_port]['internal_port_id']
+        assert unsqueeze['internal_layer_id'] == ti.output_port_map[data_output_port]['internal_layer_id']
+        assert unsqueeze.out_edge(0)['internal_port_id'] == ti.output_port_map[data_output_port]['internal_port_id']
+        assert len(squeeze.in_node().shape) == 3
+        assert len(squeeze.out_node().shape) == 2
+        assert len(unsqueeze.in_node().shape) == 2
+        assert len(unsqueeze.out_node().shape) == 3
+
         # Remove permutes
-        remove_op_node(graph, direct_permute)
-        remove_op_node(graph, inverse_permute)
-        match['output'].shape = match['output'].shape[[1, 0, 2]]
-
-        # Modify axis in TI
-        for port_map in [ti.input_port_map, ti.output_port_map]:
-            for port in port_map:
-                if 'axis' in port and port['axis'] is not None:
-                    assert port['axis'] in [0, 1]
-                    port['axis'] = 1 - port['axis']
+        remove_op_node_with_data_node(graph, direct_permute)
+        remove_op_node_with_data_node(graph, inverse_permute)
+        match['output'].shape = match['output'].shape[permute_order]
+
+        # swap 0/1 axis for partitioned ports
+        ti.input_port_map[data_input_port]['axis'] = 1 - ti.input_port_map[data_input_port]['axis']
+        ti.output_port_map[data_output_port]['axis'] = 1 - ti.output_port_map[data_output_port]['axis']
+
+        # smap 0-th and 1-th shape entries for reshapes inside body
+        squeeze.in_node().shape = squeeze.in_node().shape[[1, 0, 2]]
+        unsqueeze.out_node().shape = unsqueeze.out_node().shape[[1, 0, 2]]
+        unsqueeze.dim = unsqueeze.dim[[1, 0, 2]]
diff --git a/model-optimizer/extensions/middle/reverse_tensor_iterator.py b/model-optimizer/extensions/middle/reverse_tensor_iterator.py
new file mode 100644 (file)
index 0000000..7cd529b
--- /dev/null
@@ -0,0 +1,105 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from mo.middle.replacement import MiddleReplacementPattern
+from extensions.ops.lstm_sequence import LSTMSequence
+from extensions.middle.FusePermutesSequence import FusePermutesSequence
+from extensions.middle.TensorIteratorMerge import TensorIteratorMerge
+from extensions.middle.lstm_sequence_normalize import LSTMSequenceNormalize
+from extensions.middle.lstm_sequence_tensor_iterator import LSTMSequenceTensorIterator
+from extensions.middle.permute_tensor_iterator import PermuteTensorIteratorLSTM
+from mo.middle.passes.eliminate import remove_op_node_with_data_node
+from mo.middle.replacement import MiddleReplacementPattern
+
+
+class ReverseTensorIteratorLSTM(MiddleReplacementPattern):
+    """ Fuses Reverse operations around TI: ReverseSequence --> TI  --> ReverseSequence.
+
+        WARNING This transformation is limited to support of very special case of TI but
+        code doesn't check all the cases.
+    """
+
+    enabled = True
+
+    def run_after(self):
+        return [
+            TensorIteratorMerge,
+            LSTMSequenceNormalize,
+            LSTMSequenceTensorIterator,
+            FusePermutesSequence,
+            PermuteTensorIteratorLSTM,
+        ]
+
+    def pattern(self):
+        return dict(
+            nodes=[
+                ('input'),
+                ('direct_reverse', dict(op='ReverseSequence')),
+                ('input_reversed'),
+                ('init_hidden'),
+                ('init_cell'),
+
+                ('ti', dict(kind='op', op='TensorIterator')),
+
+                ('output_reversed'),
+                ('inverse_reverse', dict(op='ReverseSequence')),
+                ('output'),
+            ],
+            edges=[
+                ('input', 'direct_reverse', {'in': 0}),
+                ('direct_reverse', 'input_reversed'),
+
+                ('input_reversed', 'ti', {'in': 0}),
+                ('init_hidden', 'ti', {'in': 1}),
+                ('init_cell', 'ti', {'in': 2}),
+                ('ti', 'output_reversed', {'out': 0}),
+
+                ('output_reversed', 'inverse_reverse', {'in': 0}),
+                ('inverse_reverse', 'output'),
+            ]
+        )
+
+    def replace_pattern(self, graph: nx.MultiDiGraph, match: dict):
+        ti = match['ti']
+        direct_reverse = match['direct_reverse']
+        inverse_reverse = match['inverse_reverse']
+
+        assert direct_reverse.seq_dim == inverse_reverse.seq_dim
+        assert direct_reverse.batch_dim is None and inverse_reverse.batch_dim is None or \
+            direct_reverse.batch_dim == inverse_reverse.batch_dim
+
+        # Modify stride in TI
+        for port_map in [ti.input_port_map, ti.output_port_map]:
+            for port in port_map:
+                if 'axis' in port and port['axis'] is not None and 'external_port_id' in port:
+                    assert port['axis'] == direct_reverse.seq_dim, \
+                        'axis == {} != {} == direct_reverse.seq_dim'.format(port['axis'], direct_reverse.seq_dim)
+                    if 'stride' not in port or port['stride'] is None:
+                        port['stride'] = 1
+                    assert port['stride'] in [-1, 1]
+                    port['stride'] = -port['stride']
+                    if port['stride'] == -1:
+                        port['start'] = -1
+                        port['end'] = 0
+                    elif port['stride'] == 1:
+                        port['start'] = None
+                        port['end'] = None
+
+        # Remove reverses
+        remove_op_node_with_data_node(graph, direct_reverse)
+        remove_op_node_with_data_node(graph, inverse_reverse)
diff --git a/model-optimizer/extensions/ops/BlockLSTM.py b/model-optimizer/extensions/ops/BlockLSTM.py
new file mode 100644 (file)
index 0000000..8e3ac7f
--- /dev/null
@@ -0,0 +1,63 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from mo.front.common.partial_infer.utils import mark_input_bins
+from mo.graph.graph import Node
+from mo.ops.op import Op
+import numpy as np
+
+
+class BlockLSTM(Op):
+    op = 'BlockLSTM'
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        mandatory_props = {
+            'op': __class__.op,
+            'infer': __class__.infer
+        }
+        super().__init__(graph, mandatory_props, attrs)
+
+    @staticmethod
+    def infer(node: Node):
+        """
+         MO input edges:   |   Description:
+         -------------------------------------------------
+                0          | x: The sequence input to the LSTM, shape (timelen, batch_size, num_inputs)
+                1          | w: The weight matrix
+                2          | b: The bias vector
+                3          | h_prev: Previous/initial hidden state
+                4          | cs_prev: Value of the initial cell state
+         """
+        assert len(node.in_nodes()) == 5
+
+        """
+        MO output edges:    |   Description:
+                0           | cs: Output data / output hidden states concatenated over the whole time sequence
+                1           | h: Output cell states concatenated over the whole time sequence
+        """
+
+        assert len(node.out_nodes()) in [1, 2]
+
+        mark_input_bins(node)
+        input_shape = node.in_node(0).shape
+
+        assert len(input_shape) == 3
+        out_shape = input_shape
+        node.out_node(0).shape = out_shape
+        if len(node.out_nodes()) > 1:
+            node.out_node(1).shape = out_shape
index 3e4c13c..6eb3d93 100644 (file)
@@ -29,8 +29,11 @@ class DetectionOutput(Op):
             'type': __class__.op,
             'op': __class__.op,
             'infer': multi_box_detection_infer,
+            'input_width': 1,
+            'input_height': 1,
+            'normalized': 1,
             'share_location': 1,
-            'variance_encoded_in_target': 0
+            'variance_encoded_in_target': 0,
         }, attrs)
 
     def supported_attrs(self):
index 5f9876b..221c0c9 100644 (file)
@@ -19,7 +19,7 @@ import numpy as np
 
 from mo.graph.graph import Node
 from mo.ops.op import Op
-from mo.utils.utils import match_shapes
+from mo.utils.utils import symm_match_shapes
 
 
 class TensorArrayGather(Op):
@@ -43,17 +43,12 @@ class TensorArrayGather(Op):
 
         ta_node = Node(node.graph, str(handle.value))
 
-        if ta_node.has_valid('element_shape'):
-            assert match_shapes(ta_node['element_shape'], node.element_shape)
-        ta_node['element_shape'] = node.element_shape
+        if ta_node.has_valid('element_shape') and ta_node.element_shape is not None and len(ta_node.element_shape) > 0:
+            assert symm_match_shapes(ta_node['element_shape'], node.element_shape)
+        else:
+            ta_node['element_shape'] = node.element_shape
         data_shape = ta_node['element_shape']
-        if -1 in data_shape:
-            assert data_shape.size == 2 and data_shape[0] == -1 and data_shape[1] != -1
-            # This is a workaround for a bug that we cannot deduce element_shape
-            # when it is required for inference, so we putting 1 here instead of -1.
-            # This makes impossible to have batch size > 1 for LSTM-like loops
-            data_shape[0] = 1
-            ta_node.element_shape = data_shape
+        assert -1 not in data_shape or data_shape.size == 2 and data_shape[0] == -1 and data_shape[1] != -1
 
         assert ta_node.has_valid('size')
         size = ta_node['size']
index 349f788..cb30e87 100644 (file)
@@ -41,8 +41,12 @@ class TensorArrayScatter(Op):
         flow_in = node.in_node(3)
 
         ta_node = Node(node.graph, str(handle.value))
-        if ta_node.has_valid('element_shape'):
-            assert match_shapes(ta_node['element_shape'], value.shape[1:])
+        if ta_node.has_valid('element_shape') and len(ta_node.element_shape) > 0:
+            assert match_shapes(ta_node['element_shape'], value.shape[1:]), \
+                'Shapes are not compatible: {} and {}'.format(ta_node['element_shape'], value.shape[1:])
+        else:
+            ta_node['element_shape'] = value.shape[1:]
+
         # Assign element_shape anyway, because the original element_shape can contain -1
         ta_node['element_shape'] = value.shape[1:]
         #TODO: add smart check that indices and value.shape[0] is compatible
index a6c65c4..4330460 100644 (file)
@@ -45,8 +45,9 @@ class TensorArrayWriter(Op):
         value_shape = value.shape
 
         ta_node = Node(node.graph, str(handle.value))
-        if ta_node.has_valid('element_shape'):
-            assert match_shapes(ta_node['element_shape'], value.shape)
+        if ta_node.has_valid('element_shape') and len(ta_node.element_shape) > 0:
+            assert match_shapes(ta_node['element_shape'], value.shape), \
+                'Shapes are not compatible: {} and {}'.format(ta_node['element_shape'], value.shape)
         ta_node['element_shape'] = value_shape
 
         output_shape = flow_in.shape
diff --git a/model-optimizer/extensions/ops/axpy.py b/model-optimizer/extensions/ops/axpy.py
new file mode 100644 (file)
index 0000000..26e15cd
--- /dev/null
@@ -0,0 +1,34 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from mo.ops.op import Op
+
+
+class AxpyOp(Op):
+    """
+    Empty Op for Axpy layer. It will be replaced by AxpyToEltwise FrontReplacer
+    """
+    op = 'Axpy'
+    enabled = True
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        super().__init__(graph, {
+            'type': __class__.op,
+            'op': __class__.op,
+            'infer': None
+        }, attrs)
diff --git a/model-optimizer/extensions/ops/bn.py b/model-optimizer/extensions/ops/bn.py
new file mode 100644 (file)
index 0000000..69f7bf1
--- /dev/null
@@ -0,0 +1,34 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from mo.ops.op import Op
+
+
+class BNOp(Op):
+    """
+    Empty Op for BN layer. It will be replaced by BNToScaleShift FrontReplacer
+    """
+    op = 'BN'
+    enabled = True
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        super().__init__(graph, {
+            'type': __class__.op,
+            'op': __class__.op,
+            'infer': None
+        }, attrs)
index 49390ee..0a51160 100644 (file)
@@ -57,4 +57,4 @@ class ConstantFill(Op):
         assert shape is not None
 
         node.out_node(0).value = np.full(shape, node.fill_value, np.float32)
-        node.out_node(0).shape = node.out_node(0).value.shape
+        node.out_node(0).shape = np.array(node.out_node(0).value.shape, dtype=np.int64)
diff --git a/model-optimizer/extensions/ops/gather.py b/model-optimizer/extensions/ops/gather.py
new file mode 100644 (file)
index 0000000..255fd1f
--- /dev/null
@@ -0,0 +1,67 @@
+"""
+ Copyright (c) 2017-2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+
+from mo.graph.graph import Node
+from mo.ops.op import Op
+
+
+class Gather(Op):
+    op = 'Gather'
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        mandatory_props = {
+            'type': __class__.op,
+            'op': __class__.op,
+            'axis': 0,
+            'infer': __class__.infer
+        }
+        super().__init__(graph, mandatory_props, attrs)
+
+    def supported_attrs(self):
+        return [
+            'axis',
+        ]
+
+    @staticmethod
+    def infer(node: Node):
+        assert len(node.in_nodes()) == 2 or len(node.in_nodes()) == 3
+
+        # There may be three inputs in TensorFlow. The third input is axis
+        if len(node.in_nodes()) == 3:
+            if node.in_node(2).value is None:
+                log.error("Gather is supported only with constant axis value")
+                return
+            node.axis = node.in_node(2).value.item()
+            node.graph.remove_edge(node.in_node(2).id, node.id)
+
+        axis = node.axis
+        data = node.in_node(0)
+        indices = node.in_node(1)
+
+        # both inputs are constant
+        if data.value is not None and indices.value is not None:
+            node.out_node(0).value = np.take(data.value, indices.value, axis)
+            node.out_node(0).shape = np.array(node.out_node(0).value.shape, dtype=np.int64)
+            return
+
+        shape = np.concatenate((data.shape[:axis], indices.shape))
+        if axis < len(data.shape) - 1:
+            shape = np.concatenate((shape, data.shape[axis+1:]))
+
+        node.out_node(0).shape = np.array(shape, dtype=np.int64)
diff --git a/model-optimizer/extensions/ops/identity.py b/model-optimizer/extensions/ops/identity.py
new file mode 100644 (file)
index 0000000..30995a1
--- /dev/null
@@ -0,0 +1,39 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from mo.front.common.partial_infer.elemental import copy_shape_infer
+from mo.ops.op import Op
+from mo.front.common.partial_infer.utils import mark_input_bins
+
+
+class IdentityOp(Op):
+    op = 'Identity'
+    enabled = True
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        super().__init__(graph, {
+            'type': __class__.op,
+            'op': __class__.op,
+            'identity': True,
+            'infer': IdentityOp.shape_infer
+        }, attrs)
+
+    @staticmethod
+    def shape_infer(node):
+        copy_shape_infer(node)
+
index 380eaf2..8768582 100644 (file)
@@ -35,6 +35,7 @@ class InterpOp(Op):
             'op': __class__.op,
             'factor': None,
             'align_corners': 1,
+            'parse_2nd_input': 'value',
             'infer': InterpOp.interp_infer
         }
         super().__init__(graph, mandatory_props, attrs)
@@ -57,7 +58,15 @@ class InterpOp(Op):
         assert len(layout) == 4
         if len(node.in_nodes()) == 2:
             src_shape = node.in_node(0).shape
-            dst_shape = node.in_node(1).value
+            dst_shape = node.in_node(1).shape
+
+            # in Caffe can be 2 inputs too, but shape should be got from shape of the second input
+            if node.parse_2nd_input == 'shape':
+                dst_shape = [dst_shape[get_height_dim(layout, 4)], dst_shape[get_width_dim(layout, 4)]]
+            else:
+                # it is TF case
+                dst_shape = node.in_node(1).value
+
             if src_shape is None or dst_shape is None or len(src_shape) != 4 or len(dst_shape) != 2:
                 log.error(
                     'Node {} with op {} cannot be converted to Resample layer because there is no enough info about '
index 5dc9b60..0f3c63b 100644 (file)
@@ -26,25 +26,33 @@ from mo.utils.utils import refer_to_faq_msg
 
 
 class LSTMSequence(Op):
-    ''' Implements a layer that incorporates LSTM cell in a loop like it is specified in ONNX
+    """ Implements a layer that incorporates LSTM cell in a loop like it is specified in ONNX
 
         It is assumed that there is no equivalent of this op in IE,
         so it is considered as intermediate operation that will be translated differently.
         We define type for this operation to enable debuggin at IE side.
 
-        There are several flavors of this op depending on how it was created:
+        There are several flavors of this op depending on how it was created and in which framework.
+        There are several attributes that specifies the LSTM flavor:
             - ONNX/LSTM gives this op in non-normalized form and will require normalization
                 as a separate transformation (see LSTMSequenceNormalize middle transformation);
-                in this case blobs_wrb=True
-            - other sources should give already normalized operation (with blobs_wrb=False).
-    '''
+                in this case blobs_wrb=True. Normalized weights/biases for MatMul is used when
+                blobs_wrb=True.
+            - ONNX/LSTM defines output shape as 4D: [seq_length, num_directions, batch_size,
+                hidden_size], where num_directions = 1 is supported only. In this case
+                has_num_directions=True. Otherwise, output is 3D and doesn't contain num_directions.
+            - Depending on the original framework, `format` attrtibutes is specified accordingly.
+                Its value controls which normalize transformations are called.
+    """
     op = 'LSTMSequence'
 
     def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
         mandatory_props = {
-            'type': '__LSTMSequence',
+            'type': '__LSTMSequence',   # should be never emitted to IR; for debugging purposes
             'op': __class__.op,
             'blobs_wrb': False,
+            'has_num_directions': False,
+            'direction': 'forward',
             'infer': __class__.infer
         }
         super().__init__(graph, mandatory_props, attrs)
@@ -52,9 +60,12 @@ class LSTMSequence(Op):
     def supported_attrs(self):
         return [
             'hidden_size',  # number of the elements in hidden cell size
+            'direction',    # one of 'forward', 'reverse', or 'bidirectional'
             'batch_dim',    # batch dimension index in input/output shape
-            'sequence_dim', # sequence dimension index in input/output shape
+            'sequence_dim', # sequence dimension index in input shape
             'blobs_wrb',    # input blobs have three separate components W, R and B like in ONNX/LSTM
+            'has_num_directions',  # if True, output shape has 4 dimensions; 3D otherwise
+            'format',       # format type of input blobs for different frameworks (onnx, tf, mxnet)
         ]
 
     def backend_attrs(self):
@@ -65,22 +76,42 @@ class LSTMSequence(Op):
     @staticmethod
     def infer(node: Node):
         # there are limitations coming from ONNX LSTM definition and normalization rules
-        assert len(node.in_nodes()) >= 3
+        assert len(node.in_nodes()) >= 3  # X, W and R
         assert len(node.in_nodes()) <= 7
         assert len(node.out_nodes()) <= 3
         assert node.batch_dim <= 1
         assert node.sequence_dim <=1
         assert node.batch_dim != node.sequence_dim
 
+        assert node.direction in ['forward', 'reverse', 'bidirectional']
+
         if node.blobs_wrb:
             mark_input_bins(node, ['W', 'R', 'B'])
         else:
             mark_input_bins(node)
         input_shape = node.in_node(0).shape
         assert len(input_shape) == 3
-        node.out_node(0).shape = np.array([input_shape[0], input_shape[1], node.hidden_size], dtype=np.int64)
-        if len(node.out_nodes()) > 1:
-            state_size = np.array([input_shape[1], node.hidden_size], dtype=np.int64)
-            node.out_node(1).shape = state_size.copy()
-            if len(node.out_nodes()) > 2:
-                node.out_node(2).shape = state_size.copy()
+        out_shape = np.array([input_shape[node.sequence_dim], input_shape[node.batch_dim], node.hidden_size], dtype=np.int64)
+        assert not node.has_num_directions or node.sequence_dim == 0, \
+            'If has_num_directions == True, then node.sequence_dim should be equal 0, but it is {}'.format(
+                node.sequence_dim)
+        num_directions = 2 if node.direction in ['bidirectional'] else 1
+        if node.has_num_directions:
+            # insert extra dimension to output shape for num_directions
+            out_shape = np.insert(out_shape, 1, np.int64(num_directions))
+        node.out_node(0).shape = out_shape
+        # extra outputs for hidden/cell states
+        state_size = np.array([input_shape[1], node.hidden_size], dtype=np.int64)
+        if node.has_num_directions:
+            state_size = np.insert(state_size, 0, num_directions)
+        for i in [1,2]:
+            if i not in node.out_nodes():
+                data_node = Op._create_data_node(
+                    node.graph,
+                    name=node.node+'/ExtraOutput/' + str(i),
+                    attrs={'is_output': True, 'executable': None}
+                )
+                node.graph.add_edge(node.id, data_node.id, key=0, out=i)
+            else:
+                data_node = node.out_node(i)
+            data_node.shape = state_size.copy()
index c3ea993..040cbf5 100644 (file)
@@ -26,7 +26,6 @@ class Merge(Op):
 
     def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
         mandatory_props = {
-            'type': __class__.op,
             'op': __class__.op,
             'infer': __class__.merge_infer
         }
@@ -1,5 +1,5 @@
 """
- Copyright (c) 2018 Intel Corporation
+ Copyright (c) 2017-2018 Intel Corporation
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
 """
 
 import numpy as np
+import networkx as nx
 
+from mo.graph.graph import Node
+from mo.ops.op import Op
+
+
+class PackOp(Op):
+    op = 'Pack'
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        mandatory_props = {
+            'op': __class__.op
+        }
+        super().__init__(graph, mandatory_props, attrs)
+
+    def supported_attrs(self):
+        return [
+            'axis'
+        ]
 
-def up_sampling_infer(node):
-    if node.scale is None:
-        return
-    input_shape = node.in_node(0).shape
-    batch = input_shape[0]
-    channel = input_shape[1]
-    y = input_shape[2] * node.scale
-    x = input_shape[3] * node.scale
-    node.out_node(0).shape = np.array([batch, channel, y, x])
index 0caf0e9..e494097 100644 (file)
@@ -52,7 +52,7 @@ class PriorBoxOp(Op):
             'step',
             'step_h',
             'step_w',
-            'offset'
+            'offset',
         ]
 
     def backend_attrs(self):
@@ -71,6 +71,22 @@ class PriorBoxOp(Op):
     def priorbox_infer(node: Node):
         layout = node.graph.graph['layout']
         data_shape = node.in_node(0).shape
-        num_ratios = ((node.flip + 1) * len(node.aspect_ratio) + 1) * len(node.min_size) + len(node.max_size)
+
+        # calculate all different aspect_ratios (the first one is always 1)
+        # in aspect_ratio 1/x values will be added for all except 1 if flip is True
+        ar_seen = [1.0]
+        ar_seen.extend(node.aspect_ratio.copy())
+        if node.flip:
+            for s in node.aspect_ratio:
+                ar_seen.append(1.0/s)
+
+        ar_seen = np.unique(np.array(ar_seen).round(decimals=6))
+        
+        num_ratios = 0
+        if len(node.min_size) > 0:
+            num_ratios = len(ar_seen) * len(node.min_size)
+
+        num_ratios = num_ratios + len(node.max_size)
+
         res_prod = data_shape[get_height_dim(layout, 4)] * data_shape[get_width_dim(layout, 4)] * num_ratios * 4
         node.out_node(0).shape = np.array([1, 2, res_prod], dtype=np.int64)
diff --git a/model-optimizer/extensions/ops/rank.py b/model-optimizer/extensions/ops/rank.py
new file mode 100644 (file)
index 0000000..ed17048
--- /dev/null
@@ -0,0 +1,40 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+
+from mo.graph.graph import Node
+from mo.ops.op import Op
+from mo.utils.error import Error
+
+
+class Rank(Op):
+    op = 'Rank'
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        mandatory_props = {
+            'op': __class__.op,
+            'infer': __class__.infer,
+        }
+        super().__init__(graph, mandatory_props, attrs)
+
+    @staticmethod
+    def infer(node: Node):
+        rank = len(node.in_node(0).shape)
+        out_value = np.array(rank)
+        node.out_node().value = out_value
+        node.out_node().shape = out_value.shape
similarity index 53%
rename from model-optimizer/extensions/ops/take.py
rename to model-optimizer/extensions/ops/reverse_sequence.py
index c2c80e4..ff7329d 100644 (file)
 """
 
 import logging as log
-
 import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node
-from mo.ops.op import Op
-from mo.utils.utils import refer_to_faq_msg
-
+from mo.ops.op import Op, PermuteAttrs
 
-class Take(Op):
-    ''' Implements regular numpy.take function
 
-        It is assumed that there is no equivalent of this op in IE,
-        so it is usually relevant to constant folding.
-    '''
-    op = 'Take'
+class ReverseSequence(Op):
+    op = 'ReverseSequence'
 
     def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
         mandatory_props = {
-            'type': None, # do not set type as there is no IE equivalent
+            #'type': not set, there shouldn't be translated to real layer
+            'seq_dim': None,
+            'batch_dim': None,
             'op': __class__.op,
-            'axis': 0,
-            'infer': __class__.infer
+            'infer': __class__.infer,
         }
         super().__init__(graph, mandatory_props, attrs)
 
     def supported_attrs(self):
         return [
-            'axis',
         ]
 
     @staticmethod
-    def infer(node: Node):
-        assert len(node.in_nodes()) == 2
-        data = node.in_node(0).value
-        indices = node.in_node(1).value
-        assert data is not None
-        assert indices is not None
-        assert node.axis is not None
-
-        node.out_node(0).value = np.take(data, indices, node.axis)
-        node.out_node(0).shape = np.array(node.out_node(0).value.shape, dtype=np.int64)
+    def infer(node):
+        if not node.has_valid('seq_dim'):
+            assert 1 in node.in_nodes()
+            assert node.in_node(1).has_valid('value')
+            assert node.in_node(1).value.size == 1
+            node['seq_dim'] = node.in_node(1).value.item()
+            node.graph.remove_edge(node.in_node(1).id, node.id)
+        assert len(node.out_nodes()) == 1
+        node.out_node().shape = node.in_node().shape.copy()
  limitations under the License.
 """
 
+import networkx as nx
+
 from mo.front.common.partial_infer.elemental import copy_shape_infer
+from mo.ops.op import Op
+
 
+class ShuffleChannelOp(Op):
+    """
+    Op for ShuffleChannel layer. It will be replaced by ShuffleChannel MiddleReplacer.
+    """
+    op = 'ShuffleChannel'
+    enabled = True
 
-def tf_softmax_ext(pb):
-    return {
-        'type': 'SoftMax',
-        'infer': copy_shape_infer
-    }
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        super().__init__(graph, {
+            'type': None,
+            'op': __class__.op,
+            'infer': copy_shape_infer
+        }, attrs)
  limitations under the License.
 """
 
-from mo.front.common.partial_infer.elemental import copy_shape_infer
+import networkx as nx
+from mo.ops.op import Op
 
 
-def softmax_ext(pb_layer, pb_model):
-    param = pb_layer.softmax_param
+class Splice(Op):
+    op = 'Splice'
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        mandatory_props = {
+            'type': None,
+            'op': __class__.op,
+        }
+        super().__init__(graph, mandatory_props, attrs)
 
-    return {
-        'type': 'SoftMax',
-        'axis': param.axis,
-        'infer': copy_shape_infer
-    }
index db8f3c4..7c1fd42 100644 (file)
  limitations under the License.
 """
 
-import logging as log
-
 import networkx as nx
-import numpy as np
 
 from mo.front.common.partial_infer.split import tf_split_v_infer
-from mo.graph.graph import Node
 from mo.ops.op import Op
 
 
@@ -32,10 +28,13 @@ class SplitV(Op):
         super().__init__(graph, {
             'type': 'Split',
             'op': 'SplitV',
-            'axis' : 1,
+            'axis': 1,
             'input_port': 0,
             'infer': tf_split_v_infer
         }, attrs)
-        
+
     def supported_attrs(self):
-        return ['axis', 'split_sizes']
+        return ['axis', 'size_splits']
+
+    def backend_attrs(self):
+        return ['axis']
diff --git a/model-optimizer/extensions/ops/stop_gradient.py b/model-optimizer/extensions/ops/stop_gradient.py
new file mode 100644 (file)
index 0000000..58ad9bc
--- /dev/null
@@ -0,0 +1,39 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from mo.front.common.partial_infer.elemental import copy_shape_infer
+from mo.ops.op import Op
+from mo.front.common.partial_infer.utils import mark_input_bins
+
+
+class StopGradientOp(Op):
+    op = 'StopGradient'
+    enabled = True
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        super().__init__(graph, {
+            'type': __class__.op,
+            'op': __class__.op,
+            'identity': True,
+            'infer': StopGradientOp.shape_infer
+        }, attrs)
+
+    @staticmethod
+    def shape_infer(node):
+        copy_shape_infer(node)
+
diff --git a/model-optimizer/extensions/ops/swapaxes.py b/model-optimizer/extensions/ops/swapaxes.py
new file mode 100644 (file)
index 0000000..e8507fd
--- /dev/null
@@ -0,0 +1,35 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+
+from mo.graph.graph import Node
+from mo.ops.permute import Permute
+
+
+class SwapAxes(Permute):
+    op = 'SwapAxis'
+    enabled = False
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        attrs.update({'infer': SwapAxes.infer})
+        super().__init__(graph, attrs)
+
+    @staticmethod
+    def infer(node: Node):
+        node.order = list(range(node.in_node().shape.size))
+        node.order[node.dim2], node.order[node.dim1] = node.order[node.dim1], node.order[node.dim2]
+        Permute.infer(node)
index 3bfb1e4..cb6da98 100755 (executable)
@@ -46,7 +46,7 @@ for ((i=1;i <= $#;i++)) {
         esac
 }
 
-SCRIPTDIR="$(cd "$(dirname "$0")" && pwd)"
+SCRIPTDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 if [[ -f /etc/centos-release ]]; then
     DISTRO="centos"
index b903968..72e3283 100644 (file)
@@ -17,7 +17,6 @@
 from mo.front.caffe.extractors.batchnorm import batch_norm_ext
 from mo.front.caffe.extractors.concat import concat_ext
 from mo.front.caffe.extractors.eltwise import eltwise_ext
-from mo.front.caffe.extractors.flatten import flatten_ext
 from mo.front.caffe.extractors.inner_product import inner_product_ext
 from mo.front.caffe.extractors.input import global_input_ext, input_ext
 from mo.front.caffe.extractors.lrn import lrn_ext
@@ -29,10 +28,9 @@ from mo.front.caffe.extractors.reshape import reshape_ext
 from mo.front.caffe.extractors.roipooling import roipooling_ext
 from mo.front.caffe.extractors.scale import scale_ext
 from mo.front.caffe.extractors.slice import slice_ext
-from mo.front.caffe.extractors.softmax import softmax_ext
 from mo.front.common.partial_infer.elemental import copy_shape_infer
 from mo.front.common.register_custom_ops import extension_op_extractor
-from mo.front.extractor import CaffePythonFrontExtractorOp, FrontExtractorOp
+from mo.front.extractor import CaffePythonFrontExtractorOp
 from mo.graph.graph import Node
 from mo.ops.op import Op
 from mo.utils.error import Error
@@ -69,10 +67,8 @@ caffe_type_extractors = {
     # Utility Layers
     'concat': node_pb_arg(concat_ext),
     'eltwise': node_pb_arg(eltwise_ext),
-    'flatten': node_pb_arg(flatten_ext),
     'reshape': node_pb_arg(reshape_ext),
     'slice': node_pb_arg(slice_ext),
-    'softmax': node_pb_arg(softmax_ext),
 
     # Custom, implemented in IE, SSD-specific
     'permute': node_pb_arg(permute_ext),
index 66f3a7b..464a77f 100644 (file)
@@ -14,6 +14,7 @@
  limitations under the License.
 """
 
+from mo.front.caffe.collect_attributes import collect_attributes
 from mo.front.extractor import FrontExtractorOp
 from mo.ops.activation import Activation
 
@@ -24,5 +25,9 @@ class ELUFrontExtractor(FrontExtractorOp):
 
     @staticmethod
     def extract(node):
-        Activation.update_node_stat(node, {'operation': 'elu'})
+        param = node.pb.elu_param
+        attrs = collect_attributes(param)
+        attrs['operation'] = 'elu'
+
+        Activation.update_node_stat(node, attrs)
         return ELUFrontExtractor.enabled
diff --git a/model-optimizer/mo/front/caffe/extractors/flatten.py b/model-optimizer/mo/front/caffe/extractors/flatten.py
deleted file mode 100644 (file)
index 2eb5f2d..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import numpy as np
-
-from mo.front.caffe.collect_attributes import merge_attrs
-from mo.front.common.partial_infer.flatten import flatten_infer
-
-
-def flatten_ext(pl, ml):
-    param = pl.flatten_param
-    update_attrs = {
-        'axis': param.axis,
-        'end_axis': param.end_axis,
-        'num_axes': 0
-    }
-    mapping_rule = merge_attrs(param, update_attrs)
-    mapping_rule.update({
-        'type': "Flatten",
-        'infer': flatten_infer
-    })
-    return mapping_rule
index 734ff8c..13deb99 100644 (file)
@@ -13,6 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
+import logging as log
 
 from mo.front.common.partial_infer.elemental import single_output_infer
 from mo.front.common.partial_infer.reshape import tf_reshape_shape_infer
@@ -29,5 +30,12 @@ def reshape_ext(pl, ml):
         'dim': list(param.shape.dim),
         'infer': lambda node: single_output_infer(node, tf_reshape_shape_infer)
     }
+    if attrs['axis'] != 0:
+        log.error('The operation "Reshape" has attribute "axis" with unsupported value "{}"'.format(attrs['axis']))
+        return None
+    if attrs['num_axes'] != -1:
+        log.error('The operation "Reshape" has attribute "num_axes" with unsupported value "{}"'.format(
+            attrs['num_axes']))
+        return None
 
     return attrs
index 8ce0bd6..196b7d5 100644 (file)
@@ -16,7 +16,7 @@
 
 import numpy as np
 
-from mo.front.caffe.extractors.utils import weights_biases
+from mo.front.caffe.extractors.utils import embed_input, weights_biases
 from mo.front.common.partial_infer.elemental import copy_shape_infer
 from mo.utils.utils import NamedAttrsClass
 
@@ -26,11 +26,22 @@ def scale_ext(pl, ml):
     attrs = {
         'op': 'ScaleShift',
         'type': 'ScaleShift',
+        'axis': param.axis,
         'infer': copy_shape_infer
     }
-    if ml is None:
+    if ml is None and len(pl.bottom) == 1:
         # default weights and biases for scale layer if the caffemodel file doesn't contain them
         ml = NamedAttrsClass({'blobs': np.array([NamedAttrsClass({'data': np.array([1])}),
                                                  NamedAttrsClass({'data': np.array([0])})])})
-    attrs.update(weights_biases(param.bias_term, ml))
+    # scale with 1 input and 1 or 2 blobs
+    if ml and len(ml.blobs) != 0 and len(pl.bottom) == 1:
+        attrs.update(weights_biases(param.bias_term, ml))
+    # 2 inputs + bias
+    elif len(pl.bottom) == 2 and param.bias_term:
+        if ml is None or len(ml.blobs) == 0:
+            # default bias for scale layer with 2 inputs if the caffemodel file doesn't contain them
+            ml = NamedAttrsClass({'blobs': np.array([NamedAttrsClass({'data': np.array([0])})])})
+
+        embed_input(attrs, 1, 'biases', ml.blobs[0].data)
+
     return attrs
index c8d1f37..953f88c 100644 (file)
@@ -30,6 +30,8 @@ def slice_ext(proto_layer, model_layer):
         'slice_point': param.slice_point,
     }
     mapping_rule = merge_attrs(param, update_attrs)
+    if 'slice_point' not in mapping_rule:
+        mapping_rule['slice_point'] = []
     mapping_rule.update({
         'type': 'Slice',
         'infer': caffe_slice_infer
index 729c442..69f63f2 100644 (file)
@@ -26,7 +26,7 @@ from google.protobuf.internal import api_implementation
 
 from mo.front.caffe.proto import caffe_pb2
 from mo.graph.graph import Node, unique_id
-from mo.utils.error import Error
+from mo.utils.error import Error, FrameworkError
 from mo.utils.utils import refer_to_faq_msg
 
 
@@ -103,18 +103,40 @@ def load_caffe_proto_model(proto_path: str, model_path: [str, None] = None):
                        'python -m easy_install protobuf-3.5.1-py($your_python_version)-win-amd64.egg \n' \
                        'set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp'
         print(message + '\n\n' + refer_to_faq_msg(80))
+
     # Read proto layers
-    proto = caffe_pb2.NetParameter()
-    with open(proto_path, "r") as file:
-        text_format.Merge(str(file.read()), proto)
+    try:
+        proto = caffe_pb2.NetParameter()
+        with open(proto_path, "r") as file:
+            text_format.Merge(str(file.read()), proto)
+    except Exception as e:
+        log.error('Exception message: {}\n\n'.format(e) +
+                  '    Possible reasons:\n' +
+                  '      1. {} does not exist\n'.format(proto_path) +
+                  '      2. {} does not have a valid structure, for example, it was downloaded as html\n'.format(proto_path) +
+                  '      3. {} contains custom layers or attributes that are not supported\n'.format(proto_path) +
+                  '         in Model Optimizer by default.\n\n' +
+                  '    After you made sure that {} has a valid structure and still see this issue, then\n'.format(proto_path) +
+                  '    you need to generate a python parser for caffe.proto that was used when the model\n' +
+                  '    was created.\n' +
+                  '    Run "python3 generate_caffe_pb2.py --input_proto ${PATH_TO_CAFFE}/src/caffe/proto/caffe.proto"' +
+                  refer_to_faq_msg(1) + '\n\n', extra={'framework_error': True})
+        raise FrameworkError('Model Optimizer is not able to parse {}'.format(proto_path)) from e
 
     # Read model layer if exists
     model = None
-    if model_path:
-        model = caffe_pb2.NetParameter()
-        with open(model_path, "rb") as infile:
-            map = mmap.mmap(infile.fileno(), 0, access=mmap.ACCESS_READ)
-            model.MergeFromString(map)
+    try:
+        if model_path:
+            model = caffe_pb2.NetParameter()
+            with open(model_path, "rb") as infile:
+                map = mmap.mmap(infile.fileno(), 0, access=mmap.ACCESS_READ)
+                model.MergeFromString(map)
+    except Exception as e:
+        log.error('Exception message: {}\n\n'.format(e) +
+                  '    Possible reasons:\n' +
+                  '      1. {} does not exist\n'.format(model_path) +
+                  '      2. {} does not have a valid structure\n'.format(model_path), extra={'framework_error': True})
+        raise FrameworkError('Model Optimizer is not able to parse {}'.format(model_path)) from e
 
     return proto, model
 
index e79d2d5..c32fa78 100644 (file)
@@ -19,7 +19,7 @@ _sym_db = _symbol_database.Default()
 DESCRIPTOR = _descriptor.FileDescriptor(
   name='mo_caffe.proto',
   package='mo_caffe',
-  serialized_pb=_b('\n\x0emo_caffe.proto\x12\x08mo_caffe\"\x1c\n\tBlobShape\x12\x0f\n\x03\x64im\x18\x01 \x03(\x03\x42\x02\x10\x01\"\xcf\x01\n\tBlobProto\x12\"\n\x05shape\x18\x07 \x01(\x0b\x32\x13.mo_caffe.BlobShape\x12\x10\n\x04\x64\x61ta\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x10\n\x04\x64iff\x18\x06 \x03(\x02\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_data\x18\x08 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_diff\x18\t \x03(\x01\x42\x02\x10\x01\x12\x0e\n\x03num\x18\x01 \x01(\x05:\x01\x30\x12\x13\n\x08\x63hannels\x18\x02 \x01(\x05:\x01\x30\x12\x11\n\x06height\x18\x03 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\x05:\x01\x30\"5\n\x0f\x42lobProtoVector\x12\"\n\x05\x62lobs\x18\x01 \x03(\x0b\x32\x13.mo_caffe.BlobProto\"M\n\x1e\x43osineSimilarityBatchParameter\x12\x14\n\tpos_label\x18\x01 \x01(\x01:\x01\x31\x12\x15\n\tneg_label\x18\x02 \x01(\x01:\x02-1\"\x81\x01\n\x05\x44\x61tum\x12\x10\n\x08\x63hannels\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x04 \x01(\x0c\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x12\n\nfloat_data\x18\x06 \x03(\x02\x12\x16\n\x07\x65ncoded\x18\x07 \x01(\x08:\x05\x66\x61lse\"A\n\x0cLabelMapItem\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05label\x18\x02 \x01(\x05\x12\x14\n\x0c\x64isplay_name\x18\x03 \x01(\t\"0\n\x08LabelMap\x12$\n\x04item\x18\x01 \x03(\x0b\x32\x16.mo_caffe.LabelMapItem\"\x87\x01\n\x0eNormalizedBBox\x12\x0c\n\x04xmin\x18\x01 \x01(\x02\x12\x0c\n\x04ymin\x18\x02 \x01(\x02\x12\x0c\n\x04xmax\x18\x03 \x01(\x02\x12\x0c\n\x04ymax\x18\x04 \x01(\x02\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x11\n\tdifficult\x18\x06 \x01(\x08\x12\r\n\x05score\x18\x07 \x01(\x02\x12\x0c\n\x04size\x18\x08 \x01(\x02\"\xad\x02\n\x0f\x46illerParameter\x12\x16\n\x04type\x18\x01 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x02 \x01(\x02:\x01\x30\x12\x0e\n\x03min\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03max\x18\x04 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x05 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x06 \x01(\x02:\x01\x31\x12\x12\n\x06sparse\x18\x07 \x01(\x05:\x02-1\x12\x45\n\rvariance_norm\x18\x08 \x01(\x0e\x32&.mo_caffe.FillerParameter.VarianceNorm:\x06\x46\x41N_IN\x12\x0c\n\x04\x66ile\x18\t \x01(\t\x12\x10\n\x08\x64iag_val\x18\n \x03(\x02\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x41VERAGE\x10\x02\"\xed\x02\n\x0cNetParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05input\x18\x03 \x03(\t\x12(\n\x0binput_shape\x18\x08 \x03(\x0b\x32\x13.mo_caffe.BlobShape\x12\x11\n\tinput_dim\x18\x04 \x03(\x05\x12\x1d\n\x0e\x66orce_backward\x18\x05 \x01(\x08:\x05\x66\x61lse\x12!\n\x05state\x18\x06 \x01(\x0b\x32\x12.mo_caffe.NetState\x12\x19\n\ndebug_info\x18\x07 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0cprofile_info\x18\t \x01(\x08:\x05\x66\x61lse\x12\x18\n\x0cprofile_iter\x18\n \x01(\x05:\x02\x35\x30\x12\x1a\n\x0eprofile_warmup\x18\x0b \x01(\x05:\x02\x31\x30\x12\'\n\x05layer\x18\x64 \x03(\x0b\x32\x18.mo_caffe.LayerParameter\x12*\n\x06layers\x18\x02 \x03(\x0b\x32\x1a.mo_caffe.V1LayerParameter\"\xf4\n\n\x0fSolverParameter\x12\x0b\n\x03net\x18\x18 \x01(\t\x12)\n\tnet_param\x18\x19 \x01(\x0b\x32\x16.mo_caffe.NetParameter\x12\x11\n\ttrain_net\x18\x01 \x01(\t\x12\x10\n\x08test_net\x18\x02 \x03(\t\x12/\n\x0ftrain_net_param\x18\x15 \x01(\x0b\x32\x16.mo_caffe.NetParameter\x12.\n\x0etest_net_param\x18\x16 \x03(\x0b\x32\x16.mo_caffe.NetParameter\x12\'\n\x0btrain_state\x18\x1a \x01(\x0b\x32\x12.mo_caffe.NetState\x12&\n\ntest_state\x18\x1b \x03(\x0b\x32\x12.mo_caffe.NetState\x12\x11\n\ttest_iter\x18\x03 \x03(\x05\x12\x18\n\rtest_interval\x18\x04 \x01(\x05:\x01\x30\x12 \n\x11test_compute_loss\x18\x13 \x01(\x08:\x05\x66\x61lse\x12!\n\x13test_initialization\x18  \x01(\x08:\x04true\x12\x0f\n\x07\x62\x61se_lr\x18\x05 \x01(\x02\x12\x0f\n\x07\x64isplay\x18\x06 \x01(\x05\x12\x17\n\x0c\x61verage_loss\x18! \x01(\x05:\x01\x31\x12\x10\n\x08max_iter\x18\x07 \x01(\x05\x12\x14\n\titer_size\x18$ \x01(\x05:\x01\x31\x12\x11\n\tlr_policy\x18\x08 \x01(\t\x12\r\n\x05gamma\x18\t \x01(\x02\x12\r\n\x05power\x18\n \x01(\x02\x12\x10\n\x08momentum\x18\x0b \x01(\x02\x12\x14\n\x0cweight_decay\x18\x0c \x01(\x02\x12\x1f\n\x13regularization_type\x18\x1d \x01(\t:\x02L2\x12\x10\n\x08stepsize\x18\r \x01(\x05\x12\x11\n\tstepvalue\x18\" \x03(\x05\x12\x17\n\x0fplateau_winsize\x18* \x03(\x05\x12\x1a\n\x0e\x63lip_gradients\x18# \x01(\x02:\x02-1\x12\x13\n\x08snapshot\x18\x0e \x01(\x05:\x01\x30\x12\x17\n\x0fsnapshot_prefix\x18\x0f \x01(\t\x12\x1c\n\rsnapshot_diff\x18\x10 \x01(\x08:\x05\x66\x61lse\x12N\n\x0fsnapshot_format\x18% \x01(\x0e\x32(.mo_caffe.SolverParameter.SnapshotFormat:\x0b\x42INARYPROTO\x12>\n\x0bsolver_mode\x18\x11 \x01(\x0e\x32$.mo_caffe.SolverParameter.SolverMode:\x03GPU\x12\x14\n\tdevice_id\x18\x12 \x01(\x05:\x01\x30\x12\x17\n\x0brandom_seed\x18\x14 \x01(\x03:\x02-1\x12\x11\n\x04type\x18( \x01(\t:\x03SGD\x12\x14\n\x05\x64\x65lta\x18\x1f \x01(\x02:\x05\x31\x65-08\x12\x18\n\tmomentum2\x18\' \x01(\x02:\x05\x30.999\x12\x17\n\trms_decay\x18& \x01(\x02:\x04\x30.99\x12\x19\n\ndebug_info\x18\x17 \x01(\x08:\x05\x66\x61lse\x12\"\n\x14snapshot_after_train\x18\x1c \x01(\x08:\x04true\x12>\n\x0bsolver_type\x18\x1e \x01(\x0e\x32$.mo_caffe.SolverParameter.SolverType:\x03SGD\x12\x1f\n\x11layer_wise_reduce\x18) \x01(\x08:\x04true\"+\n\x0eSnapshotFormat\x12\x08\n\x04HDF5\x10\x00\x12\x0f\n\x0b\x42INARYPROTO\x10\x01\"\x1e\n\nSolverMode\x12\x07\n\x03\x43PU\x10\x00\x12\x07\n\x03GPU\x10\x01\"U\n\nSolverType\x12\x07\n\x03SGD\x10\x00\x12\x0c\n\x08NESTEROV\x10\x01\x12\x0b\n\x07\x41\x44\x41GRAD\x10\x02\x12\x0b\n\x07RMSPROP\x10\x03\x12\x0c\n\x08\x41\x44\x41\x44\x45LTA\x10\x04\x12\x08\n\x04\x41\x44\x41M\x10\x05\"\xa8\x01\n\x0bSolverState\x12\x0c\n\x04iter\x18\x01 \x01(\x05\x12\x13\n\x0blearned_net\x18\x02 \x01(\t\x12$\n\x07history\x18\x03 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x17\n\x0c\x63urrent_step\x18\x04 \x01(\x05:\x01\x30\x12\x1b\n\x0cminimum_loss\x18\x05 \x01(\x02:\x05\x31\x65+38\x12\x1a\n\x0fiter_last_event\x18\x06 \x01(\x05:\x01\x30\"Q\n\x08NetState\x12$\n\x05phase\x18\x01 \x01(\x0e\x32\x0f.mo_caffe.Phase:\x04TEST\x12\x10\n\x05level\x18\x02 \x01(\x05:\x01\x30\x12\r\n\x05stage\x18\x03 \x03(\t\"v\n\x0cNetStateRule\x12\x1e\n\x05phase\x18\x01 \x01(\x0e\x32\x0f.mo_caffe.Phase\x12\x11\n\tmin_level\x18\x02 \x01(\x05\x12\x11\n\tmax_level\x18\x03 \x01(\x05\x12\r\n\x05stage\x18\x04 \x03(\t\x12\x11\n\tnot_stage\x18\x05 \x03(\t\"\xad\x02\n\x1bSpatialTransformerParameter\x12\x1e\n\x0etransform_type\x18\x01 \x01(\t:\x06\x61\x66\x66ine\x12\x1e\n\x0csampler_type\x18\x02 \x01(\t:\x08\x62ilinear\x12\x10\n\x08output_H\x18\x03 \x01(\x05\x12\x10\n\x08output_W\x18\x04 \x01(\x05\x12\x1b\n\rto_compute_dU\x18\x05 \x01(\x08:\x04true\x12\x11\n\ttheta_1_1\x18\x06 \x01(\x01\x12\x11\n\ttheta_1_2\x18\x07 \x01(\x01\x12\x11\n\ttheta_1_3\x18\x08 \x01(\x01\x12\x11\n\ttheta_2_1\x18\t \x01(\x01\x12\x11\n\ttheta_2_2\x18\n \x01(\x01\x12\x11\n\ttheta_2_3\x18\x0b \x01(\x01\x12\x1b\n\x0c\x64\x65_transform\x18\x0c \x01(\x08:\x05\x66\x61lse\"(\n\x12PowerFileParameter\x12\x12\n\nshift_file\x18\x01 \x01(\t\"5\n\x0fSTLossParameter\x12\x10\n\x08output_H\x18\x01 \x02(\x05\x12\x10\n\x08output_W\x18\x02 \x02(\x05\"%\n\x10LocLossParameter\x12\x11\n\tthreshold\x18\x01 \x02(\x01\"\xa6\x01\n\tParamSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x34\n\nshare_mode\x18\x02 \x01(\x0e\x32 .mo_caffe.ParamSpec.DimCheckMode\x12\x12\n\x07lr_mult\x18\x03 \x01(\x02:\x01\x31\x12\x15\n\ndecay_mult\x18\x04 \x01(\x02:\x01\x31\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xb1#\n\x0eLayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06\x62ottom\x18\x03 \x03(\t\x12\x0b\n\x03top\x18\x04 \x03(\t\x12\x1e\n\x05phase\x18\n \x01(\x0e\x32\x0f.mo_caffe.Phase\x12\x13\n\x0bloss_weight\x18\x05 \x03(\x02\x12\"\n\x05param\x18\x06 \x03(\x0b\x32\x13.mo_caffe.ParamSpec\x12\"\n\x05\x62lobs\x18\x07 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x16\n\x0epropagate_down\x18\x0b \x03(\x08\x12\'\n\x07include\x18\x08 \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\'\n\x07\x65xclude\x18\t \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12:\n\x0ftransform_param\x18\x64 \x01(\x0b\x32!.mo_caffe.TransformationParameter\x12+\n\nloss_param\x18\x65 \x01(\x0b\x32\x17.mo_caffe.LossParameter\x12\x33\n\x0e\x61\x63\x63uracy_param\x18\x66 \x01(\x0b\x32\x1b.mo_caffe.AccuracyParameter\x12/\n\x0c\x61rgmax_param\x18g \x01(\x0b\x32\x19.mo_caffe.ArgMaxParameter\x12\x37\n\x10\x62\x61tch_norm_param\x18\x8b\x01 \x01(\x0b\x32\x1c.mo_caffe.BatchNormParameter\x12,\n\nbias_param\x18\x8d\x01 \x01(\x0b\x32\x17.mo_caffe.BiasParameter\x12I\n\x19\x63hannel_permutation_param\x18\x92? \x01(\x0b\x32%.mo_caffe.ChannelPermutationParameter\x12/\n\x0c\x63oncat_param\x18h \x01(\x0b\x32\x19.mo_caffe.ConcatParameter\x12\x42\n\x16\x63ontrastive_loss_param\x18i \x01(\x0b\x32\".mo_caffe.ContrastiveLossParameter\x12\x39\n\x11\x63onvolution_param\x18j \x01(\x0b\x32\x1e.mo_caffe.ConvolutionParameter\x12,\n\ncrop_param\x18\x90\x01 \x01(\x0b\x32\x17.mo_caffe.CropParameter\x12\x39\n\x11\x63tc_decoder_param\x18\x95\x01 \x01(\x0b\x32\x1d.mo_caffe.CTCDecoderParameter\x12\x33\n\x0e\x63tc_loss_param\x18\x94\x01 \x01(\x0b\x32\x1a.mo_caffe.CTCLossParameter\x12+\n\ndata_param\x18k \x01(\x0b\x32\x17.mo_caffe.DataParameter\x12\x31\n\rdropout_param\x18l \x01(\x0b\x32\x1a.mo_caffe.DropoutParameter\x12\x36\n\x10\x64ummy_data_param\x18m \x01(\x0b\x32\x1c.mo_caffe.DummyDataParameter\x12\x31\n\reltwise_param\x18n \x01(\x0b\x32\x1a.mo_caffe.EltwiseParameter\x12*\n\telu_param\x18\x8c\x01 \x01(\x0b\x32\x16.mo_caffe.ELUParameter\x12.\n\x0b\x65mbed_param\x18\x89\x01 \x01(\x0b\x32\x18.mo_caffe.EmbedParameter\x12)\n\texp_param\x18o \x01(\x0b\x32\x16.mo_caffe.ExpParameter\x12\x32\n\rflatten_param\x18\x87\x01 \x01(\x0b\x32\x1a.mo_caffe.FlattenParameter\x12*\n\tgrn_param\x18\xd5\x01 \x01(\x0b\x32\x16.mo_caffe.GRNParameter\x12\x34\n\x0fhdf5_data_param\x18p \x01(\x0b\x32\x1b.mo_caffe.HDF5DataParameter\x12\x38\n\x11hdf5_output_param\x18q \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\x12\x36\n\x10hinge_loss_param\x18r \x01(\x0b\x32\x1c.mo_caffe.HingeLossParameter\x12\x36\n\x10image_data_param\x18s \x01(\x0b\x32\x1c.mo_caffe.ImageDataParameter\x12<\n\x13infogain_loss_param\x18t \x01(\x0b\x32\x1f.mo_caffe.InfogainLossParameter\x12<\n\x13inner_product_param\x18u \x01(\x0b\x32\x1f.mo_caffe.InnerProductParameter\x12.\n\x0binput_param\x18\x8f\x01 \x01(\x0b\x32\x18.mo_caffe.InputParameter\x12*\n\tlog_param\x18\x86\x01 \x01(\x0b\x32\x16.mo_caffe.LogParameter\x12)\n\tlrn_param\x18v \x01(\x0b\x32\x16.mo_caffe.LRNParameter\x12\x38\n\x11memory_data_param\x18w \x01(\x0b\x32\x1d.mo_caffe.MemoryDataParameter\x12)\n\tmvn_param\x18x \x01(\x0b\x32\x16.mo_caffe.MVNParameter\x12\x36\n\x0fparameter_param\x18\x91\x01 \x01(\x0b\x32\x1c.mo_caffe.ParameterParameter\x12\x31\n\rpooling_param\x18y \x01(\x0b\x32\x1a.mo_caffe.PoolingParameter\x12\x32\n\rpermute_param\x18\x9a\x01 \x01(\x0b\x32\x1a.mo_caffe.PermuteParameter\x12-\n\x0bpower_param\x18z \x01(\x0b\x32\x18.mo_caffe.PowerParameter\x12.\n\x0bprelu_param\x18\x83\x01 \x01(\x0b\x32\x18.mo_caffe.PReLUParameter\x12\x30\n\x0cpython_param\x18\x82\x01 \x01(\x0b\x32\x19.mo_caffe.PythonParameter\x12\x36\n\x0frecurrent_param\x18\x92\x01 \x01(\x0b\x32\x1c.mo_caffe.RecurrentParameter\x12\x36\n\x0freduction_param\x18\x88\x01 \x01(\x0b\x32\x1c.mo_caffe.ReductionParameter\x12+\n\nrelu_param\x18{ \x01(\x0b\x32\x17.mo_caffe.ReLUParameter\x12\x32\n\rreshape_param\x18\x85\x01 \x01(\x0b\x32\x1a.mo_caffe.ReshapeParameter\x12\x32\n\rreverse_param\x18\x93\x01 \x01(\x0b\x32\x1a.mo_caffe.ReverseParameter\x12.\n\x0bscale_param\x18\x8e\x01 \x01(\x0b\x32\x18.mo_caffe.ScaleParameter\x12\x31\n\rsigmoid_param\x18| \x01(\x0b\x32\x1a.mo_caffe.SigmoidParameter\x12\x31\n\rsoftmax_param\x18} \x01(\x0b\x32\x1a.mo_caffe.SoftmaxParameter\x12*\n\tspp_param\x18\x84\x01 \x01(\x0b\x32\x16.mo_caffe.SPPParameter\x12-\n\x0bslice_param\x18~ \x01(\x0b\x32\x18.mo_caffe.SliceParameter\x12+\n\ntanh_param\x18\x7f \x01(\x0b\x32\x17.mo_caffe.TanHParameter\x12\x36\n\x0fthreshold_param\x18\x80\x01 \x01(\x0b\x32\x1c.mo_caffe.ThresholdParameter\x12,\n\ntile_param\x18\x8a\x01 \x01(\x0b\x32\x17.mo_caffe.TileParameter\x12\x39\n\x11window_data_param\x18\x81\x01 \x01(\x0b\x32\x1d.mo_caffe.WindowDataParameter\x12\x38\n\x08st_param\x18\x96\x01 \x01(\x0b\x32%.mo_caffe.SpatialTransformerParameter\x12\x31\n\rst_loss_param\x18\x97\x01 \x01(\x0b\x32\x19.mo_caffe.STLossParameter\x12\x37\n\x10power_file_param\x18\x98\x01 \x01(\x0b\x32\x1c.mo_caffe.PowerFileParameter\x12\x33\n\x0eloc_loss_param\x18\x99\x01 \x01(\x0b\x32\x1a.mo_caffe.LocLossParameter\x12\x34\n\x0eproposal_param\x18\xc9\x01 \x01(\x0b\x32\x1b.mo_caffe.ProposalParameter\x12P\n\x1d\x63osine_similarity_batch_param\x18\xca\x01 \x01(\x0b\x32(.mo_caffe.CosineSimilarityBatchParameter\x12\x45\n\x0erss_loss_param\x18\xcb\x01 \x01(\x0b\x32,.mo_caffe.RandomSamplingSoftmaxLossParameter\x12\x31\n\nnorm_param\x18\xcc\x01 \x01(\x0b\x32\x1c.mo_caffe.NormalizeParameter\x12\x39\n\x11roi_warping_param\x18\xcd\x01 \x01(\x0b\x32\x1d.mo_caffe.ROIWarpingParameter\x12=\n\x13psroi_pooling_param\x18\xcf\x01 \x01(\x0b\x32\x1f.mo_caffe.PSROIPoolingParameter\x12\x39\n\x11roi_pooling_param\x18\xd0\x01 \x01(\x0b\x32\x1d.mo_caffe.ROIPoolingParameter\x12>\n\x14smooth_l1_loss_param\x18\xd1\x01 \x01(\x0b\x32\x1f.mo_caffe.SmoothL1LossParameter\x12\x46\n\x18\x62ox_annotator_ohem_param\x18\xd2\x01 \x01(\x0b\x32#.mo_caffe.BoxAnnotatorOHEMParameter\x12\x43\n\x16\x64\x65tection_output_param\x18\xd3\x01 \x01(\x0b\x32\".mo_caffe.DetectionOutputParameter\x12\x35\n\x0fprior_box_param\x18\xd4\x01 \x01(\x0b\x32\x1b.mo_caffe.PriorBoxParameter\x12\x39\n\x11region_yolo_param\x18\xd6\x01 \x01(\x0b\x32\x1d.mo_caffe.RegionYoloParameter\x12\x37\n\x10reorg_yolo_param\x18\xd7\x01 \x01(\x0b\x32\x1c.mo_caffe.ReorgYoloParameter\x12.\n\x0brelu6_param\x18\xd8\x01 \x01(\x0b\x32\x18.mo_caffe.ReLU6Parameter\x12\x30\n\x0cinterp_param\x18\xd9\x01 \x01(\x0b\x32\x19.mo_caffe.InterpParameter\x12<\n\x12\x61ugmentation_param\x18\xda\x01 \x01(\x0b\x32\x1f.mo_caffe.AugmentationParameter\x12:\n\x11\x63orrelation_param\x18\xdb\x01 \x01(\x0b\x32\x1e.mo_caffe.CorrelationParameter\x12\x34\n\x0eresample_param\x18\xdc\x01 \x01(\x0b\x32\x1b.mo_caffe.ResampleParameter\x12\x35\n\x0f\x66low_warp_param\x18\xdd\x01 \x01(\x0b\x32\x1b.mo_caffe.FlowWarpParameter\x12.\n\x0b\x61\x63\x63um_param\x18\xde\x01 \x01(\x0b\x32\x18.mo_caffe.AccumParameter\x12?\n\x14\x63oeff_schedule_param\x18\xdf\x01 \x01(\x0b\x32 .mo_caffe.CoeffScheduleParameter\"\x90\x01\n\x0fInterpParameter\x12\x11\n\x06height\x18\x01 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0bzoom_factor\x18\x03 \x01(\x05:\x01\x31\x12\x18\n\rshrink_factor\x18\x04 \x01(\x05:\x01\x31\x12\x12\n\x07pad_beg\x18\x05 \x01(\x05:\x01\x30\x12\x12\n\x07pad_end\x18\x06 \x01(\x05:\x01\x30\"n\n\"RandomSamplingSoftmaxLossParameter\x12 \n\x13random_sampling_num\x18\x01 \x01(\x05:\x03\x31\x30\x30\x12&\n\x16random_sampling_policy\x18\x02 \x01(\t:\x06random\"\xc8\x01\n\x11ProposalParameter\x12\x17\n\x0b\x66\x65\x61t_stride\x18\x01 \x01(\r:\x02\x31\x36\x12\x15\n\tbase_size\x18\x02 \x01(\r:\x02\x31\x36\x12\x14\n\x08min_size\x18\x03 \x01(\r:\x02\x31\x36\x12\r\n\x05ratio\x18\x04 \x03(\x02\x12\r\n\x05scale\x18\x05 \x03(\x02\x12\x1a\n\x0cpre_nms_topn\x18\x06 \x01(\r:\x04\x36\x30\x30\x30\x12\x1a\n\rpost_nms_topn\x18\x07 \x01(\r:\x03\x33\x30\x30\x12\x17\n\nnms_thresh\x18\x08 \x01(\x02:\x03\x30.7\"\x95\x01\n\x12NormalizeParameter\x12\x1c\n\x0e\x61\x63ross_spatial\x18\x01 \x01(\x08:\x04true\x12/\n\x0cscale_filler\x18\x02 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x1c\n\x0e\x63hannel_shared\x18\x03 \x01(\x08:\x04true\x12\x12\n\x03\x65ps\x18\x04 \x01(\x02:\x05\x31\x65-10\"!\n\x10PermuteParameter\x12\r\n\x05order\x18\x01 \x03(\r\"\xb6\x01\n\x17TransformationParameter\x12\x10\n\x05scale\x18\x01 \x01(\x02:\x01\x31\x12\x15\n\x06mirror\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x14\n\tcrop_size\x18\x03 \x01(\r:\x01\x30\x12\x11\n\tmean_file\x18\x04 \x01(\t\x12\x12\n\nmean_value\x18\x05 \x03(\x02\x12\x1a\n\x0b\x66orce_color\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\nforce_gray\x18\x07 \x01(\x08:\x05\x66\x61lse\"\xb4\x02\n\rLossParameter\x12\x14\n\x0cignore_label\x18\x01 \x01(\x05\x12G\n\rnormalization\x18\x03 \x01(\x0e\x32).mo_caffe.LossParameter.NormalizationMode:\x05VALID\x12\x11\n\tnormalize\x18\x02 \x01(\x08\x12\x1f\n\x14pre_fixed_normalizer\x18\x04 \x01(\x02:\x01\x31\x12$\n\x15weight_by_label_freqs\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63lass_weighting\x18\x06 \x03(\x02\"Q\n\x11NormalizationMode\x12\x08\n\x04\x46ULL\x10\x00\x12\t\n\x05VALID\x10\x01\x12\x0e\n\nBATCH_SIZE\x10\x02\x12\r\n\tPRE_FIXED\x10\x03\x12\x08\n\x04NONE\x10\x04\"L\n\x11\x41\x63\x63uracyParameter\x12\x10\n\x05top_k\x18\x01 \x01(\r:\x01\x31\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x14\n\x0cignore_label\x18\x03 \x01(\x05\"M\n\x0f\x41rgMaxParameter\x12\x1a\n\x0bout_max_val\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x05top_k\x18\x02 \x01(\r:\x01\x31\x12\x0c\n\x04\x61xis\x18\x03 \x01(\x05\"D\n\x18\x43hannelPermutationAction\x12\x0c\n\x04\x63han\x18\x01 \x02(\r\x12\x0c\n\x04\x63opy\x18\x02 \x01(\r\x12\x0c\n\x04\x66ill\x18\x03 \x01(\x02\"\x9a\x01\n\x1b\x43hannelPermutationParameter\x12\x32\n\x06\x61\x63tion\x18\x01 \x03(\x0b\x32\".mo_caffe.ChannelPermutationAction\x12\x12\n\nnum_output\x18\x10 \x02(\r\x12\x1f\n\x10inplace_possible\x18\x11 \x01(\x08:\x05\x66\x61lse\x12\x12\n\x07version\x18\x12 \x01(\x05:\x01\x30\"9\n\x0f\x43oncatParameter\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x15\n\nconcat_dim\x18\x01 \x01(\r:\x01\x31\"j\n\x12\x42\x61tchNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12&\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x05\x30.999\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x31\x65-05\"J\n\x19\x42oxAnnotatorOHEMParameter\x12\x13\n\x0broi_per_img\x18\x01 \x02(\r\x12\x18\n\x0cignore_label\x18\x02 \x01(\x05:\x02-1\"`\n\rBiasParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12)\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"L\n\x18\x43ontrastiveLossParameter\x12\x11\n\x06margin\x18\x01 \x01(\x02:\x01\x31\x12\x1d\n\x0elegacy_version\x18\x02 \x01(\x08:\x05\x66\x61lse\"\x85\x04\n\x14\x43onvolutionParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x0b\n\x03pad\x18\x03 \x03(\r\x12\x13\n\x0bkernel_size\x18\x04 \x03(\r\x12\x0e\n\x06stride\x18\x06 \x03(\r\x12\x10\n\x08\x64ilation\x18\x12 \x03(\r\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x10\n\x08kernel_h\x18\x0b \x01(\r\x12\x10\n\x08kernel_w\x18\x0c \x01(\r\x12\x10\n\x08stride_h\x18\r \x01(\r\x12\x10\n\x08stride_w\x18\x0e \x01(\r\x12\x10\n\x05group\x18\x05 \x01(\r:\x01\x31\x12\x30\n\rweight_filler\x18\x07 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x08 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12>\n\x06\x65ngine\x18\x0f \x01(\x0e\x32%.mo_caffe.ConvolutionParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x10 \x01(\x05:\x01\x31\x12\x1e\n\x0f\x66orce_nd_im2col\x18\x11 \x01(\x08:\x05\x66\x61lse\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"A\n\rCropParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x32\x12\x0e\n\x06offset\x18\x02 \x03(\r\x12\x0f\n\x07\x64imsize\x18\x03 \x03(\r\"P\n\x13\x43TCDecoderParameter\x12\x17\n\x0b\x62lank_index\x18\x01 \x01(\x05:\x02-1\x12 \n\x12\x63tc_merge_repeated\x18\x02 \x01(\x08:\x04true\"\xb2\x01\n\x10\x43TCLossParameter\x12\x17\n\x0coutput_delay\x18\x01 \x01(\x05:\x01\x30\x12\x17\n\x0b\x62lank_index\x18\x02 \x01(\x05:\x02-1\x12+\n\x1cpreprocess_collapse_repeated\x18\x03 \x01(\x08:\x05\x66\x61lse\x12 \n\x12\x63tc_merge_repeated\x18\x04 \x01(\x08:\x04true\x12\x1d\n\x12loss_calculation_t\x18\x05 \x01(\x05:\x01\x30\"\xa7\x02\n\rDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x34\n\x07\x62\x61\x63kend\x18\x08 \x01(\x0e\x32\x1a.mo_caffe.DataParameter.DB:\x07LEVELDB\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\"\n\x13\x66orce_encoded_color\x18\t \x01(\x08:\x05\x66\x61lse\x12\x13\n\x08prefetch\x18\n \x01(\r:\x01\x34\"\x1b\n\x02\x44\x42\x12\x0b\n\x07LEVELDB\x10\x00\x12\x08\n\x04LMDB\x10\x01\"[\n\x1eNonMaximumSuppressionParameter\x12\x1a\n\rnms_threshold\x18\x01 \x01(\x02:\x03\x30.3\x12\r\n\x05top_k\x18\x02 \x01(\x05\x12\x0e\n\x03\x65ta\x18\x03 \x01(\x02:\x01\x31\"\x99\x04\n\x0fResizeParameter\x12\x0f\n\x04prob\x18\x01 \x01(\x02:\x01\x31\x12@\n\x0bresize_mode\x18\x02 \x01(\x0e\x32%.mo_caffe.ResizeParameter.Resize_mode:\x04WARP\x12\x11\n\x06height\x18\x03 \x01(\r:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\r:\x01\x30\x12\x17\n\x0cheight_scale\x18\x08 \x01(\r:\x01\x30\x12\x16\n\x0bwidth_scale\x18\t \x01(\r:\x01\x30\x12>\n\x08pad_mode\x18\x05 \x01(\x0e\x32\".mo_caffe.ResizeParameter.Pad_mode:\x08\x43ONSTANT\x12\x11\n\tpad_value\x18\x06 \x03(\x02\x12:\n\x0binterp_mode\x18\x07 \x03(\x0e\x32%.mo_caffe.ResizeParameter.Interp_mode\"G\n\x0bResize_mode\x12\x08\n\x04WARP\x10\x01\x12\x12\n\x0e\x46IT_SMALL_SIZE\x10\x02\x12\x1a\n\x16\x46IT_LARGE_SIZE_AND_PAD\x10\x03\":\n\x08Pad_mode\x12\x0c\n\x08\x43ONSTANT\x10\x01\x12\x0c\n\x08MIRRORED\x10\x02\x12\x12\n\x0eREPEAT_NEAREST\x10\x03\"I\n\x0bInterp_mode\x12\n\n\x06LINEAR\x10\x01\x12\x08\n\x04\x41REA\x10\x02\x12\x0b\n\x07NEAREST\x10\x03\x12\t\n\x05\x43UBIC\x10\x04\x12\x0c\n\x08LANCZOS4\x10\x05\"\xdb\x01\n\x13SaveOutputParameter\x12\x18\n\x10output_directory\x18\x01 \x01(\t\x12\x1a\n\x12output_name_prefix\x18\x02 \x01(\t\x12\x15\n\routput_format\x18\x03 \x01(\t\x12\x16\n\x0elabel_map_file\x18\x04 \x01(\t\x12\x16\n\x0ename_size_file\x18\x05 \x01(\t\x12\x16\n\x0enum_test_image\x18\x06 \x01(\r\x12/\n\x0cresize_param\x18\x07 \x01(\x0b\x32\x19.mo_caffe.ResizeParameter\"\x9d\x04\n\x18\x44\x65tectionOutputParameter\x12\x13\n\x0bnum_classes\x18\x01 \x01(\r\x12\x1c\n\x0eshare_location\x18\x02 \x01(\x08:\x04true\x12\x1e\n\x13\x62\x61\x63kground_label_id\x18\x03 \x01(\x05:\x01\x30\x12;\n\tnms_param\x18\x04 \x01(\x0b\x32(.mo_caffe.NonMaximumSuppressionParameter\x12\x38\n\x11save_output_param\x18\x05 \x01(\x0b\x32\x1d.mo_caffe.SaveOutputParameter\x12?\n\tcode_type\x18\x06 \x01(\x0e\x32$.mo_caffe.PriorBoxParameter.CodeType:\x06\x43ORNER\x12)\n\x1avariance_encoded_in_target\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x16\n\nkeep_top_k\x18\x07 \x01(\x05:\x02-1\x12\x1c\n\x14\x63onfidence_threshold\x18\t \x01(\x02\x12\x18\n\tvisualize\x18\n \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x13visualize_threshold\x18\x0b \x01(\x02\x12\x11\n\tsave_file\x18\x0c \x01(\t\x12\x17\n\x0binput_width\x18\r \x01(\x05:\x02-1\x12\x18\n\x0cinput_height\x18\x0e \x01(\x05:\x02-1\x12\x18\n\nnormalized\x18\x0f \x01(\x08:\x04true\".\n\x10\x44ropoutParameter\x12\x1a\n\rdropout_ratio\x18\x01 \x01(\x02:\x03\x30.5\"\xa6\x01\n\x12\x44ummyDataParameter\x12.\n\x0b\x64\x61ta_filler\x18\x01 \x03(\x0b\x32\x19.mo_caffe.FillerParameter\x12\"\n\x05shape\x18\x06 \x03(\x0b\x32\x13.mo_caffe.BlobShape\x12\x0b\n\x03num\x18\x02 \x03(\r\x12\x10\n\x08\x63hannels\x18\x03 \x03(\r\x12\x0e\n\x06height\x18\x04 \x03(\r\x12\r\n\x05width\x18\x05 \x03(\r\"\xa8\x01\n\x10\x45ltwiseParameter\x12<\n\toperation\x18\x01 \x01(\x0e\x32$.mo_caffe.EltwiseParameter.EltwiseOp:\x03SUM\x12\r\n\x05\x63oeff\x18\x02 \x03(\x02\x12\x1e\n\x10stable_prod_grad\x18\x03 \x01(\x08:\x04true\"\'\n\tEltwiseOp\x12\x08\n\x04PROD\x10\x00\x12\x07\n\x03SUM\x10\x01\x12\x07\n\x03MAX\x10\x02\" \n\x0c\x45LUParameter\x12\x10\n\x05\x61lpha\x18\x01 \x01(\x02:\x01\x31\"\xb2\x01\n\x0e\x45mbedParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x11\n\tinput_dim\x18\x02 \x01(\r\x12\x17\n\tbias_term\x18\x03 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x04 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"D\n\x0c\x45xpParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"9\n\x10\x46lattenParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x14\n\x08\x65nd_axis\x18\x02 \x01(\x05:\x02-1\"O\n\x11HDF5DataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x16\n\x07shuffle\x18\x03 \x01(\x08:\x05\x66\x61lse\"(\n\x13HDF5OutputParameter\x12\x11\n\tfile_name\x18\x01 \x01(\t\"a\n\x12HingeLossParameter\x12\x33\n\x04norm\x18\x01 \x01(\x0e\x32!.mo_caffe.HingeLossParameter.Norm:\x02L1\"\x16\n\x04Norm\x12\x06\n\x02L1\x10\x01\x12\x06\n\x02L2\x10\x02\"\x97\x02\n\x12ImageDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x15\n\nbatch_size\x18\x04 \x01(\r:\x01\x31\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x16\n\x07shuffle\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nnew_height\x18\t \x01(\r:\x01\x30\x12\x14\n\tnew_width\x18\n \x01(\r:\x01\x30\x12\x16\n\x08is_color\x18\x0b \x01(\x08:\x04true\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\x0c \x01(\t:\x00\"\'\n\x15InfogainLossParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\"\xd1\x01\n\x15InnerProductParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x04 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x0f\n\x04\x61xis\x18\x05 \x01(\x05:\x01\x31\x12\x18\n\ttranspose\x18\x06 \x01(\x08:\x05\x66\x61lse\"4\n\x0eInputParameter\x12\"\n\x05shape\x18\x01 \x03(\x0b\x32\x13.mo_caffe.BlobShape\"D\n\x0cLogParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xbe\x02\n\x0cLRNParameter\x12\x15\n\nlocal_size\x18\x01 \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x02 \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x03 \x01(\x02:\x04\x30.75\x12G\n\x0bnorm_region\x18\x04 \x01(\x0e\x32!.mo_caffe.LRNParameter.NormRegion:\x0f\x41\x43ROSS_CHANNELS\x12\x0c\n\x01k\x18\x05 \x01(\x02:\x01\x31\x12\x36\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1d.mo_caffe.LRNParameter.Engine:\x07\x44\x45\x46\x41ULT\"5\n\nNormRegion\x12\x13\n\x0f\x41\x43ROSS_CHANNELS\x10\x00\x12\x12\n\x0eWITHIN_CHANNEL\x10\x01\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\x1f\n\x0cGRNParameter\x12\x0f\n\x04\x62ias\x18\x01 \x01(\x02:\x01\x31\"Z\n\x13MemoryDataParameter\x12\x12\n\nbatch_size\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\x12\x0e\n\x06height\x18\x03 \x01(\r\x12\r\n\x05width\x18\x04 \x01(\r\"d\n\x0cMVNParameter\x12 \n\x12normalize_variance\x18\x01 \x01(\x08:\x04true\x12\x1e\n\x0f\x61\x63ross_channels\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x31\x65-09\"8\n\x12ParameterParameter\x12\"\n\x05shape\x18\x01 \x01(\x0b\x32\x13.mo_caffe.BlobShape\"\xc1\x03\n\x10PoolingParameter\x12\x38\n\x04pool\x18\x01 \x01(\x0e\x32%.mo_caffe.PoolingParameter.PoolMethod:\x03MAX\x12\x0e\n\x03pad\x18\x04 \x01(\r:\x01\x30\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x02 \x01(\r\x12\x10\n\x08kernel_h\x18\x05 \x01(\r\x12\x10\n\x08kernel_w\x18\x06 \x01(\r\x12\x11\n\x06stride\x18\x03 \x01(\r:\x01\x31\x12\x10\n\x08stride_h\x18\x07 \x01(\r\x12\x10\n\x08stride_w\x18\x08 \x01(\r\x12:\n\x06\x65ngine\x18\x0b \x01(\x0e\x32!.mo_caffe.PoolingParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x1d\n\x0eglobal_pooling\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x17\n\tceil_mode\x18\r \x01(\x08:\x04true\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"F\n\x0ePowerParameter\x12\x10\n\x05power\x18\x01 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xd4\x02\n\x11PriorBoxParameter\x12\x10\n\x08min_size\x18\x01 \x03(\x02\x12\x10\n\x08max_size\x18\x02 \x03(\x02\x12\x14\n\x0c\x61spect_ratio\x18\x03 \x03(\x02\x12\x12\n\x04\x66lip\x18\x04 \x01(\x08:\x04true\x12\x13\n\x04\x63lip\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x08variance\x18\x06 \x03(\x02\x12\x10\n\x08img_size\x18\x07 \x01(\r\x12\r\n\x05img_h\x18\x08 \x01(\r\x12\r\n\x05img_w\x18\t \x01(\r\x12\x0c\n\x04step\x18\n \x01(\x02\x12\x0e\n\x06step_h\x18\x0b \x01(\x02\x12\x0e\n\x06step_w\x18\x0c \x01(\x02\x12\x13\n\x06offset\x18\r \x01(\x02:\x03\x30.5\x12\r\n\x05width\x18\x0e \x03(\x02\x12\x0e\n\x06height\x18\x0f \x03(\x02\"8\n\x08\x43odeType\x12\n\n\x06\x43ORNER\x10\x01\x12\x0f\n\x0b\x43\x45NTER_SIZE\x10\x02\x12\x0f\n\x0b\x43ORNER_SIZE\x10\x03\"V\n\x15PSROIPoolingParameter\x12\x15\n\rspatial_scale\x18\x01 \x02(\x02\x12\x12\n\noutput_dim\x18\x02 \x02(\x05\x12\x12\n\ngroup_size\x18\x03 \x02(\x05\"g\n\x0fPythonParameter\x12\x0e\n\x06module\x18\x01 \x01(\t\x12\r\n\x05layer\x18\x02 \x01(\t\x12\x13\n\tparam_str\x18\x03 \x01(\t:\x00\x12 \n\x11share_in_parallel\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xc6\x01\n\x12RecurrentParameter\x12\x15\n\nnum_output\x18\x01 \x01(\r:\x01\x30\x12\x30\n\rweight_filler\x18\x02 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x19\n\ndebug_info\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x1c\n\rexpose_hidden\x18\x05 \x01(\x08:\x05\x66\x61lse\"\xb0\x01\n\x12ReductionParameter\x12@\n\toperation\x18\x01 \x01(\x0e\x32(.mo_caffe.ReductionParameter.ReductionOp:\x03SUM\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x10\n\x05\x63oeff\x18\x03 \x01(\x02:\x01\x31\"5\n\x0bReductionOp\x12\x07\n\x03SUM\x10\x01\x12\x08\n\x04\x41SUM\x10\x02\x12\t\n\x05SUMSQ\x10\x03\x12\x08\n\x04MEAN\x10\x04\"\x90\x01\n\rReLUParameter\x12\x19\n\x0enegative_slope\x18\x01 \x01(\x02:\x01\x30\x12\x37\n\x06\x65ngine\x18\x02 \x01(\x0e\x32\x1e.mo_caffe.ReLUParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\x1e\n\x0eReLU6Parameter\x12\x0c\n\x01n\x18\x01 \x01(\x02:\x01\x36\"]\n\x10ReshapeParameter\x12\"\n\x05shape\x18\x01 \x01(\x0b\x32\x13.mo_caffe.BlobShape\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x14\n\x08num_axes\x18\x03 \x01(\x05:\x02-1\"#\n\x10ReverseParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x30\"Y\n\x13ROIPoolingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"]\n\x17ROIWarpingTestParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"Y\n\x13ROIWarpingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"\xab\x01\n\x0eScaleParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12)\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x18\n\tbias_term\x18\x04 \x01(\x08:\x05\x66\x61lse\x12.\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"{\n\x10SigmoidParameter\x12:\n\x06\x65ngine\x18\x01 \x01(\x0e\x32!.mo_caffe.SigmoidParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"L\n\x0eSliceParameter\x12\x0f\n\x04\x61xis\x18\x03 \x01(\x05:\x01\x31\x12\x13\n\x0bslice_point\x18\x02 \x03(\r\x12\x14\n\tslice_dim\x18\x01 \x01(\r:\x01\x31\")\n\x15SmoothL1LossParameter\x12\x10\n\x05sigma\x18\x01 \x01(\x02:\x01\x31\"\x8c\x01\n\x10SoftmaxParameter\x12:\n\x06\x65ngine\x18\x01 \x01(\x0e\x32!.mo_caffe.SoftmaxParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"u\n\rTanHParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.mo_caffe.TanHParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"/\n\rTileParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05tiles\x18\x02 \x01(\x05\"*\n\x12ThresholdParameter\x12\x14\n\tthreshold\x18\x01 \x01(\x02:\x01\x30\"\xc1\x02\n\x13WindowDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0c\x66g_threshold\x18\x07 \x01(\x02:\x03\x30.5\x12\x19\n\x0c\x62g_threshold\x18\x08 \x01(\x02:\x03\x30.5\x12\x19\n\x0b\x66g_fraction\x18\t \x01(\x02:\x04\x30.25\x12\x16\n\x0b\x63ontext_pad\x18\n \x01(\r:\x01\x30\x12\x17\n\tcrop_mode\x18\x0b \x01(\t:\x04warp\x12\x1b\n\x0c\x63\x61\x63he_images\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\r \x01(\t:\x00\"\xf1\x01\n\x0cSPPParameter\x12\x16\n\x0epyramid_height\x18\x01 \x01(\r\x12\x34\n\x04pool\x18\x02 \x01(\x0e\x32!.mo_caffe.SPPParameter.PoolMethod:\x03MAX\x12\x36\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1d.mo_caffe.SPPParameter.Engine:\x07\x44\x45\x46\x41ULT\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xcc\x14\n\x10V1LayerParameter\x12\x0e\n\x06\x62ottom\x18\x02 \x03(\t\x12\x0b\n\x03top\x18\x03 \x03(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\'\n\x07include\x18  \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\'\n\x07\x65xclude\x18! \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\x32\n\x04type\x18\x05 \x01(\x0e\x32$.mo_caffe.V1LayerParameter.LayerType\x12\"\n\x05\x62lobs\x18\x06 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x0e\n\x05param\x18\xe9\x07 \x03(\t\x12\x41\n\x0f\x62lob_share_mode\x18\xea\x07 \x03(\x0e\x32\'.mo_caffe.V1LayerParameter.DimCheckMode\x12\x10\n\x08\x62lobs_lr\x18\x07 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x08 \x03(\x02\x12\x13\n\x0bloss_weight\x18# \x03(\x02\x12\x33\n\x0e\x61\x63\x63uracy_param\x18\x1b \x01(\x0b\x32\x1b.mo_caffe.AccuracyParameter\x12/\n\x0c\x61rgmax_param\x18\x17 \x01(\x0b\x32\x19.mo_caffe.ArgMaxParameter\x12/\n\x0c\x63oncat_param\x18\t \x01(\x0b\x32\x19.mo_caffe.ConcatParameter\x12\x42\n\x16\x63ontrastive_loss_param\x18( \x01(\x0b\x32\".mo_caffe.ContrastiveLossParameter\x12\x39\n\x11\x63onvolution_param\x18\n \x01(\x0b\x32\x1e.mo_caffe.ConvolutionParameter\x12+\n\ndata_param\x18\x0b \x01(\x0b\x32\x17.mo_caffe.DataParameter\x12\x31\n\rdropout_param\x18\x0c \x01(\x0b\x32\x1a.mo_caffe.DropoutParameter\x12\x36\n\x10\x64ummy_data_param\x18\x1a \x01(\x0b\x32\x1c.mo_caffe.DummyDataParameter\x12\x31\n\reltwise_param\x18\x18 \x01(\x0b\x32\x1a.mo_caffe.EltwiseParameter\x12)\n\texp_param\x18) \x01(\x0b\x32\x16.mo_caffe.ExpParameter\x12\x34\n\x0fhdf5_data_param\x18\r \x01(\x0b\x32\x1b.mo_caffe.HDF5DataParameter\x12\x38\n\x11hdf5_output_param\x18\x0e \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\x12\x36\n\x10hinge_loss_param\x18\x1d \x01(\x0b\x32\x1c.mo_caffe.HingeLossParameter\x12\x36\n\x10image_data_param\x18\x0f \x01(\x0b\x32\x1c.mo_caffe.ImageDataParameter\x12<\n\x13infogain_loss_param\x18\x10 \x01(\x0b\x32\x1f.mo_caffe.InfogainLossParameter\x12<\n\x13inner_product_param\x18\x11 \x01(\x0b\x32\x1f.mo_caffe.InnerProductParameter\x12)\n\tlrn_param\x18\x12 \x01(\x0b\x32\x16.mo_caffe.LRNParameter\x12\x38\n\x11memory_data_param\x18\x16 \x01(\x0b\x32\x1d.mo_caffe.MemoryDataParameter\x12)\n\tmvn_param\x18\" \x01(\x0b\x32\x16.mo_caffe.MVNParameter\x12\x31\n\rpooling_param\x18\x13 \x01(\x0b\x32\x1a.mo_caffe.PoolingParameter\x12-\n\x0bpower_param\x18\x15 \x01(\x0b\x32\x18.mo_caffe.PowerParameter\x12+\n\nrelu_param\x18\x1e \x01(\x0b\x32\x17.mo_caffe.ReLUParameter\x12\x31\n\rsigmoid_param\x18& \x01(\x0b\x32\x1a.mo_caffe.SigmoidParameter\x12\x31\n\rsoftmax_param\x18\' \x01(\x0b\x32\x1a.mo_caffe.SoftmaxParameter\x12-\n\x0bslice_param\x18\x1f \x01(\x0b\x32\x18.mo_caffe.SliceParameter\x12+\n\ntanh_param\x18% \x01(\x0b\x32\x17.mo_caffe.TanHParameter\x12\x35\n\x0fthreshold_param\x18\x19 \x01(\x0b\x32\x1c.mo_caffe.ThresholdParameter\x12\x38\n\x11window_data_param\x18\x14 \x01(\x0b\x32\x1d.mo_caffe.WindowDataParameter\x12:\n\x0ftransform_param\x18$ \x01(\x0b\x32!.mo_caffe.TransformationParameter\x12+\n\nloss_param\x18* \x01(\x0b\x32\x17.mo_caffe.LossParameter\x12)\n\x05layer\x18\x01 \x01(\x0b\x32\x1a.mo_caffe.V0LayerParameter\"\xd8\x04\n\tLayerType\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06\x41\x42SVAL\x10#\x12\x0c\n\x08\x41\x43\x43URACY\x10\x01\x12\n\n\x06\x41RGMAX\x10\x1e\x12\x08\n\x04\x42NLL\x10\x02\x12\n\n\x06\x43ONCAT\x10\x03\x12\x14\n\x10\x43ONTRASTIVE_LOSS\x10%\x12\x0f\n\x0b\x43ONVOLUTION\x10\x04\x12\x08\n\x04\x44\x41TA\x10\x05\x12\x11\n\rDECONVOLUTION\x10\'\x12\x0b\n\x07\x44ROPOUT\x10\x06\x12\x0e\n\nDUMMY_DATA\x10 \x12\x12\n\x0e\x45UCLIDEAN_LOSS\x10\x07\x12\x0b\n\x07\x45LTWISE\x10\x19\x12\x07\n\x03\x45XP\x10&\x12\x0b\n\x07\x46LATTEN\x10\x08\x12\r\n\tHDF5_DATA\x10\t\x12\x0f\n\x0bHDF5_OUTPUT\x10\n\x12\x0e\n\nHINGE_LOSS\x10\x1c\x12\n\n\x06IM2COL\x10\x0b\x12\x0e\n\nIMAGE_DATA\x10\x0c\x12\x11\n\rINFOGAIN_LOSS\x10\r\x12\x11\n\rINNER_PRODUCT\x10\x0e\x12\x07\n\x03LRN\x10\x0f\x12\x0f\n\x0bMEMORY_DATA\x10\x1d\x12\x1d\n\x19MULTINOMIAL_LOGISTIC_LOSS\x10\x10\x12\x07\n\x03MVN\x10\"\x12\x0b\n\x07POOLING\x10\x11\x12\t\n\x05POWER\x10\x1a\x12\x08\n\x04RELU\x10\x12\x12\x0b\n\x07SIGMOID\x10\x13\x12\x1e\n\x1aSIGMOID_CROSS_ENTROPY_LOSS\x10\x1b\x12\x0b\n\x07SILENCE\x10$\x12\x0b\n\x07SOFTMAX\x10\x14\x12\x10\n\x0cSOFTMAX_LOSS\x10\x15\x12\t\n\x05SPLIT\x10\x16\x12\t\n\x05SLICE\x10!\x12\x08\n\x04TANH\x10\x17\x12\x0f\n\x0bWINDOW_DATA\x10\x18\x12\r\n\tTHRESHOLD\x10\x1f\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\x8c\x08\n\x10V0LayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x12\n\nnum_output\x18\x03 \x01(\r\x12\x16\n\x08\x62iasterm\x18\x04 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x06 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x0e\n\x03pad\x18\x07 \x01(\r:\x01\x30\x12\x12\n\nkernelsize\x18\x08 \x01(\r\x12\x10\n\x05group\x18\t \x01(\r:\x01\x31\x12\x11\n\x06stride\x18\n \x01(\r:\x01\x31\x12\x38\n\x04pool\x18\x0b \x01(\x0e\x32%.mo_caffe.V0LayerParameter.PoolMethod:\x03MAX\x12\x1a\n\rdropout_ratio\x18\x0c \x01(\x02:\x03\x30.5\x12\x15\n\nlocal_size\x18\r \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x0e \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x0f \x01(\x02:\x04\x30.75\x12\x0c\n\x01k\x18\x16 \x01(\x02:\x01\x31\x12\x0e\n\x06source\x18\x10 \x01(\t\x12\x10\n\x05scale\x18\x11 \x01(\x02:\x01\x31\x12\x10\n\x08meanfile\x18\x12 \x01(\t\x12\x11\n\tbatchsize\x18\x13 \x01(\r\x12\x13\n\x08\x63ropsize\x18\x14 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x15 \x01(\x08:\x05\x66\x61lse\x12\"\n\x05\x62lobs\x18\x32 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x10\n\x08\x62lobs_lr\x18\x33 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x34 \x03(\x02\x12\x14\n\trand_skip\x18\x35 \x01(\r:\x01\x30\x12\x1d\n\x10\x64\x65t_fg_threshold\x18\x36 \x01(\x02:\x03\x30.5\x12\x1d\n\x10\x64\x65t_bg_threshold\x18\x37 \x01(\x02:\x03\x30.5\x12\x1d\n\x0f\x64\x65t_fg_fraction\x18\x38 \x01(\x02:\x04\x30.25\x12\x1a\n\x0f\x64\x65t_context_pad\x18: \x01(\r:\x01\x30\x12\x1b\n\rdet_crop_mode\x18; \x01(\t:\x04warp\x12\x12\n\x07new_num\x18< \x01(\x05:\x01\x30\x12\x17\n\x0cnew_channels\x18= \x01(\x05:\x01\x30\x12\x15\n\nnew_height\x18> \x01(\x05:\x01\x30\x12\x14\n\tnew_width\x18? \x01(\x05:\x01\x30\x12\x1d\n\x0eshuffle_images\x18@ \x01(\x08:\x05\x66\x61lse\x12\x15\n\nconcat_dim\x18\x41 \x01(\r:\x01\x31\x12\x39\n\x11hdf5_output_param\x18\xe9\x07 \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"Z\n\x0ePReLUParameter\x12)\n\x06\x66iller\x18\x01 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x1d\n\x0e\x63hannel_shared\x18\x02 \x01(\x08:\x05\x66\x61lse\"\x86\x01\n\x13RegionYoloParameter\x12\x11\n\x06\x63oords\x18\x01 \x01(\x05:\x01\x34\x12\x13\n\x07\x63lasses\x18\x02 \x01(\x05:\x02\x32\x30\x12\x0e\n\x03num\x18\x03 \x01(\x05:\x01\x31\x12\x18\n\ndo_softmax\x18\x04 \x01(\x08:\x04true\x12\x0f\n\x07\x61nchors\x18\x05 \x03(\x02\x12\x0c\n\x04mask\x18\x06 \x03(\x05\"\'\n\x12ReorgYoloParameter\x12\x11\n\x06stride\x18\x01 \x01(\x05:\x01\x31\"\xcf\x01\n\x18RandomGeneratorParameter\x12\x1a\n\trand_type\x18\x01 \x01(\t:\x07uniform\x12\x12\n\x03\x65xp\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x0f\n\x04mean\x18\x04 \x01(\x02:\x01\x30\x12\x11\n\x06spread\x18\x05 \x01(\x02:\x01\x30\x12\x0f\n\x04prob\x18\x06 \x01(\x02:\x01\x31\x12\x1c\n\x0e\x61pply_schedule\x18\x07 \x01(\x08:\x04true\x12\x19\n\ndiscretize\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nmultiplier\x18\t \x01(\x02:\x01\x31\"`\n\x16\x43oeffScheduleParameter\x12\x14\n\thalf_life\x18\x01 \x01(\x02:\x01\x31\x12\x18\n\rinitial_coeff\x18\x02 \x01(\x02:\x01\x31\x12\x16\n\x0b\x66inal_coeff\x18\x03 \x01(\x02:\x01\x31\"\xde\x07\n\x11\x41ugmentationCoeff\x12\x11\n\x06mirror\x18\x01 \x01(\x02:\x01\x30\x12\r\n\x02\x64x\x18\x02 \x01(\x02:\x01\x30\x12\r\n\x02\x64y\x18\x03 \x01(\x02:\x01\x30\x12\x10\n\x05\x61ngle\x18\x04 \x01(\x02:\x01\x30\x12\x11\n\x06zoom_x\x18\x05 \x01(\x02:\x01\x31\x12\x11\n\x06zoom_y\x18\x06 \x01(\x02:\x01\x31\x12\x10\n\x05gamma\x18\x64 \x01(\x02:\x01\x31\x12\x15\n\nbrightness\x18\x65 \x01(\x02:\x01\x30\x12\x13\n\x08\x63ontrast\x18\x66 \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor1\x18g \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor2\x18h \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor3\x18i \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean0\x18\n \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean1\x18\x0b \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean2\x18\x0c \x01(\x02:\x01\x31\x12\x16\n\x0b\x61\x64\x64_nomean0\x18\r \x01(\x02:\x01\x30\x12\x16\n\x0b\x61\x64\x64_nomean1\x18\x0e \x01(\x02:\x01\x30\x12\x16\n\x0b\x61\x64\x64_nomean2\x18\x0f \x01(\x02:\x01\x30\x12\x17\n\x0cmult_nomean0\x18\x10 \x01(\x02:\x01\x31\x12\x17\n\x0cmult_nomean1\x18\x11 \x01(\x02:\x01\x31\x12\x17\n\x0cmult_nomean2\x18\x12 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean0\x18\x13 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean1\x18\x14 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean2\x18\x15 \x01(\x02:\x01\x31\x12\x18\n\radd_withmean0\x18\x16 \x01(\x02:\x01\x30\x12\x18\n\radd_withmean1\x18\x17 \x01(\x02:\x01\x30\x12\x18\n\radd_withmean2\x18\x18 \x01(\x02:\x01\x30\x12\x19\n\x0emult_withmean0\x18\x19 \x01(\x02:\x01\x31\x12\x19\n\x0emult_withmean1\x18\x1a \x01(\x02:\x01\x31\x12\x19\n\x0emult_withmean2\x18\x1b \x01(\x02:\x01\x31\x12\x14\n\tlmult_pow\x18\x1c \x01(\x02:\x01\x31\x12\x14\n\tlmult_add\x18\x1d \x01(\x02:\x01\x30\x12\x15\n\nlmult_mult\x18\x1e \x01(\x02:\x01\x31\x12\x14\n\tcol_angle\x18\x1f \x01(\x02:\x01\x30\x12\x15\n\nfog_amount\x18& \x01(\x02:\x01\x30\x12\x13\n\x08\x66og_size\x18\' \x01(\x02:\x01\x30\x12\x1c\n\x11motion_blur_angle\x18( \x01(\x02:\x01\x30\x12\x1b\n\x10motion_blur_size\x18) \x01(\x02:\x01\x30\x12\x17\n\x0cshadow_angle\x18* \x01(\x02:\x01\x30\x12\x1a\n\x0fshadow_distance\x18+ \x01(\x02:\x01\x30\x12\x1a\n\x0fshadow_strength\x18, \x01(\x02:\x01\x30\x12\x10\n\x05noise\x18- \x01(\x02:\x01\x30\"\xcc\x10\n\x15\x41ugmentationParameter\x12\x15\n\ncrop_width\x18! \x01(\r:\x01\x30\x12\x16\n\x0b\x63rop_height\x18\" \x01(\r:\x01\x30\x12\x19\n\x0fwrite_augmented\x18\x02 \x01(\t:\x00\x12\x1b\n\x0emax_multiplier\x18\x03 \x01(\x02:\x03\x32\x35\x35\x12\"\n\x13\x61ugment_during_test\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0erecompute_mean\x18\x05 \x01(\r:\x01\x30\x12\x14\n\nwrite_mean\x18\x06 \x01(\t:\x00\x12\x1c\n\x0emean_per_pixel\x18\x07 \x01(\x08:\x04true\x12\x0c\n\x04mean\x18\x12 \x03(\x02\x12\x11\n\x04mode\x18\x08 \x01(\t:\x03\x61\x64\x64\x12\x16\n\x0b\x62ottomwidth\x18P \x01(\r:\x01\x30\x12\x17\n\x0c\x62ottomheight\x18Q \x01(\r:\x01\x30\x12\x0e\n\x03num\x18R \x01(\r:\x01\x30\x12\x18\n\x10\x63hromatic_eigvec\x18S \x03(\x02\x12\x32\n\x06mirror\x18\n \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\ttranslate\x18\x0b \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x32\n\x06rotate\x18\x0c \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x30\n\x04zoom\x18\r \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07squeeze\x18\x0e \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x37\n\x0btranslate_x\x18\x0f \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x37\n\x0btranslate_y\x18\x10 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05gamma\x18# \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nbrightness\x18$ \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x63ontrast\x18% \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05\x63olor\x18& \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tlmult_pow\x18\x14 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nlmult_mult\x18\x15 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tlmult_add\x18\x16 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07sat_pow\x18\x17 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08sat_mult\x18\x18 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07sat_add\x18\x19 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07\x63ol_pow\x18\x1a \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x63ol_mult\x18\x1b \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07\x63ol_add\x18\x1c \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08ladd_pow\x18\x1d \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tladd_mult\x18\x1e \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08ladd_add\x18\x1f \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\ncol_rotate\x18  \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nfog_amount\x18\x64 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x66og_size\x18\x65 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12=\n\x11motion_blur_angle\x18\x66 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12<\n\x10motion_blur_size\x18g \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x38\n\x0cshadow_angle\x18h \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12;\n\x0fshadow_distance\x18i \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12;\n\x0fshadow_strength\x18j \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05noise\x18k \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\"\x85\x01\n\x11\x46lowWarpParameter\x12\x43\n\nfill_value\x18\x01 \x01(\x0e\x32).mo_caffe.FlowWarpParameter.FillParameter:\x04ZERO\"+\n\rFillParameter\x12\x08\n\x04ZERO\x10\x01\x12\x10\n\x0cNOT_A_NUMBER\x10\x02\"\xb6\x02\n\x14\x43orrelationParameter\x12\x0e\n\x03pad\x18\x02 \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x03 \x01(\r\x12\x18\n\x10max_displacement\x18\x04 \x01(\r\x12\x13\n\x08stride_1\x18\x05 \x01(\r:\x01\x31\x12\x13\n\x08stride_2\x18\x06 \x01(\r:\x01\x31\x12\x1b\n\x10single_direction\x18\x08 \x01(\x05:\x01\x30\x12\x15\n\x06\x64o_abs\x18\x07 \x01(\x08:\x05\x66\x61lse\x12R\n\x10\x63orrelation_type\x18\x0f \x01(\x0e\x32..mo_caffe.CorrelationParameter.CorrelationType:\x08MULTIPLY\"-\n\x0f\x43orrelationType\x12\x0c\n\x08MULTIPLY\x10\x00\x12\x0c\n\x08SUBTRACT\x10\x01\"\xdc\x01\n\x11ResampleParameter\x12\x17\n\tantialias\x18\x04 \x01(\x08:\x04true\x12\r\n\x05width\x18\x01 \x01(\r\x12\x0e\n\x06height\x18\x02 \x01(\r\x12>\n\x04type\x18\x03 \x01(\x0e\x32(.mo_caffe.ResampleParameter.ResampleType:\x06LINEAR\x12\x11\n\x06\x66\x61\x63tor\x18\x05 \x01(\x02:\x01\x31\"<\n\x0cResampleType\x12\x0b\n\x07NEAREST\x10\x01\x12\n\n\x06LINEAR\x10\x02\x12\t\n\x05\x43UBIC\x10\x03\x12\x08\n\x04\x41REA\x10\x04\"z\n\x0e\x41\x63\x63umParameter\x12\x15\n\ntop_height\x18\x01 \x01(\r:\x01\x30\x12\x14\n\ttop_width\x18\x02 \x01(\r:\x01\x30\x12\x1c\n\x11size_divisible_by\x18\x03 \x01(\r:\x01\x30\x12\x1d\n\x0ehave_reference\x18\x04 \x01(\x08:\x05\x66\x61lse*\x1c\n\x05Phase\x12\t\n\x05TRAIN\x10\x00\x12\x08\n\x04TEST\x10\x01')
+  serialized_pb=_b('\n\x0emo_caffe.proto\x12\x08mo_caffe\"\x1c\n\tBlobShape\x12\x0f\n\x03\x64im\x18\x01 \x03(\x03\x42\x02\x10\x01\"\xcf\x01\n\tBlobProto\x12\"\n\x05shape\x18\x07 \x01(\x0b\x32\x13.mo_caffe.BlobShape\x12\x10\n\x04\x64\x61ta\x18\x05 \x03(\x02\x42\x02\x10\x01\x12\x10\n\x04\x64iff\x18\x06 \x03(\x02\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_data\x18\x08 \x03(\x01\x42\x02\x10\x01\x12\x17\n\x0b\x64ouble_diff\x18\t \x03(\x01\x42\x02\x10\x01\x12\x0e\n\x03num\x18\x01 \x01(\x05:\x01\x30\x12\x13\n\x08\x63hannels\x18\x02 \x01(\x05:\x01\x30\x12\x11\n\x06height\x18\x03 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\x05:\x01\x30\"5\n\x0f\x42lobProtoVector\x12\"\n\x05\x62lobs\x18\x01 \x03(\x0b\x32\x13.mo_caffe.BlobProto\"M\n\x1e\x43osineSimilarityBatchParameter\x12\x14\n\tpos_label\x18\x01 \x01(\x01:\x01\x31\x12\x15\n\tneg_label\x18\x02 \x01(\x01:\x02-1\"\x81\x01\n\x05\x44\x61tum\x12\x10\n\x08\x63hannels\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x04 \x01(\x0c\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x12\n\nfloat_data\x18\x06 \x03(\x02\x12\x16\n\x07\x65ncoded\x18\x07 \x01(\x08:\x05\x66\x61lse\"A\n\x0cLabelMapItem\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05label\x18\x02 \x01(\x05\x12\x14\n\x0c\x64isplay_name\x18\x03 \x01(\t\"0\n\x08LabelMap\x12$\n\x04item\x18\x01 \x03(\x0b\x32\x16.mo_caffe.LabelMapItem\"\x87\x01\n\x0eNormalizedBBox\x12\x0c\n\x04xmin\x18\x01 \x01(\x02\x12\x0c\n\x04ymin\x18\x02 \x01(\x02\x12\x0c\n\x04xmax\x18\x03 \x01(\x02\x12\x0c\n\x04ymax\x18\x04 \x01(\x02\x12\r\n\x05label\x18\x05 \x01(\x05\x12\x11\n\tdifficult\x18\x06 \x01(\x08\x12\r\n\x05score\x18\x07 \x01(\x02\x12\x0c\n\x04size\x18\x08 \x01(\x02\"\xad\x02\n\x0f\x46illerParameter\x12\x16\n\x04type\x18\x01 \x01(\t:\x08\x63onstant\x12\x10\n\x05value\x18\x02 \x01(\x02:\x01\x30\x12\x0e\n\x03min\x18\x03 \x01(\x02:\x01\x30\x12\x0e\n\x03max\x18\x04 \x01(\x02:\x01\x31\x12\x0f\n\x04mean\x18\x05 \x01(\x02:\x01\x30\x12\x0e\n\x03std\x18\x06 \x01(\x02:\x01\x31\x12\x12\n\x06sparse\x18\x07 \x01(\x05:\x02-1\x12\x45\n\rvariance_norm\x18\x08 \x01(\x0e\x32&.mo_caffe.FillerParameter.VarianceNorm:\x06\x46\x41N_IN\x12\x0c\n\x04\x66ile\x18\t \x01(\t\x12\x10\n\x08\x64iag_val\x18\n \x03(\x02\"4\n\x0cVarianceNorm\x12\n\n\x06\x46\x41N_IN\x10\x00\x12\x0b\n\x07\x46\x41N_OUT\x10\x01\x12\x0b\n\x07\x41VERAGE\x10\x02\"\xed\x02\n\x0cNetParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05input\x18\x03 \x03(\t\x12(\n\x0binput_shape\x18\x08 \x03(\x0b\x32\x13.mo_caffe.BlobShape\x12\x11\n\tinput_dim\x18\x04 \x03(\x05\x12\x1d\n\x0e\x66orce_backward\x18\x05 \x01(\x08:\x05\x66\x61lse\x12!\n\x05state\x18\x06 \x01(\x0b\x32\x12.mo_caffe.NetState\x12\x19\n\ndebug_info\x18\x07 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0cprofile_info\x18\t \x01(\x08:\x05\x66\x61lse\x12\x18\n\x0cprofile_iter\x18\n \x01(\x05:\x02\x35\x30\x12\x1a\n\x0eprofile_warmup\x18\x0b \x01(\x05:\x02\x31\x30\x12\'\n\x05layer\x18\x64 \x03(\x0b\x32\x18.mo_caffe.LayerParameter\x12*\n\x06layers\x18\x02 \x03(\x0b\x32\x1a.mo_caffe.V1LayerParameter\"\xf4\n\n\x0fSolverParameter\x12\x0b\n\x03net\x18\x18 \x01(\t\x12)\n\tnet_param\x18\x19 \x01(\x0b\x32\x16.mo_caffe.NetParameter\x12\x11\n\ttrain_net\x18\x01 \x01(\t\x12\x10\n\x08test_net\x18\x02 \x03(\t\x12/\n\x0ftrain_net_param\x18\x15 \x01(\x0b\x32\x16.mo_caffe.NetParameter\x12.\n\x0etest_net_param\x18\x16 \x03(\x0b\x32\x16.mo_caffe.NetParameter\x12\'\n\x0btrain_state\x18\x1a \x01(\x0b\x32\x12.mo_caffe.NetState\x12&\n\ntest_state\x18\x1b \x03(\x0b\x32\x12.mo_caffe.NetState\x12\x11\n\ttest_iter\x18\x03 \x03(\x05\x12\x18\n\rtest_interval\x18\x04 \x01(\x05:\x01\x30\x12 \n\x11test_compute_loss\x18\x13 \x01(\x08:\x05\x66\x61lse\x12!\n\x13test_initialization\x18  \x01(\x08:\x04true\x12\x0f\n\x07\x62\x61se_lr\x18\x05 \x01(\x02\x12\x0f\n\x07\x64isplay\x18\x06 \x01(\x05\x12\x17\n\x0c\x61verage_loss\x18! \x01(\x05:\x01\x31\x12\x10\n\x08max_iter\x18\x07 \x01(\x05\x12\x14\n\titer_size\x18$ \x01(\x05:\x01\x31\x12\x11\n\tlr_policy\x18\x08 \x01(\t\x12\r\n\x05gamma\x18\t \x01(\x02\x12\r\n\x05power\x18\n \x01(\x02\x12\x10\n\x08momentum\x18\x0b \x01(\x02\x12\x14\n\x0cweight_decay\x18\x0c \x01(\x02\x12\x1f\n\x13regularization_type\x18\x1d \x01(\t:\x02L2\x12\x10\n\x08stepsize\x18\r \x01(\x05\x12\x11\n\tstepvalue\x18\" \x03(\x05\x12\x17\n\x0fplateau_winsize\x18* \x03(\x05\x12\x1a\n\x0e\x63lip_gradients\x18# \x01(\x02:\x02-1\x12\x13\n\x08snapshot\x18\x0e \x01(\x05:\x01\x30\x12\x17\n\x0fsnapshot_prefix\x18\x0f \x01(\t\x12\x1c\n\rsnapshot_diff\x18\x10 \x01(\x08:\x05\x66\x61lse\x12N\n\x0fsnapshot_format\x18% \x01(\x0e\x32(.mo_caffe.SolverParameter.SnapshotFormat:\x0b\x42INARYPROTO\x12>\n\x0bsolver_mode\x18\x11 \x01(\x0e\x32$.mo_caffe.SolverParameter.SolverMode:\x03GPU\x12\x14\n\tdevice_id\x18\x12 \x01(\x05:\x01\x30\x12\x17\n\x0brandom_seed\x18\x14 \x01(\x03:\x02-1\x12\x11\n\x04type\x18( \x01(\t:\x03SGD\x12\x14\n\x05\x64\x65lta\x18\x1f \x01(\x02:\x05\x31\x65-08\x12\x18\n\tmomentum2\x18\' \x01(\x02:\x05\x30.999\x12\x17\n\trms_decay\x18& \x01(\x02:\x04\x30.99\x12\x19\n\ndebug_info\x18\x17 \x01(\x08:\x05\x66\x61lse\x12\"\n\x14snapshot_after_train\x18\x1c \x01(\x08:\x04true\x12>\n\x0bsolver_type\x18\x1e \x01(\x0e\x32$.mo_caffe.SolverParameter.SolverType:\x03SGD\x12\x1f\n\x11layer_wise_reduce\x18) \x01(\x08:\x04true\"+\n\x0eSnapshotFormat\x12\x08\n\x04HDF5\x10\x00\x12\x0f\n\x0b\x42INARYPROTO\x10\x01\"\x1e\n\nSolverMode\x12\x07\n\x03\x43PU\x10\x00\x12\x07\n\x03GPU\x10\x01\"U\n\nSolverType\x12\x07\n\x03SGD\x10\x00\x12\x0c\n\x08NESTEROV\x10\x01\x12\x0b\n\x07\x41\x44\x41GRAD\x10\x02\x12\x0b\n\x07RMSPROP\x10\x03\x12\x0c\n\x08\x41\x44\x41\x44\x45LTA\x10\x04\x12\x08\n\x04\x41\x44\x41M\x10\x05\"\xa8\x01\n\x0bSolverState\x12\x0c\n\x04iter\x18\x01 \x01(\x05\x12\x13\n\x0blearned_net\x18\x02 \x01(\t\x12$\n\x07history\x18\x03 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x17\n\x0c\x63urrent_step\x18\x04 \x01(\x05:\x01\x30\x12\x1b\n\x0cminimum_loss\x18\x05 \x01(\x02:\x05\x31\x65+38\x12\x1a\n\x0fiter_last_event\x18\x06 \x01(\x05:\x01\x30\"Q\n\x08NetState\x12$\n\x05phase\x18\x01 \x01(\x0e\x32\x0f.mo_caffe.Phase:\x04TEST\x12\x10\n\x05level\x18\x02 \x01(\x05:\x01\x30\x12\r\n\x05stage\x18\x03 \x03(\t\"v\n\x0cNetStateRule\x12\x1e\n\x05phase\x18\x01 \x01(\x0e\x32\x0f.mo_caffe.Phase\x12\x11\n\tmin_level\x18\x02 \x01(\x05\x12\x11\n\tmax_level\x18\x03 \x01(\x05\x12\r\n\x05stage\x18\x04 \x03(\t\x12\x11\n\tnot_stage\x18\x05 \x03(\t\"\xad\x02\n\x1bSpatialTransformerParameter\x12\x1e\n\x0etransform_type\x18\x01 \x01(\t:\x06\x61\x66\x66ine\x12\x1e\n\x0csampler_type\x18\x02 \x01(\t:\x08\x62ilinear\x12\x10\n\x08output_H\x18\x03 \x01(\x05\x12\x10\n\x08output_W\x18\x04 \x01(\x05\x12\x1b\n\rto_compute_dU\x18\x05 \x01(\x08:\x04true\x12\x11\n\ttheta_1_1\x18\x06 \x01(\x01\x12\x11\n\ttheta_1_2\x18\x07 \x01(\x01\x12\x11\n\ttheta_1_3\x18\x08 \x01(\x01\x12\x11\n\ttheta_2_1\x18\t \x01(\x01\x12\x11\n\ttheta_2_2\x18\n \x01(\x01\x12\x11\n\ttheta_2_3\x18\x0b \x01(\x01\x12\x1b\n\x0c\x64\x65_transform\x18\x0c \x01(\x08:\x05\x66\x61lse\"(\n\x12PowerFileParameter\x12\x12\n\nshift_file\x18\x01 \x01(\t\"5\n\x0fSTLossParameter\x12\x10\n\x08output_H\x18\x01 \x02(\x05\x12\x10\n\x08output_W\x18\x02 \x02(\x05\"%\n\x10LocLossParameter\x12\x11\n\tthreshold\x18\x01 \x02(\x01\"\xa6\x01\n\tParamSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x34\n\nshare_mode\x18\x02 \x01(\x0e\x32 .mo_caffe.ParamSpec.DimCheckMode\x12\x12\n\x07lr_mult\x18\x03 \x01(\x02:\x01\x31\x12\x15\n\ndecay_mult\x18\x04 \x01(\x02:\x01\x31\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\xf4#\n\x0eLayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x0e\n\x06\x62ottom\x18\x03 \x03(\t\x12\x0b\n\x03top\x18\x04 \x03(\t\x12\x1e\n\x05phase\x18\n \x01(\x0e\x32\x0f.mo_caffe.Phase\x12\x13\n\x0bloss_weight\x18\x05 \x03(\x02\x12\"\n\x05param\x18\x06 \x03(\x0b\x32\x13.mo_caffe.ParamSpec\x12\"\n\x05\x62lobs\x18\x07 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x16\n\x0epropagate_down\x18\x0b \x03(\x08\x12\'\n\x07include\x18\x08 \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\'\n\x07\x65xclude\x18\t \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12:\n\x0ftransform_param\x18\x64 \x01(\x0b\x32!.mo_caffe.TransformationParameter\x12+\n\nloss_param\x18\x65 \x01(\x0b\x32\x17.mo_caffe.LossParameter\x12\x33\n\x0e\x61\x63\x63uracy_param\x18\x66 \x01(\x0b\x32\x1b.mo_caffe.AccuracyParameter\x12/\n\x0c\x61rgmax_param\x18g \x01(\x0b\x32\x19.mo_caffe.ArgMaxParameter\x12\x37\n\x10\x62\x61tch_norm_param\x18\x8b\x01 \x01(\x0b\x32\x1c.mo_caffe.BatchNormParameter\x12,\n\nbias_param\x18\x8d\x01 \x01(\x0b\x32\x17.mo_caffe.BiasParameter\x12I\n\x19\x63hannel_permutation_param\x18\x92? \x01(\x0b\x32%.mo_caffe.ChannelPermutationParameter\x12/\n\x0c\x63oncat_param\x18h \x01(\x0b\x32\x19.mo_caffe.ConcatParameter\x12\x42\n\x16\x63ontrastive_loss_param\x18i \x01(\x0b\x32\".mo_caffe.ContrastiveLossParameter\x12\x39\n\x11\x63onvolution_param\x18j \x01(\x0b\x32\x1e.mo_caffe.ConvolutionParameter\x12,\n\ncrop_param\x18\x90\x01 \x01(\x0b\x32\x17.mo_caffe.CropParameter\x12\x39\n\x11\x63tc_decoder_param\x18\x95\x01 \x01(\x0b\x32\x1d.mo_caffe.CTCDecoderParameter\x12\x33\n\x0e\x63tc_loss_param\x18\x94\x01 \x01(\x0b\x32\x1a.mo_caffe.CTCLossParameter\x12+\n\ndata_param\x18k \x01(\x0b\x32\x17.mo_caffe.DataParameter\x12\x31\n\rdropout_param\x18l \x01(\x0b\x32\x1a.mo_caffe.DropoutParameter\x12\x36\n\x10\x64ummy_data_param\x18m \x01(\x0b\x32\x1c.mo_caffe.DummyDataParameter\x12\x31\n\reltwise_param\x18n \x01(\x0b\x32\x1a.mo_caffe.EltwiseParameter\x12*\n\telu_param\x18\x8c\x01 \x01(\x0b\x32\x16.mo_caffe.ELUParameter\x12.\n\x0b\x65mbed_param\x18\x89\x01 \x01(\x0b\x32\x18.mo_caffe.EmbedParameter\x12)\n\texp_param\x18o \x01(\x0b\x32\x16.mo_caffe.ExpParameter\x12\x32\n\rflatten_param\x18\x87\x01 \x01(\x0b\x32\x1a.mo_caffe.FlattenParameter\x12*\n\tgrn_param\x18\xd5\x01 \x01(\x0b\x32\x16.mo_caffe.GRNParameter\x12\x34\n\x0fhdf5_data_param\x18p \x01(\x0b\x32\x1b.mo_caffe.HDF5DataParameter\x12\x38\n\x11hdf5_output_param\x18q \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\x12\x36\n\x10hinge_loss_param\x18r \x01(\x0b\x32\x1c.mo_caffe.HingeLossParameter\x12\x36\n\x10image_data_param\x18s \x01(\x0b\x32\x1c.mo_caffe.ImageDataParameter\x12<\n\x13infogain_loss_param\x18t \x01(\x0b\x32\x1f.mo_caffe.InfogainLossParameter\x12<\n\x13inner_product_param\x18u \x01(\x0b\x32\x1f.mo_caffe.InnerProductParameter\x12.\n\x0binput_param\x18\x8f\x01 \x01(\x0b\x32\x18.mo_caffe.InputParameter\x12*\n\tlog_param\x18\x86\x01 \x01(\x0b\x32\x16.mo_caffe.LogParameter\x12)\n\tlrn_param\x18v \x01(\x0b\x32\x16.mo_caffe.LRNParameter\x12\x38\n\x11memory_data_param\x18w \x01(\x0b\x32\x1d.mo_caffe.MemoryDataParameter\x12)\n\tmvn_param\x18x \x01(\x0b\x32\x16.mo_caffe.MVNParameter\x12\x36\n\x0fparameter_param\x18\x91\x01 \x01(\x0b\x32\x1c.mo_caffe.ParameterParameter\x12\x31\n\rpooling_param\x18y \x01(\x0b\x32\x1a.mo_caffe.PoolingParameter\x12\x32\n\rpermute_param\x18\x9a\x01 \x01(\x0b\x32\x1a.mo_caffe.PermuteParameter\x12-\n\x0bpower_param\x18z \x01(\x0b\x32\x18.mo_caffe.PowerParameter\x12.\n\x0bprelu_param\x18\x83\x01 \x01(\x0b\x32\x18.mo_caffe.PReLUParameter\x12\x30\n\x0cpython_param\x18\x82\x01 \x01(\x0b\x32\x19.mo_caffe.PythonParameter\x12\x36\n\x0frecurrent_param\x18\x92\x01 \x01(\x0b\x32\x1c.mo_caffe.RecurrentParameter\x12\x36\n\x0freduction_param\x18\x88\x01 \x01(\x0b\x32\x1c.mo_caffe.ReductionParameter\x12+\n\nrelu_param\x18{ \x01(\x0b\x32\x17.mo_caffe.ReLUParameter\x12\x32\n\rreshape_param\x18\x85\x01 \x01(\x0b\x32\x1a.mo_caffe.ReshapeParameter\x12\x32\n\rreverse_param\x18\x93\x01 \x01(\x0b\x32\x1a.mo_caffe.ReverseParameter\x12.\n\x0bscale_param\x18\x8e\x01 \x01(\x0b\x32\x18.mo_caffe.ScaleParameter\x12\x31\n\rsigmoid_param\x18| \x01(\x0b\x32\x1a.mo_caffe.SigmoidParameter\x12\x31\n\rsoftmax_param\x18} \x01(\x0b\x32\x1a.mo_caffe.SoftmaxParameter\x12*\n\tspp_param\x18\x84\x01 \x01(\x0b\x32\x16.mo_caffe.SPPParameter\x12-\n\x0bslice_param\x18~ \x01(\x0b\x32\x18.mo_caffe.SliceParameter\x12+\n\ntanh_param\x18\x7f \x01(\x0b\x32\x17.mo_caffe.TanHParameter\x12\x36\n\x0fthreshold_param\x18\x80\x01 \x01(\x0b\x32\x1c.mo_caffe.ThresholdParameter\x12,\n\ntile_param\x18\x8a\x01 \x01(\x0b\x32\x17.mo_caffe.TileParameter\x12\x39\n\x11window_data_param\x18\x81\x01 \x01(\x0b\x32\x1d.mo_caffe.WindowDataParameter\x12\x38\n\x08st_param\x18\x96\x01 \x01(\x0b\x32%.mo_caffe.SpatialTransformerParameter\x12\x31\n\rst_loss_param\x18\x97\x01 \x01(\x0b\x32\x19.mo_caffe.STLossParameter\x12\x37\n\x10power_file_param\x18\x98\x01 \x01(\x0b\x32\x1c.mo_caffe.PowerFileParameter\x12\x33\n\x0eloc_loss_param\x18\x99\x01 \x01(\x0b\x32\x1a.mo_caffe.LocLossParameter\x12\x34\n\x0eproposal_param\x18\xc9\x01 \x01(\x0b\x32\x1b.mo_caffe.ProposalParameter\x12P\n\x1d\x63osine_similarity_batch_param\x18\xca\x01 \x01(\x0b\x32(.mo_caffe.CosineSimilarityBatchParameter\x12\x45\n\x0erss_loss_param\x18\xcb\x01 \x01(\x0b\x32,.mo_caffe.RandomSamplingSoftmaxLossParameter\x12\x31\n\nnorm_param\x18\xcc\x01 \x01(\x0b\x32\x1c.mo_caffe.NormalizeParameter\x12\x39\n\x11roi_warping_param\x18\xcd\x01 \x01(\x0b\x32\x1d.mo_caffe.ROIWarpingParameter\x12=\n\x13psroi_pooling_param\x18\xcf\x01 \x01(\x0b\x32\x1f.mo_caffe.PSROIPoolingParameter\x12\x39\n\x11roi_pooling_param\x18\xd0\x01 \x01(\x0b\x32\x1d.mo_caffe.ROIPoolingParameter\x12>\n\x14smooth_l1_loss_param\x18\xd1\x01 \x01(\x0b\x32\x1f.mo_caffe.SmoothL1LossParameter\x12\x46\n\x18\x62ox_annotator_ohem_param\x18\xd2\x01 \x01(\x0b\x32#.mo_caffe.BoxAnnotatorOHEMParameter\x12\x43\n\x16\x64\x65tection_output_param\x18\xd3\x01 \x01(\x0b\x32\".mo_caffe.DetectionOutputParameter\x12\x35\n\x0fprior_box_param\x18\xd4\x01 \x01(\x0b\x32\x1b.mo_caffe.PriorBoxParameter\x12\x39\n\x11region_yolo_param\x18\xd6\x01 \x01(\x0b\x32\x1d.mo_caffe.RegionYoloParameter\x12\x37\n\x10reorg_yolo_param\x18\xd7\x01 \x01(\x0b\x32\x1c.mo_caffe.ReorgYoloParameter\x12.\n\x0brelu6_param\x18\xd8\x01 \x01(\x0b\x32\x18.mo_caffe.ReLU6Parameter\x12\x30\n\x0cinterp_param\x18\xd9\x01 \x01(\x0b\x32\x19.mo_caffe.InterpParameter\x12<\n\x12\x61ugmentation_param\x18\xda\x01 \x01(\x0b\x32\x1f.mo_caffe.AugmentationParameter\x12:\n\x11\x63orrelation_param\x18\xdb\x01 \x01(\x0b\x32\x1e.mo_caffe.CorrelationParameter\x12\x34\n\x0eresample_param\x18\xdc\x01 \x01(\x0b\x32\x1b.mo_caffe.ResampleParameter\x12\x35\n\x0f\x66low_warp_param\x18\xdd\x01 \x01(\x0b\x32\x1b.mo_caffe.FlowWarpParameter\x12.\n\x0b\x61\x63\x63um_param\x18\xde\x01 \x01(\x0b\x32\x18.mo_caffe.AccumParameter\x12?\n\x14\x63oeff_schedule_param\x18\xdf\x01 \x01(\x0b\x32 .mo_caffe.CoeffScheduleParameter\x12\x41\n\x15shuffle_channel_param\x18\xe0\x01 \x01(\x0b\x32!.mo_caffe.ShuffleChannelParameter\"\x90\x01\n\x0fInterpParameter\x12\x11\n\x06height\x18\x01 \x01(\x05:\x01\x30\x12\x10\n\x05width\x18\x02 \x01(\x05:\x01\x30\x12\x16\n\x0bzoom_factor\x18\x03 \x01(\x05:\x01\x31\x12\x18\n\rshrink_factor\x18\x04 \x01(\x05:\x01\x31\x12\x12\n\x07pad_beg\x18\x05 \x01(\x05:\x01\x30\x12\x12\n\x07pad_end\x18\x06 \x01(\x05:\x01\x30\"n\n\"RandomSamplingSoftmaxLossParameter\x12 \n\x13random_sampling_num\x18\x01 \x01(\x05:\x03\x31\x30\x30\x12&\n\x16random_sampling_policy\x18\x02 \x01(\t:\x06random\"\xc8\x01\n\x11ProposalParameter\x12\x17\n\x0b\x66\x65\x61t_stride\x18\x01 \x01(\r:\x02\x31\x36\x12\x15\n\tbase_size\x18\x02 \x01(\r:\x02\x31\x36\x12\x14\n\x08min_size\x18\x03 \x01(\r:\x02\x31\x36\x12\r\n\x05ratio\x18\x04 \x03(\x02\x12\r\n\x05scale\x18\x05 \x03(\x02\x12\x1a\n\x0cpre_nms_topn\x18\x06 \x01(\r:\x04\x36\x30\x30\x30\x12\x1a\n\rpost_nms_topn\x18\x07 \x01(\r:\x03\x33\x30\x30\x12\x17\n\nnms_thresh\x18\x08 \x01(\x02:\x03\x30.7\"\x95\x01\n\x12NormalizeParameter\x12\x1c\n\x0e\x61\x63ross_spatial\x18\x01 \x01(\x08:\x04true\x12/\n\x0cscale_filler\x18\x02 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x1c\n\x0e\x63hannel_shared\x18\x03 \x01(\x08:\x04true\x12\x12\n\x03\x65ps\x18\x04 \x01(\x02:\x05\x31\x65-10\"!\n\x10PermuteParameter\x12\r\n\x05order\x18\x01 \x03(\r\"\xb6\x01\n\x17TransformationParameter\x12\x10\n\x05scale\x18\x01 \x01(\x02:\x01\x31\x12\x15\n\x06mirror\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x14\n\tcrop_size\x18\x03 \x01(\r:\x01\x30\x12\x11\n\tmean_file\x18\x04 \x01(\t\x12\x12\n\nmean_value\x18\x05 \x03(\x02\x12\x1a\n\x0b\x66orce_color\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\nforce_gray\x18\x07 \x01(\x08:\x05\x66\x61lse\"\xb4\x02\n\rLossParameter\x12\x14\n\x0cignore_label\x18\x01 \x01(\x05\x12G\n\rnormalization\x18\x03 \x01(\x0e\x32).mo_caffe.LossParameter.NormalizationMode:\x05VALID\x12\x11\n\tnormalize\x18\x02 \x01(\x08\x12\x1f\n\x14pre_fixed_normalizer\x18\x04 \x01(\x02:\x01\x31\x12$\n\x15weight_by_label_freqs\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63lass_weighting\x18\x06 \x03(\x02\"Q\n\x11NormalizationMode\x12\x08\n\x04\x46ULL\x10\x00\x12\t\n\x05VALID\x10\x01\x12\x0e\n\nBATCH_SIZE\x10\x02\x12\r\n\tPRE_FIXED\x10\x03\x12\x08\n\x04NONE\x10\x04\"L\n\x11\x41\x63\x63uracyParameter\x12\x10\n\x05top_k\x18\x01 \x01(\r:\x01\x31\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x14\n\x0cignore_label\x18\x03 \x01(\x05\"M\n\x0f\x41rgMaxParameter\x12\x1a\n\x0bout_max_val\x18\x01 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x05top_k\x18\x02 \x01(\r:\x01\x31\x12\x0c\n\x04\x61xis\x18\x03 \x01(\x05\"D\n\x18\x43hannelPermutationAction\x12\x0c\n\x04\x63han\x18\x01 \x02(\r\x12\x0c\n\x04\x63opy\x18\x02 \x01(\r\x12\x0c\n\x04\x66ill\x18\x03 \x01(\x02\"\x9a\x01\n\x1b\x43hannelPermutationParameter\x12\x32\n\x06\x61\x63tion\x18\x01 \x03(\x0b\x32\".mo_caffe.ChannelPermutationAction\x12\x12\n\nnum_output\x18\x10 \x02(\r\x12\x1f\n\x10inplace_possible\x18\x11 \x01(\x08:\x05\x66\x61lse\x12\x12\n\x07version\x18\x12 \x01(\x05:\x01\x30\"9\n\x0f\x43oncatParameter\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\x12\x15\n\nconcat_dim\x18\x01 \x01(\r:\x01\x31\"j\n\x12\x42\x61tchNormParameter\x12\x18\n\x10use_global_stats\x18\x01 \x01(\x08\x12&\n\x17moving_average_fraction\x18\x02 \x01(\x02:\x05\x30.999\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x31\x65-05\"J\n\x19\x42oxAnnotatorOHEMParameter\x12\x13\n\x0broi_per_img\x18\x01 \x02(\r\x12\x18\n\x0cignore_label\x18\x02 \x01(\x05:\x02-1\"`\n\rBiasParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12)\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"L\n\x18\x43ontrastiveLossParameter\x12\x11\n\x06margin\x18\x01 \x01(\x02:\x01\x31\x12\x1d\n\x0elegacy_version\x18\x02 \x01(\x08:\x05\x66\x61lse\"\x85\x04\n\x14\x43onvolutionParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x0b\n\x03pad\x18\x03 \x03(\r\x12\x13\n\x0bkernel_size\x18\x04 \x03(\r\x12\x0e\n\x06stride\x18\x06 \x03(\r\x12\x10\n\x08\x64ilation\x18\x12 \x03(\r\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x10\n\x08kernel_h\x18\x0b \x01(\r\x12\x10\n\x08kernel_w\x18\x0c \x01(\r\x12\x10\n\x08stride_h\x18\r \x01(\r\x12\x10\n\x08stride_w\x18\x0e \x01(\r\x12\x10\n\x05group\x18\x05 \x01(\r:\x01\x31\x12\x30\n\rweight_filler\x18\x07 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x08 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12>\n\x06\x65ngine\x18\x0f \x01(\x0e\x32%.mo_caffe.ConvolutionParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x10 \x01(\x05:\x01\x31\x12\x1e\n\x0f\x66orce_nd_im2col\x18\x11 \x01(\x08:\x05\x66\x61lse\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"A\n\rCropParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x32\x12\x0e\n\x06offset\x18\x02 \x03(\r\x12\x0f\n\x07\x64imsize\x18\x03 \x03(\r\"P\n\x13\x43TCDecoderParameter\x12\x17\n\x0b\x62lank_index\x18\x01 \x01(\x05:\x02-1\x12 \n\x12\x63tc_merge_repeated\x18\x02 \x01(\x08:\x04true\"\xb2\x01\n\x10\x43TCLossParameter\x12\x17\n\x0coutput_delay\x18\x01 \x01(\x05:\x01\x30\x12\x17\n\x0b\x62lank_index\x18\x02 \x01(\x05:\x02-1\x12+\n\x1cpreprocess_collapse_repeated\x18\x03 \x01(\x08:\x05\x66\x61lse\x12 \n\x12\x63tc_merge_repeated\x18\x04 \x01(\x08:\x04true\x12\x1d\n\x12loss_calculation_t\x18\x05 \x01(\x05:\x01\x30\"\xa7\x02\n\rDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x34\n\x07\x62\x61\x63kend\x18\x08 \x01(\x0e\x32\x1a.mo_caffe.DataParameter.DB:\x07LEVELDB\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\"\n\x13\x66orce_encoded_color\x18\t \x01(\x08:\x05\x66\x61lse\x12\x13\n\x08prefetch\x18\n \x01(\r:\x01\x34\"\x1b\n\x02\x44\x42\x12\x0b\n\x07LEVELDB\x10\x00\x12\x08\n\x04LMDB\x10\x01\"[\n\x1eNonMaximumSuppressionParameter\x12\x1a\n\rnms_threshold\x18\x01 \x01(\x02:\x03\x30.3\x12\r\n\x05top_k\x18\x02 \x01(\x05\x12\x0e\n\x03\x65ta\x18\x03 \x01(\x02:\x01\x31\"\x99\x04\n\x0fResizeParameter\x12\x0f\n\x04prob\x18\x01 \x01(\x02:\x01\x31\x12@\n\x0bresize_mode\x18\x02 \x01(\x0e\x32%.mo_caffe.ResizeParameter.Resize_mode:\x04WARP\x12\x11\n\x06height\x18\x03 \x01(\r:\x01\x30\x12\x10\n\x05width\x18\x04 \x01(\r:\x01\x30\x12\x17\n\x0cheight_scale\x18\x08 \x01(\r:\x01\x30\x12\x16\n\x0bwidth_scale\x18\t \x01(\r:\x01\x30\x12>\n\x08pad_mode\x18\x05 \x01(\x0e\x32\".mo_caffe.ResizeParameter.Pad_mode:\x08\x43ONSTANT\x12\x11\n\tpad_value\x18\x06 \x03(\x02\x12:\n\x0binterp_mode\x18\x07 \x03(\x0e\x32%.mo_caffe.ResizeParameter.Interp_mode\"G\n\x0bResize_mode\x12\x08\n\x04WARP\x10\x01\x12\x12\n\x0e\x46IT_SMALL_SIZE\x10\x02\x12\x1a\n\x16\x46IT_LARGE_SIZE_AND_PAD\x10\x03\":\n\x08Pad_mode\x12\x0c\n\x08\x43ONSTANT\x10\x01\x12\x0c\n\x08MIRRORED\x10\x02\x12\x12\n\x0eREPEAT_NEAREST\x10\x03\"I\n\x0bInterp_mode\x12\n\n\x06LINEAR\x10\x01\x12\x08\n\x04\x41REA\x10\x02\x12\x0b\n\x07NEAREST\x10\x03\x12\t\n\x05\x43UBIC\x10\x04\x12\x0c\n\x08LANCZOS4\x10\x05\"\xdb\x01\n\x13SaveOutputParameter\x12\x18\n\x10output_directory\x18\x01 \x01(\t\x12\x1a\n\x12output_name_prefix\x18\x02 \x01(\t\x12\x15\n\routput_format\x18\x03 \x01(\t\x12\x16\n\x0elabel_map_file\x18\x04 \x01(\t\x12\x16\n\x0ename_size_file\x18\x05 \x01(\t\x12\x16\n\x0enum_test_image\x18\x06 \x01(\r\x12/\n\x0cresize_param\x18\x07 \x01(\x0b\x32\x19.mo_caffe.ResizeParameter\"\x9d\x04\n\x18\x44\x65tectionOutputParameter\x12\x13\n\x0bnum_classes\x18\x01 \x01(\r\x12\x1c\n\x0eshare_location\x18\x02 \x01(\x08:\x04true\x12\x1e\n\x13\x62\x61\x63kground_label_id\x18\x03 \x01(\x05:\x01\x30\x12;\n\tnms_param\x18\x04 \x01(\x0b\x32(.mo_caffe.NonMaximumSuppressionParameter\x12\x38\n\x11save_output_param\x18\x05 \x01(\x0b\x32\x1d.mo_caffe.SaveOutputParameter\x12?\n\tcode_type\x18\x06 \x01(\x0e\x32$.mo_caffe.PriorBoxParameter.CodeType:\x06\x43ORNER\x12)\n\x1avariance_encoded_in_target\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x16\n\nkeep_top_k\x18\x07 \x01(\x05:\x02-1\x12\x1c\n\x14\x63onfidence_threshold\x18\t \x01(\x02\x12\x18\n\tvisualize\x18\n \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x13visualize_threshold\x18\x0b \x01(\x02\x12\x11\n\tsave_file\x18\x0c \x01(\t\x12\x17\n\x0binput_width\x18\r \x01(\x05:\x02-1\x12\x18\n\x0cinput_height\x18\x0e \x01(\x05:\x02-1\x12\x18\n\nnormalized\x18\x0f \x01(\x08:\x04true\".\n\x10\x44ropoutParameter\x12\x1a\n\rdropout_ratio\x18\x01 \x01(\x02:\x03\x30.5\"\xa6\x01\n\x12\x44ummyDataParameter\x12.\n\x0b\x64\x61ta_filler\x18\x01 \x03(\x0b\x32\x19.mo_caffe.FillerParameter\x12\"\n\x05shape\x18\x06 \x03(\x0b\x32\x13.mo_caffe.BlobShape\x12\x0b\n\x03num\x18\x02 \x03(\r\x12\x10\n\x08\x63hannels\x18\x03 \x03(\r\x12\x0e\n\x06height\x18\x04 \x03(\r\x12\r\n\x05width\x18\x05 \x03(\r\"\xa8\x01\n\x10\x45ltwiseParameter\x12<\n\toperation\x18\x01 \x01(\x0e\x32$.mo_caffe.EltwiseParameter.EltwiseOp:\x03SUM\x12\r\n\x05\x63oeff\x18\x02 \x03(\x02\x12\x1e\n\x10stable_prod_grad\x18\x03 \x01(\x08:\x04true\"\'\n\tEltwiseOp\x12\x08\n\x04PROD\x10\x00\x12\x07\n\x03SUM\x10\x01\x12\x07\n\x03MAX\x10\x02\" \n\x0c\x45LUParameter\x12\x10\n\x05\x61lpha\x18\x01 \x01(\x02:\x01\x31\"\xb2\x01\n\x0e\x45mbedParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x11\n\tinput_dim\x18\x02 \x01(\r\x12\x17\n\tbias_term\x18\x03 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x04 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"D\n\x0c\x45xpParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"9\n\x10\x46lattenParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x14\n\x08\x65nd_axis\x18\x02 \x01(\x05:\x02-1\"O\n\x11HDF5DataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x16\n\x07shuffle\x18\x03 \x01(\x08:\x05\x66\x61lse\"(\n\x13HDF5OutputParameter\x12\x11\n\tfile_name\x18\x01 \x01(\t\"a\n\x12HingeLossParameter\x12\x33\n\x04norm\x18\x01 \x01(\x0e\x32!.mo_caffe.HingeLossParameter.Norm:\x02L1\"\x16\n\x04Norm\x12\x06\n\x02L1\x10\x01\x12\x06\n\x02L2\x10\x02\"\x97\x02\n\x12ImageDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x15\n\nbatch_size\x18\x04 \x01(\r:\x01\x31\x12\x14\n\trand_skip\x18\x07 \x01(\r:\x01\x30\x12\x16\n\x07shuffle\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nnew_height\x18\t \x01(\r:\x01\x30\x12\x14\n\tnew_width\x18\n \x01(\r:\x01\x30\x12\x16\n\x08is_color\x18\x0b \x01(\x08:\x04true\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\x0c \x01(\t:\x00\"\'\n\x15InfogainLossParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\"\xd1\x01\n\x15InnerProductParameter\x12\x12\n\nnum_output\x18\x01 \x01(\r\x12\x17\n\tbias_term\x18\x02 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x04 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x0f\n\x04\x61xis\x18\x05 \x01(\x05:\x01\x31\x12\x18\n\ttranspose\x18\x06 \x01(\x08:\x05\x66\x61lse\"4\n\x0eInputParameter\x12\"\n\x05shape\x18\x01 \x03(\x0b\x32\x13.mo_caffe.BlobShape\"D\n\x0cLogParameter\x12\x10\n\x04\x62\x61se\x18\x01 \x01(\x02:\x02-1\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xbe\x02\n\x0cLRNParameter\x12\x15\n\nlocal_size\x18\x01 \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x02 \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x03 \x01(\x02:\x04\x30.75\x12G\n\x0bnorm_region\x18\x04 \x01(\x0e\x32!.mo_caffe.LRNParameter.NormRegion:\x0f\x41\x43ROSS_CHANNELS\x12\x0c\n\x01k\x18\x05 \x01(\x02:\x01\x31\x12\x36\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1d.mo_caffe.LRNParameter.Engine:\x07\x44\x45\x46\x41ULT\"5\n\nNormRegion\x12\x13\n\x0f\x41\x43ROSS_CHANNELS\x10\x00\x12\x12\n\x0eWITHIN_CHANNEL\x10\x01\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\x1f\n\x0cGRNParameter\x12\x0f\n\x04\x62ias\x18\x01 \x01(\x02:\x01\x31\"Z\n\x13MemoryDataParameter\x12\x12\n\nbatch_size\x18\x01 \x01(\r\x12\x10\n\x08\x63hannels\x18\x02 \x01(\r\x12\x0e\n\x06height\x18\x03 \x01(\r\x12\r\n\x05width\x18\x04 \x01(\r\"d\n\x0cMVNParameter\x12 \n\x12normalize_variance\x18\x01 \x01(\x08:\x04true\x12\x1e\n\x0f\x61\x63ross_channels\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x12\n\x03\x65ps\x18\x03 \x01(\x02:\x05\x31\x65-09\"8\n\x12ParameterParameter\x12\"\n\x05shape\x18\x01 \x01(\x0b\x32\x13.mo_caffe.BlobShape\"\xc1\x03\n\x10PoolingParameter\x12\x38\n\x04pool\x18\x01 \x01(\x0e\x32%.mo_caffe.PoolingParameter.PoolMethod:\x03MAX\x12\x0e\n\x03pad\x18\x04 \x01(\r:\x01\x30\x12\x10\n\x05pad_h\x18\t \x01(\r:\x01\x30\x12\x10\n\x05pad_w\x18\n \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x02 \x01(\r\x12\x10\n\x08kernel_h\x18\x05 \x01(\r\x12\x10\n\x08kernel_w\x18\x06 \x01(\r\x12\x11\n\x06stride\x18\x03 \x01(\r:\x01\x31\x12\x10\n\x08stride_h\x18\x07 \x01(\r\x12\x10\n\x08stride_w\x18\x08 \x01(\r\x12:\n\x06\x65ngine\x18\x0b \x01(\x0e\x32!.mo_caffe.PoolingParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x1d\n\x0eglobal_pooling\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x17\n\tceil_mode\x18\r \x01(\x08:\x04true\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"F\n\x0ePowerParameter\x12\x10\n\x05power\x18\x01 \x01(\x02:\x01\x31\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x10\n\x05shift\x18\x03 \x01(\x02:\x01\x30\"\xd4\x02\n\x11PriorBoxParameter\x12\x10\n\x08min_size\x18\x01 \x03(\x02\x12\x10\n\x08max_size\x18\x02 \x03(\x02\x12\x14\n\x0c\x61spect_ratio\x18\x03 \x03(\x02\x12\x12\n\x04\x66lip\x18\x04 \x01(\x08:\x04true\x12\x13\n\x04\x63lip\x18\x05 \x01(\x08:\x05\x66\x61lse\x12\x10\n\x08variance\x18\x06 \x03(\x02\x12\x10\n\x08img_size\x18\x07 \x01(\r\x12\r\n\x05img_h\x18\x08 \x01(\r\x12\r\n\x05img_w\x18\t \x01(\r\x12\x0c\n\x04step\x18\n \x01(\x02\x12\x0e\n\x06step_h\x18\x0b \x01(\x02\x12\x0e\n\x06step_w\x18\x0c \x01(\x02\x12\x13\n\x06offset\x18\r \x01(\x02:\x03\x30.5\x12\r\n\x05width\x18\x0e \x03(\x02\x12\x0e\n\x06height\x18\x0f \x03(\x02\"8\n\x08\x43odeType\x12\n\n\x06\x43ORNER\x10\x01\x12\x0f\n\x0b\x43\x45NTER_SIZE\x10\x02\x12\x0f\n\x0b\x43ORNER_SIZE\x10\x03\"V\n\x15PSROIPoolingParameter\x12\x15\n\rspatial_scale\x18\x01 \x02(\x02\x12\x12\n\noutput_dim\x18\x02 \x02(\x05\x12\x12\n\ngroup_size\x18\x03 \x02(\x05\"g\n\x0fPythonParameter\x12\x0e\n\x06module\x18\x01 \x01(\t\x12\r\n\x05layer\x18\x02 \x01(\t\x12\x13\n\tparam_str\x18\x03 \x01(\t:\x00\x12 \n\x11share_in_parallel\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xc6\x01\n\x12RecurrentParameter\x12\x15\n\nnum_output\x18\x01 \x01(\r:\x01\x30\x12\x30\n\rweight_filler\x18\x02 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x19\n\ndebug_info\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x1c\n\rexpose_hidden\x18\x05 \x01(\x08:\x05\x66\x61lse\"\xb0\x01\n\x12ReductionParameter\x12@\n\toperation\x18\x01 \x01(\x0e\x32(.mo_caffe.ReductionParameter.ReductionOp:\x03SUM\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x10\n\x05\x63oeff\x18\x03 \x01(\x02:\x01\x31\"5\n\x0bReductionOp\x12\x07\n\x03SUM\x10\x01\x12\x08\n\x04\x41SUM\x10\x02\x12\t\n\x05SUMSQ\x10\x03\x12\x08\n\x04MEAN\x10\x04\"\x90\x01\n\rReLUParameter\x12\x19\n\x0enegative_slope\x18\x01 \x01(\x02:\x01\x30\x12\x37\n\x06\x65ngine\x18\x02 \x01(\x0e\x32\x1e.mo_caffe.ReLUParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\x1e\n\x0eReLU6Parameter\x12\x0c\n\x01n\x18\x01 \x01(\x02:\x01\x36\"]\n\x10ReshapeParameter\x12\"\n\x05shape\x18\x01 \x01(\x0b\x32\x13.mo_caffe.BlobShape\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x30\x12\x14\n\x08num_axes\x18\x03 \x01(\x05:\x02-1\"#\n\x10ReverseParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x30\"Y\n\x13ROIPoolingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"]\n\x17ROIWarpingTestParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"Y\n\x13ROIWarpingParameter\x12\x13\n\x08pooled_h\x18\x01 \x01(\r:\x01\x30\x12\x13\n\x08pooled_w\x18\x02 \x01(\r:\x01\x30\x12\x18\n\rspatial_scale\x18\x03 \x01(\x02:\x01\x31\"\xab\x01\n\x0eScaleParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\x13\n\x08num_axes\x18\x02 \x01(\x05:\x01\x31\x12)\n\x06\x66iller\x18\x03 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x18\n\tbias_term\x18\x04 \x01(\x08:\x05\x66\x61lse\x12.\n\x0b\x62ias_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\"{\n\x10SigmoidParameter\x12:\n\x06\x65ngine\x18\x01 \x01(\x0e\x32!.mo_caffe.SigmoidParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"L\n\x0eSliceParameter\x12\x0f\n\x04\x61xis\x18\x03 \x01(\x05:\x01\x31\x12\x13\n\x0bslice_point\x18\x02 \x03(\r\x12\x14\n\tslice_dim\x18\x01 \x01(\r:\x01\x31\")\n\x15SmoothL1LossParameter\x12\x10\n\x05sigma\x18\x01 \x01(\x02:\x01\x31\"\x8c\x01\n\x10SoftmaxParameter\x12:\n\x06\x65ngine\x18\x01 \x01(\x0e\x32!.mo_caffe.SoftmaxParameter.Engine:\x07\x44\x45\x46\x41ULT\x12\x0f\n\x04\x61xis\x18\x02 \x01(\x05:\x01\x31\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"u\n\rTanHParameter\x12\x37\n\x06\x65ngine\x18\x01 \x01(\x0e\x32\x1e.mo_caffe.TanHParameter.Engine:\x07\x44\x45\x46\x41ULT\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"/\n\rTileParameter\x12\x0f\n\x04\x61xis\x18\x01 \x01(\x05:\x01\x31\x12\r\n\x05tiles\x18\x02 \x01(\x05\"*\n\x12ThresholdParameter\x12\x14\n\tthreshold\x18\x01 \x01(\x02:\x01\x30\"\xc1\x02\n\x13WindowDataParameter\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x10\n\x05scale\x18\x02 \x01(\x02:\x01\x31\x12\x11\n\tmean_file\x18\x03 \x01(\t\x12\x12\n\nbatch_size\x18\x04 \x01(\r\x12\x14\n\tcrop_size\x18\x05 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x06 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0c\x66g_threshold\x18\x07 \x01(\x02:\x03\x30.5\x12\x19\n\x0c\x62g_threshold\x18\x08 \x01(\x02:\x03\x30.5\x12\x19\n\x0b\x66g_fraction\x18\t \x01(\x02:\x04\x30.25\x12\x16\n\x0b\x63ontext_pad\x18\n \x01(\r:\x01\x30\x12\x17\n\tcrop_mode\x18\x0b \x01(\t:\x04warp\x12\x1b\n\x0c\x63\x61\x63he_images\x18\x0c \x01(\x08:\x05\x66\x61lse\x12\x15\n\x0broot_folder\x18\r \x01(\t:\x00\"\xf1\x01\n\x0cSPPParameter\x12\x16\n\x0epyramid_height\x18\x01 \x01(\r\x12\x34\n\x04pool\x18\x02 \x01(\x0e\x32!.mo_caffe.SPPParameter.PoolMethod:\x03MAX\x12\x36\n\x06\x65ngine\x18\x06 \x01(\x0e\x32\x1d.mo_caffe.SPPParameter.Engine:\x07\x44\x45\x46\x41ULT\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"+\n\x06\x45ngine\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\t\n\x05\x43\x41\x46\x46\x45\x10\x01\x12\t\n\x05\x43UDNN\x10\x02\"\xcc\x14\n\x10V1LayerParameter\x12\x0e\n\x06\x62ottom\x18\x02 \x03(\t\x12\x0b\n\x03top\x18\x03 \x03(\t\x12\x0c\n\x04name\x18\x04 \x01(\t\x12\'\n\x07include\x18  \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\'\n\x07\x65xclude\x18! \x03(\x0b\x32\x16.mo_caffe.NetStateRule\x12\x32\n\x04type\x18\x05 \x01(\x0e\x32$.mo_caffe.V1LayerParameter.LayerType\x12\"\n\x05\x62lobs\x18\x06 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x0e\n\x05param\x18\xe9\x07 \x03(\t\x12\x41\n\x0f\x62lob_share_mode\x18\xea\x07 \x03(\x0e\x32\'.mo_caffe.V1LayerParameter.DimCheckMode\x12\x10\n\x08\x62lobs_lr\x18\x07 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x08 \x03(\x02\x12\x13\n\x0bloss_weight\x18# \x03(\x02\x12\x33\n\x0e\x61\x63\x63uracy_param\x18\x1b \x01(\x0b\x32\x1b.mo_caffe.AccuracyParameter\x12/\n\x0c\x61rgmax_param\x18\x17 \x01(\x0b\x32\x19.mo_caffe.ArgMaxParameter\x12/\n\x0c\x63oncat_param\x18\t \x01(\x0b\x32\x19.mo_caffe.ConcatParameter\x12\x42\n\x16\x63ontrastive_loss_param\x18( \x01(\x0b\x32\".mo_caffe.ContrastiveLossParameter\x12\x39\n\x11\x63onvolution_param\x18\n \x01(\x0b\x32\x1e.mo_caffe.ConvolutionParameter\x12+\n\ndata_param\x18\x0b \x01(\x0b\x32\x17.mo_caffe.DataParameter\x12\x31\n\rdropout_param\x18\x0c \x01(\x0b\x32\x1a.mo_caffe.DropoutParameter\x12\x36\n\x10\x64ummy_data_param\x18\x1a \x01(\x0b\x32\x1c.mo_caffe.DummyDataParameter\x12\x31\n\reltwise_param\x18\x18 \x01(\x0b\x32\x1a.mo_caffe.EltwiseParameter\x12)\n\texp_param\x18) \x01(\x0b\x32\x16.mo_caffe.ExpParameter\x12\x34\n\x0fhdf5_data_param\x18\r \x01(\x0b\x32\x1b.mo_caffe.HDF5DataParameter\x12\x38\n\x11hdf5_output_param\x18\x0e \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\x12\x36\n\x10hinge_loss_param\x18\x1d \x01(\x0b\x32\x1c.mo_caffe.HingeLossParameter\x12\x36\n\x10image_data_param\x18\x0f \x01(\x0b\x32\x1c.mo_caffe.ImageDataParameter\x12<\n\x13infogain_loss_param\x18\x10 \x01(\x0b\x32\x1f.mo_caffe.InfogainLossParameter\x12<\n\x13inner_product_param\x18\x11 \x01(\x0b\x32\x1f.mo_caffe.InnerProductParameter\x12)\n\tlrn_param\x18\x12 \x01(\x0b\x32\x16.mo_caffe.LRNParameter\x12\x38\n\x11memory_data_param\x18\x16 \x01(\x0b\x32\x1d.mo_caffe.MemoryDataParameter\x12)\n\tmvn_param\x18\" \x01(\x0b\x32\x16.mo_caffe.MVNParameter\x12\x31\n\rpooling_param\x18\x13 \x01(\x0b\x32\x1a.mo_caffe.PoolingParameter\x12-\n\x0bpower_param\x18\x15 \x01(\x0b\x32\x18.mo_caffe.PowerParameter\x12+\n\nrelu_param\x18\x1e \x01(\x0b\x32\x17.mo_caffe.ReLUParameter\x12\x31\n\rsigmoid_param\x18& \x01(\x0b\x32\x1a.mo_caffe.SigmoidParameter\x12\x31\n\rsoftmax_param\x18\' \x01(\x0b\x32\x1a.mo_caffe.SoftmaxParameter\x12-\n\x0bslice_param\x18\x1f \x01(\x0b\x32\x18.mo_caffe.SliceParameter\x12+\n\ntanh_param\x18% \x01(\x0b\x32\x17.mo_caffe.TanHParameter\x12\x35\n\x0fthreshold_param\x18\x19 \x01(\x0b\x32\x1c.mo_caffe.ThresholdParameter\x12\x38\n\x11window_data_param\x18\x14 \x01(\x0b\x32\x1d.mo_caffe.WindowDataParameter\x12:\n\x0ftransform_param\x18$ \x01(\x0b\x32!.mo_caffe.TransformationParameter\x12+\n\nloss_param\x18* \x01(\x0b\x32\x17.mo_caffe.LossParameter\x12)\n\x05layer\x18\x01 \x01(\x0b\x32\x1a.mo_caffe.V0LayerParameter\"\xd8\x04\n\tLayerType\x12\x08\n\x04NONE\x10\x00\x12\n\n\x06\x41\x42SVAL\x10#\x12\x0c\n\x08\x41\x43\x43URACY\x10\x01\x12\n\n\x06\x41RGMAX\x10\x1e\x12\x08\n\x04\x42NLL\x10\x02\x12\n\n\x06\x43ONCAT\x10\x03\x12\x14\n\x10\x43ONTRASTIVE_LOSS\x10%\x12\x0f\n\x0b\x43ONVOLUTION\x10\x04\x12\x08\n\x04\x44\x41TA\x10\x05\x12\x11\n\rDECONVOLUTION\x10\'\x12\x0b\n\x07\x44ROPOUT\x10\x06\x12\x0e\n\nDUMMY_DATA\x10 \x12\x12\n\x0e\x45UCLIDEAN_LOSS\x10\x07\x12\x0b\n\x07\x45LTWISE\x10\x19\x12\x07\n\x03\x45XP\x10&\x12\x0b\n\x07\x46LATTEN\x10\x08\x12\r\n\tHDF5_DATA\x10\t\x12\x0f\n\x0bHDF5_OUTPUT\x10\n\x12\x0e\n\nHINGE_LOSS\x10\x1c\x12\n\n\x06IM2COL\x10\x0b\x12\x0e\n\nIMAGE_DATA\x10\x0c\x12\x11\n\rINFOGAIN_LOSS\x10\r\x12\x11\n\rINNER_PRODUCT\x10\x0e\x12\x07\n\x03LRN\x10\x0f\x12\x0f\n\x0bMEMORY_DATA\x10\x1d\x12\x1d\n\x19MULTINOMIAL_LOGISTIC_LOSS\x10\x10\x12\x07\n\x03MVN\x10\"\x12\x0b\n\x07POOLING\x10\x11\x12\t\n\x05POWER\x10\x1a\x12\x08\n\x04RELU\x10\x12\x12\x0b\n\x07SIGMOID\x10\x13\x12\x1e\n\x1aSIGMOID_CROSS_ENTROPY_LOSS\x10\x1b\x12\x0b\n\x07SILENCE\x10$\x12\x0b\n\x07SOFTMAX\x10\x14\x12\x10\n\x0cSOFTMAX_LOSS\x10\x15\x12\t\n\x05SPLIT\x10\x16\x12\t\n\x05SLICE\x10!\x12\x08\n\x04TANH\x10\x17\x12\x0f\n\x0bWINDOW_DATA\x10\x18\x12\r\n\tTHRESHOLD\x10\x1f\"*\n\x0c\x44imCheckMode\x12\n\n\x06STRICT\x10\x00\x12\x0e\n\nPERMISSIVE\x10\x01\"\x8c\x08\n\x10V0LayerParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\x12\x12\n\nnum_output\x18\x03 \x01(\r\x12\x16\n\x08\x62iasterm\x18\x04 \x01(\x08:\x04true\x12\x30\n\rweight_filler\x18\x05 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12.\n\x0b\x62ias_filler\x18\x06 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x0e\n\x03pad\x18\x07 \x01(\r:\x01\x30\x12\x12\n\nkernelsize\x18\x08 \x01(\r\x12\x10\n\x05group\x18\t \x01(\r:\x01\x31\x12\x11\n\x06stride\x18\n \x01(\r:\x01\x31\x12\x38\n\x04pool\x18\x0b \x01(\x0e\x32%.mo_caffe.V0LayerParameter.PoolMethod:\x03MAX\x12\x1a\n\rdropout_ratio\x18\x0c \x01(\x02:\x03\x30.5\x12\x15\n\nlocal_size\x18\r \x01(\r:\x01\x35\x12\x10\n\x05\x61lpha\x18\x0e \x01(\x02:\x01\x31\x12\x12\n\x04\x62\x65ta\x18\x0f \x01(\x02:\x04\x30.75\x12\x0c\n\x01k\x18\x16 \x01(\x02:\x01\x31\x12\x0e\n\x06source\x18\x10 \x01(\t\x12\x10\n\x05scale\x18\x11 \x01(\x02:\x01\x31\x12\x10\n\x08meanfile\x18\x12 \x01(\t\x12\x11\n\tbatchsize\x18\x13 \x01(\r\x12\x13\n\x08\x63ropsize\x18\x14 \x01(\r:\x01\x30\x12\x15\n\x06mirror\x18\x15 \x01(\x08:\x05\x66\x61lse\x12\"\n\x05\x62lobs\x18\x32 \x03(\x0b\x32\x13.mo_caffe.BlobProto\x12\x10\n\x08\x62lobs_lr\x18\x33 \x03(\x02\x12\x14\n\x0cweight_decay\x18\x34 \x03(\x02\x12\x14\n\trand_skip\x18\x35 \x01(\r:\x01\x30\x12\x1d\n\x10\x64\x65t_fg_threshold\x18\x36 \x01(\x02:\x03\x30.5\x12\x1d\n\x10\x64\x65t_bg_threshold\x18\x37 \x01(\x02:\x03\x30.5\x12\x1d\n\x0f\x64\x65t_fg_fraction\x18\x38 \x01(\x02:\x04\x30.25\x12\x1a\n\x0f\x64\x65t_context_pad\x18: \x01(\r:\x01\x30\x12\x1b\n\rdet_crop_mode\x18; \x01(\t:\x04warp\x12\x12\n\x07new_num\x18< \x01(\x05:\x01\x30\x12\x17\n\x0cnew_channels\x18= \x01(\x05:\x01\x30\x12\x15\n\nnew_height\x18> \x01(\x05:\x01\x30\x12\x14\n\tnew_width\x18? \x01(\x05:\x01\x30\x12\x1d\n\x0eshuffle_images\x18@ \x01(\x08:\x05\x66\x61lse\x12\x15\n\nconcat_dim\x18\x41 \x01(\r:\x01\x31\x12\x39\n\x11hdf5_output_param\x18\xe9\x07 \x01(\x0b\x32\x1d.mo_caffe.HDF5OutputParameter\".\n\nPoolMethod\x12\x07\n\x03MAX\x10\x00\x12\x07\n\x03\x41VE\x10\x01\x12\x0e\n\nSTOCHASTIC\x10\x02\"Z\n\x0ePReLUParameter\x12)\n\x06\x66iller\x18\x01 \x01(\x0b\x32\x19.mo_caffe.FillerParameter\x12\x1d\n\x0e\x63hannel_shared\x18\x02 \x01(\x08:\x05\x66\x61lse\"\x86\x01\n\x13RegionYoloParameter\x12\x11\n\x06\x63oords\x18\x01 \x01(\x05:\x01\x34\x12\x13\n\x07\x63lasses\x18\x02 \x01(\x05:\x02\x32\x30\x12\x0e\n\x03num\x18\x03 \x01(\x05:\x01\x31\x12\x18\n\ndo_softmax\x18\x04 \x01(\x08:\x04true\x12\x0f\n\x07\x61nchors\x18\x05 \x03(\x02\x12\x0c\n\x04mask\x18\x06 \x03(\x05\"\'\n\x12ReorgYoloParameter\x12\x11\n\x06stride\x18\x01 \x01(\x05:\x01\x31\"\xcf\x01\n\x18RandomGeneratorParameter\x12\x1a\n\trand_type\x18\x01 \x01(\t:\x07uniform\x12\x12\n\x03\x65xp\x18\x02 \x01(\x08:\x05\x66\x61lse\x12\x0f\n\x04mean\x18\x04 \x01(\x02:\x01\x30\x12\x11\n\x06spread\x18\x05 \x01(\x02:\x01\x30\x12\x0f\n\x04prob\x18\x06 \x01(\x02:\x01\x31\x12\x1c\n\x0e\x61pply_schedule\x18\x07 \x01(\x08:\x04true\x12\x19\n\ndiscretize\x18\x08 \x01(\x08:\x05\x66\x61lse\x12\x15\n\nmultiplier\x18\t \x01(\x02:\x01\x31\"`\n\x16\x43oeffScheduleParameter\x12\x14\n\thalf_life\x18\x01 \x01(\x02:\x01\x31\x12\x18\n\rinitial_coeff\x18\x02 \x01(\x02:\x01\x31\x12\x16\n\x0b\x66inal_coeff\x18\x03 \x01(\x02:\x01\x31\"\xde\x07\n\x11\x41ugmentationCoeff\x12\x11\n\x06mirror\x18\x01 \x01(\x02:\x01\x30\x12\r\n\x02\x64x\x18\x02 \x01(\x02:\x01\x30\x12\r\n\x02\x64y\x18\x03 \x01(\x02:\x01\x30\x12\x10\n\x05\x61ngle\x18\x04 \x01(\x02:\x01\x30\x12\x11\n\x06zoom_x\x18\x05 \x01(\x02:\x01\x31\x12\x11\n\x06zoom_y\x18\x06 \x01(\x02:\x01\x31\x12\x10\n\x05gamma\x18\x64 \x01(\x02:\x01\x31\x12\x15\n\nbrightness\x18\x65 \x01(\x02:\x01\x30\x12\x13\n\x08\x63ontrast\x18\x66 \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor1\x18g \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor2\x18h \x01(\x02:\x01\x31\x12\x11\n\x06\x63olor3\x18i \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean0\x18\n \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean1\x18\x0b \x01(\x02:\x01\x31\x12\x16\n\x0bpow_nomean2\x18\x0c \x01(\x02:\x01\x31\x12\x16\n\x0b\x61\x64\x64_nomean0\x18\r \x01(\x02:\x01\x30\x12\x16\n\x0b\x61\x64\x64_nomean1\x18\x0e \x01(\x02:\x01\x30\x12\x16\n\x0b\x61\x64\x64_nomean2\x18\x0f \x01(\x02:\x01\x30\x12\x17\n\x0cmult_nomean0\x18\x10 \x01(\x02:\x01\x31\x12\x17\n\x0cmult_nomean1\x18\x11 \x01(\x02:\x01\x31\x12\x17\n\x0cmult_nomean2\x18\x12 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean0\x18\x13 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean1\x18\x14 \x01(\x02:\x01\x31\x12\x18\n\rpow_withmean2\x18\x15 \x01(\x02:\x01\x31\x12\x18\n\radd_withmean0\x18\x16 \x01(\x02:\x01\x30\x12\x18\n\radd_withmean1\x18\x17 \x01(\x02:\x01\x30\x12\x18\n\radd_withmean2\x18\x18 \x01(\x02:\x01\x30\x12\x19\n\x0emult_withmean0\x18\x19 \x01(\x02:\x01\x31\x12\x19\n\x0emult_withmean1\x18\x1a \x01(\x02:\x01\x31\x12\x19\n\x0emult_withmean2\x18\x1b \x01(\x02:\x01\x31\x12\x14\n\tlmult_pow\x18\x1c \x01(\x02:\x01\x31\x12\x14\n\tlmult_add\x18\x1d \x01(\x02:\x01\x30\x12\x15\n\nlmult_mult\x18\x1e \x01(\x02:\x01\x31\x12\x14\n\tcol_angle\x18\x1f \x01(\x02:\x01\x30\x12\x15\n\nfog_amount\x18& \x01(\x02:\x01\x30\x12\x13\n\x08\x66og_size\x18\' \x01(\x02:\x01\x30\x12\x1c\n\x11motion_blur_angle\x18( \x01(\x02:\x01\x30\x12\x1b\n\x10motion_blur_size\x18) \x01(\x02:\x01\x30\x12\x17\n\x0cshadow_angle\x18* \x01(\x02:\x01\x30\x12\x1a\n\x0fshadow_distance\x18+ \x01(\x02:\x01\x30\x12\x1a\n\x0fshadow_strength\x18, \x01(\x02:\x01\x30\x12\x10\n\x05noise\x18- \x01(\x02:\x01\x30\"\xcc\x10\n\x15\x41ugmentationParameter\x12\x15\n\ncrop_width\x18! \x01(\r:\x01\x30\x12\x16\n\x0b\x63rop_height\x18\" \x01(\r:\x01\x30\x12\x19\n\x0fwrite_augmented\x18\x02 \x01(\t:\x00\x12\x1b\n\x0emax_multiplier\x18\x03 \x01(\x02:\x03\x32\x35\x35\x12\"\n\x13\x61ugment_during_test\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x19\n\x0erecompute_mean\x18\x05 \x01(\r:\x01\x30\x12\x14\n\nwrite_mean\x18\x06 \x01(\t:\x00\x12\x1c\n\x0emean_per_pixel\x18\x07 \x01(\x08:\x04true\x12\x0c\n\x04mean\x18\x12 \x03(\x02\x12\x11\n\x04mode\x18\x08 \x01(\t:\x03\x61\x64\x64\x12\x16\n\x0b\x62ottomwidth\x18P \x01(\r:\x01\x30\x12\x17\n\x0c\x62ottomheight\x18Q \x01(\r:\x01\x30\x12\x0e\n\x03num\x18R \x01(\r:\x01\x30\x12\x18\n\x10\x63hromatic_eigvec\x18S \x03(\x02\x12\x32\n\x06mirror\x18\n \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\ttranslate\x18\x0b \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x32\n\x06rotate\x18\x0c \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x30\n\x04zoom\x18\r \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07squeeze\x18\x0e \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x37\n\x0btranslate_x\x18\x0f \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x37\n\x0btranslate_y\x18\x10 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05gamma\x18# \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nbrightness\x18$ \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x63ontrast\x18% \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05\x63olor\x18& \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tlmult_pow\x18\x14 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nlmult_mult\x18\x15 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tlmult_add\x18\x16 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07sat_pow\x18\x17 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08sat_mult\x18\x18 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07sat_add\x18\x19 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07\x63ol_pow\x18\x1a \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x63ol_mult\x18\x1b \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x33\n\x07\x63ol_add\x18\x1c \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08ladd_pow\x18\x1d \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x35\n\tladd_mult\x18\x1e \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08ladd_add\x18\x1f \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\ncol_rotate\x18  \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x36\n\nfog_amount\x18\x64 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x34\n\x08\x66og_size\x18\x65 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12=\n\x11motion_blur_angle\x18\x66 \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12<\n\x10motion_blur_size\x18g \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x38\n\x0cshadow_angle\x18h \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12;\n\x0fshadow_distance\x18i \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12;\n\x0fshadow_strength\x18j \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\x12\x31\n\x05noise\x18k \x01(\x0b\x32\".mo_caffe.RandomGeneratorParameter\"\x85\x01\n\x11\x46lowWarpParameter\x12\x43\n\nfill_value\x18\x01 \x01(\x0e\x32).mo_caffe.FlowWarpParameter.FillParameter:\x04ZERO\"+\n\rFillParameter\x12\x08\n\x04ZERO\x10\x01\x12\x10\n\x0cNOT_A_NUMBER\x10\x02\"\xb6\x02\n\x14\x43orrelationParameter\x12\x0e\n\x03pad\x18\x02 \x01(\r:\x01\x30\x12\x13\n\x0bkernel_size\x18\x03 \x01(\r\x12\x18\n\x10max_displacement\x18\x04 \x01(\r\x12\x13\n\x08stride_1\x18\x05 \x01(\r:\x01\x31\x12\x13\n\x08stride_2\x18\x06 \x01(\r:\x01\x31\x12\x1b\n\x10single_direction\x18\x08 \x01(\x05:\x01\x30\x12\x15\n\x06\x64o_abs\x18\x07 \x01(\x08:\x05\x66\x61lse\x12R\n\x10\x63orrelation_type\x18\x0f \x01(\x0e\x32..mo_caffe.CorrelationParameter.CorrelationType:\x08MULTIPLY\"-\n\x0f\x43orrelationType\x12\x0c\n\x08MULTIPLY\x10\x00\x12\x0c\n\x08SUBTRACT\x10\x01\"\xdc\x01\n\x11ResampleParameter\x12\x17\n\tantialias\x18\x04 \x01(\x08:\x04true\x12\r\n\x05width\x18\x01 \x01(\r\x12\x0e\n\x06height\x18\x02 \x01(\r\x12>\n\x04type\x18\x03 \x01(\x0e\x32(.mo_caffe.ResampleParameter.ResampleType:\x06LINEAR\x12\x11\n\x06\x66\x61\x63tor\x18\x05 \x01(\x02:\x01\x31\"<\n\x0cResampleType\x12\x0b\n\x07NEAREST\x10\x01\x12\n\n\x06LINEAR\x10\x02\x12\t\n\x05\x43UBIC\x10\x03\x12\x08\n\x04\x41REA\x10\x04\"z\n\x0e\x41\x63\x63umParameter\x12\x15\n\ntop_height\x18\x01 \x01(\r:\x01\x30\x12\x14\n\ttop_width\x18\x02 \x01(\r:\x01\x30\x12\x1c\n\x11size_divisible_by\x18\x03 \x01(\r:\x01\x30\x12\x1d\n\x0ehave_reference\x18\x04 \x01(\x08:\x05\x66\x61lse\"(\n\x17ShuffleChannelParameter\x12\r\n\x05group\x18\x01 \x02(\r*\x1c\n\x05Phase\x12\t\n\x05TRAIN\x10\x00\x12\x08\n\x04TEST\x10\x01')
 )
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 
@@ -40,8 +40,8 @@ _PHASE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=26632,
-  serialized_end=26660,
+  serialized_start=26741,
+  serialized_end=26769,
 )
 _sym_db.RegisterEnumDescriptor(_PHASE)
 
@@ -209,8 +209,8 @@ _LOSSPARAMETER_NORMALIZATIONMODE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=9437,
-  serialized_end=9518,
+  serialized_start=9504,
+  serialized_end=9585,
 )
 _sym_db.RegisterEnumDescriptor(_LOSSPARAMETER_NORMALIZATIONMODE)
 
@@ -235,8 +235,8 @@ _CONVOLUTIONPARAMETER_ENGINE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=10798,
-  serialized_end=10841,
+  serialized_start=10865,
+  serialized_end=10908,
 )
 _sym_db.RegisterEnumDescriptor(_CONVOLUTIONPARAMETER_ENGINE)
 
@@ -257,8 +257,8 @@ _DATAPARAMETER_DB = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=11442,
-  serialized_end=11469,
+  serialized_start=11509,
+  serialized_end=11536,
 )
 _sym_db.RegisterEnumDescriptor(_DATAPARAMETER_DB)
 
@@ -283,8 +283,8 @@ _RESIZEPARAMETER_RESIZE_MODE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=11896,
-  serialized_end=11967,
+  serialized_start=11963,
+  serialized_end=12034,
 )
 _sym_db.RegisterEnumDescriptor(_RESIZEPARAMETER_RESIZE_MODE)
 
@@ -309,8 +309,8 @@ _RESIZEPARAMETER_PAD_MODE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=11969,
-  serialized_end=12027,
+  serialized_start=12036,
+  serialized_end=12094,
 )
 _sym_db.RegisterEnumDescriptor(_RESIZEPARAMETER_PAD_MODE)
 
@@ -343,8 +343,8 @@ _RESIZEPARAMETER_INTERP_MODE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=12029,
-  serialized_end=12102,
+  serialized_start=12096,
+  serialized_end=12169,
 )
 _sym_db.RegisterEnumDescriptor(_RESIZEPARAMETER_INTERP_MODE)
 
@@ -369,8 +369,8 @@ _ELTWISEPARAMETER_ELTWISEOP = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=13217,
-  serialized_end=13256,
+  serialized_start=13284,
+  serialized_end=13323,
 )
 _sym_db.RegisterEnumDescriptor(_ELTWISEPARAMETER_ELTWISEOP)
 
@@ -391,8 +391,8 @@ _HINGELOSSPARAMETER_NORM = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=13800,
-  serialized_end=13822,
+  serialized_start=13867,
+  serialized_end=13889,
 )
 _sym_db.RegisterEnumDescriptor(_HINGELOSSPARAMETER_NORM)
 
@@ -413,8 +413,8 @@ _LRNPARAMETER_NORMREGION = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=14704,
-  serialized_end=14757,
+  serialized_start=14771,
+  serialized_end=14824,
 )
 _sym_db.RegisterEnumDescriptor(_LRNPARAMETER_NORMREGION)
 
@@ -439,8 +439,8 @@ _LRNPARAMETER_ENGINE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=10798,
-  serialized_end=10841,
+  serialized_start=10865,
+  serialized_end=10908,
 )
 _sym_db.RegisterEnumDescriptor(_LRNPARAMETER_ENGINE)
 
@@ -465,8 +465,8 @@ _POOLINGPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=15448,
-  serialized_end=15494,
+  serialized_start=15515,
+  serialized_end=15561,
 )
 _sym_db.RegisterEnumDescriptor(_POOLINGPARAMETER_POOLMETHOD)
 
@@ -491,8 +491,8 @@ _POOLINGPARAMETER_ENGINE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=10798,
-  serialized_end=10841,
+  serialized_start=10865,
+  serialized_end=10908,
 )
 _sym_db.RegisterEnumDescriptor(_POOLINGPARAMETER_ENGINE)
 
@@ -517,8 +517,8 @@ _PRIORBOXPARAMETER_CODETYPE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=15898,
-  serialized_end=15954,
+  serialized_start=15965,
+  serialized_end=16021,
 )
 _sym_db.RegisterEnumDescriptor(_PRIORBOXPARAMETER_CODETYPE)
 
@@ -547,8 +547,8 @@ _REDUCTIONPARAMETER_REDUCTIONOP = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=16474,
-  serialized_end=16527,
+  serialized_start=16541,
+  serialized_end=16594,
 )
 _sym_db.RegisterEnumDescriptor(_REDUCTIONPARAMETER_REDUCTIONOP)
 
@@ -573,8 +573,8 @@ _RELUPARAMETER_ENGINE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=10798,
-  serialized_end=10841,
+  serialized_start=10865,
+  serialized_end=10908,
 )
 _sym_db.RegisterEnumDescriptor(_RELUPARAMETER_ENGINE)
 
@@ -599,8 +599,8 @@ _SIGMOIDPARAMETER_ENGINE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=10798,
-  serialized_end=10841,
+  serialized_start=10865,
+  serialized_end=10908,
 )
 _sym_db.RegisterEnumDescriptor(_SIGMOIDPARAMETER_ENGINE)
 
@@ -625,8 +625,8 @@ _SOFTMAXPARAMETER_ENGINE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=10798,
-  serialized_end=10841,
+  serialized_start=10865,
+  serialized_end=10908,
 )
 _sym_db.RegisterEnumDescriptor(_SOFTMAXPARAMETER_ENGINE)
 
@@ -651,8 +651,8 @@ _TANHPARAMETER_ENGINE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=10798,
-  serialized_end=10841,
+  serialized_start=10865,
+  serialized_end=10908,
 )
 _sym_db.RegisterEnumDescriptor(_TANHPARAMETER_ENGINE)
 
@@ -677,8 +677,8 @@ _SPPPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=15448,
-  serialized_end=15494,
+  serialized_start=15515,
+  serialized_end=15561,
 )
 _sym_db.RegisterEnumDescriptor(_SPPPARAMETER_POOLMETHOD)
 
@@ -703,8 +703,8 @@ _SPPPARAMETER_ENGINE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=10798,
-  serialized_end=10841,
+  serialized_start=10865,
+  serialized_end=10908,
 )
 _sym_db.RegisterEnumDescriptor(_SPPPARAMETER_ENGINE)
 
@@ -877,8 +877,8 @@ _V1LAYERPARAMETER_LAYERTYPE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=20453,
-  serialized_end=21053,
+  serialized_start=20520,
+  serialized_end=21120,
 )
 _sym_db.RegisterEnumDescriptor(_V1LAYERPARAMETER_LAYERTYPE)
 
@@ -925,8 +925,8 @@ _V0LAYERPARAMETER_POOLMETHOD = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=15448,
-  serialized_end=15494,
+  serialized_start=15515,
+  serialized_end=15561,
 )
 _sym_db.RegisterEnumDescriptor(_V0LAYERPARAMETER_POOLMETHOD)
 
@@ -947,8 +947,8 @@ _FLOWWARPPARAMETER_FILLPARAMETER = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=25927,
-  serialized_end=25970,
+  serialized_start=25994,
+  serialized_end=26037,
 )
 _sym_db.RegisterEnumDescriptor(_FLOWWARPPARAMETER_FILLPARAMETER)
 
@@ -969,8 +969,8 @@ _CORRELATIONPARAMETER_CORRELATIONTYPE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=26238,
-  serialized_end=26283,
+  serialized_start=26305,
+  serialized_end=26350,
 )
 _sym_db.RegisterEnumDescriptor(_CORRELATIONPARAMETER_CORRELATIONTYPE)
 
@@ -999,8 +999,8 @@ _RESAMPLEPARAMETER_RESAMPLETYPE = _descriptor.EnumDescriptor(
   ],
   containing_type=None,
   options=None,
-  serialized_start=26446,
-  serialized_end=26506,
+  serialized_start=26513,
+  serialized_end=26573,
 )
 _sym_db.RegisterEnumDescriptor(_RESAMPLEPARAMETER_RESAMPLETYPE)
 
@@ -2987,6 +2987,13 @@ _LAYERPARAMETER = _descriptor.Descriptor(
       message_type=None, enum_type=None, containing_type=None,
       is_extension=False, extension_scope=None,
       options=None),
+    _descriptor.FieldDescriptor(
+      name='shuffle_channel_param', full_name='mo_caffe.LayerParameter.shuffle_channel_param', index=89,
+      number=224, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
   ],
   extensions=[
   ],
@@ -2999,7 +3006,7 @@ _LAYERPARAMETER = _descriptor.Descriptor(
   oneofs=[
   ],
   serialized_start=3844,
-  serialized_end=8373,
+  serialized_end=8440,
 )
 
 
@@ -3063,8 +3070,8 @@ _INTERPPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=8376,
-  serialized_end=8520,
+  serialized_start=8443,
+  serialized_end=8587,
 )
 
 
@@ -3100,8 +3107,8 @@ _RANDOMSAMPLINGSOFTMAXLOSSPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=8522,
-  serialized_end=8632,
+  serialized_start=8589,
+  serialized_end=8699,
 )
 
 
@@ -3179,8 +3186,8 @@ _PROPOSALPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=8635,
-  serialized_end=8835,
+  serialized_start=8702,
+  serialized_end=8902,
 )
 
 
@@ -3230,8 +3237,8 @@ _NORMALIZEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=8838,
-  serialized_end=8987,
+  serialized_start=8905,
+  serialized_end=9054,
 )
 
 
@@ -3260,8 +3267,8 @@ _PERMUTEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=8989,
-  serialized_end=9022,
+  serialized_start=9056,
+  serialized_end=9089,
 )
 
 
@@ -3332,8 +3339,8 @@ _TRANSFORMATIONPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=9025,
-  serialized_end=9207,
+  serialized_start=9092,
+  serialized_end=9274,
 )
 
 
@@ -3398,8 +3405,8 @@ _LOSSPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=9210,
-  serialized_end=9518,
+  serialized_start=9277,
+  serialized_end=9585,
 )
 
 
@@ -3442,8 +3449,8 @@ _ACCURACYPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=9520,
-  serialized_end=9596,
+  serialized_start=9587,
+  serialized_end=9663,
 )
 
 
@@ -3486,8 +3493,8 @@ _ARGMAXPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=9598,
-  serialized_end=9675,
+  serialized_start=9665,
+  serialized_end=9742,
 )
 
 
@@ -3530,8 +3537,8 @@ _CHANNELPERMUTATIONACTION = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=9677,
-  serialized_end=9745,
+  serialized_start=9744,
+  serialized_end=9812,
 )
 
 
@@ -3581,8 +3588,8 @@ _CHANNELPERMUTATIONPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=9748,
-  serialized_end=9902,
+  serialized_start=9815,
+  serialized_end=9969,
 )
 
 
@@ -3618,8 +3625,8 @@ _CONCATPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=9904,
-  serialized_end=9961,
+  serialized_start=9971,
+  serialized_end=10028,
 )
 
 
@@ -3662,8 +3669,8 @@ _BATCHNORMPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=9963,
-  serialized_end=10069,
+  serialized_start=10030,
+  serialized_end=10136,
 )
 
 
@@ -3699,8 +3706,8 @@ _BOXANNOTATOROHEMPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=10071,
-  serialized_end=10145,
+  serialized_start=10138,
+  serialized_end=10212,
 )
 
 
@@ -3743,8 +3750,8 @@ _BIASPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=10147,
-  serialized_end=10243,
+  serialized_start=10214,
+  serialized_end=10310,
 )
 
 
@@ -3780,8 +3787,8 @@ _CONTRASTIVELOSSPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=10245,
-  serialized_end=10321,
+  serialized_start=10312,
+  serialized_end=10388,
 )
 
 
@@ -3930,8 +3937,8 @@ _CONVOLUTIONPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=10324,
-  serialized_end=10841,
+  serialized_start=10391,
+  serialized_end=10908,
 )
 
 
@@ -3974,8 +3981,8 @@ _CROPPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=10843,
-  serialized_end=10908,
+  serialized_start=10910,
+  serialized_end=10975,
 )
 
 
@@ -4011,8 +4018,8 @@ _CTCDECODERPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=10910,
-  serialized_end=10990,
+  serialized_start=10977,
+  serialized_end=11057,
 )
 
 
@@ -4069,8 +4076,8 @@ _CTCLOSSPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=10993,
-  serialized_end=11171,
+  serialized_start=11060,
+  serialized_end=11238,
 )
 
 
@@ -4163,8 +4170,8 @@ _DATAPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=11174,
-  serialized_end=11469,
+  serialized_start=11241,
+  serialized_end=11536,
 )
 
 
@@ -4207,8 +4214,8 @@ _NONMAXIMUMSUPPRESSIONPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=11471,
-  serialized_end=11562,
+  serialized_start=11538,
+  serialized_end=11629,
 )
 
 
@@ -4296,8 +4303,8 @@ _RESIZEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=11565,
-  serialized_end=12102,
+  serialized_start=11632,
+  serialized_end=12169,
 )
 
 
@@ -4368,8 +4375,8 @@ _SAVEOUTPUTPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=12105,
-  serialized_end=12324,
+  serialized_start=12172,
+  serialized_end=12391,
 )
 
 
@@ -4496,8 +4503,8 @@ _DETECTIONOUTPUTPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=12327,
-  serialized_end=12868,
+  serialized_start=12394,
+  serialized_end=12935,
 )
 
 
@@ -4526,8 +4533,8 @@ _DROPOUTPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=12870,
-  serialized_end=12916,
+  serialized_start=12937,
+  serialized_end=12983,
 )
 
 
@@ -4591,8 +4598,8 @@ _DUMMYDATAPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=12919,
-  serialized_end=13085,
+  serialized_start=12986,
+  serialized_end=13152,
 )
 
 
@@ -4636,8 +4643,8 @@ _ELTWISEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=13088,
-  serialized_end=13256,
+  serialized_start=13155,
+  serialized_end=13323,
 )
 
 
@@ -4666,8 +4673,8 @@ _ELUPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=13258,
-  serialized_end=13290,
+  serialized_start=13325,
+  serialized_end=13357,
 )
 
 
@@ -4724,8 +4731,8 @@ _EMBEDPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=13293,
-  serialized_end=13471,
+  serialized_start=13360,
+  serialized_end=13538,
 )
 
 
@@ -4768,8 +4775,8 @@ _EXPPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=13473,
-  serialized_end=13541,
+  serialized_start=13540,
+  serialized_end=13608,
 )
 
 
@@ -4805,8 +4812,8 @@ _FLATTENPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=13543,
-  serialized_end=13600,
+  serialized_start=13610,
+  serialized_end=13667,
 )
 
 
@@ -4849,8 +4856,8 @@ _HDF5DATAPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=13602,
-  serialized_end=13681,
+  serialized_start=13669,
+  serialized_end=13748,
 )
 
 
@@ -4879,8 +4886,8 @@ _HDF5OUTPUTPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=13683,
-  serialized_end=13723,
+  serialized_start=13750,
+  serialized_end=13790,
 )
 
 
@@ -4910,8 +4917,8 @@ _HINGELOSSPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=13725,
-  serialized_end=13822,
+  serialized_start=13792,
+  serialized_end=13889,
 )
 
 
@@ -5017,8 +5024,8 @@ _IMAGEDATAPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=13825,
-  serialized_end=14104,
+  serialized_start=13892,
+  serialized_end=14171,
 )
 
 
@@ -5047,8 +5054,8 @@ _INFOGAINLOSSPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=14106,
-  serialized_end=14145,
+  serialized_start=14173,
+  serialized_end=14212,
 )
 
 
@@ -5112,8 +5119,8 @@ _INNERPRODUCTPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=14148,
-  serialized_end=14357,
+  serialized_start=14215,
+  serialized_end=14424,
 )
 
 
@@ -5142,8 +5149,8 @@ _INPUTPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=14359,
-  serialized_end=14411,
+  serialized_start=14426,
+  serialized_end=14478,
 )
 
 
@@ -5186,8 +5193,8 @@ _LOGPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=14413,
-  serialized_end=14481,
+  serialized_start=14480,
+  serialized_end=14548,
 )
 
 
@@ -5253,8 +5260,8 @@ _LRNPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=14484,
-  serialized_end=14802,
+  serialized_start=14551,
+  serialized_end=14869,
 )
 
 
@@ -5283,8 +5290,8 @@ _GRNPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=14804,
-  serialized_end=14835,
+  serialized_start=14871,
+  serialized_end=14902,
 )
 
 
@@ -5334,8 +5341,8 @@ _MEMORYDATAPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=14837,
-  serialized_end=14927,
+  serialized_start=14904,
+  serialized_end=14994,
 )
 
 
@@ -5378,8 +5385,8 @@ _MVNPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=14929,
-  serialized_end=15029,
+  serialized_start=14996,
+  serialized_end=15096,
 )
 
 
@@ -5408,8 +5415,8 @@ _PARAMETERPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=15031,
-  serialized_end=15087,
+  serialized_start=15098,
+  serialized_end=15154,
 )
 
 
@@ -5524,8 +5531,8 @@ _POOLINGPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=15090,
-  serialized_end=15539,
+  serialized_start=15157,
+  serialized_end=15606,
 )
 
 
@@ -5568,8 +5575,8 @@ _POWERPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=15541,
-  serialized_end=15611,
+  serialized_start=15608,
+  serialized_end=15678,
 )
 
 
@@ -5697,8 +5704,8 @@ _PRIORBOXPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=15614,
-  serialized_end=15954,
+  serialized_start=15681,
+  serialized_end=16021,
 )
 
 
@@ -5741,8 +5748,8 @@ _PSROIPOOLINGPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=15956,
-  serialized_end=16042,
+  serialized_start=16023,
+  serialized_end=16109,
 )
 
 
@@ -5792,8 +5799,8 @@ _PYTHONPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=16044,
-  serialized_end=16147,
+  serialized_start=16111,
+  serialized_end=16214,
 )
 
 
@@ -5850,8 +5857,8 @@ _RECURRENTPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=16150,
-  serialized_end=16348,
+  serialized_start=16217,
+  serialized_end=16415,
 )
 
 
@@ -5895,8 +5902,8 @@ _REDUCTIONPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=16351,
-  serialized_end=16527,
+  serialized_start=16418,
+  serialized_end=16594,
 )
 
 
@@ -5933,8 +5940,8 @@ _RELUPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=16530,
-  serialized_end=16674,
+  serialized_start=16597,
+  serialized_end=16741,
 )
 
 
@@ -5963,8 +5970,8 @@ _RELU6PARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=16676,
-  serialized_end=16706,
+  serialized_start=16743,
+  serialized_end=16773,
 )
 
 
@@ -6007,8 +6014,8 @@ _RESHAPEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=16708,
-  serialized_end=16801,
+  serialized_start=16775,
+  serialized_end=16868,
 )
 
 
@@ -6037,8 +6044,8 @@ _REVERSEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=16803,
-  serialized_end=16838,
+  serialized_start=16870,
+  serialized_end=16905,
 )
 
 
@@ -6081,8 +6088,8 @@ _ROIPOOLINGPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=16840,
-  serialized_end=16929,
+  serialized_start=16907,
+  serialized_end=16996,
 )
 
 
@@ -6125,8 +6132,8 @@ _ROIWARPINGTESTPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=16931,
-  serialized_end=17024,
+  serialized_start=16998,
+  serialized_end=17091,
 )
 
 
@@ -6169,8 +6176,8 @@ _ROIWARPINGPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=17026,
-  serialized_end=17115,
+  serialized_start=17093,
+  serialized_end=17182,
 )
 
 
@@ -6227,8 +6234,8 @@ _SCALEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=17118,
-  serialized_end=17289,
+  serialized_start=17185,
+  serialized_end=17356,
 )
 
 
@@ -6258,8 +6265,8 @@ _SIGMOIDPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=17291,
-  serialized_end=17414,
+  serialized_start=17358,
+  serialized_end=17481,
 )
 
 
@@ -6302,8 +6309,8 @@ _SLICEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=17416,
-  serialized_end=17492,
+  serialized_start=17483,
+  serialized_end=17559,
 )
 
 
@@ -6332,8 +6339,8 @@ _SMOOTHL1LOSSPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=17494,
-  serialized_end=17535,
+  serialized_start=17561,
+  serialized_end=17602,
 )
 
 
@@ -6370,8 +6377,8 @@ _SOFTMAXPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=17538,
-  serialized_end=17678,
+  serialized_start=17605,
+  serialized_end=17745,
 )
 
 
@@ -6401,8 +6408,8 @@ _TANHPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=17680,
-  serialized_end=17797,
+  serialized_start=17747,
+  serialized_end=17864,
 )
 
 
@@ -6438,8 +6445,8 @@ _TILEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=17799,
-  serialized_end=17846,
+  serialized_start=17866,
+  serialized_end=17913,
 )
 
 
@@ -6468,8 +6475,8 @@ _THRESHOLDPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=17848,
-  serialized_end=17890,
+  serialized_start=17915,
+  serialized_end=17957,
 )
 
 
@@ -6582,8 +6589,8 @@ _WINDOWDATAPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=17893,
-  serialized_end=18214,
+  serialized_start=17960,
+  serialized_end=18281,
 )
 
 
@@ -6628,8 +6635,8 @@ _SPPPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=18217,
-  serialized_end=18458,
+  serialized_start=18284,
+  serialized_end=18525,
 )
 
 
@@ -6954,8 +6961,8 @@ _V1LAYERPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=18461,
-  serialized_end=21097,
+  serialized_start=18528,
+  serialized_end=21164,
 )
 
 
@@ -7244,8 +7251,8 @@ _V0LAYERPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=21100,
-  serialized_end=22136,
+  serialized_start=21167,
+  serialized_end=22203,
 )
 
 
@@ -7281,8 +7288,8 @@ _PRELUPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=22138,
-  serialized_end=22228,
+  serialized_start=22205,
+  serialized_end=22295,
 )
 
 
@@ -7346,8 +7353,8 @@ _REGIONYOLOPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=22231,
-  serialized_end=22365,
+  serialized_start=22298,
+  serialized_end=22432,
 )
 
 
@@ -7376,8 +7383,8 @@ _REORGYOLOPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=22367,
-  serialized_end=22406,
+  serialized_start=22434,
+  serialized_end=22473,
 )
 
 
@@ -7455,8 +7462,8 @@ _RANDOMGENERATORPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=22409,
-  serialized_end=22616,
+  serialized_start=22476,
+  serialized_end=22683,
 )
 
 
@@ -7499,8 +7506,8 @@ _COEFFSCHEDULEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=22618,
-  serialized_end=22714,
+  serialized_start=22685,
+  serialized_end=22781,
 )
 
 
@@ -7816,8 +7823,8 @@ _AUGMENTATIONCOEFF = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=22717,
-  serialized_end=23707,
+  serialized_start=22784,
+  serialized_end=23774,
 )
 
 
@@ -8161,8 +8168,8 @@ _AUGMENTATIONPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=23710,
-  serialized_end=25834,
+  serialized_start=23777,
+  serialized_end=25901,
 )
 
 
@@ -8192,8 +8199,8 @@ _FLOWWARPPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=25837,
-  serialized_end=25970,
+  serialized_start=25904,
+  serialized_end=26037,
 )
 
 
@@ -8272,8 +8279,8 @@ _CORRELATIONPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=25973,
-  serialized_end=26283,
+  serialized_start=26040,
+  serialized_end=26350,
 )
 
 
@@ -8331,8 +8338,8 @@ _RESAMPLEPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=26286,
-  serialized_end=26506,
+  serialized_start=26353,
+  serialized_end=26573,
 )
 
 
@@ -8382,8 +8389,38 @@ _ACCUMPARAMETER = _descriptor.Descriptor(
   extension_ranges=[],
   oneofs=[
   ],
-  serialized_start=26508,
-  serialized_end=26630,
+  serialized_start=26575,
+  serialized_end=26697,
+)
+
+
+_SHUFFLECHANNELPARAMETER = _descriptor.Descriptor(
+  name='ShuffleChannelParameter',
+  full_name='mo_caffe.ShuffleChannelParameter',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='group', full_name='mo_caffe.ShuffleChannelParameter.group', index=0,
+      number=1, type=13, cpp_type=3, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=26699,
+  serialized_end=26739,
 )
 
 _BLOBPROTO.fields_by_name['shape'].message_type = _BLOBSHAPE
@@ -8494,6 +8531,7 @@ _LAYERPARAMETER.fields_by_name['resample_param'].message_type = _RESAMPLEPARAMET
 _LAYERPARAMETER.fields_by_name['flow_warp_param'].message_type = _FLOWWARPPARAMETER
 _LAYERPARAMETER.fields_by_name['accum_param'].message_type = _ACCUMPARAMETER
 _LAYERPARAMETER.fields_by_name['coeff_schedule_param'].message_type = _COEFFSCHEDULEPARAMETER
+_LAYERPARAMETER.fields_by_name['shuffle_channel_param'].message_type = _SHUFFLECHANNELPARAMETER
 _NORMALIZEPARAMETER.fields_by_name['scale_filler'].message_type = _FILLERPARAMETER
 _LOSSPARAMETER.fields_by_name['normalization'].enum_type = _LOSSPARAMETER_NORMALIZATIONMODE
 _LOSSPARAMETER_NORMALIZATIONMODE.containing_type = _LOSSPARAMETER
@@ -8740,6 +8778,7 @@ DESCRIPTOR.message_types_by_name['FlowWarpParameter'] = _FLOWWARPPARAMETER
 DESCRIPTOR.message_types_by_name['CorrelationParameter'] = _CORRELATIONPARAMETER
 DESCRIPTOR.message_types_by_name['ResampleParameter'] = _RESAMPLEPARAMETER
 DESCRIPTOR.message_types_by_name['AccumParameter'] = _ACCUMPARAMETER
+DESCRIPTOR.message_types_by_name['ShuffleChannelParameter'] = _SHUFFLECHANNELPARAMETER
 DESCRIPTOR.enum_types_by_name['Phase'] = _PHASE
 
 BlobShape = _reflection.GeneratedProtocolMessageType('BlobShape', (_message.Message,), dict(
@@ -9456,6 +9495,13 @@ AccumParameter = _reflection.GeneratedProtocolMessageType('AccumParameter', (_me
   ))
 _sym_db.RegisterMessage(AccumParameter)
 
+ShuffleChannelParameter = _reflection.GeneratedProtocolMessageType('ShuffleChannelParameter', (_message.Message,), dict(
+  DESCRIPTOR = _SHUFFLECHANNELPARAMETER,
+  __module__ = 'mo_caffe_pb2'
+  # @@protoc_insertion_point(class_scope:mo_caffe.ShuffleChannelParameter)
+  ))
+_sym_db.RegisterMessage(ShuffleChannelParameter)
+
 
 _BLOBSHAPE.fields_by_name['dim'].has_options = True
 _BLOBSHAPE.fields_by_name['dim']._options = _descriptor._ParseOptions(descriptor_pb2.FieldOptions(), _b('\020\001'))
index 941f65c..2a86db4 100644 (file)
@@ -22,12 +22,8 @@ import sys
 
 
 def shell(cmd, env=None, cwd=None):
-    kwargs = dict(cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    if sys.platform.startswith('linux') or sys.platform == 'darwin':
-        cmd = ['/bin/bash', '-c', "".join(cmd)]
-    else:
-        kwargs.update({'shell': True})
-    print('Running: "{}"'.format(''.join(cmd)))
+    kwargs = dict(cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False)
+    print('Running: "{}"'.format(' '.join(cmd)))
     p = subprocess.Popen(cmd, **kwargs)
     (stdout, stderr) = p.communicate()
     return p.returncode, stdout, stderr
@@ -42,21 +38,19 @@ def get_cli_parser():
 
 
 def build_proto(proto_file_path, python_path):
-    retcode, out, err = shell('protoc -h')
+    retcode, out, err = shell(['protoc', '-h'])
     if retcode:
         print(err)
         return 1
     if not (os.path.exists(proto_file_path) and os.path.isfile(proto_file_path)):
         print('File {} does not exist'.format(proto_file_path))
         return 1
-    if not os.path.exists(proto_file_path):
-        os.makedirs(python_path)
     proto_path = os.path.split(proto_file_path)[0]
     if not proto_path:
         proto_path = os.getcwd()
 
     proto_file = os.path.split(proto_file_path)[1]
-    command = 'protoc {} --python_out={}'.format(proto_file, python_path)
+    command = ['protoc', proto_file, '--python_out={}'.format(python_path)]
 
     retcode, out, err = shell(command, cwd=proto_path)
 
@@ -78,5 +72,8 @@ if __name__ == "__main__":
     argv = get_cli_parser().parse_args()
     proto_file_path = argv.input_proto
     python_path = argv.output
+    if not os.path.exists(python_path):
+        print("Output directory {} does not exist".format(python_path))
+        sys.exit(1)
     status = build_proto(proto_file_path, python_path)
     exit(status)
index 852fccf..82f83a5 100644 (file)
@@ -534,6 +534,9 @@ message LayerParameter {
   optional FlowWarpParameter flow_warp_param = 221;
   optional AccumParameter accum_param = 222;
   optional CoeffScheduleParameter coeff_schedule_param = 223;
+
+  // for Shufflenet v2
+  optional ShuffleChannelParameter shuffle_channel_param= 224;
 }
 
 message InterpParameter {
@@ -2078,3 +2081,7 @@ message AccumParameter {
     optional uint32 size_divisible_by = 3 [default = 0]; // Upscales to the minimal size divisible by the given number
     optional bool have_reference = 4 [ default = false ];
 }
+
+message ShuffleChannelParameter {
+    required uint32 group = 1;
+}
index 2f368dc..6da7861 100644 (file)
  limitations under the License.
 """
 
+import logging as log
+
 import numpy as np
 
-from mo.graph.graph import Node
 from mo.utils.error import Error
 
 nchw_to_nhwc_permute = np.array([0, 2, 3, 1], dtype=np.int64)
index adbf6ec..99adb1f 100644 (file)
 
 def single_output_infer(node, shape_infer, value_infer=None):
     node.out_node(0).shape = shape_infer(node)
-    if value_infer is not None:
-        node.out_node(0).value = value_infer(node)
 
+    if value_infer is not None and \
+       'value' in node.in_node() and \
+       node.in_node().value is not None:
+        node.out_node(0).value = value_infer(node)
 
 def copy_shape_infer(node):
     """
index bf0803b..50ac4f0 100644 (file)
@@ -50,9 +50,7 @@ def tf_expand_dims_infer(node):
     if input_node.value is not None:
         output_node.value = np.array(np.reshape(input_node.value, output_node.shape))
 
-    node['axis'] = 0
-    node['num_axes'] = -1
     node['dim'] = output_node.shape
 
-    PermuteAttrs.create_permute_attrs(node, attrs=[('axis','output:0'), ('dim','output:0')])
+    PermuteAttrs.create_permute_attrs(node, attrs=[('dim', 'output:0')])
 
diff --git a/model-optimizer/mo/front/common/partial_infer/flatten.py b/model-optimizer/mo/front/common/partial_infer/flatten.py
deleted file mode 100644 (file)
index 223f53d..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import logging as log
-
-import numpy as np
-
-from mo.front.caffe.extractors.utils import get_canonical_axis_index
-
-
-def flatten_infer(node):
-    """
-    Infers shape of flatten node as it is done in Caffe.
-    Output shape: [Batch is the same, Production of other dims]
-    Args:
-        node: graph flatten node
-
-    """
-    input_shape = node.in_node(0).shape
-    if input_shape is None:
-        return
-
-    # TODO: Should check that input_shape[1:] part doesn't contain -1 elements
-    axis = get_canonical_axis_index(input_shape, node.axis)
-    end_axis = node.end_axis if node.has('end_axis') else -1
-    end_axis = get_canonical_axis_index(input_shape, end_axis)
-    prod_axes = np.prod(input_shape[axis: end_axis + 1])
-    node.out_node(0).shape = np.array([*input_shape[0: axis], prod_axes, *input_shape[end_axis + 1:]], dtype=np.int64)
-    log.debug('input_shape: {}, output_shape: {}'.format(input_shape, node.out_node().shape))
-
index b1d7189..765363b 100644 (file)
@@ -52,20 +52,13 @@ def onnx_matmul_infer(node):
     if len(node.in_nodes()) != 2:
         raise Error("Wrong number of input nodes for {} node. Should be 2 instead of {}".format(node.name,
                                                                                                 len(node.in_nodes())))
+    input_0_shape = node.in_node(0).shape
+    input_1_shape = node.in_node(1).shape
 
-    input_node = node.in_node(0)
-    weights_node = node.in_node(1)
+    input_shapes = [node.in_node(port).shape for port in node.in_nodes()]
+    max_len = max([len(shape) for shape in input_shapes])
+    new_input_shapes = [np.concatenate([np.ones(max_len - len(input_shapes[i])), input_shapes[i]], axis=0)
+                        for i in range(len(input_shapes))]
 
-    input_shape = input_node.shape
-    weights_shape = weights_node.shape
-
-    if len(weights_shape) > 2:
-        raise Error("MatMul {} with weights shape != 2 is not supported".format(node.name))
-
-    mark_input_bins(node)
-    assign_dims_to_weights(weights_node, None, 0, 1, 2)
-    PermuteAttrs.set_permutation(weights_node, node, PermuteAttrs.Permutation(perm=int64_array([1, 0]),
-                                                                              inv=int64_array([0, 1])))
-
-    node['out-size'] = weights_shape[1]
-    node.out_node().shape = np.array([*input_shape[0:-1], weights_shape[1]])
+    node.out_node().shape = np.concatenate([np.maximum(*[shape[0:-2] for shape in new_input_shapes]),
+                                            [input_0_shape[-2], input_1_shape[-1]]], axis=0)
index f776a12..157402c 100644 (file)
@@ -18,16 +18,44 @@ import logging as log
 
 import numpy as np
 
+from mo.front.common.partial_infer.utils import int64_array
+from mo.ops.op import PermuteAttrs
 from mo.utils.error import Error
 
 
 def tf_matmul_infer(node):
     assert (len(node.in_nodes()) == 2)
-    shapes = [node.in_node(i).shape for i in range(2)]
+
+    shapes = [node.in_node(i).shape.copy() for i in range(2)]
     log.debug('matmul shapes: {}'.format(shapes))
-    if node.transpose_a or node.transpose_b or any(s is None or len(s) < 2 for s in shapes):
+    if any(s is None or len(s) < 2 for s in shapes):
         log.error("MatMul wasn't able to infer shape")
         return
+
+    if node.transpose_a:
+        if not node.in_node(0).has_valid('value'):
+            log.error("MatMul wasn't able to infer shape")
+            return
+        else:
+            perm = np.array(range(len(node.in_node(0).shape)), dtype=np.int64)
+            perm[-1], perm[-2] = perm[-2], perm[-1]
+            inv = PermuteAttrs.get_inverse_permutation(perm)
+            permutation = PermuteAttrs.Permutation(perm=perm, inv=int64_array(inv))
+            PermuteAttrs.set_permutation(node.in_node(0), node, permutation)
+            shapes[0] = shapes[0][perm]
+
+    if node.transpose_b:
+        if not node.in_node(1).has_valid('value'):
+            log.error("MatMul wasn't able to infer shape")
+            return
+        else:
+            perm = np.array(range(len(node.in_node(1).shape)), dtype=np.int64)
+            perm[-1], perm[-2] = perm[-2], perm[-1]
+            inv = PermuteAttrs.get_inverse_permutation(perm)
+            permutation = PermuteAttrs.Permutation(perm=perm, inv=int64_array(inv))
+            PermuteAttrs.set_permutation(node.in_node(1), node, permutation)
+            shapes[1] = shapes[1][perm]
+
     if any(shapes[0][:-2] != shapes[1][:-2]) or shapes[0][-1] != shapes[1][-2]:
         log.error("MatMul wasn't able to infer shape because input dimensions are not compatible")
         return
@@ -49,7 +77,6 @@ def tf_matmul_infer(node):
     log.debug('matmul shape: {}'.format(node.out_node().shape))
 
 
-
 def onnx_gemm_infer(node):
     assert (len(node.in_nodes()) == 3)
     shapeA = node.in_node(0).shape
@@ -60,16 +87,15 @@ def onnx_gemm_infer(node):
 
     if shapeA.size > 2 and node.transpose_a:
         raise Error(
-            'ONNX Gemm operation do not support {}dimensional input with set transA key'.format(shapeA.size))
+            'ONNX Gemm operation do not support {} dimensional input with set transA key'.format(shapeA.size))
 
     # apply transposes and broadcasts
     if node.transpose_a:
-        shapeA = shapeA[[1,0]]
+        shapeA = shapeA[[1, 0]]
     if node.transpose_b:
-        shapeB = shapeB[[1,0]]
+        shapeB = shapeB[[1, 0]]
     if node.broadcast_c and shapeC.size == 1:
         shapeC = np.array([shapeA[0], shapeC[0]])
 
     node.out_node().shape = shapeC
     return
-
index 74fdd40..627badc 100644 (file)
@@ -26,6 +26,9 @@ def tf_reduce_infer(node, op=None):
     if input_shape is None or axis is None or input_shape.ndim != 1 or axis.ndim > 1:
         return
     output_shape = np.array(input_shape)
+    if len(axis.shape) == 0:  # fix since np.delete deprecate negative idxs
+        axis = axis.reshape([1])
+    axis[axis < 0] += output_shape.shape[0]
     if node.keep_dims:
         output_shape[axis] = 1
     else:
@@ -34,4 +37,4 @@ def tf_reduce_infer(node, op=None):
     if op is not None and node.in_node(0).value is not None:
         node.out_node(0).value = np.array([op(node.in_node(0).value, (*axis,))],
                                           dtype=node.in_node(0).value.dtype)  # TODO extend to multi-dimensional axis
-        log.debug("value: {}".format(node.out_node(0).value))
+        log.debug("value: {}".format(node.out_node(0).value))
\ No newline at end of file
index db7cc98..ae61602 100644 (file)
  limitations under the License.
 """
 
-import logging as log
-
-import numpy as np
-
+from mo.front.common.partial_infer.utils import int64_array
 from mo.ops.op import PermuteAttrs
+from mo.utils.error import Error
 
 
 def tf_reshape_shape_infer(node):
@@ -32,12 +30,9 @@ def tf_reshape_shape_infer(node):
     input_shape = node.in_node(0).shape
     reshape_output = node.in_node(1).value if len(node.in_nodes()) > 1 else node.dim
 
-    # In case if Reshape operation was created with two inputs and dim attr wasn't set, we set in automatically
-    if not node.has_valid('dim'):
-        node['dim'] = np.array(reshape_output, dtype=np.int64)
-
     if node.in_node(0).shape is None:
         return None
+
     total = 1
     for index, i in enumerate(input_shape):
         total *= i
@@ -65,11 +60,16 @@ def tf_reshape_shape_infer(node):
         out_shape_total *= i
 
     if total != out_shape_total:
-        log.error(
+        raise Error(
             "Number of elements in input {} and output {} of reshape node {} mismatch".format(input_shape, output_shape,
                                                                                               node.name))
-        return None
 
     PermuteAttrs.create_permute_attrs(node, attrs=[('dim', 'output:0')])
 
-    return np.array(output_shape, dtype=np.int64)
+    output_shape = int64_array(output_shape)
+
+    # In case if Reshape operation was created with two inputs and dim attr wasn't set, we set in automatically
+    if not node.has_valid('dim'):
+        node['dim'] = output_shape
+
+    return output_shape
index cac47d4..bf23763 100644 (file)
@@ -71,7 +71,6 @@ def tf_strided_slice_infer(node):
         new_axis_mask.append(False)
 
     value = node.in_node(0).value if node.in_node(0).value is not None else np.zeros(shape)
-
     # fix for the warning: "FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated use
     # `arr[tuple(seq)]` instead of `arr[seq]`"
     value = value[tuple(slice_idx)]
index 24bf678..ff8abb8 100644 (file)
@@ -19,7 +19,7 @@ import logging as log
 import numpy as np
 
 from mo.ops.op import PermuteAttrs
-from mo.utils.error import Error
+from mo.graph.graph import Node
 
 
 def part_sizes_to_indices(part_sizes: list):
@@ -39,13 +39,13 @@ def part_sizes_to_indices(part_sizes: list):
     return np.array(indices)
 
 
-def split(input, node, outputs, axis, part_sizes):
+def split(input_data_node: Node, node: Node, axis: int, part_sizes: list):
     """
     Partial inference of generic split node.
 
     Args:
         @input: input tensor node, subject to split
-        @outputs: output tensor nodes where we put inferred output shapes
+        @node: node of one of the Split types
         @axis: split dimension index
         @part_sizes: a NumPy array with sizes of all pieces that we split to
 
@@ -54,18 +54,14 @@ def split(input, node, outputs, axis, part_sizes):
 
     """
 
-    if input.shape is None:
-        return
-
-    if len(outputs) != len(part_sizes):
-        log.error('Number of outputs do not match the number of parts with sizes.')
+    if input_data_node.shape is None:
         return
 
     # normalize axis
     if axis < 0:
-        axis = input.shape.size + axis
+        axis = input_data_node.shape.size + axis
 
-    if axis < 0 or axis >= input.shape.size:
+    if axis < 0 or axis >= input_data_node.shape.size:
         log.error('Model is incorrect: axis for split node is out of range')
         return
 
@@ -77,64 +73,67 @@ def split(input, node, outputs, axis, part_sizes):
     if undef_indices.size == 1:
         undef_index = undef_indices[0]
         part_sizes[undef_index] = 0
-        deduced_dim = input.shape[axis] - np.add.reduce(part_sizes)
+        deduced_dim = input_data_node.shape[axis] - np.add.reduce(part_sizes)
         if deduced_dim < 0:
-            log.error(
-                'Just deduced dimension for the split has negative value that means that split input shape and desired parts are not compatible')
+            log.error('Just deduced dimension for the split has negative value that means that split input shape and '
+                      'desired parts are not compatible')
             return
 
     all_parts_size = np.add.reduce(part_sizes)
-    if all_parts_size != input.shape[axis]:
-        log.error("input.shape[{}] = {}  !=  {} = sum of all parts in part_sizes".format(axis, input.shape[axis],
+    if all_parts_size != input_data_node.shape[axis]:
+        log.error("input.shape[{}] = {}  !=  {} = sum of all parts in part_sizes".format(axis,
+                                                                                         input_data_node.shape[axis],
                                                                                          all_parts_size))
         return
 
-    for i, part_size in enumerate(part_sizes):
-        shape = input.shape.copy()
-        shape[axis] = part_size
-        outputs[i].shape = shape
+    splitted = None
+    if input_data_node.value is not None:
+        splitted = np.split(input_data_node.value, part_sizes_to_indices(part_sizes), axis)
 
-    if input.value is not None:
-        splitted = np.split(input.value, part_sizes_to_indices(part_sizes), axis)
-        # log.debug("splitted = {}".format(splitted))
-        for i, part in enumerate(splitted):
-            outputs[i].value = part
-            # log.debug('outputs[i].value.shape = {}, outputs[i].shape = {}'.format(outputs[i].value.shape, outputs[i].shape))
-            assert all(outputs[i].value.shape == outputs[i].shape)
+    # not all outputs from the split could be used so it is necessary to iterate over output edges and infer shape for
+    # necessary nodes only
+    for _, dst, edge_attrs in node.graph.out_edges(node.id, data=True):
+        out_port = edge_attrs['out']
+        out_node = node.out_node(out_port)
+
+        new_out_shape = input_data_node.shape.copy()
+        new_out_shape[axis] = part_sizes[out_port]
+        node.out_node(out_port).shape = new_out_shape
+        if splitted is not None:
+            out_node.value = splitted[out_port]
+            assert all(out_node.value.shape == out_node.shape)
 
     assert not node.has_valid('axis') or node.axis == axis
     node.axis = axis
-    # WARNING: != 4 is supposed to work for NHWC to NCHW translation only; if other global permutations happen this will fail
+    # WARNING: != 4 is supposed to work for NHWC to NCHW translation only.
+    # if other global permutations happen this will fail
     # TODO: redesign it to have this logic built in NHWC to NCHW translation pass; it requires
     #       additional attributes with layout to be propagated through the network
-    if len(input.shape) != 4 and node.has_valid('dim_attrs') and 'axis' in node.dim_attrs:
-        log.warning(
-            'Removed "axis" attribute from the scope of the model relayout pass because len(input.shape) == {} != 4 for node {}'.format(
-                len(input.shape),
-                node.name if node.has_valid('name') else '<UNKNOWN>'))
+    if len(input_data_node.shape) != 4 and node.has_valid('dim_attrs') and 'axis' in node.dim_attrs:
+        log.warning('Removed "axis" attribute from the scope of the model relayout pass because len(input.shape) == {} '
+                    '!= 4 for node {}'.format(len(input_data_node.shape), node.soft_get('name')))
         node.dim_attrs.remove('axis')
         assert 'axis' not in node.dim_attrs
+    log.debug('output shapes after split: {}'.format([v.shape for k, v in node.out_nodes().items()]))
 
 
 def tf_split_infer(node):
     """
     Partial infer of split node similar to Split op of TF.
     """
-
-    if len(node.in_nodes()) == 1:
-        return True
-
-    # Two inputs: [split_dim, input)
-    assert (len(node.in_nodes()) == 2)
+    # Two inputs: [split_dim, input]
+    assert len(node.in_nodes()) == 2, 'Node "{}" must have exactly two inputs'.format(node.soft_get('name'))
     split_dim = node.in_node(0).value
     if split_dim is None:
         log.error('split_dim value for node {} is None. Cannot do shape inference.')
         return
-    assert split_dim.ndim == 0
+
+    assert split_dim.ndim == 0, 'The split dimension for node "{}" must be a scalar.'.format(node.soft_get('name'))
     split_dim = split_dim.item()
     input = node.in_node(1)
 
-    if split_dim is None or input.shape is None:
+    if input.shape is None:
+        log.error('Input shape for node {} is not defined'.format(node.soft_get('name')))
         return
 
     log.debug('input shape for split: {}, should be split along {} dim'.format(input.shape, split_dim))
@@ -145,42 +144,36 @@ def tf_split_infer(node):
         log.error("split_dim cannot be evenly divided by a given number of parts")
         return
 
-    outputs = node.out_nodes()
     # split_dim is a numpy array, axis is split_dim[0]
-    log.debug(
-        'split_dim_size = {}, node.num_split = {}, div = {}, typeof div = {}'.format(split_dim_size, node.num_split,
-                                                                                     split_dim_size / node.num_split,
-                                                                                     type(
-                                                                                         split_dim_size / node.num_split)))
-    split(input, node, [outputs[i] for i in range(len(outputs))], split_dim,
-          [int(split_dim_size / node.num_split)] * node.num_split)
-    log.debug('output shapes after split: {}'.format([v.shape for k, v in outputs.items()]))
+    log.debug('split_dim_size = {}, node.num_split = {}, div = {}, typeof div = {}'.format(
+        split_dim_size, node.num_split, split_dim_size / node.num_split, type(split_dim_size / node.num_split)))
+    split(input, node, split_dim, [int(split_dim_size / node.num_split)] * node.num_split)
     node.graph.remove_edge(node.in_node(0).id, node.id)
     node['input_port'] = 1
 
     PermuteAttrs.create_permute_attrs(node, attrs=[('axis', 'input:1')])
 
 
-def tf_split_v_infer(node):
+def tf_split_v_infer(node: Node):
     """
     Partial infer of split node similar to SplitV op of TF.
     """
 
     if len(node.in_nodes()) == 1 and not (node.has_valid('axis') and node.has_valid('size_splits')):
-        return True
+        return
 
     if len(node.in_nodes()) == 3 and (node.has_valid('axis') or node.has_valid('size_splits')):
-        return True
+        return
 
     # Three inputs: [input, size_splits, split_dim)
-    if len(node.in_nodes())==3 :
+    if len(node.in_nodes()) == 3:
         split_dim = node.in_node(2).value
         assert split_dim.ndim == 0
         split_dim = split_dim.item()
         size_splits = node.in_node(1).value
         node.graph.remove_edge(node.in_node(1).id, node.id)
         node.graph.remove_edge(node.in_node(2).id, node.id)
-    else :
+    else:
         split_dim = node.axis
         size_splits = node.size_splits
    
@@ -189,21 +182,19 @@ def tf_split_v_infer(node):
         return
     
     input = node.in_node(0)
-    
-    log.debug(
-        'split_dim = {}, input.shape = {}, size_splits.value = {}'.format(split_dim, input.shape, size_splits))
-
-    if split_dim is None or input.shape is None or size_splits is None:
+    if input.shape is None or size_splits is None:
+        log.error('input shape or size of splits are not defined for node {}'.format(node.soft_get('name')))
         return
 
-    outputs = node.out_nodes()
+    log.debug('split_dim = {}, input.shape = {}, size_splits.value = {}'.format(split_dim, input.shape, size_splits))
+
     # split_dim is a numpy array, axis is split_dim
-    split(input, node, [outputs[i] for i in range(len(outputs))], split_dim, size_splits)
-    log.debug('output shapes after split: {}'.format([v.shape for k, v in outputs.items()]))
-    
-    PermuteAttrs.create_permute_attrs(node, attrs=[('axis','input:0')])
+    split(input, node, split_dim, size_splits)
+
+    PermuteAttrs.create_permute_attrs(node, attrs=[('axis', 'input:0')])
 
-def tf_unpack_infer(node):
+
+def tf_unpack_infer(node: Node):
     if len(node.in_nodes()) != 1:
         log.debug('Unpack node "{}" must have one input.'.format(node.name))
         return
@@ -229,9 +220,5 @@ def tf_unpack_infer(node):
         log.error("split_dim cannot be evenly divided by a given number of parts")
         return
 
-    outputs = node.out_nodes()
-    split(node.in_node(), node, [outputs[i] for i in range(len(outputs))], split_dim,
-          [int(split_dim_size / node.num_split)] * node.num_split)
-
+    split(node.in_node(), node, split_dim, [int(split_dim_size / node.num_split)] * node.num_split)
     # node shapes will be squeezed in the separate pass
-    log.debug('output shapes after split: {}'.format([v.shape for k, v in outputs.items()]))
index 351a391..574ba85 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
+import logging as log
 
 import numpy as np
-from mo.front.caffe.extractors.utils import get_canonical_axis_index
 
+from mo.front.caffe.extractors.utils import get_canonical_axis_index
+from mo.front.common.layout import get_height_dim, get_width_dim, get_depth_dim
+from mo.front.common.partial_infer.utils import int64_array
 from mo.ops.op import PermuteAttrs
+from mo.utils.error import Error
+
+
+def is_spatial_squeeze(layout: str, input_shape: np.ndarray, squeeze_dims: np.ndarray):
+    """
+    Checks that the squeeze operation removes all spatial dimensions.
+    :param layout: graph layout.
+    :param input_shape: numpy array with input shape.
+    :param squeeze_dims: numpy array with dims to squeeze.
+    :return: result of the check.
+    """
+    if len(input_shape) < 4 or len(input_shape) > 5:
+        return False
+    spatial_dims = [get_height_dim(layout, len(input_shape)), get_width_dim(layout, len(input_shape))]
+    if len(input_shape) == 5:
+        spatial_dims.append(get_depth_dim(layout, len(input_shape)))
+    for dim in spatial_dims:
+        if input_shape[dim] != 1:
+            log.debug('The reshape from "{}" with squeezed dims "{}" is not a spatial squeeze'.format(input_shape,
+                                                                                                      squeeze_dims))
+            return False
+    if len(squeeze_dims) != len(spatial_dims):
+        log.debug('The reshape from "{}" with squeezed dims "{}" is not a spatial squeeze'.format(input_shape,
+                                                                                                  squeeze_dims))
+        return False
+    log.debug('The reshape from "{}" with squeezed dims "{}" is not a spatial squeeze'.format(input_shape,
+                                                                                              squeeze_dims))
+    return True
 
 
 def tf_squeeze_infer(node):
     if node.squeeze_dims is None:
         # TODO: implement; there is no implementation now because no test
         return
+
     real_squeeze_dims = []
-    shape = node.in_node().shape
-    if shape is None:
+    input_shape = node.in_node().shape
+    if input_shape is None:
         return
     # UGLY
-    shape = shape.copy()
+    output_shape = input_shape.copy()
     for n in node.squeeze_dims:
-        if shape[n] == 1:
-            real_squeeze_dims.append(get_canonical_axis_index(shape, n))
-    shape = np.delete(shape, real_squeeze_dims)
-    node.out_node().shape = shape
-    node['dim'] = shape
+        if output_shape[n] == 1:
+            real_squeeze_dims.append(get_canonical_axis_index(output_shape, n))
+        else:
+            raise Error('Trying to squeeze dimension not equal to 1 for node "{}"'.format(node.soft_get('name')))
+
+    output_shape = np.delete(output_shape, real_squeeze_dims)
+    node.out_node().shape = output_shape
+
+    if is_spatial_squeeze(node.graph.graph['layout'], input_shape, output_shape):
+        output_shape = int64_array([0, -1])
+    node['dim'] = output_shape
     if node.in_node().value is not None:
-        node.out_node().value = np.array(np.reshape(node.in_node().value, shape))
+        node.out_node().value = np.array(np.reshape(node.in_node().value, output_shape))
 
-    PermuteAttrs.create_permute_attrs(node, attrs =[('dim','output:0')])
\ No newline at end of file
+    PermuteAttrs.create_permute_attrs(node, attrs=[('dim', 'output:0')])
index dbb76a5..0056a0a 100644 (file)
@@ -24,21 +24,17 @@ def int64_array(l: list):
 
 
 def float_array(l: list):
-    return np.array(l, dtype=np.int64)
+    return np.array(l, dtype=np.float64)
 
 
 def mark_input_bins(node, names=('weights', 'biases'), start_port: int = 1):
     """
     Preparing necessary attributes for edges at input ports starting from start_port.
-    It is applicable for convolution and other operations that has constant inputs which
+    It is applicable for convolution and other operations that have constant inputs which
     are intended to be dumped as IE IR bin file.
     """
-    nports = len(node.in_nodes())
-    for i, name in enumerate(names):
-        port = i + start_port
-        if port >= nports:
-            break
-        if node.in_node(port).value is not None:
+    for port, name in enumerate(names, start=start_port):
+        if port in node.in_nodes() and node.in_node(port).has_valid('value'):
             node.in_edge(port)['bin'] = name
 
 
index 89607f8..1172bf3 100644 (file)
@@ -53,7 +53,6 @@ def check_for_duplicates(extractors_collection: dict):
     Check if extractors_collection has case-insensitive duplicates, if it does,
     raise exception with information about duplicates
     """
-    assert extractors_collection, 'Extractors collection can not be empty.'
     # Check if extractors_collection is a normal form, that is it doesn't have case-insensitive duplicates
     duplicates, keys = find_case_insensitive_duplicates(extractors_collection)
     if len(duplicates) > 0:
index bf55518..6ba1ea4 100644 (file)
@@ -15,6 +15,8 @@
 """
 import ast
 import logging as log
+from collections import defaultdict
+from copy import copy
 
 import networkx as nx
 import numpy as np
@@ -27,8 +29,6 @@ from mo.utils.error import Error
 from mo.utils.graph import dfs
 from mo.utils.unsupported_ops import UnsupportedOps
 from mo.utils.utils import refer_to_faq_msg
-from collections import defaultdict
-from copy import copy
 
 
 def restore_edges(graph: nx.DiGraph, get_edges: callable):
@@ -473,6 +473,7 @@ def update_ie_fields(attrs: dict, ir_version = None):
     ir_version_mapping = {
         # Default behaviour is IR V3 attributes
         None: ir_v3_attrs,
+        4: ir_v3_attrs,
         3: ir_v3_attrs,
         2: ir_v2_attrs
     }
@@ -884,15 +885,9 @@ def add_output_ops(graph: nx.MultiDiGraph, user_defined_outputs: dict, inputs: d
     return sinks
 
 
-def set_is_input_true(graph: nx.MultiDiGraph, placeholders: list):
+def set_is_input(graph: nx.MultiDiGraph, placeholders: list, is_input: bool):
     for placeholder in placeholders:
-        graph.node[placeholder]['is_input'] = True
-
-
-def set_is_input_false(graph: nx.MultiDiGraph):
-    for node, data in list(graph.nodes(data=True)):
-        if 'op' in data and data['op'] == 'Placeholder':
-            graph.node[node]['is_input'] = False
+        graph.node[placeholder]['is_input'] = is_input
 
 
 def check_input(graph: nx.MultiDiGraph, node_name: str):
@@ -912,120 +907,240 @@ def split_node_in_port(node_id: str):
             node_name = separator.join(parts[1:])
             try:
                 port = int(parts[0])
-                return (node_name, port)
+                return node_name, port
             except ValueError as err:
                 log.warning('Didn\'t recognize port:node format for "{}" because port is not an integer.'.format(
                     node_id))
-    return (node_id, None)
-
-
-def add_input_op(graph: nx.MultiDiGraph, node_id: str, port: int = 0, data: bool = False, shape=None):
-    # we import it here because Op imports add_attrs_props and update_ie_fields from this file
+    return node_id, None
+
+
+def add_input_op_input_port_without_data(graph: nx.MultiDiGraph, node_id: str, input_op, edge_attrs: dict):
+    input_node = input_op.create_node()
+    graph.add_edge(input_node.id, node_id, **edge_attrs)
+    log.debug('Input: {} for node {}'.format(input_node.id, node_id))
+    log.debug("Add edge from {} to {}".format(input_node.id, node_id))
+    return input_node.id
+
+
+def add_input_op_input_port_with_data(graph: nx.MultiDiGraph, node_id: str, input_op, edge_attrs: dict):
+    input_data_node = input_op.create_node_with_data()
+    input_node = input_data_node.in_node()
+    graph.add_edge(input_data_node.id, node_id, **edge_attrs)
+    update_ie_fields(graph.node[input_node.id])
+    log.debug('Input: {} for node {}'.format(input_node.id, node_id))
+    log.debug("Add edge from {} to {}".format(input_node.id, input_data_node.id))
+    log.debug("Add edge from {} to {}".format(input_data_node.id, node_id))
+    return input_node.id
+
+
+def add_input_op_output_port_without_data(graph: nx.MultiDiGraph, node_id: str, input_op, port: int):
+    input_node = input_op.create_node()
+    # In this case it can be more than one out edge from one port and we should iterate over all output edges
+    for _, out_node, attrs in graph.out_edges(node_id, data=True):
+        if attrs['out'] == port:
+            # new out port = 0
+            attrs = attrs.copy()
+            attrs['out'] = 0
+            graph.add_edge(input_node.id, out_node, **attrs)
+            log.debug('Input: {} for node {} output port {}'.format(input_node.id, node_id, port))
+            log.debug("Add edge from {} to {}".format(input_node.id, out_node))
+    return input_node.id
+
+
+def add_input_op_output_port_with_data(graph: nx.MultiDiGraph, node_id: str, input_op, port: int):
+    # we assume that after op always data node
+    data_node = Node(graph, node_id).out_node(port)
+    assert data_node.has_valid('kind') and data_node.kind == 'data'
+    input_op.create_node_with_data(data_nodes=data_node)
+    input_node = data_node.in_node()
+    update_ie_fields(graph.node[input_node.id])
+    log.debug('Input: {} for node {}'.format(input_node.id, node_id))
+    log.debug("Add edge from {} to {}".format(input_node.id, node_id))
+    return input_node.id
+
+
+def add_input_op(graph: nx.MultiDiGraph, node_id: str, port: int = 0, data: bool = False, shape=None,
+                 is_out_port: bool = False):
+    """
+    This function adds Input node to node with id==node_id to specified port (in or out defined with is_out_port).
+    :param graph: graph to operate on.
+    :param node_id: node_id for node to which we should add new input.
+    :param port: number of port of node_id node for adding input node.
+    :param data: flag that define whether data nodes is needed or not.
+    :param shape: shape for new input node.
+    :param is_out_port: flag that define whether port is output port or not.
+    :return: id of new Input operation
+    """
+    # We import it here because Op imports add_attrs_props and update_ie_fields from this file
     from mo.ops.input import Input
-    input = Input(graph, dict(shape=shape, initial_node_name=node_id, name='{}/placeholder_port_{}'.format(node_id, port)))
+    port_type = '_out' if is_out_port else ''
+    input_op = Input(graph, dict(shape=shape, initial_node_name=node_id,
+                                 name='{}/placeholder{}_port_{}'.format(node_id, port_type, port)))
     edge_attrs = {'in': port, 'out': 0, 'in_attrs': ['in'], 'out_attrs': ['out'],
                   'fw_tensor_debug_info': [(Node(graph, node_id).soft_get('name'), port)],
                   'data_attrs': ['fw_tensor_debug_info']}
     if not data:
-        input_node = input.create_node()
-        graph.add_edge(input_node.id, node_id, **edge_attrs)
-        log.debug('Input: {} for node {}'.format(input_node.id, node_id))
-        log.debug("Add edge from {} to {}".format(node_id, input_node.id))
-        return input_node.id
+        if is_out_port:
+            new_input_id = add_input_op_output_port_without_data(graph=graph, node_id=node_id, input_op=input_op,
+                                                                 port=port)
+        else:
+            new_input_id = add_input_op_input_port_without_data(graph=graph, node_id=node_id, input_op=input_op,
+                                                                edge_attrs=edge_attrs)
     else:
-        input_data_node = input.create_node_with_data()
-        input = input_data_node.in_node()
-        graph.add_edge(input_data_node.id, node_id, **edge_attrs)
-        update_ie_fields(graph.node[input.id])
-        log.debug('Input: {} for node {}'.format(input.id, node_id))
-        log.debug("Add edge from {} to {}".format(input.id, input_data_node.id))
-        log.debug("Add edge from {} to {}".format(input_data_node.id, node_id))
-        return input.id
+        if is_out_port:
+            new_input_id = add_input_op_output_port_with_data(graph=graph, node_id=node_id, input_op=input_op,
+                                                              port=port)
+        else:
+            new_input_id = add_input_op_input_port_with_data(graph=graph, node_id=node_id, input_op=input_op,
+                                                             edge_attrs=edge_attrs)
+    return new_input_id
+
+
+def add_input_ops_helper_before_infer_input_port(graph: nx.MultiDiGraph, smart_node: Node, port: int, node_id: str,
+                                                 shape: np.array, inputs: list, edges_to_remove: list):
+    n_inputs = len(smart_node.in_nodes())
+    if n_inputs > 1 and port is None:
+        raise Error(
+            'Node {} has more than 1 input and input shapes were provided. Try not to provide input'
+            ' shapes or specify input port with port:node notation, where port is an integer. '
+            '{}'.format(smart_node.soft_get('name'), refer_to_faq_msg(30)))
+    port = port if port is not None else 0
+    edges_to_remove.append((smart_node.in_node(port).id, smart_node.id))
+    inputs.append(add_input_op(graph=graph, node_id=node_id, port=port, data=False,
+                               shape=shape))
+
+
+def add_input_ops_helper_after_infer_input_port(graph: nx.MultiDiGraph, smart_node: Node, port:int, node_id: str,
+                                                inputs: list, edges_to_remove: list):
+    n_inputs = len(smart_node.in_nodes())
+    if n_inputs > 1 and port is not None and port != 0:
+        raise Error(
+            'Input port > 0 in --input is not supported if --input_shape is not provided. Node:'
+            ' "{}". Omit port index and all input ports will be replaced by placeholders. '
+            'Or provide --input_shape. ' + refer_to_faq_msg(31), node_id)
+    port = port if port is not None else 0
+    in_node = smart_node.in_node(port)
+    shape = in_node['shape'] if 'shape' in in_node else None
+    if shape is None:
+        raise Error('Shape for tensor "{}" is not defined. Can not proceed.' + refer_to_faq_msg(41),
+                    in_node.soft_get('name'))
+    inputs.append(add_input_op(graph=graph, node_id=node_id, port=port, data=True,
+                               shape=shape.copy()))
+    edges_to_remove.append((in_node.id, node_id))
+
+
+def add_input_ops_helper_before_infer_output_port(graph: nx.MultiDiGraph, port:int, node_id: str,
+                                                 shape: np.array, inputs: list, edges_to_remove: list):
+    for u, v, edge_attrs in graph.out_edges(node_id, data=True):
+        if edge_attrs['out'] == port:
+            edges_to_remove.append((u, v))  # we need to remove all edges from this port
+    inputs.append(add_input_op(graph=graph, node_id=node_id, port=port, data=False,
+                               shape=shape, is_out_port=True))
+
+def add_input_ops_helper_after_infer_output_port(graph: nx.MultiDiGraph, smart_node: Node, port:int, node_id: str,
+                                                 inputs: list, edges_to_remove: list):
+    out_node = smart_node.out_node(port)
+    shape = out_node['shape'] if 'shape' in out_node else None
+    if shape is None:
+        raise Error('Shape for tensor "{}" is not defined. Can not proceed.' + refer_to_faq_msg(41),
+                    out_node.soft_get('name'))
+    inputs.append(add_input_op(graph=graph, node_id=node_id, port=port, data=True,
+                               shape=shape.copy(), is_out_port=True))
+    edges_to_remove.append((node_id, out_node.id))
 
 
 def add_input_ops(graph: nx.MultiDiGraph, user_defined_inputs: dict, before_infer: bool):
+    """
+    This function add user defined input operations.
+    For cutting without port:
+    Op_1 -> Op_2 -> output, user_defined_inputs = {'Op_2': {'shape':[1, 2]}} =>
+    Op_1,  New_input (op=Placeholder, shape=[1, 2]) -> Op_2 -> output
+
+    For cutting with input port:
+    Op_1 -> Op_2 -> output, user_defined_inputs = {'Op_2': {'shape':[1, 2], 'in': 0}} =>
+    Op_1,  New_input (op=Placeholder, shape=[1, 2]) -> Op_2 -> output
+
+    For cutting with output port:
+    Op_1 -> Op_2 -> output, user_defined_inputs = {'Op_2': {'shape':[1, 2], 'out': 0}} =>
+    Op_1 -> Op_2, New_input (op=Placeholder, shape=[1, 2]) -> output
+
+    For case with before_infer=False data nodes are added to this schemes.
+    """
     inputs = []
-    set_is_input_false(graph)
+    set_is_input(graph, get_nodes_with_attributes(graph, op='Placeholder'), False)
     if user_defined_inputs is None:
         inputs = get_nodes_with_attributes(graph, op='Placeholder')
     else:
         # cutting the net by inputs
         assert isinstance(user_defined_inputs, dict)
-        for key, values in user_defined_inputs.items():
-            for value in values:
-                if 'out' in value:
-                    raise Error(
-                        'Cutting the net by output ports of nodes is forbidden. Can not cut the edge from output port '
-                        '{} of node {}'.format(value['out'], key))
-
         edges_to_remove = []
         for node_id in user_defined_inputs:
             for port_and_shape_info in user_defined_inputs[node_id]:
                 if 'added' in port_and_shape_info and port_and_shape_info['added']:
                     continue
+
+                is_out_port = 'out' in port_and_shape_info  # by default we assume input port or input node without port
                 shape = port_and_shape_info['shape'] if 'shape' in port_and_shape_info else None
-                port = port_and_shape_info['in'] if 'in' in port_and_shape_info else None
                 smart_node = Node(graph, node_id)
-                n_inputs = len(smart_node.in_nodes())
-                # specific Placeholder cases
+
+                # Common port index check
+                if is_out_port:
+                    port = port_and_shape_info['out']  # we check that 'out' in port_and_shape_info earlier
+                    if port is None:
+                        raise Error('Output port for input node {} should be specified, it cannot be None!'.format(
+                            node_id
+                        ))
+                    if port is not None and port not in smart_node.out_nodes():
+                        raise Error('Output port index {} is out of number of available output ports for node "{}". ' +
+                                    refer_to_faq_msg(29), port, node_id)
+                else:
+                    port = port_and_shape_info['in'] if 'in' in port_and_shape_info else None
+                    if port is not None and port not in smart_node.in_nodes():
+                        raise Error('Input port index {} is out of number of available input ports for node "{}". ' +
+                                    refer_to_faq_msg(29), port, node_id)
+
+                # specific Placeholder case
                 if smart_node.op == 'Placeholder':
                     if port is not None:
-                        raise Error('Placeholder node "{}" doesn\'t have input port, but input port {} was provided. ' +
-                                    refer_to_faq_msg(28), node_id, port)
+                        raise Error(
+                            'Placeholder node "{}" doesn\'t have input port, but input port {} was provided. ' +
+                            refer_to_faq_msg(28), node_id, port)
                     if shape is not None:
                         graph.node[node_id]['shape'] = shape
                     inputs.append(node_id)
                     port_and_shape_info['added'] = True
                     continue
-                # common port index check
-                if port is not None and port >= n_inputs:
-                    raise Error('Port index {} is out of number of available input ports for node "{}". ' +
-                                refer_to_faq_msg(29), port, n_inputs)
+
                 if before_infer:
                     if shape is None:
                         continue
-                    # we cut with shapes provided by user and there is no need to wait till infer
-                    if n_inputs > 1 and port is None:
-                        raise Error('Node {} has more than 1 input and input shapes were provided. Try not to provide input'
-                                    ' shapes or specify input port with port:node notation, where port is an integer. '
-                                    '{}'.format(smart_node.soft_get('name'), refer_to_faq_msg(30)))
-                    if port is None:
-                        assert n_inputs == 1
-                        port = 0
-                    edges_to_remove = [(smart_node.in_node(port).id, smart_node.id)]
-                    inputs.append(add_input_op(graph=graph, node_id=node_id, port=port, data=False, shape=shape))
-                    port_and_shape_info['added'] = True
+                    # We cut with shapes provided by user and there is no need to wait till infer
+                    if is_out_port:
+                        add_input_ops_helper_before_infer_output_port(graph, port, node_id, shape, inputs,
+                                                                      edges_to_remove)
+                    else:
+                        add_input_ops_helper_before_infer_input_port(graph, smart_node, port, node_id, shape, inputs,
+                                                                     edges_to_remove)
                 else:
-
-                    # we cut after infer and
-                    if n_inputs > 1 and port is not None and port != 0:
-                        raise Error('Input port > 0 in --input is not supported if --input_shape is not provided. Node:'
-                                    ' "{}". Omit port index and all input ports will be replaced by placeholders. '
-                                    'Or provide --input_shape. ' + refer_to_faq_msg(31), node_id)
-                    for first, second, edge_attrs in list(graph.in_edges(node_id, data=True)):
-                        if graph.node[first]['value'] is not None:
-                            continue
-                        if port is not None and edge_attrs['in'] != port:
-                            continue
-                        shape = graph.node[first]['shape'].copy()
-                        if shape is None:
-                            raise Error('Shape for tensor "{}" is not defined. Can not proceed. ' + refer_to_faq_msg(41),
-                                        first)
-                        port = port if port is not None else edge_attrs['in']
-                        inputs.append(add_input_op(graph=graph, node_id=node_id, port=port, data=True, shape=shape))
-                        port_and_shape_info['added'] = True
-                        edges_to_remove.append((first, second))
-            graph.remove_edges_from(edges_to_remove)
-            edges_to_remove = []
+                    # We cut after infer and we need inferred shapes in nodes
+                    if is_out_port:
+                        add_input_ops_helper_after_infer_output_port(graph, smart_node, port, node_id, inputs,
+                                                                     edges_to_remove)
+                    else:
+                        add_input_ops_helper_after_infer_input_port(graph, smart_node, port, node_id, inputs,
+                                                                    edges_to_remove)
+                port_and_shape_info['added'] = True
+        graph.remove_edges_from(edges_to_remove)
 
     # if len(inputs) == 0, shapes were not provided for all nodes in input-cut request,
     # we didn't cut inputs before infer, so this check is useless and invalid
     if len(inputs):
-        set_is_input_true(graph, inputs)
+        set_is_input(graph, inputs, True)
         # Check if there are inputs that are not listed in user_defined_inputs and are needed to calculate outputs
         outputs = get_nodes_with_attributes(graph, is_output=True)
+        visited = set()
         for output_name in outputs:
-            reverse_dfs(graph, output_name, check_input)
+            reverse_dfs(graph, output_name, check_input, visited)
 
     return inputs
 
index 2d4e9e1..f0e3b3b 100644 (file)
@@ -21,39 +21,27 @@ from mo.utils.utils import refer_to_faq_msg
 
 
 def node_pb_arg(pb_extractor):
-    return lambda node: pb_extractor(node.pb)
+    return lambda node: pb_extractor(node.parameters)
 
 
-kaldi_type_extractors = {
-    # Data Layers
-    'globalinput': node_pb_arg(lambda x: dict(op='Placeholder', type='Input',
-                                              infer=lambda node: single_output_infer(node, lambda n: n.shape))),
-
-    # Utility Layers
-    'softmax': node_pb_arg(lambda _: dict(op='SoftMax', type='SoftMax', infer=copy_shape_infer)),
-}
+kaldi_type_extractors = {}
 
 
 def common_kaldi_fields(node: Node) -> dict:
-    pb = node.pb if node.pb else node
-    layer_type = pb.type
+    layer_type = node.op
     return {
         'kind': 'op',
-        'name': pb.name,
-        'type': layer_type,
+        'name': node.id,
         'op': layer_type,
         # generic code relies on op; it should be overridden by specific op extractor
         'infer': None,
-        'precision': 'FP32'  # TODO use real precision derived from the model
+        'precision': 'FP32'
     }
 
 
 def kaldi_extractor(node: Node) -> (bool, dict):
-    if node.has_valid('op') and node.op == 'Identity':
-        return True, {}
     result = common_kaldi_fields(node)
-
-    layer_type = result['type'].lower()
+    layer_type = result['op']
     if layer_type not in kaldi_type_extractors:
         raise Error('Found unsupported layer {}. '.format(node.id) +
                     'Model Optimizer does not support this layer type: {}. '.format(layer_type) +
diff --git a/model-optimizer/mo/front/kaldi/extractors/add_shift_ext.py b/model-optimizer/mo/front/kaldi/extractors/add_shift_ext.py
new file mode 100644 (file)
index 0000000..ff5dff9
--- /dev/null
@@ -0,0 +1,38 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import numpy as np
+
+from mo.front.caffe.extractors.utils import embed_input
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.utils import read_binary_vector, read_learning_info
+from mo.ops.scale_shift import ScaleShiftOp
+
+
+class AddShiftFrontExtractor(FrontExtractorOp):
+    op = 'addshift'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        pb = node.parameters
+        read_learning_info(pb)
+        biases = read_binary_vector(pb)
+        bias_term = True
+        mapping_rule = {'bias_term': bias_term}
+        embed_input(mapping_rule, 1, 'weights', np.ones(biases.shape))
+        embed_input(mapping_rule, 2, 'biases', biases)
+        ScaleShiftOp.update_node_stat(node, mapping_rule)
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/affine_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/affine_component_ext.py
new file mode 100644 (file)
index 0000000..7900639
--- /dev/null
@@ -0,0 +1,29 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.extractors.fixed_affine_component_ext import FixedAffineComponentFrontExtractor
+from mo.front.kaldi.utils import read_learning_info
+from mo.graph.graph import Node
+
+
+class AffineComponentFrontExtractor(FrontExtractorOp):
+    op = 'affinecomponent'
+    enabled = True
+
+    @staticmethod
+    def extract(node: Node):
+        read_learning_info(node.parameters)
+        return FixedAffineComponentFrontExtractor.extract(node)
diff --git a/model-optimizer/mo/front/kaldi/extractors/affine_component_preconditioned_online_ext.py b/model-optimizer/mo/front/kaldi/extractors/affine_component_preconditioned_online_ext.py
new file mode 100644 (file)
index 0000000..70a8c41
--- /dev/null
@@ -0,0 +1,29 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.extractors.fixed_affine_component_ext import FixedAffineComponentFrontExtractor
+from mo.front.kaldi.utils import read_learning_info
+from mo.graph.graph import Node
+
+
+class AffineComponentFrontExtractor(FrontExtractorOp):
+    op = 'affinecomponentpreconditionedonline'
+    enabled = True
+
+    @staticmethod
+    def extract(node: Node):
+        read_learning_info(node.parameters)
+        return FixedAffineComponentFrontExtractor.extract(node)
index c335d92..8175fb1 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-
-from mo.front.caffe.extractors.utils import weights_biases
+from mo.front.caffe.extractors.utils import embed_input
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
+from mo.front.kaldi.utils import read_binary_matrix, read_binary_vector, read_learning_info
+from mo.ops.inner_product import InnerProduct
 
 
 class AffineTransformFrontExtractor(FrontExtractorOp):
@@ -25,11 +25,17 @@ class AffineTransformFrontExtractor(FrontExtractorOp):
 
     @staticmethod
     def extract(node):
+        pb = node.parameters
+        read_learning_info(pb)
+        weights, weights_shape = read_binary_matrix(pb)
+        biases = read_binary_vector(pb)
+
         mapping_rule = {
-            'out-size': node.pb.num_output,
+            'out-size': weights_shape[0],
             'layout': 'NCHW'
         }
-        mapping_rule.update(weights_biases(node.pb.bias_term, node.pb))
+        embed_input(mapping_rule, 1, 'weights', weights)
+        embed_input(mapping_rule, 2, 'biases', biases)
 
-        Op.get_op_class_by_name('FullyConnected').update_node_stat(node, mapping_rule)
+        InnerProduct.update_node_stat(node, mapping_rule)
         return __class__.enabled
index b0cd822..9299c7c 100644 (file)
@@ -15,7 +15,7 @@
 """
 
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
+from mo.ops.concat import Concat
 
 
 class ConcatFrontExtractor(FrontExtractorOp):
@@ -25,8 +25,7 @@ class ConcatFrontExtractor(FrontExtractorOp):
     @staticmethod
     def extract(node):
         mapping_rule = {
-           'axis': node.pb.axis
+           'axis': 1
         }
-
-        Op.get_op_class_by_name('Concat').update_node_stat(node, mapping_rule)
+        Concat.update_node_stat(node, mapping_rule)
         return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/convolution_ext.py b/model-optimizer/mo/front/kaldi/extractors/convolution_ext.py
deleted file mode 100644 (file)
index 16998d8..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import copy
-
-import numpy as np
-
-from mo.front.caffe.extractors.utils import weights_biases
-from mo.front.common.extractors.utils import layout_attrs
-from mo.front.common.partial_infer.utils import int64_array
-from mo.front.extractor import FrontExtractorOp
-from mo.graph.graph import Node
-from mo.ops.convolution import Convolution
-from mo.ops.op import Op
-
-
-class Convolution1DFrontExtractor(FrontExtractorOp):
-    op = 'convolution'
-    enabled = True
-
-    @staticmethod
-    def extract(node: Node) -> bool:
-        params = node.pb
-        mapping_rule = {
-            'output': params.output,
-            'patch_stride': params.patch_stride,
-            'bias_term': None,
-            'pad': int64_array([[0, 0], [0, 0], [0, 0], [0, 0]]),
-            'pad_spatial_shape': int64_array([[0, 0], [0, 0]]),
-            'dilation': int64_array([1, 1, 1, 1]),
-            'kernel': int64_array([1, 1, 1, params.kernel]),
-            'stride': int64_array([1, 1, 1, params.stride]),
-            'kernel_spatial': int64_array([1, params.kernel]),
-            'input_feature_channel': 1,
-            'output_feature_channel': 0,
-            'kernel_spatial_idx': [2,3],
-            'group': 1,
-            'reshape_kernel': True,
-        }
-        mapping_rule.update(layout_attrs())
-        mapping_rule.update(weights_biases(params.bias_term, params))
-        if len(params.blobs) > 1 and len(params.blobs[1]) > 0:
-            mapping_rule['bias_addable'] = True
-        else:
-            mapping_rule['bias_addable'] = False
-
-        Op.get_op_class_by_name('Convolution').update_node_stat(node, mapping_rule)
-        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/convolutional_1d_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/convolutional_1d_component_ext.py
new file mode 100644 (file)
index 0000000..d77eeb3
--- /dev/null
@@ -0,0 +1,96 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import numpy as np
+
+from mo.front.caffe.extractors.utils import embed_input
+from mo.front.common.extractors.utils import layout_attrs
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.loader.utils import read_token_value, collect_until_whitespace, find_next_tag
+from mo.front.kaldi.utils import read_learning_info, read_binary_matrix, read_binary_vector
+from mo.graph.graph import Node
+from mo.ops.convolution import Convolution
+from mo.utils.error import Error
+from mo.utils.utils import refer_to_faq_msg
+
+
+class ConvolutionalComponentFrontExtractor(FrontExtractorOp):
+    op = 'convolutional1dcomponent'  # Naming like in Kaldi
+    enabled = True
+
+    @staticmethod
+    def extract(node: Node) -> bool:
+        """
+        Extract conv parameters from node.parameters.
+        node.parameters like file descriptor object.
+        :param node: Convolution node
+        :return:
+        """
+        pb = node.parameters
+        read_learning_info(pb)
+
+        kernel = read_token_value(pb, b'<PatchDim>')
+        stride = read_token_value(pb, b'<PatchStep>')
+        patch_stride = read_token_value(pb, b'<PatchStride>')
+
+        token = find_next_tag(pb)
+        if token == '<AppendedConv>':
+            appended_conv = True
+            token = find_next_tag(pb)
+        if token != '<FilterParams>':
+            raise Error('Can not load token {} from Kaldi model'.format(token) +
+                        refer_to_faq_msg(94))
+        collect_until_whitespace(pb)
+        weights, weights_shape = read_binary_matrix(pb)
+
+        collect_until_whitespace(pb)
+        biases = read_binary_vector(pb)
+
+        if (patch_stride - kernel) % stride != 0:
+            raise Error(
+                'Kernel size and stride does not correspond to `patch_stride` attribute of Convolution layer. ' +
+                refer_to_faq_msg(93))
+
+        output = biases.shape[0]
+        if weights_shape[0] != output:
+            raise Error('Weights shape does not correspond to the `output` attribute of Convolution layer. ' +
+                        refer_to_faq_msg(93))
+
+        mapping_rule = {
+            'output': output,
+            'patch_stride': patch_stride,
+            'bias_term': None,
+            'pad': np.array([[0, 0], [0, 0], [0, 0], [0, 0]], dtype=np.int64),
+            'pad_spatial_shape': np.array([[0, 0], [0, 0]], dtype=np.int64),
+            'dilation': np.array([1, 1, 1, 1], dtype=np.int64),
+            'kernel': np.array([1, 1, 1, kernel], dtype=np.int64),
+            'stride': np.array([1, 1, 1, stride], dtype=np.int64),
+            'kernel_spatial': np.array([1, kernel], dtype=np.int64),
+            'input_feature_channel': 1,
+            'output_feature_channel': 0,
+            'kernel_spatial_idx': [2, 3],
+            'group': 1,
+            'reshape_kernel': True,
+            'appended_conv': appended_conv
+        }
+
+        mapping_rule.update(layout_attrs())
+        embed_input(mapping_rule, 1, 'weights', weights)
+        embed_input(mapping_rule, 2, 'biases', biases)
+
+        mapping_rule['bias_addable'] = len(biases) > 0
+
+        Convolution.update_node_stat(node, mapping_rule)
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/convolutional_component_ext.py
new file mode 100644 (file)
index 0000000..21a1e33
--- /dev/null
@@ -0,0 +1,88 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import numpy as np
+
+from mo.front.caffe.extractors.utils import embed_input
+from mo.front.common.extractors.utils import layout_attrs
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.loader.utils import read_token_value, collect_until_whitespace
+from mo.front.kaldi.utils import read_learning_info, read_binary_matrix, read_binary_vector
+from mo.graph.graph import Node
+from mo.ops.convolution import Convolution
+from mo.utils.error import Error
+from mo.utils.utils import refer_to_faq_msg
+
+
+class ConvolutionalComponentFrontExtractor(FrontExtractorOp):
+    op = 'convolutionalcomponent'  # Naming like in Kaldi
+    enabled = True
+
+    @staticmethod
+    def extract(node: Node) -> bool:
+        """
+        Extract conv parameters from node.parameters.
+        node.parameters like file descriptor object.
+        :param node: Convolution node
+        :return:
+        """
+        pb = node.parameters
+        kernel = read_token_value(pb, b'<PatchDim>')
+        stride = read_token_value(pb, b'<PatchStep>')
+        patch_stride = read_token_value(pb, b'<PatchStride>')
+
+        read_learning_info(pb)
+
+        collect_until_whitespace(pb)
+        weights, weights_shape = read_binary_matrix(pb)
+
+        collect_until_whitespace(pb)
+        biases = read_binary_vector(pb)
+
+        if (patch_stride - kernel) % stride != 0:
+            raise Error(
+                'Kernel size and stride does not correspond to `patch_stride` attribute of Convolution layer. ' +
+                refer_to_faq_msg(93))
+
+        output = biases.shape[0]
+        if weights_shape[0] != output:
+            raise Error('Weights shape does not correspond to the `output` attribute of Convolution layer. ' +
+                        refer_to_faq_msg(93))
+
+        mapping_rule = {
+            'output': output,
+            'patch_stride': patch_stride,
+            'bias_term': None,
+            'pad': np.array([[0, 0], [0, 0], [0, 0], [0, 0]], dtype=np.int64),
+            'pad_spatial_shape': np.array([[0, 0], [0, 0]], dtype=np.int64),
+            'dilation': np.array([1, 1, 1, 1], dtype=np.int64),
+            'kernel': np.array([1, 1, 1, kernel], dtype=np.int64),
+            'stride': np.array([1, 1, 1, stride], dtype=np.int64),
+            'kernel_spatial': np.array([1, kernel], dtype=np.int64),
+            'input_feature_channel': 1,
+            'output_feature_channel': 0,
+            'kernel_spatial_idx': [2, 3],
+            'group': 1,
+            'reshape_kernel': True,
+        }
+
+        mapping_rule.update(layout_attrs())
+        embed_input(mapping_rule, 1, 'weights', weights)
+        embed_input(mapping_rule, 2, 'biases', biases)
+
+        mapping_rule['bias_addable'] = len(biases) > 0
+
+        Convolution.update_node_stat(node, mapping_rule)
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/copy_ext.py b/model-optimizer/mo/front/kaldi/extractors/copy_ext.py
new file mode 100644 (file)
index 0000000..3348ef1
--- /dev/null
@@ -0,0 +1,40 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import numpy as np
+
+from mo.front.caffe.extractors.utils import embed_input
+from mo.front.common.partial_infer.elemental import copy_shape_infer
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.loader.utils import read_binary_integer32_token, read_blob
+from mo.ops.permute import Permute
+
+
+class CopyFrontExtractor(FrontExtractorOp):
+    op = 'copy'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        pb = node.parameters
+        weights_size = read_binary_integer32_token(pb)
+        weights = read_blob(pb, weights_size, dtype=np.int32) - 1
+        attrs = {
+            'infer': copy_shape_infer
+        }
+        embed_input(attrs, 1, 'indexes', weights)
+        Permute.update_node_stat(node, attrs)
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/fixed_affine_component_ext.py
new file mode 100644 (file)
index 0000000..eee267f
--- /dev/null
@@ -0,0 +1,51 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.caffe.extractors.utils import embed_input
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.loader.utils import find_next_tag, read_placeholder
+from mo.front.kaldi.utils import read_binary_matrix, read_binary_vector
+from mo.ops.inner_product import InnerProduct
+from mo.utils.error import Error
+
+
+class FixedAffineComponentFrontExtractor(FrontExtractorOp):
+    op = 'fixedaffinecomponent'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        pb = node.parameters
+        tag = find_next_tag(pb)
+        if tag != '<LinearParams>':
+            raise Error('FixedAffineComponent must contain LinearParams')
+        read_placeholder(pb, 1)
+        weights, weights_shape = read_binary_matrix(pb)
+        tag = find_next_tag(pb)
+        read_placeholder(pb, 1)
+        if tag != '<BiasParams>':
+            raise Error('FixedAffineComponent must contain BiasParams')
+        biases = read_binary_vector(pb)
+
+        mapping_rule = {
+            'out-size': weights_shape[0],
+            'layout': 'NCHW'
+        }
+        embed_input(mapping_rule, 1, 'weights', weights)
+        embed_input(mapping_rule, 2, 'biases', biases)
+
+        InnerProduct.update_node_stat(node, mapping_rule)
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/lstm_projected_streams_ext.py b/model-optimizer/mo/front/kaldi/extractors/lstm_projected_streams_ext.py
new file mode 100644 (file)
index 0000000..09e8061
--- /dev/null
@@ -0,0 +1,68 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from extensions.ops.lstm_cell import LSTMCell
+from mo.front.caffe.extractors.utils import embed_input
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.loader.utils import collect_until_token, collect_until_whitespace, get_uint32
+from mo.front.kaldi.utils import read_binary_matrix, read_binary_vector
+
+
+class LSTMProjectedStreamsFrontExtractor(FrontExtractorOp):
+    op = 'lstmprojectedstreams'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        clip_value = 50
+        pb = node.parameters
+        res = collect_until_whitespace(pb)
+        if res == b'<CellClip>':
+            clip_value = get_uint32(pb.read(4))
+        collect_until_token(pb, b'FM')
+        gifo_x_weights, gifo_x_weights_shape = read_binary_matrix(pb, False)
+        gifo_r_weights, gifo_r_weights_shape = read_binary_matrix(pb)
+        gifo_biases = read_binary_vector(pb)
+        input_gate_weights = read_binary_vector(pb)
+        forget_gate_weights = read_binary_vector(pb)
+        output_gate_weights = read_binary_vector(pb)
+
+        projection_weights, projection_weights_shape = read_binary_matrix(pb)
+
+        mapping_rule = {'gifo_x_weights_shape': gifo_x_weights_shape,
+                        'gifo_r_weights_shape': gifo_r_weights_shape,
+                        'projection_weights_shape': projection_weights_shape,
+                        'clip_value': clip_value
+                        }
+
+        embed_input(mapping_rule, 1, 'gifo_x_weights', gifo_x_weights)
+        embed_input(mapping_rule, 2, 'gifo_r_weights', gifo_r_weights)
+        embed_input(mapping_rule, 3, 'gifo_biases', gifo_biases)
+        embed_input(mapping_rule, 4, 'input_gate_weights', input_gate_weights)
+        embed_input(mapping_rule, 5, 'forget_gate_weights', forget_gate_weights)
+        embed_input(mapping_rule, 6, 'output_gate_weights', output_gate_weights)
+        embed_input(mapping_rule, 7, 'projection_weights', projection_weights)
+
+        LSTMCell.update_node_stat(node, mapping_rule)
+        return __class__.enabled
+
+
+class LSTMProjectedFrontExtractor(FrontExtractorOp):
+    op = 'lstmprojected'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        return LSTMProjectedStreamsFrontExtractor.extract(node)
diff --git a/model-optimizer/mo/front/kaldi/extractors/max_pooling_ext.py b/model-optimizer/mo/front/kaldi/extractors/max_pooling_ext.py
new file mode 100644 (file)
index 0000000..0e38dd3
--- /dev/null
@@ -0,0 +1,60 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import numpy as np
+
+from mo.front.common.extractors.utils import layout_attrs
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.loader.utils import read_token_value, collect_until_whitespace, collect_until_token, \
+    read_binary_integer32_token, find_next_tag, read_placeholder
+from mo.ops.pooling import Pooling
+from mo.utils.error import Error
+
+
+class MaxPoolingComponentFrontExtractor(FrontExtractorOp):
+    op = 'maxpoolingcomponent'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        pb = node.parameters
+        collect_until_token(pb, b'<PoolSize>')
+        kernel = read_binary_integer32_token(pb)
+        tag = find_next_tag(pb)
+        if tag == '<PoolStep>':
+            read_placeholder(pb, 1)
+            stride = read_binary_integer32_token(pb)
+            pool_step = stride
+            pool_stride = read_token_value(pb, b'<PoolStride>')
+        elif tag == '<PoolStride>':
+            stride = 1
+            pool_step = None
+            read_placeholder(pb, 1)
+            pool_stride = read_binary_integer32_token(pb)
+        else:
+            raise Error('Can not extract parameters for {}'.format(node))
+
+        mapping_rule = {
+            'window': np.array([1, 1, 1, kernel], dtype=np.int64),
+            'stride': np.array([1, 1, stride, stride], dtype=np.int64),
+            'pool_stride': pool_stride,
+            'pool_step': pool_step,
+            'pad': np.array([[0, 0], [0, 0], [0, 0], [0, 0]], dtype=np.int64),
+            'pad_spatial_shape': np.array([[0, 0], [0, 0]], dtype=np.int64),
+            'pool_method': 'max',
+        }
+        mapping_rule.update(layout_attrs())
+        Pooling.update_node_stat(node, mapping_rule)
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/normalize_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/normalize_component_ext.py
new file mode 100644 (file)
index 0000000..4d1e9e9
--- /dev/null
@@ -0,0 +1,43 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import numpy as np
+
+from mo.front.caffe.extractors.utils import embed_input
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.loader.utils import read_binary_integer32_token, collect_until_token
+from mo.ops.scale_shift import ScaleShiftOp
+
+
+class NormalizeComponentFrontExtractor(FrontExtractorOp):
+    op = 'normalizecomponent'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        pb = node.parameters
+        collect_until_token(pb, b'<Dim>')
+        dim = read_binary_integer32_token(pb)
+        target_rms = 1
+        d_scaled = dim * target_rms ** 2
+        in_norm = np.zeros([dim], np.float64)
+        in_norm += 1.0 / d_scaled
+        in_norm = np.maximum(in_norm, 2. ** (-66))
+        in_norm = np.power(in_norm, -0.5)
+        attrs = {}
+        embed_input(attrs, 1, 'weights', in_norm)
+        ScaleShiftOp.update_node_stat(node, attrs)
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/pooling_ext.py b/model-optimizer/mo/front/kaldi/extractors/pooling_ext.py
deleted file mode 100644 (file)
index 44c64a3..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-from mo.front.common.extractors.utils import layout_attrs
-from mo.front.common.partial_infer.utils import int64_array
-from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
-
-
-class PoolingFrontExtractor(FrontExtractorOp):
-    op = 'pooling'
-    enabled = True
-
-    @staticmethod
-    def extract(node):
-        mapping_rule = {
-            'window': int64_array([1, 1, 1, node.pb.kernel]),
-            'stride': int64_array([1, 1, node.pb.stride, node.pb.stride]),
-            'pool_stride': node.pb.pool_stride,
-            'pad': int64_array([[0, 0], [0, 0], [0, 0], [0, 0]]),
-            'pad_spatial_shape': int64_array([[0, 0], [0, 0]]),
-        }
-        mapping_rule.update(layout_attrs())
-        Op.get_op_class_by_name('Pooling').update_node_stat(node, mapping_rule)
-        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/rectified_linear_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/rectified_linear_component_ext.py
new file mode 100644 (file)
index 0000000..713db4b
--- /dev/null
@@ -0,0 +1,28 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.relu import ReLU
+
+
+class RectifiedLinearComponentFrontExtractor(FrontExtractorOp):
+    op = 'rectifiedlinearcomponent'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        ReLU.update_node_stat(node, {})
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/rescale_ext.py b/model-optimizer/mo/front/kaldi/extractors/rescale_ext.py
new file mode 100644 (file)
index 0000000..459e558
--- /dev/null
@@ -0,0 +1,36 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.caffe.extractors.utils import embed_input
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.loader.utils import collect_until_whitespace, read_binary_integer32_token
+from mo.front.kaldi.utils import read_binary_vector, read_learning_info
+from mo.ops.scale_shift import ScaleShiftOp
+
+
+class RescaleFrontExtractor(FrontExtractorOp):
+    op = 'rescale'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        pb = node.parameters
+        read_learning_info(pb)
+        weights = read_binary_vector(pb)
+        mapping_rule = {}
+        embed_input(mapping_rule, 1, 'weights', weights)
+        ScaleShiftOp.update_node_stat(node, mapping_rule)
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/reshape.py b/model-optimizer/mo/front/kaldi/extractors/reshape.py
deleted file mode 100644 (file)
index cf47f67..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-import numpy as np
-
-from mo.front.common.partial_infer.reshape import tf_reshape_shape_infer
-from mo.front.common.partial_infer.utils import int64_array
-from mo.front.extractor import FrontExtractorOp
-from mo.graph.graph import Node
-from mo.ops.op import Op
-from mo.ops.reshape import Reshape
-
-
-class ReshapeFrontExtractor(FrontExtractorOp):
-    op = 'reshape'
-    enabled = True
-
-    @staticmethod
-    def extract(node):
-        mapping_rule = {
-            'dim': node.pb.dim if hasattr(node.pb, 'dim') else None,
-            'axis': node.pb.axis,
-            'num_axes': node.pb.num_axes,
-            'infer': ReshapeFrontExtractor.infer
-        }
-        Op.get_op_class_by_name('Reshape').update_node_stat(node, mapping_rule)
-        return __class__.enabled
-
-    @staticmethod
-    def infer(node: Node):
-        in_node = node.in_node().in_node()  # prev_layer_node -> data -> this_node
-        input_shape = node.in_node().shape
-        # Kaldi Reshape hugely depends on the layers that precedes or succeeds
-        # Convolution/Pooling layers. Therefore there are 4 cases with different
-        # partial inference.
-        batch = input_shape[0]
-        if in_node.type == 'Convolution' or in_node.type == 'Pooling':
-            output_spatial = int64_array([batch, np.prod(input_shape[1:])])
-            return ReshapeFrontExtractor.set_shape_and_dim(node, output_spatial)
-        # Supports ONLY NCHW and NH layouts
-        spatial_shape = input_shape[1]
-        if input_shape.shape == (4,):
-            spatial_shape = input_shape[2:3]
-        out_node = node.out_node().out_node()
-        if out_node.type == 'Convolution':
-            output_spatial = int64_array([batch, int(np.ceil(spatial_shape / out_node.patch_stride)), 1, out_node.patch_stride])
-            return ReshapeFrontExtractor.set_shape_and_dim(node, output_spatial)
-        elif out_node.type == 'Pooling':
-            output_spatial = int64_array([batch, out_node.pool_stride, 1, int(np.ceil(spatial_shape / out_node.pool_stride))])
-            return ReshapeFrontExtractor.set_shape_and_dim(node, output_spatial)
-
-    @staticmethod
-    def set_shape_and_dim(node: Node, reshape_dim):
-        Reshape.update_node_stat(node, {'dim': reshape_dim})
-        node.out_node().shape = reshape_dim
-
index 31f2845..a68ad4f 100644 (file)
@@ -18,7 +18,7 @@ from mo.front.extractor import FrontExtractorOp
 from mo.ops.activation import Activation
 
 
-class Sigmoid(FrontExtractorOp):
+class SigmoidFrontExtractor(FrontExtractorOp):
     op = 'sigmoid'
     enabled = True
 
index 1acfb9b..4235c0d 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
+import numpy as np
+
 from mo.front.common.partial_infer.slice import caffe_slice_infer
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
+from mo.front.kaldi.loader.utils import read_binary_integer32_token, read_blob
+from mo.ops.slice import Slice
 
 
 class SliceFrontExtractor(FrontExtractorOp):
@@ -24,12 +27,15 @@ class SliceFrontExtractor(FrontExtractorOp):
 
     @staticmethod
     def extract(node):
+        pb = node.parameters
+        num_slice_points = read_binary_integer32_token(pb)
         mapping_rule = {
-            'axis': node.pb.axis if hasattr(node.pb, 'axis') else 1,
-            'slice_point': node.pb.slice_point,
+            'axis': 1,
+            'slice_point': read_blob(pb, num_slice_points, np.int32),
             'batch_dims': 0,
             'spatial_dims': 1,
             'infer': caffe_slice_infer
         }
-        Op.get_op_class_by_name('Slice').update_node_stat(node, mapping_rule)
+        node.parameters.close()
+        Slice.update_node_stat(node, mapping_rule)
         return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/softmax_ext.py b/model-optimizer/mo/front/kaldi/extractors/softmax_ext.py
new file mode 100644 (file)
index 0000000..da9f0a1
--- /dev/null
@@ -0,0 +1,37 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from mo.front.common.partial_infer.elemental import copy_shape_infer
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.softmax import Softmax
+
+
+class SoftmaxComponentFrontExtractor(FrontExtractorOp):
+    op = 'softmaxcomponent'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        return SoftmaxFrontExtractor.extract(node)
+
+
+class SoftmaxFrontExtractor(FrontExtractorOp):
+    op = 'softmax'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        Softmax.update_node_stat(node, {'infer': copy_shape_infer})
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/splice_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/splice_component_ext.py
new file mode 100644 (file)
index 0000000..47cbc23
--- /dev/null
@@ -0,0 +1,53 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import numpy as np
+
+from extensions.ops.splice import Splice
+from mo.front.extractor import FrontExtractorOp
+from mo.front.kaldi.loader.utils import find_next_tag, read_placeholder, read_binary_integer32_token, \
+    collect_until_whitespace
+from mo.front.kaldi.utils import read_binary_vector
+from mo.utils.error import Error
+
+
+class SpliceFrontExtractor(FrontExtractorOp):
+    op = 'splicecomponent'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        pb = node.parameters
+        mapping_rule = {
+            'context': list()
+        }
+        tag = find_next_tag(pb)
+        if tag == '<LeftContext>':
+            read_placeholder(pb, 1)
+            l_context = read_binary_integer32_token(pb)
+            tag = find_next_tag(pb)
+            if tag != '<RightContext>':
+                raise Error('Unknown token {} in SpliceComponent node {}'.format(tag, node.id))
+            read_placeholder(pb, 1)
+            r_context = read_binary_integer32_token(pb)
+            for i in range(-l_context, r_context + 1):
+                mapping_rule['context'].append(i)
+        elif tag == '<Context>':
+            collect_until_whitespace(pb)
+            mapping_rule['context'] = read_binary_vector(pb, False, dtype=np.int32)
+        else:
+            raise Error('Unknown token {} in SpliceComponent node {}'.format(tag, node.id))
+        Splice.update_node_stat(node, mapping_rule)
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/extractors/tanh_component_ext.py b/model-optimizer/mo/front/kaldi/extractors/tanh_component_ext.py
new file mode 100644 (file)
index 0000000..e67f9c4
--- /dev/null
@@ -0,0 +1,28 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.extractor import FrontExtractorOp
+from mo.ops.activation import Activation
+
+
+class TanhFrontExtractor(FrontExtractorOp):
+    op = 'tanhcomponent'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        Activation.update_node_stat(node, {'operation': 'tanh'})
+        return __class__.enabled
diff --git a/model-optimizer/mo/front/kaldi/loader.py b/model-optimizer/mo/front/kaldi/loader.py
deleted file mode 100644 (file)
index 7638a6c..0000000
+++ /dev/null
@@ -1,544 +0,0 @@
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-import os
-from io import IOBase
-
-from mo.front.kaldi.extractor import common_kaldi_fields
-from mo.front.kaldi.utils import get_uint32, get_uint16, KaldiNode
-from mo.graph.graph import unique_id, Node
-from mo.utils.error import Error
-
-import networkx as nx
-import numpy as np
-
-from mo.utils.utils import refer_to_faq_msg
-
-
-def read_placeholder(file_desc):
-    """
-    Placeholder is like: |FW | or |FV | - they take 3 spaces and appear before a matrix or a vector respectively
-    :param file_path:
-    :return:
-    """
-    file_desc.read(3)
-
-
-def read_binary_matrix(file_desc, skip: bool = False):
-    if not skip:
-        read_placeholder(file_desc)
-    rows_number = read_binary_integer_token(file_desc)
-    cols_number = read_binary_integer_token(file_desc)
-    # to compare: ((float *)a->buffer())[10]
-    return read_blob(file_desc, rows_number * cols_number), (rows_number, cols_number)
-
-
-def read_binary_vector(file_desc):
-    read_placeholder(file_desc)
-    elements_number = read_binary_integer_token(file_desc)
-    return read_blob(file_desc, elements_number)
-
-
-def collect_until_token(f, token):
-    while True:
-        # usually there is the following structure <CellDim> DIM<ClipGradient> VALUEFM
-        res = collect_until_whitespace(f)
-        if res[-2:] == token:
-            return
-
-
-class KaldiLayer:
-    def __init__(self, f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style=False,
-                 type='General'):
-        self.f = f
-        self.graph = graph
-        self.type = type
-        self.layer_i = layer_i
-        self.layer_o = layer_o
-        self.layer_name = layer_name
-        self.prev_layer_name = prev_layer_name
-        self.is_switch_board_style = is_switch_board_style
-        self.attrs = dict(type=type)
-        self.weights = None
-        self.biases = None
-
-    def construct_sub_graph(self):
-        return add_single_node(self.graph, self.layer_name, self.prev_layer_name, self.attrs, self.weights, self.biases)
-
-    def load_build(self):
-        return self.construct_sub_graph()
-
-
-class SigmoidKaldiLayer(KaldiLayer):
-    def load_build(self):
-        self.attrs.update({
-            'operation': 'sigmoid'
-        })
-        return self.construct_sub_graph()
-
-
-class AffineTransformKaldiLayer(KaldiLayer):
-    def load_weights_biases_attrs(self):
-        collect_until_token(self.f, b'FM')
-        self.weights, weights_shape = read_binary_matrix(self.f, skip=True)
-        self.biases = read_binary_vector(self.f)
-        self.attrs = {
-            'num_output': self.layer_o,
-            'bias_term': True,
-            'weights_shape': weights_shape,
-            'type': 'AffineTransform'
-        }
-
-    def load_build(self):
-        self.load_weights_biases_attrs()
-        return self.construct_sub_graph()
-
-
-class LSTMProjectedStreamsKaldiLayer(KaldiLayer):
-    def __init__(self, f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style=False,
-                 type='General'):
-        super().__init__(f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style, type)
-        self.clip_value = None
-        self.gifo_x_weights = None
-        self.gifo_r_weights = None
-        self.gifo_biases = None
-        self.input_gate_weights = None
-        self.forget_gate_weights = None
-        self.output_gate_weights = None
-        self.projection_weights = None
-        self.gifo_x_weights_shape = None
-        self.gifo_r_weights_shape = None
-        self.projection_weights_shape = None
-
-    def load_weights_biases_attrs(self):
-        self.clip_value = 1 if self.is_switch_board_style else 50
-
-        if not self.is_switch_board_style:
-            res = collect_until_whitespace(self.f)  # <CellClip>
-            if res == b'<CellClip>':
-                self.clip_value = get_uint32(self.f.read(4))
-
-            collect_until_token(self.f, b'FM')
-
-        self.gifo_x_weights, self.gifo_x_weights_shape = read_binary_matrix(self.f, skip=True)
-        self.gifo_r_weights, self.gifo_r_weights_shape = read_binary_matrix(self.f)
-        self.gifo_biases = read_binary_vector(self.f)
-        self.input_gate_weights = read_binary_vector(self.f)
-        self.forget_gate_weights = read_binary_vector(self.f)
-        self.output_gate_weights = read_binary_vector(self.f)
-
-        if not self.is_switch_board_style:
-            self.projection_weights, self.projection_weights_shape = read_binary_matrix(self.f)
-
-    def load_build(self):
-        self.load_weights_biases_attrs()
-        return self.construct_sub_graph()
-
-    def construct_sub_graph(self):
-        self.attrs.update(dict(gifo_x_weights=self.gifo_x_weights, gifo_r_weights=self.gifo_r_weights,
-                               gifo_biases=self.gifo_biases, input_gate_weights=self.input_gate_weights,
-                               forget_gate_weights=self.forget_gate_weights,
-                               clip_value=self.clip_value,
-                               output_gate_weights=self.output_gate_weights,
-                               projection_weights=self.projection_weights,
-                               gifo_x_weights_shape=self.gifo_x_weights_shape,
-                               gifo_r_weights_shape=self.gifo_r_weights_shape,
-                               projection_weights_shape=self.projection_weights_shape,
-                               type='LSTMProjectedStreams'))
-        return add_single_node(self.graph, self.layer_name, self.prev_layer_name, self.attrs, self.weights, self.biases)
-
-
-class ConvolutionKaldiLayer(KaldiLayer):
-    def __init__(self, f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style=False):
-        super().__init__(f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style, 'Convolution')
-        self.kernel = None
-        self.stride = None
-        self.output = None
-        self.weights_shape = None
-        self.shape = None
-        self.patch_stride = None
-
-    def load_build(self):
-        '''
-        /* Prepare feature patches, the layout is:
-        274      |----------|----------|----------|---------| (in = spliced frames)
-        275       xxx        xxx        xxx        xxx        (x = selected elements)
-        276
-        277        xxx : patch dim
-        278         xxx
-        279        ^---: patch step
-        280      |----------| : patch stride
-        281
-        282        xxx-xxx-xxx-xxx : filter dim
-        283
-        '''
-        self.kernel = read_token_value(self.f, b'<PatchDim>')
-        self.stride = read_token_value(self.f, b'<PatchStep>')
-        self.patch_stride = read_token_value(self.f, b'<PatchStride>')
-
-        if (self.patch_stride - self.kernel) % self.stride != 0:
-            raise Error(
-                'Kernel size and stride does not correspond to `patch_stride` attribute of Convolution layer. ' +
-                refer_to_faq_msg(93))
-
-        do_loop = True
-        while do_loop:
-            self.f.read(1)
-            first_char = self.f.read(1)
-            self.f.seek(-2, os.SEEK_CUR)
-            if first_char == b'L':
-                read_token_value(self.f, b'<LearnRateCoef>')
-            elif first_char == b'B':
-                read_token_value(self.f, b'<BiasLearnRateCoef>')
-            elif first_char == b'M':
-                read_token_value(self.f, b'<MaxNorm>')
-            elif first_char == b'!':
-                read_token_value(self.f, b'<EndOfComponent>')
-                do_loop = False
-            else:
-                do_loop = False
-        self.load_weights_biases_attrs()
-
-        self.output = self.biases.shape[0]
-        if self.weights_shape[0] != self.output:
-            raise Error('Weights shape does not correspond to the `output` attribute of Convolution layer. ' +
-                        refer_to_faq_msg(93))
-        self.attrs.update({
-            'kernel': self.kernel,
-            'stride': self.stride,
-            'output': self.output,
-            'bias_term': True,
-            'patch_stride': self.patch_stride
-        })
-        return self.construct_sub_graph()
-
-    def load_weights_biases_attrs(self):
-        collect_until_whitespace(self.f)
-        self.weights, self.weights_shape = read_binary_matrix(self.f)
-        collect_until_whitespace(self.f)
-        self.biases = read_binary_vector(self.f)
-
-
-class PoolingKaldiLayer(KaldiLayer):
-    def __init__(self, f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style=False,
-                 pool_method='Max'):
-        super().__init__(f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style, 'Pooling')
-        self.pad = 0
-        self.window = None
-        self.pool_method = pool_method
-        self.stride = None
-
-    def load_build(self):
-        self.window = read_token_value(self.f, b'<PoolSize>')
-        self.stride = read_token_value(self.f, b'<PoolStep>')
-        pool_stride = read_token_value(self.f, b'<PoolStride>')
-
-        self.attrs.update({
-            'kernel': self.window,
-            'stride': self.stride,
-            'pool_stride': pool_stride,
-            'pool_method': self.pool_method
-        })
-        return self.construct_sub_graph()
-
-
-class ScaleShiftKaldiLayer(KaldiLayer):
-    def __init__(self, f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style=False,
-                 weights=None):
-        super().__init__(f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style, 'ScaleShift')
-        self.weights = weights
-        self.bias_term = False
-
-    def load_build(self):
-        if collect_until_whitespace(self.f) == b'<AddShift>':
-            self.layer_o = read_binary_integer_token(self.f)
-            self.layer_o = read_binary_integer_token(self.f)
-            self.biases = read_binary_vector(self.f)
-            self.bias_term = True
-            self.attrs.update({'bias_term': self.bias_term})
-        return self.construct_sub_graph()
-
-
-class RescaleKaldiLayer(KaldiLayer):
-    def __init__(self, f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style=False):
-        super().__init__(f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style, 'ScaleShift')
-        self.weights = None
-        self.bias_term = False
-
-    def load_build(self):
-        if self.f.read(1) == b'<':
-            self.f.seek(-1, os.SEEK_CUR)
-            read_token_value(self.f, b'<LearnRateCoef>')
-        else:
-            self.f.seek(-1, os.SEEK_CUR)
-        self.weights = read_binary_vector(self.f)
-        next_token = collect_until_whitespace(self.f)
-        if next_token == b'<!EndOfComponent>':
-            next_token = collect_until_whitespace(self.f)
-        if next_token == b'<AddShift>':
-            read_binary_integer_token(self.f)  # input layer
-            self.layer_o = read_binary_integer_token(self.f)
-            if self.f.read(1) == b'<':
-                self.f.seek(-1, os.SEEK_CUR)
-                read_token_value(self.f, b'<LearnRateCoef>')
-            else:
-                self.f.seek(-1, os.SEEK_CUR)
-            self.biases = read_binary_vector(self.f)
-            self.bias_term = True
-            self.attrs.update({'bias_term': self.bias_term})
-        else:
-            self.f.seek(-len(next_token), os.SEEK_CUR)
-        return self.construct_sub_graph()
-
-
-class ParallelKaldiLayer(KaldiLayer):
-    def __init__(self, f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style=False):
-        super().__init__(f, graph, layer_i, layer_o, layer_name, prev_layer_name, is_switch_board_style, 'Parallel')
-        self.output_nodes = []
-        self.edge_attrs = {
-            'out': None,
-            'in': 0,
-            'name': None,
-            'fw_tensor_name': None,
-            'fw_tensor_port': None,
-            'in_attrs': ['in', 'name'],
-            'out_attrs': ['out', 'name'],
-            'data_attrs': ['fw_tensor_name', 'fw_tensor_port']
-        }
-
-    def load_build(self):
-        nnet_count = read_token_value(self.f, b'<NestedNnetCount>')
-        slice_id = add_single_node(self.graph, 'Slice', self.prev_layer_name,
-                                   {'type': 'Slice', 'axis': 1, 'slice_point': []}, None, None)
-        for i in range(nnet_count):
-            read_token_value(self.f, b'<NestedNnet>')
-            graph, shape = load_kaldi_nnet_model(self.f, None)
-            input_nodes = [n for n in graph.nodes(data=True) if n[1]['type'] == 'GlobalInput']
-            for input_node in input_nodes:
-                shape_subgraph = input_node[1]['shape']
-                if i != nnet_count - 1:
-                    self.graph.node[slice_id]['pb'].slice_point.append(shape_subgraph[1])
-                graph.remove_node(input_node[0])
-            mapping = {node: unique_id(self.graph, node) for node in graph.nodes(data=False) if node in self.graph}
-            g = nx.relabel_nodes(graph, mapping)
-            for val in mapping.values():
-                g.node[val]['name'] = val
-            self.graph.add_nodes_from(g.nodes(data=True))
-            self.graph.add_edges_from(g.edges(data=True))
-            sorted_nodes = tuple(nx.topological_sort(g))
-            self.edge_attrs['out'] = i
-            self.edge_attrs['name'] = sorted_nodes[0]
-            self.edge_attrs['fw_tensor_name'] = slice_id
-            self.edge_attrs['fw_tensor_port'] = sorted_nodes[0]
-            self.graph.add_edge(slice_id, sorted_nodes[0], **self.edge_attrs)
-            self.output_nodes.append(sorted_nodes[-1])
-        end_token = collect_until_whitespace(self.f)
-        if end_token != b'</ParallelComponent>':
-            raise Error('Expected token `</ParallelComponent>`, has {}'.format(end_token) + refer_to_faq_msg(99))
-        return self.construct_sub_graph()
-
-    def construct_sub_graph(self):
-        new_id = unique_id(self.graph, '{}_'.format('Concat'))
-        layer = KaldiNode(new_id)
-        layer.set_attrs(dict(axis=1))
-        layer.type = 'Concat'
-        self.graph.add_node(new_id, pb=layer, kind='op')
-        self.graph.node[layer.name].update(common_kaldi_fields(Node(self.graph, layer.name)))
-        self.edge_attrs['out'] = 0
-        self.edge_attrs['name'] = layer.name
-        self.edge_attrs['fw_tensor_port'] = layer.name
-        for i, output_node in enumerate(self.output_nodes):
-            self.edge_attrs['fw_tensor_name'] = output_node
-            self.edge_attrs['in'] = i
-            self.graph.add_edge(output_node, layer.name, **self.edge_attrs)
-        return new_id
-
-
-def read_token_value(file, token: bytes = b'', value_type: type = np.uint32):
-    getters = {
-        np.uint32: read_binary_integer_token
-    }
-    current_token = collect_until_whitespace(file)
-    if token != b'' and token != current_token:
-        raise Error('Can not load token {} from Kaldi model'.format(token) +
-                    refer_to_faq_msg(94))
-    return getters[value_type](file)
-
-
-def read_binary_integer_token(file_path):
-    buffer_size = file_path.read(1)
-    return get_uint32(file_path.read(buffer_size[0]))
-
-
-def collect_until_whitespace(file_path):
-    res = b''
-    while True:
-        new_sym = file_path.read(1)
-        if new_sym == b' ':
-            break
-        res += new_sym
-    return res
-
-
-def read_blob(file_path, size):
-    float_size = 4
-    data = file_path.read(size * float_size)
-    return np.fromstring(data, dtype='<f4')
-
-
-layer_weights_biases_attrs_getter = {
-    'affinetransform': AffineTransformKaldiLayer,
-    'sigmoid': lambda f, g, i, o, name, prev, style: KaldiLayer(f, g, i, o, name, prev, style, type='Sigmoid'),
-    'softmax': lambda f, g, i, o, name, prev, style: KaldiLayer(f, g, i, o, name, prev, style, type='SoftMax'),
-    'lstmprojectedstreams': LSTMProjectedStreamsKaldiLayer,
-    'lstmprojected': LSTMProjectedStreamsKaldiLayer,
-    'maxpoolingcomponent': PoolingKaldiLayer,
-    'convolutionalcomponent': ConvolutionKaldiLayer,
-    'rescale': RescaleKaldiLayer,
-    'parallelcomponent': ParallelKaldiLayer,
-}
-
-
-def add_single_node(graph, layer_name, prev_layer_name, attrs, weights, biases):
-    new_id = unique_id(graph, '{}_'.format(layer_name))
-
-    layer = KaldiNode(new_id)
-    layer.set_weight(weights)
-    layer.set_bias(biases)
-    if attrs:
-        layer.set_attrs(attrs)
-
-    graph.add_node(layer.name, pb=layer, kind='op')
-    graph.node[layer.name].update(common_kaldi_fields(Node(graph, layer.name)))
-
-    edge_attrs = {
-        'out': 0,
-        'in': 0,
-        'name': layer.name,
-        'fw_tensor_debug_info': [(prev_layer_name, layer.name)],  # debug anchor for a framework tensor name and port
-        'in_attrs': ['in', 'name'],
-        'out_attrs': ['out', 'name'],
-        'data_attrs': ['fw_tensor_debug_info']
-    }
-
-    graph.add_edge(prev_layer_name, layer.name, **edge_attrs)
-
-    return new_id
-
-
-def find_first_tag(file):
-    tag = b''
-    while True:
-        symbol = file.read(1)
-        if tag == b'' and symbol != b'<':
-            continue
-        tag += symbol
-        if symbol != b'>':
-            continue
-        return tag
-
-
-def find_first_component(file):
-    while True:
-        tag = find_first_tag(file)
-        component_name = tag.decode('ascii').lower()
-        if component_name[1:-1] in layer_weights_biases_attrs_getter.keys() or tag == b'</Nnet>' or tag == b'<EndOfComponent>':
-            file.read(1)  # read ' '
-            return component_name
-
-
-def load_kaldi_nnet_model(nnet_path, check_sum: int = 16896):
-    """
-    Structure of the file is the following:
-    magic-number(16896)<Nnet> <Next Layer Name> weights etc.
-    :param nnet_path:
-    :param check_sum:
-    :return:
-    """
-    if isinstance(nnet_path, str):
-        file = open(nnet_path, "rb")
-    elif isinstance(nnet_path, IOBase):
-        file = nnet_path
-
-    # 1. check the file
-    # first element is 16896<Nnet>
-    if check_sum and get_uint16(file.read(2)) != check_sum:
-        raise Error('File {} does not appear to be a Kaldi file (magic number does not match). ', nnet_path,
-                    refer_to_faq_msg(89)
-                    )
-
-    while True:
-        name = find_first_tag(file)
-        if name == b'<Nnet>':
-            file.read(1)
-            break
-        elif len(name) == 6:
-            raise Error('Kaldi model should start with <Nnet> tag. ',
-                        refer_to_faq_msg(89))
-    graph = nx.MultiDiGraph()
-    input_name = 'Input'
-    graph.add_node(input_name, pb=None, type='GlobalInput', name=input_name, shape=None, kind='op')
-
-    prev_layer_name = input_name
-    input_shapes = {}
-
-    while True:
-        """
-        Typical structure of the layer
-        <Layer> |Size of output value in bits|Actual value of output|Size of input value in bits|Actual value of input|\
-        FM Matrix|FV Vector| </Layer>
-        """
-        layer_name = find_first_component(file)
-        if layer_name == '</nnet>':
-            break
-        elif layer_name == '<!endofcomponent>':
-            continue
-        extracted_name = layer_name[1:-1]
-
-        layer_o = read_binary_integer_token(file)
-        layer_i = read_binary_integer_token(file)
-
-        if prev_layer_name == 'Input':
-            graph.node['Input']['shape'] = np.array([1, layer_i], dtype=np.int64)
-
-        cls = layer_weights_biases_attrs_getter[extracted_name]
-        cls_instance = cls(file, graph, layer_i, layer_o, extracted_name, prev_layer_name, False)
-
-        prev_layer_name = cls_instance.load_build()
-    return graph, input_shapes
-
-
-def read_counts_file(file_path):
-    with open(file_path, 'r') as f:
-        file_content = f.readlines()
-    if len(file_content) > 1:
-        raise Error('Expect counts file to be one-line file. ' +
-                    refer_to_faq_msg(90))
-
-    counts_line = file_content[0].strip().replace('[', '').replace(']', '')
-    try:
-        counts = np.fromstring(counts_line, dtype=int, sep=' ')
-    except TypeError:
-        raise Error('Expect counts file to contain list of integers.' +
-                    refer_to_faq_msg(90))
-    cutoff = 1.00000001e-10
-    counts = [cutoff if count < cutoff else count for count in counts]
-    scale = 1.0 / np.sum(counts)
-    for idx, count in enumerate(counts):
-        val = np.log(scale * count)
-        if count == cutoff:
-            val += np.iinfo(np.float32).max / 2
-        counts[idx] = val
-    return counts
diff --git a/model-optimizer/mo/front/kaldi/loader/__init__.py b/model-optimizer/mo/front/kaldi/loader/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/model-optimizer/mo/front/kaldi/loader/loader.py b/model-optimizer/mo/front/kaldi/loader/loader.py
new file mode 100644 (file)
index 0000000..8bf9085
--- /dev/null
@@ -0,0 +1,215 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import io
+
+import numpy as np
+import struct
+from io import IOBase
+
+import networkx as nx
+import logging as log
+
+from mo.front.kaldi.loader.utils import find_next_tag, read_placeholder, find_next_component, get_name_from_path, \
+    find_end_of_component, end_of_nnet_tag, read_binary_integer32_token, get_parameters, read_token_value, collect_until_token, \
+    create_edge_attrs
+from mo.graph.graph import unique_id, Node
+from mo.utils.error import Error
+from mo.utils.utils import refer_to_faq_msg
+
+
+def read_counts_file(file_path):
+    with open(file_path, 'r') as f:
+        file_content = f.readlines()
+    if len(file_content) > 1:
+        raise Error('Expect counts file to be one-line file. ' +
+                    refer_to_faq_msg(90))
+
+    counts_line = file_content[0].strip().replace('[', '').replace(']', '')
+    try:
+        counts = np.fromstring(counts_line, dtype=int, sep=' ')
+    except TypeError:
+        raise Error('Expect counts file to contain list of integers.' +
+                    refer_to_faq_msg(90))
+    cutoff = 1.00000001e-10
+    cutoff_idxs = np.where(counts < cutoff)
+    counts[cutoff_idxs] = cutoff
+    scale = 1.0 / np.sum(counts)
+    counts = np.log(counts * scale)
+    counts[cutoff_idxs] += np.finfo(np.float32).max / 2
+    return counts
+
+
+def load_parallel_component(file_descr, graph: nx.MultiDiGraph, prev_layer_id):
+    """
+    Load ParallelComponent of the Kaldi model.
+    ParallelComponent contains parallel nested networks.
+    Slice is inserted before nested networks.
+    Outputs of nested networks concatenate with layer Concat.
+
+    :param file_descr: descriptor of the model file
+    :param graph: graph with the topology.
+    :param prev_layer_id: id of the input layers for parallel component layer
+    :return: id of the concat layer - last layer of the parallel component layers
+    """
+    nnet_count = read_token_value(file_descr, b'<NestedNnetCount>')
+    log.debug('Model contains parallel component with {} nested networks'.format(nnet_count))
+
+    slice_id = unique_id(graph, prefix='Slice')
+    graph.add_node(slice_id, parameters=None, op='slice', kind='op')
+
+    slice_node = Node(graph, slice_id)
+    graph.add_edge(prev_layer_id, slice_id, **create_edge_attrs(prev_layer_id, slice_id))
+    slices_points = []
+
+    outputs = []
+
+    for i in range(nnet_count):
+        read_token_value(file_descr, b'<NestedNnet>')
+        collect_until_token(file_descr, b'<Nnet>')
+        g, shape = load_kalid_nnet1_model(file_descr, 'Nested_net_{}'.format(i))
+        input_nodes = [n for n in graph.nodes(data=True) if n[1]['op'] == 'Input']
+        if i != nnet_count - 1:
+            slices_points.append(shape[1])
+        g.remove_node(input_nodes[0][0])
+        mapping = {node: unique_id(graph, node) for node in g.nodes(data=False) if node in graph}
+        g = nx.relabel_nodes(g, mapping)
+        for val in mapping.values():
+            g.node[val]['name'] = val
+        graph.add_nodes_from(g.nodes(data=True))
+        graph.add_edges_from(g.edges(data=True))
+        sorted_nodes = tuple(nx.topological_sort(g))
+        edge_attrs = create_edge_attrs(slice_id, sorted_nodes[0])
+        edge_attrs['out'] = i
+        graph.add_edge(slice_id, sorted_nodes[0], **edge_attrs)
+        outputs.append(sorted_nodes[-1])
+    packed_sp = struct.pack("B", 4) + struct.pack("I", len(slices_points))
+    for i in slices_points:
+        packed_sp += struct.pack("I", i)
+    slice_node.parameters = io.BytesIO(packed_sp)
+    concat_id = unique_id(graph, prefix='Concat')
+    graph.add_node(concat_id, parameters=None, op='concat', kind='op')
+    for i, output in enumerate(outputs):
+        edge_attrs = create_edge_attrs(output, concat_id)
+        edge_attrs['in'] = i
+        graph.add_edge(output, concat_id, **edge_attrs)
+    return concat_id
+
+
+def load_kaldi_model(nnet_path):
+    """
+    Structure of the file is the following:
+    magic-number(16896)<Nnet> <Next Layer Name> weights etc.
+    :param nnet_path:
+    :param check_sum:
+    :return:
+    """
+    nnet_name = None
+    if isinstance(nnet_path, str):
+        file_desc = open(nnet_path, "rb")
+        nnet_name = get_name_from_path(nnet_path)
+    elif isinstance(nnet_path, IOBase):
+        file_desc = nnet_path
+    else:
+        raise Error('Unsupported type of Kaldi model')
+
+    name = find_next_tag(file_desc)
+    # start new model / submodel
+    if name == '<Nnet>':
+        load_function = load_kalid_nnet1_model
+    elif name == '<TransitionModel>':
+        load_function = load_kalid_nnet2_model
+    else:
+        raise Error('Kaldi model should start with <Nnet> or <TransitionModel> tag. ',
+                    refer_to_faq_msg(89))
+    read_placeholder(file_desc, 1)
+
+    return load_function(file_desc, nnet_name)
+
+
+def load_kalid_nnet1_model(file_descr, name):
+    graph = nx.MultiDiGraph(name=name)
+
+    prev_layer_id = 'Input'
+    graph.add_node(prev_layer_id, name=prev_layer_id, kind='op', op='Input', parameters=None)
+    input_shape = []
+
+    while True:
+        component_type = find_next_component(file_descr)
+        if component_type == end_of_nnet_tag.lower()[1:-1]:
+            break
+
+        layer_o = read_binary_integer32_token(file_descr)
+        layer_i = read_binary_integer32_token(file_descr)
+
+        if component_type == 'parallelcomponent':
+            prev_layer_id = load_parallel_component(file_descr, graph, prev_layer_id)
+            continue
+
+        start_index = file_descr.tell()
+        end_tag, end_index = find_end_of_component(file_descr, component_type)
+        end_index -= len(end_tag)
+        layer_id = unique_id(graph, prefix=component_type)
+        graph.add_node(layer_id,
+                       parameters=get_parameters(file_descr, start_index, end_index),
+                       op=component_type,
+                       kind='op',
+                       layer_i=layer_i,
+                       layer_o=layer_o)
+
+        prev_node = Node(graph, prev_layer_id)
+        if prev_node.op == 'Input':
+            prev_node['shape'] = np.array([1, layer_i], dtype=np.int64)
+            input_shape = np.array([1, layer_i], dtype=np.int64)
+        graph.add_edge(prev_layer_id, layer_id, **create_edge_attrs(prev_layer_id, layer_id))
+        prev_layer_id = layer_id
+        log.debug('{} (type is {}) was loaded'.format(prev_layer_id, component_type))
+    return graph, input_shape
+
+
+def load_kalid_nnet2_model(file_descr, nnet_name):
+    graph = nx.MultiDiGraph(name=nnet_name)
+    input_name = 'Input'
+    graph.add_node(input_name, name=input_name, kind='op', op='Input', parameters=None, shape=None)
+
+    prev_layer_id = input_name
+
+    collect_until_token(file_descr, b'<Nnet>')
+    num_components = read_token_value(file_descr, b'<NumComponents>')
+    log.debug('Network contains {} components'.format(num_components))
+    collect_until_token(file_descr, b'<Components>')
+    for _ in range(num_components):
+        component_type = find_next_component(file_descr)
+
+        if component_type == end_of_nnet_tag.lower()[1:-1]:
+            break
+        start_index = file_descr.tell()
+        end_tag, end_index = find_end_of_component(file_descr, component_type)
+        layer_id = unique_id(graph, prefix=component_type)
+        graph.add_node(layer_id,
+                       parameters=get_parameters(file_descr, start_index, end_index),
+                       op=component_type,
+                       kind='op')
+
+        prev_node = Node(graph, prev_layer_id)
+        if prev_node.op == 'Input':
+            parameters = Node(graph, layer_id).parameters
+            input_dim = read_token_value(parameters, b'<InputDim>')
+            prev_node['shape'] = np.array([1, input_dim], dtype=np.int64)
+            input_shape = np.array([1, input_dim], dtype=np.int64)
+        graph.add_edge(prev_layer_id, layer_id, **create_edge_attrs(prev_layer_id, layer_id))
+        prev_layer_id = layer_id
+        log.debug('{} (type is {}) was loaded'.format(prev_layer_id, component_type))
+    return graph, input_shape
diff --git a/model-optimizer/mo/front/kaldi/loader/utils.py b/model-optimizer/mo/front/kaldi/loader/utils.py
new file mode 100644 (file)
index 0000000..4dbba94
--- /dev/null
@@ -0,0 +1,299 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import io
+
+import numpy as np
+import os
+import struct
+
+from mo.utils.error import Error
+from mo.utils.utils import refer_to_faq_msg
+
+end_of_nnet_tag = '</Nnet>'
+end_of_component_tag = '<!EndOfComponent>'
+
+supported_components = [
+    'addshift',
+    'affinecomponent',
+    'affinetransform',
+    'convolutional1dcomponent',
+    'convolutionalcomponent',
+    'copy',
+    'fixedaffinecomponent',
+    'lstmprojected',
+    'lstmprojectedstreams',
+    'maxpoolingcomponent',
+    'parallelcomponent',
+    'rescale',
+    'sigmoid',
+    'softmax',
+    'softmaxcomponent',
+    'splicecomponent',
+    'tanhcomponent',
+    'normalizecomponent',
+    'affinecomponentpreconditionedonline',
+    'rectifiedlinearcomponent'
+]
+
+
+def get_bool(s: bytes) -> bool:
+    """
+    Get bool value from bytes
+    :param s: bytes array contains bool value
+    :return: bool value from bytes array
+    """
+    return struct.unpack('?', s)[0]
+
+
+def get_uint16(s: bytes) -> int:
+    """
+    Get unsigned int16 value from bytes
+    :param s: bytes array contains unsigned int16 value
+    :return: unsigned int16 value from bytes array
+    """
+    return struct.unpack('H', s)[0]
+
+
+def get_uint32(s: bytes) -> int:
+    """
+    Get unsigned int32 value from bytes
+    :param s: bytes array contains unsigned int32 value
+    :return: unsigned int32 value from bytes array
+    """
+    return struct.unpack('I', s)[0]
+
+
+def get_uint64(s: bytes) -> int:
+    """
+    Get unsigned int64 value from bytes
+    :param s: bytes array contains unsigned int64 value
+    :return: unsigned int64 value from bytes array
+    """
+    return struct.unpack('q', s)[0]
+
+
+def read_binary_bool_token(file_desc: io.BufferedReader) -> bool:
+    """
+    Get next bool value from file
+    The carriage moves forward to 1 position.
+    :param file_desc: file descriptor
+    :return: next boolean value in file
+    """
+    return get_bool(file_desc.read(1))
+
+
+def read_binary_integer32_token(file_desc: io.BufferedReader) -> int:
+    """
+    Get next int32 value from file
+    The carriage moves forward to 5 position.
+    :param file_desc: file descriptor
+    :return: next uint32 value in file
+    """
+    buffer_size = file_desc.read(1)
+    return get_uint32(file_desc.read(buffer_size[0]))
+
+
+def read_binary_integer64_token(file_desc: io.BufferedReader) -> int:
+    """
+    Get next int64 value from file
+    The carriage moves forward to 9 position.
+    :param file_desc: file descriptor
+    :return: next uint64 value in file
+    """
+    buffer_size = file_desc.read(1)
+    return get_uint64(file_desc.read(buffer_size[0]))
+
+
+def find_next_tag(file_desc: io.BufferedReader) -> str:
+    """
+    Get next tag in the file
+    :param file_desc:file descriptor
+    :return: string like '<sometag>'
+    """
+    tag = b''
+    while True:
+        symbol = file_desc.read(1)
+        if symbol == b'':
+            raise Error('Unexpected end of Kaldi model')
+        if tag == b'' and symbol != b'<':
+            continue
+        elif symbol == b'<':
+            tag = b''
+        tag += symbol
+        if symbol != b'>':
+            continue
+        try:
+            return tag.decode('ascii')
+        except UnicodeDecodeError:
+            # Tag in Kaldi model always in ascii encoding
+            tag = b''
+
+
+def read_placeholder(file_desc: io.BufferedReader, size=3) -> bytes:
+    """
+    Read size bytes from file
+    :param file_desc:file descriptor
+    :param size:number of reading bytes
+    :return: bytes
+    """
+    return file_desc.read(size)
+
+
+def find_next_component(file_desc: io.BufferedReader) -> str:
+    """
+    Read next component in the file.
+    All components are contained in supported_components
+    :param file_desc:file descriptor
+    :return: string like '<component>'
+    """
+    while True:
+        tag = find_next_tag(file_desc)
+        # Tag is <NameOfTheLayer>. But we want get without '<' and '>'
+        component_name = tag[1:-1].lower()
+        if component_name in supported_components or tag == end_of_nnet_tag:
+            # There is whitespace after component's name
+            read_placeholder(file_desc, 1)
+            return component_name
+
+
+def get_name_from_path(path: str) -> str:
+    """
+    Get name from path to the file
+    :param path: path to the file
+    :return: name of the file
+    """
+    return os.path.splitext(os.path.basename(path))[0]
+
+
+def find_end_of_component(file_desc: io.BufferedReader, component: str, end_tags: tuple = ()):
+    """
+    Find an index and a tag of the ent of the component
+    :param file_desc: file descriptor
+    :param component: component from supported_components
+    :param end_tags: specific end tags
+    :return: the index and the tag of the end of the component
+    """
+    end_tags_of_component = ['</{}>'.format(component),
+                             end_of_component_tag.lower(),
+                             end_of_nnet_tag.lower(),
+                             *end_tags,
+                             *['<{}>'.format(component) for component in supported_components]]
+    next_tag = find_next_tag(file_desc)
+    while next_tag.lower() not in end_tags_of_component:
+        next_tag = find_next_tag(file_desc)
+    return next_tag, file_desc.tell()
+
+
+def get_parameters(file_desc: io.BufferedReader, start_index: int, end_index: int):
+    """
+    Get part of file
+    :param file_desc: file descriptor
+    :param start_index: Index of the start reading
+    :param end_index:  Index of the end reading
+    :return: part of the file
+    """
+    file_desc.seek(start_index)
+    buffer = file_desc.read(end_index - start_index)
+    return io.BytesIO(buffer)
+
+
+def read_token_value(file_desc: io.BufferedReader, token: bytes = b'', value_type: type = np.uint32):
+    """
+    Get value of the token.
+    Read next token (until whitespace) and check if next teg equals token
+    :param file_desc: file descriptor
+    :param token: token
+    :param value_type:  type of the reading value
+    :return: value of the token
+    """
+    getters = {
+        np.uint32: read_binary_integer32_token,
+        np.uint64: read_binary_integer64_token,
+        bool: read_binary_bool_token
+    }
+    current_token = collect_until_whitespace(file_desc)
+    if token != b'' and token != current_token:
+        raise Error('Can not load token {} from Kaldi model'.format(token) +
+                    refer_to_faq_msg(94))
+    return getters[value_type](file_desc)
+
+
+def collect_until_whitespace(file_desc: io.BufferedReader):
+    """
+    Read from file until whitespace
+    :param file_desc: file descriptor
+    :return:
+    """
+    res = b''
+    while True:
+        new_sym = file_desc.read(1)
+        if new_sym == b' ' or new_sym == b'':
+            break
+        res += new_sym
+    return res
+
+
+def collect_until_token(file_desc: io.BufferedReader, token):
+    """
+    Read from file until the token
+    :param file_desc: file descriptor
+    :return:
+    """
+    while True:
+        # usually there is the following structure <CellDim> DIM<ClipGradient> VALUEFM
+        res = collect_until_whitespace(file_desc)
+        if res == token or res[-len(token):] == token:
+            return
+        if isinstance(file_desc, io.BytesIO):
+            size = len(file_desc.getbuffer())
+        elif isinstance(file_desc, io.BufferedReader):
+            size = os.fstat(file_desc.fileno()).st_size
+        if file_desc.tell() == size:
+            raise Error('End of the file. Token {} not found. {}'.format(token, file_desc.tell()))
+
+
+def create_edge_attrs(prev_layer_id: str, next_layer_id: str) -> dict:
+    """
+    Create common edge's attributes
+    :param prev_layer_id: id of previous layer
+    :param next_layer_id: id of next layer
+    :return: dictionary contains common attributes for edge
+    """
+    return {
+        'out': 0,
+        'in': 0,
+        'name': next_layer_id,
+        'fw_tensor_debug_info': [(prev_layer_id, next_layer_id)],
+        'in_attrs': ['in', 'name'],
+        'out_attrs': ['out', 'name'],
+        'data_attrs': ['fw_tensor_debug_info']
+    }
+
+
+def read_blob(file_desc: io.BufferedReader, size: int, dtype=np.float32):
+    """
+    Read blob from the file
+    :param file_desc: file descriptor
+    :param size: size of the blob
+    :param dtype: type of values of the blob
+    :return: np array contains blob
+    """
+    dsizes = {
+        np.float32: 4,
+        np.int32: 4
+    }
+    data = file_desc.read(size * dsizes[dtype])
+    return np.fromstring(data, dtype=dtype)
index 1dfb545..237ee91 100644 (file)
@@ -1,5 +1,5 @@
 """
- Copyright (c) 2017-2018 Intel Corporation
+ Copyright (c) 2018 Intel Corporation
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
index ef251fe..f29a643 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
+import io
+import numpy as np
+import os
+import logging as log
 
-import struct
+from mo.front.kaldi.loader.utils import read_placeholder, read_binary_integer32_token, read_blob, read_token_value, find_next_tag
+from mo.utils.error import Error
 
 
-def get_uint16(s):
-    return struct.unpack('H', s)[0]
+def read_binary_matrix(file_desc: io.BufferedReader, read_token: bool = True):
+    if read_token:
+        read_placeholder(file_desc)
+    rows_number = read_binary_integer32_token(file_desc)
+    cols_number = read_binary_integer32_token(file_desc)
+    # to compare: ((float *)a->buffer())[10]
+    return read_blob(file_desc, rows_number * cols_number), (rows_number, cols_number)
 
 
-def get_uint32(s):
-    return struct.unpack('I', s)[0]
+def read_binary_vector(file_desc: io.BufferedReader, read_token: bool = True, dtype=np.float32):
+    if read_token:
+        read_placeholder(file_desc)
+    elements_number = read_binary_integer32_token(file_desc)
+    return read_blob(file_desc, elements_number, dtype)
 
 
-class KaldiNode:
-    def __init__(self, name):
-        self.name = name
-        self.blobs = [None, None]
+def read_learning_info(pb: io.BufferedReader):
+    while True:
+        read_placeholder(pb, 1)
+        first_char = pb.read(1)
+        pb.seek(-2, os.SEEK_CUR)
+        position = pb.tell()
+        if first_char == b'L':
+            cur_pos = pb.tell()
+            token = find_next_tag(pb)
+            pb.seek(cur_pos)
+            if token in ['<LearnRateCoef>', '<LearningRate>']:
+                token = bytes(token, 'ascii')
+            else:
+                log.debug('Unexpected tag: {}'.format(token))
+                break
+        elif first_char == b'B':
+            token = b'<BiasLearnRateCoef>'
+        elif first_char == b'M':
+            token = b'<MaxNorm>'
+        elif first_char == b'!':  # token = b'<EndOfComponent>'
+            break
+        else:
+            break
+        try:
+            read_token_value(pb, token)
+        except Error:
+            pb.seek(position)
+            break
 
-    def set_weight(self, w):
-        self.blobs[0] = w
-
-    def set_bias(self, b):
-        self.blobs[1] = b
-
-    def set_attrs(self, attrs: dict):
-        for k, v in attrs.items():
-            setattr(self, k, v)
index e9783de..ad613f8 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-from mo.front.common.partial_infer.elemental import copy_shape_infer
+
 from mo.front.mxnet.extractors.batchnorm import batch_norm_ext
 from mo.front.mxnet.extractors.concat import concat_ext
 from mo.front.mxnet.extractors.crop import crop_ext
 from mo.front.mxnet.extractors.eltwise import eltwise_ext
-from mo.front.mxnet.extractors.flatten import flatten_ext
 from mo.front.mxnet.extractors.fully_connected import fully_connected_ext
 from mo.front.mxnet.extractors.l2_normalization import l2_normalization_ext
 from mo.front.mxnet.extractors.lrn import lrn_ext
 from mo.front.mxnet.extractors.multibox_detection import multi_box_detection_ext
 from mo.front.mxnet.extractors.multibox_prior import multi_box_prior_ext
 from mo.front.mxnet.extractors.null import null_ext
-from mo.front.mxnet.extractors.reshape import reshape_ext
 from mo.front.mxnet.extractors.scaleshift import scale_shift_ext
+from mo.front.mxnet.extractors.slice_axis import slice_axis_ext
 from mo.front.mxnet.extractors.transpose import transpose_ext
-from mo.front.mxnet.extractors.up_sampling import up_sampling_ext
 from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
-from mo.front.mxnet.extractors.slice_axis import slice_axis_ext
-from mo.utils.error import Error
 from mo.graph.graph import Node
+from mo.utils.error import Error
 from mo.utils.utils import refer_to_faq_msg
 
+
 def extractor_wrapper(mxnet_extractor):
     return lambda node: mxnet_extractor(get_mxnet_layer_attrs(node.symbol_dict))
 
@@ -49,15 +47,10 @@ mxnet_op_extractors = {
     'elemwise_add': extractor_wrapper(lambda attrs: eltwise_ext(attrs, infer=lambda a, b: a + b, op_type="sum")),
     'elemwise_mul': extractor_wrapper(lambda attrs: eltwise_ext(attrs, infer=lambda a, b: a * b, op_type="mul")),
     '_Plus': extractor_wrapper(lambda attrs: eltwise_ext(attrs, infer=lambda a, b: a + b, op_type="sum")),
-    'Flatten': extractor_wrapper(flatten_ext),
     'FullyConnected': extractor_wrapper(fully_connected_ext),
-    'Reshape': extractor_wrapper(reshape_ext),
-    'UpSampling': extractor_wrapper(up_sampling_ext),
     'transpose': extractor_wrapper(transpose_ext),
     'LRN': extractor_wrapper(lrn_ext),
     'L2Normalization': extractor_wrapper(l2_normalization_ext),
-    'Dropout': extractor_wrapper(lambda _: dict(infer=copy_shape_infer)),
-    '_copy': extractor_wrapper(lambda _: dict(infer=copy_shape_infer)),
     '_contrib_MultiBoxPrior': extractor_wrapper(multi_box_prior_ext),
     '_contrib_MultiBoxDetection': extractor_wrapper(multi_box_detection_ext),
     'broadcast_add': extractor_wrapper(lambda attrs: eltwise_ext(attrs, infer=lambda a, b: a + b, op_type="sum")),
@@ -84,6 +77,11 @@ def mxnet_op_extractor(node: Node):
             refer_to_faq_msg(86),
             op)
     result_attr = mxnet_op_extractors[op](node)
+
+    if result_attr is None:
+        raise Error('Model Optimizer does not support layer "{}". Please, implement extension. '.format(node.name) +
+                    refer_to_faq_msg(45))
+
     result.update(result_attr)
     supported = bool(result_attr)
     return supported, result
diff --git a/model-optimizer/mo/front/mxnet/extractors/reshape.py b/model-optimizer/mo/front/mxnet/extractors/reshape.py
deleted file mode 100644 (file)
index 6c5b65b..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import numpy as np
-
-from mo.front.common.partial_infer.elemental import single_output_infer
-from mo.front.common.partial_infer.reshape import tf_reshape_shape_infer
-
-
-def reshape_ext(attr):
-    dim = attr.tuple("shape", int, None)
-    node_attrs = {
-        'type': 'Reshape',
-        'axis': 0,
-        'num_axes': -1,
-        'dim': np.array(dim),
-        'infer': lambda node: single_output_infer(node, tf_reshape_shape_infer)
-    }
-    return node_attrs
diff --git a/model-optimizer/mo/front/mxnet/extractors/up_sampling.py b/model-optimizer/mo/front/mxnet/extractors/up_sampling.py
deleted file mode 100644 (file)
index 5804e19..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-"""
- Copyright (c) 2018 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-from mo.front.common.partial_infer.up_sampling import up_sampling_infer
-
-
-def up_sampling_ext(attrs):
-    node_attrs = {
-        'type': 'Resample',
-        'scale': attrs.int("scale", 1),
-        'sample_type': 'caffe.ResampleParameter.NEAREST',
-        'antialias': 0,
-        'infer': up_sampling_infer
-    }
-    return node_attrs
index e77306b..8c8d23d 100644 (file)
@@ -69,6 +69,8 @@ class AttrDictionary(object):
         if attr is None:
             return default
         if isinstance(attr, str):
+            if (not '(' in attr and not ')' in attr) and (not '[' in attr and not ']' in attr):
+                return (valtype(attr),)
             if (not attr) or (not attr[1:-1].split(',')[0]):
                 return tuple([valtype(x) for x in default])
             return StrTo.tuple(valtype, attr)
@@ -104,9 +106,10 @@ def get_mxnet_node_edges(node: dict, node_id: [int, str], nodes_list: list, inde
     edge_list = []
     for in_port, src_node_id in enumerate(node['inputs']):
         src_node = src_node_id[0]
+        dest_port = src_node_id[1]
         edge_attrs = {
             'in': in_port,
-            'out': 0,  # TODO Check if src_node_id[1] should be here (already used as fw_tensor_debug_info)
+            'out': dest_port,
             # debug anchor for name of tensor consumed at this input port
             'fw_tensor_debug_info': [(nodes_list[src_node]['name'], src_node_id[1])],
             'in_attrs': ['in'],
@@ -152,6 +155,9 @@ def load_params(input_model, data_names = ('data',)):
             elif len(keys)>1 and 'arg' == keys[0]:
                 arg_keys.append(keys[1])
                 arg_params[keys[1]] = loaded_weight[key]
+            else:
+                arg_keys.append(key)
+                arg_params[key] = loaded_weight[key]
     elif file_format == 'nd':
         for key in loaded_weight:
             if 'auxs' in input_model:
index e4f6318..76a666c 100644 (file)
  limitations under the License.
 """
 
-import logging as log
-
 import numpy as np
 
+from mo.front.onnx.extractors.concat import concat_ext
 from mo.front.onnx.extractors.const import onnx_const_ext
 from mo.front.onnx.extractors.constant import onnx_constant_ext
+from mo.front.onnx.extractors.dropout import dropout_ext
 from mo.front.onnx.extractors.eltwise import make_tf_eltwise
 from mo.front.onnx.extractors.fused_bn import tf_fused_bn_extractor
 from mo.front.onnx.extractors.matmul import onnx_gemm_ext
 from mo.front.onnx.extractors.placeholder import onnx_placeholder_ext
-from mo.front.onnx.extractors.concat import concat_ext
-from mo.front.onnx.extractors.dropout import dropout_ext
 from mo.front.onnx.extractors.reshape import onnx_reshape_ext
-from mo.front.tf.extractors.softmax import tf_softmax_ext
 from mo.graph.graph import Node
 
 
@@ -48,14 +45,14 @@ onnx_op_extractors = {
         make_tf_eltwise(lambda a, b: a + b, attrs={'type': 'Eltwise', 'operation': 'sum', 'can_be_bias': True})),
     'Relu': node_pb_arg(make_tf_eltwise(lambda v: np.maximum(0, v), attrs={'type': 'ReLU'})),  # 0 is an integer
     'Reshape': onnx_reshape_ext,
-    'Softmax': node_pb_arg(tf_softmax_ext),
 }
 
 
 def common_onnx_fields(node: Node):
     return {
         'kind': 'op',
-        'name': node.id,  # no reliable name for an onnx node, name can be empty, so we use that surrogate built as ID in the loaader
+        'name': node.id,
+    # no reliable name for an onnx node, name can be empty, so we use that surrogate built as ID in the loaader
         'op': node.op if node.has_valid('op') else node.pb.op_type,
         'precision': 'FP32'  # TODO use real precision derived from the model
     }
index f798f15..8c35bc3 100644 (file)
@@ -36,6 +36,8 @@ def change_placeholders_types_to_FP32(graph: nx.MultiDiGraph):
             if all([is_node_casts_to_float(op) and len(op.in_nodes()) == 1 for op in next_ops]):
                 change_node_type(node, tf_types.DT_FLOAT)
                 remove_node_preserving_edges(node, next_ops)  # remove 'Cast' nodes
+            elif all([is_node_gather(op) for op in next_ops] for op in next_ops):
+                change_node_type(node, tf_types.DT_FLOAT)
             else:
                 raise Error(
                     ('Cannot convert type of placeholder "{}" because not all of its outputs are "Cast" to float '
@@ -52,6 +54,11 @@ def is_node_casts_to_float(node: Node):
     return 'pb' in attrs and attrs['pb'].op == 'Cast' and attrs['pb'].attr['DstT'].type == tf_types.DT_FLOAT
 
 
+def is_node_gather(node: Node):
+    attrs = node.graph.node[node.id]
+    return 'pb' in attrs and attrs['pb'].op == 'GatherV2' and attrs['precision'] == 'FP32'
+
+
 def change_node_type(node: Node, new_type: type):
     node.graph.node[node.id]['pb'].attr['dtype'].type = new_type
 
index a653a25..d7af0d5 100644 (file)
@@ -34,7 +34,6 @@ from mo.front.tf.extractors.random_uniform import tf_random_uniform_ext
 from mo.front.tf.extractors.range import tf_range_ext
 from mo.front.tf.extractors.reshape import tf_reshape_ext
 from mo.front.tf.extractors.shape import tf_shape_ext
-from mo.front.tf.extractors.softmax import tf_softmax_ext
 from mo.front.tf.extractors.space_to_batch import tf_space_to_batch_ext, tf_batch_to_space_ext
 from mo.front.tf.extractors.split import tf_split_ext
 from mo.front.tf.extractors.squeeze import tf_squeeze_ext
@@ -95,7 +94,7 @@ tf_op_extractors = {
     'Prod': node_pb_arg(tf_reduce_prod_ext),
     'Const': node_pb_arg(tf_const_ext),
     'Placeholder': node_pb_arg(tf_placeholder_ext),
-    'Identity': node_pb_arg(make_tf_eltwise(lambda v: v)),
+    'Identity': node_pb_arg(make_tf_eltwise(lambda v: v, attrs={'identity': True})),
     'Add': node_pb_arg(
         make_tf_eltwise(lambda a, b: a + b, attrs={'type': 'Eltwise', 'operation': 'sum', 'can_be_bias': True})),
     'Mul': node_pb_arg(make_tf_eltwise(lambda a, b: a * b, attrs={'type': 'Eltwise', 'operation': 'mul'})),
@@ -111,18 +110,16 @@ tf_op_extractors = {
     'Reshape': node_pb_arg(tf_reshape_ext),
     'Squeeze': node_pb_arg(tf_squeeze_ext),
     'Shape': node_pb_arg(tf_shape_ext),
-    'Softmax': node_pb_arg(tf_softmax_ext),
     'SpaceToBatchND': node_pb_arg(tf_space_to_batch_ext),
     'BatchToSpaceND': node_pb_arg(tf_batch_to_space_ext),
-    'StopGradient': node_pb_arg(make_tf_eltwise(lambda v: v)),
     'Square': node_pb_arg(make_tf_eltwise(lambda a: a * a)),
     'Minimum': node_pb_arg(make_tf_eltwise(lambda a, b: np.minimum(a, b))),  # can use clamp if one argument is const
     'Maximum': node_pb_arg(make_tf_eltwise(lambda a, b: np.maximum(a, b), attrs={'type': 'Eltwise',
                                                                                  'operation': 'max'})),
     'Sum': node_pb_arg(tf_sum_ext),
     'Range': node_pb_arg(tf_range_ext),
-    'ReadVariableOp': node_pb_arg(make_tf_eltwise(lambda v: v, attrs={'op': 'Identity'})),
-    'PlaceholderWithDefault': node_pb_arg(make_tf_eltwise(lambda v: v, attrs={'op': 'Identity'}))
+    'ReadVariableOp': node_pb_arg(make_tf_eltwise(lambda v: v, attrs={'identity': True})),
+    'PlaceholderWithDefault': node_pb_arg(make_tf_eltwise(lambda v: v, attrs={'identity': True}))
 }
 
 
index a60d3e8..5b736df 100644 (file)
@@ -80,7 +80,7 @@ def tf_tensor_content(tf_dtype, shape, pb_tensor):
         # return np.array(type_helper[1](pb_tensor), dtype=type_helper[0])
     else:
         if pb_tensor.tensor_content:
-            flat = np.frombuffer(pb_tensor.tensor_content, type_helper[0])
+            flat = np.array(np.frombuffer(pb_tensor.tensor_content, type_helper[0]))
             if len(flat) == shape.prod():
                 return flat.reshape(shape)
             else:
index 67335a2..2a8454c 100644 (file)
@@ -133,7 +133,7 @@ def add_activation_function_after_node(graph: nx.MultiDiGraph, node: Node, activ
     """
     if activation_function == 'SOFTMAX':
         # softmax to be applied to the confidence
-        softmax_conf_op = Softmax(graph, dict(axis=1, nchw_layout=True))
+        softmax_conf_op = Softmax(graph, dict(axis=-1, nchw_layout=True))
         activation_node = softmax_conf_op.create_node([node], dict(name=node.name + '/softmax'))
     elif activation_function == 'SIGMOID':
         # sigmoid activation function to be applied to the confidence
index b8d8ca1..8310e0a 100644 (file)
  limitations under the License.
 """
 
+import logging as log
 import os
 import re
 
+import networkx as nx
+
 from mo.utils.error import Error, FrameworkError
 from mo.utils.utils import refer_to_faq_msg
 
@@ -31,6 +34,55 @@ from mo.graph.graph import create_graph_with_nodes
 from mo.utils.summarize_graph import summarize_graph
 
 
+def freeze_checkpoints(graph_def: tf.GraphDef, checkpoint_dir: str, output_node_names: list):
+    """
+    Loads all the variables in a graph and stores them in a separate dictionary. Freezes output nodes in the graph
+    :param graph_def: GraphDef object holding the network.
+    :param checkpoint_dir: path to directory with checkpoint files with values of graph variables.
+    :param output_node_names: list of output node names.
+    :return: GraphDef containing a simplified version of the original.
+    """
+    log.debug("Loading checkpoint files from directory: {}".format(checkpoint_dir))
+    checkpoint_files = []
+    for checkpoint_name in sorted(os.listdir(checkpoint_dir)):
+        checkpoint_path = os.path.join(checkpoint_dir, checkpoint_name)
+        if os.path.isfile(checkpoint_path):
+            checkpoint_files.append(checkpoint_path)
+            log.debug("File {} will be loaded".format(checkpoint_path))
+        else:
+            log.debug("Path {} is not a file. Skipping")
+
+    if len(checkpoint_files) == 0:
+        raise Error("There are no checkpoint files in directory: {}".format(checkpoint_dir))
+
+    tf.import_graph_def(graph_def, name='')
+
+    with tf.Session() as sess:
+        uninitialized_variables = [str(v, 'utf-8') for v in set(sess.run(tf.report_uninitialized_variables()))]
+        all_variables = [n.name for n in sess.graph.as_graph_def().node if n.op in ['Variable', 'VariableV2']]
+        white_list = [v for v in all_variables if v not in uninitialized_variables]
+        black_list = [v for v in all_variables if v in uninitialized_variables]
+        output_graph_def = tf.graph_util.convert_variables_to_constants(sess, graph_def, output_node_names,
+                                                                        variable_names_whitelist=white_list,
+                                                                        variable_names_blacklist=black_list)
+    variable_values = {}
+    for checkpoint_file in checkpoint_files:
+        log.debug("Loading {}".format(checkpoint_file))
+        with tf.Session() as sess:
+            var_list = {}
+            var_to_shape_map = tf.pywrap_tensorflow.NewCheckpointReader(checkpoint_file).get_variable_to_shape_map()
+            for key in var_to_shape_map:
+                try:
+                    tensor = sess.graph.get_operation_by_name(key).outputs[0]
+                except KeyError:
+                    continue
+                var_list[key] = tensor
+            tf.train.Saver(var_list=var_list).restore(sess, checkpoint_file)
+            for name, tensor in var_list.items():
+                variable_values[name] = sess.run(tensor)
+    return output_graph_def, variable_values
+
+
 def freeze_checkpoint(graph_def, checkpoint, output_node_names):
     """
     Replaces all the variables in a graph with constants of the same values.
@@ -40,6 +92,7 @@ def freeze_checkpoint(graph_def, checkpoint, output_node_names):
     :return: GraphDef containing a simplified version of the original.
     """
     tf.import_graph_def(graph_def, name="")
+
     with tf.Session() as sess:
         var_list = {}
         var_to_shape_map = tf.pywrap_tensorflow.NewCheckpointReader(checkpoint).get_variable_to_shape_map()
@@ -54,7 +107,8 @@ def freeze_checkpoint(graph_def, checkpoint, output_node_names):
     return output_graph_def
 
 
-def read_file_to_graph_def(graph_def: [tf.GraphDef, tf.MetaGraphDef], graph_file_name: str = "", is_binary: bool = True):
+def read_file_to_graph_def(graph_def: [tf.GraphDef, tf.MetaGraphDef], graph_file_name: str = "",
+                           is_binary: bool = True):
     """
     Reads file to protobuf
     :param graph_def: GraphDef orr MetaGraphDef object to store the network
@@ -141,16 +195,22 @@ def load_tf_graph_def(graph_file_name: str = "", is_binary: bool = True, checkpo
               '--input_checkpoint "path/to/*.ckpt"'
               '\n\n2. For "*.meta" file:'
               '\npython3 mo_tf.py --input_meta_graph "path/to/*.meta"')
-
+    variables_values = {}
     try:
         if graph_file_name and not meta_graph_file and not checkpoint:
             # frozen graph
-            return read_file_to_graph_def(graph_def, graph_file_name, is_binary)
+            return read_file_to_graph_def(graph_def, graph_file_name, is_binary), variables_values
         if graph_file_name and not meta_graph_file and checkpoint:
             # inference graph and checkpoint
             graph_def = read_file_to_graph_def(graph_def, graph_file_name, is_binary)
             outputs = get_output_node_names_list(graph_def, user_output_node_names_list)
-            return freeze_checkpoint(graph_def=graph_def, checkpoint=checkpoint, output_node_names=outputs)
+            if os.path.isfile(checkpoint):
+                graph_def = freeze_checkpoint(graph_def=graph_def, checkpoint=checkpoint, output_node_names=outputs)
+            elif os.path.isdir(checkpoint):
+                graph_def, variables_values = freeze_checkpoints(graph_def=graph_def, checkpoint_dir=checkpoint,
+                                                                 output_node_names=outputs)
+            # we are sure that checkpoint is existing file or directory due to cli_parser configuration
+            return graph_def, variables_values
         if not graph_file_name and meta_graph_file:
             meta_graph_file = deducing_metagraph_path(meta_graph_file)
             input_meta_graph_def = read_file_to_graph_def(tf.MetaGraphDef(), meta_graph_file, is_binary)
@@ -159,14 +219,16 @@ def load_tf_graph_def(graph_file_name: str = "", is_binary: bool = True, checkpo
                 restorer = tf.train.import_meta_graph(input_meta_graph_def)
                 restorer.restore(sess, re.sub('\.meta$', '', meta_graph_file))
                 outputs = get_output_node_names_list(input_meta_graph_def.graph_def, user_output_node_names_list)
-                return tf.graph_util.convert_variables_to_constants(sess, input_meta_graph_def.graph_def, outputs)
+                graph_def = tf.graph_util.convert_variables_to_constants(sess, input_meta_graph_def.graph_def, outputs)
+                return graph_def, variables_values
         if model_dir:
             # saved model directory
             tags = saved_model_tags if saved_model_tags is not None else [tf.saved_model.tag_constants.SERVING]
             with tf.Session() as sess:
                 meta_graph_def = tf.saved_model.loader.load(sess, tags, model_dir)
                 outputs = get_output_node_names_list(meta_graph_def.graph_def, user_output_node_names_list)
-                return tf.graph_util.convert_variables_to_constants(sess, meta_graph_def.graph_def, outputs)
+                graph_def = tf.graph_util.convert_variables_to_constants(sess, meta_graph_def.graph_def, outputs)
+                return graph_def, variables_values
     except Exception as e:
         raise FrameworkError('Cannot load input model: {}', e) from e
     raise Error("Unknown configuration of input model parameters")
@@ -194,3 +256,24 @@ def protobuf2nx(pb: tf.GraphDef):
                     index = index + 1
 
     return graph
+
+
+def variables_to_constants(graph: nx.MultiDiGraph, variables_values: dict):
+    """
+    Converts `Variable<V2>` operations to FakeConst operations with `value` from `variables_values` dictionary
+    :param graph: graph to operate on
+    :param variables_values: dictionary with variable names as keys and np.array data as values
+    """
+    variable_operations = ['Variable', 'VariableV2']
+    for node_name in graph.nodes():
+        node_attr_dict = graph.node[node_name]
+        if 'op' not in node_attr_dict:
+            continue
+        op_name = node_attr_dict['op']
+        if op_name not in variable_operations:
+            continue
+        if node_name not in variables_values:
+            log.debug("There is no value for '{}': {} in checkpoint variable values".format(op_name, node_name))
+            continue
+        graph.node[node_name]['op'] = 'FakeConst'
+        graph.node[node_name]['value'] = variables_values[node_name]
index f1ef031..e44b108 100644 (file)
@@ -15,8 +15,8 @@
 """
 
 import collections
-from copy import deepcopy
 import logging as log
+from copy import deepcopy
 
 import networkx as nx
 import numpy as np
@@ -229,9 +229,23 @@ def get_graph_ops(graph: nx.MultiDiGraph):
     return [Node(graph, node) for node in graph.nodes() if Node(graph, node).soft_get('kind') == 'op']
 
 
+def dict_includes_compare_attrs(attr, attr_probe):
+    if callable(attr_probe) and not isinstance(attr_probe, type):
+        return attr_probe(attr)
+    else:
+        return attr == attr_probe
+
 def dict_includes(big: dict, sub_dict: dict):
-    ''' Searches attributes from sub_dict in big and ensures that all values match. '''
-    return all(big.get(attr, None) == sub_dict.get(attr, None) for attr in sub_dict.keys())
+    ''' Searches attributes from sub_dict in big and ensures that all values match.
+
+        Entries in sub_dict can be of two types: callable or not callable. If callable is specified
+        it is treated as probing function for attribute value from big dictionary by callable(attr) expression.
+        If it is not callable, the values are compared with == operator.
+    '''
+    return all(
+        dict_includes_compare_attrs(big.get(attr, None), sub_dict[attr])
+        for attr in sub_dict.keys()
+    )
 
 
 class NodeWrap:
@@ -330,11 +344,11 @@ class NodeWrap:
         return self[k] if self.has_valid(k) else '<UNKNOWN>'
 
     def edges(self, attrs: dict=None):
-        ''' Get a single edge with specified set of attributes.
+        ''' Get a list of all edges with specified set of attributes.
 
-            If none or multiple edges satisfies this criteria, exception is raised
             Edge is represented as tuple (u, v, d), where u is source node,
-            v is destination node and d is edge attributes.
+            v is destination node and d is edge attributes. The function
+            returns a list of such tuples.
         '''
         edges = list(self.graph.in_edges([self.id], data=True)) + list(self.graph.out_edges([self.id], data=True))
         return [(u, v, d) for u,v,d in edges if dict_includes(d, attrs)]
@@ -350,6 +364,112 @@ class NodeWrap:
         assert len(edges) == 1, 'edges: {}, required attributes: {}'.format(edges, attrs)
         return edges[0]
 
+    def insert_node_with_data_before(self, inp, new_op_class: callable, op_before_params: dict = None,
+                                     infer_current: bool = False):
+        """
+        Inserts operation node with op_before_params and data node before current operation
+
+        :param inp: input data node of current node
+        :param new_op_class: class of operation that will be inserted before current operation node
+        :param op_before_params: parameters to be added to operation that will be inserted before current operation
+
+        Before calling:
+        [...] -> inp -> Cur_Op -> Cur_Data -> [...]
+
+        After calling:
+        [...] -> inp -> New_Op_bef -> New_Data_bef -> Cur_Op -> Cur_Data -> [...]
+                    [op_before_params]
+        """
+        graph = self.graph
+        node = Node(graph, self.node)
+        cls_name = new_op_class.op
+        op_before_params = {} if op_before_params is None else op_before_params
+
+        # operating with input
+        new_op_before = new_op_class(graph, op_before_params)
+        edge_attrs = deepcopy(graph.get_edge_data(inp.id, node.id)[0])
+        graph.remove_edge(inp.id, node.id)
+        new_inp = new_op_before.create_node_with_data([inp], {'name': node.name + cls_name + '/Before'})
+        graph.add_edge(new_inp.id, node.id, **edge_attrs)
+        if infer_current:
+            node.infer(node)
+
+    def insert_node_with_data_after(self, out, new_op_class: callable, op_after_params: dict = None):
+        """
+        Inserts operation node with op_after_params and data node after current operation
+
+        :param out: output data node of current node
+        :param new_op_class: class of operation that will be inserted after current operation node
+        :param op_after_params:  parameters to be added to operation that will be inserted after current operation
+
+        Before calling:
+        [...] -> Cur_Op -> Cur_Data -> [...]
+
+        After calling:
+        [...] -> Cur_Op -> Cur_Data -> New_Op_aft -> New_Data_aft(==out) -> [...]
+                                   [op_after_params]
+        """
+        # we import it here because Op imports Node and unique_id from this file
+        from mo.ops.op import Op
+
+        graph = self.graph
+        node = Node(graph, self.node)
+        cls_name = new_op_class.op
+        op_after_params = {} if op_after_params is None else op_after_params
+
+        new_op_after = new_op_class(graph, op_after_params)
+        graph.remove_edge(node.id, out.id)
+        new_out = Op.create_data_node(graph, node)
+        node.infer(node)
+        new_op_after.create_node_with_data([new_out], {'name': node.name + cls_name + '/After'}, data_nodes=out)
+
+    def bracket_with_different_nodes_with_data(self, inp, out, new_op_class_before: callable,
+                                               new_op_class_after: callable,
+                                               op_before_params: dict = None, op_after_params: dict = None):
+        """
+        Inserts one operation node with op_before_params and data node before current operation node and
+        inserts one operation node with op_after_params and data node after current operation node
+        :param inp: input data node of self.node node
+        :param out: output data node of self.node node
+        :param new_op_class_before: class of operation that will be inserted before current operation node
+        :param new_op_class_after: class of operation that will be inserted after current operation node
+        :param op_before_params: parameters to be added to operation that will be inserted before current operation
+        :param op_after_params: parameters to be added to operation that will be inserted after current operation
+
+        Before calling:
+        [...] -> inp -> Cur_Op -> out -> [...]
+
+        After calling:
+        [...] -> inp -> New_Op_bef -> New_Data_bef -> Cur_Op -> Cur_Data -> New_Op_aft -> New_Data_aft(==out) -> [...]
+                    [op_before_params]                                  [op_after_params]
+        """
+        op_before_params = {} if op_before_params is None else op_before_params
+        op_after_params = {} if op_after_params is None else op_after_params
+        self.insert_node_with_data_before(inp, new_op_class_before, op_before_params)
+        self.insert_node_with_data_after(out, new_op_class_after, op_after_params)
+
+    def bracket_op_with_another_op(self, inp, out, new_op_class: callable,
+                                   op_before_params: dict = None, op_after_params: dict = None):
+        """
+        Covers current operation with two similar another ones of class new_op_class:
+        :param inp: input data node of self.node node
+        :param out: output data node of self.node node
+        :param new_op_class: class of operation with which current operation will be covered
+        :param op_before_params: parameters to be added to operation that will be inserted before current operation
+        :param op_after_params: parameters to be added to operation that will be inserted after current operation
+
+        Before calling:
+        [...] -> inp -> Cur_Op -> out -> [...]
+
+        After calling:
+        [...] -> inp -> New_Op_bef -> New_Data_bef -> Cur_Op -> Cur_Data -> New_Op_aft -> New_Data_aft(==out) -> [...]
+                    [op_before_params]                                  [op_after_params]
+        """
+        self.bracket_with_different_nodes_with_data(inp=inp, out=out,
+                                                    new_op_class_before=new_op_class, new_op_class_after=new_op_class,
+                                                    op_before_params=op_before_params, op_after_params=op_after_params)
+
+
 class Node(NodeWrap):
     pass
 
@@ -419,7 +539,8 @@ def insert_node_after(node: Node, new_node: Node, node_out_port: int = 0):
 def erase_node(node: Node):
     """
     Erases node from the graph and reconnect edges from input node(s) to output node(s)
-    Produces assertion error in case of multiple inputs and outputs node at the same time
+    Produces assertion error if the node being removed has multiple inputs or outputs.
+    The function can be used in the front phase only (when there are no data nodes in the graph).
     :param node: Node to erase
     """
     graph = node.graph
@@ -428,37 +549,32 @@ def erase_node(node: Node):
     inputs = list(graph.in_edges(node_id, data=True))
     outputs = list(graph.out_edges(node_id, data=True))
 
-    assert not (len(inputs) > 1 and len(outputs) > 1)
+    assert node.kind == 'op' and (len(node.out_nodes()) == 0 or list(node.out_nodes().values())[0].kind != 'data'), \
+        "The function must be used before the partial infer when graph doesn't contain data nodes."
+    assert len(node.out_nodes()) <= 1, "The node {} must produce just one output tensor".format(node.soft_get('name'))
+    assert len(inputs) <= 1, "The node {} must have just one input".format(node.soft_get('name'))
 
     if len(outputs) == 0 and len(inputs) != 0:
-        for input, _, attrs in inputs:
-            if Node(graph, node_id).has_and_set('is_output'):
-                if graph.node[input]['kind'] == 'op':
-                    data_nodes = [u for u, v in graph.in_edges(input)]
+        for input_node_id, _, __ in inputs:
+            if node.has_and_set('is_output'):
+                if graph.node[input_node_id]['kind'] == 'op':
+                    data_nodes = [u for u, v in graph.in_edges(input_node_id)]
                     for data in data_nodes:
                         graph.node[data]['is_output'] = graph.node[node_id]['is_output']
                 else:
-                    graph.node[input]['is_output'] = graph.node[node_id]['is_output']
+                    graph.node[input_node_id]['is_output'] = graph.node[node_id]['is_output']
 
     if len(outputs) == 0 or len(inputs) == 0:
         graph.remove_node(node_id)
         return
 
-    if len(outputs) == 1:
-        output = outputs[0][1]
-        for src, noop, attrs in inputs:
-            graph.remove_edge(src, noop)
-            graph.add_edge(src, output, **attrs)
-        graph.remove_node(node_id)
-        return
-
-    if len(inputs) == 1:
-        input = inputs[0][0]
-        for noop, dst, attrs in outputs:
-            graph.remove_edge(noop, dst)
-            graph.add_edge(input, dst, **attrs)
-        graph.remove_node(node_id)
-        return
+    input_node_id = inputs[0][0]
+    for src, dst, attrs in outputs:
+        graph.remove_edge(src, dst)
+        # update the 'out' attribute of the edge from the node being removed
+        attrs['out'] = inputs[0][2]['out']
+        graph.add_edge(input_node_id, dst, **attrs)
+    graph.remove_node(node_id)
 
 
 def replace_node(old_node: Node, new_node: Node, new_node_out_port: int=None):
@@ -475,7 +591,8 @@ def replace_node(old_node: Node, new_node: Node, new_node_out_port: int=None):
         new_edge_attrs = deepcopy(edge_attrs)
         if new_node_out_port is not None:
             assert 'out' not in edge_attrs or edge_attrs['out'] == 0, \
-                'replace_node function can replace old node with a single output port only if new_node_out_port is specified'
+                'replace_node function can replace old node with a single output port only if new_node_out_port is ' \
+                'specified'
             new_edge_attrs.update({'out': new_node_out_port})
         graph.add_edge(new_node.id, dst_node_name, **new_edge_attrs)
 
index b0076f4..9c6654f 100644 (file)
@@ -20,17 +20,14 @@ import logging as log
 import networkx as nx
 import numpy as np
 
-from mo.front.common.layout import indices_mapping
+from mo.front.common.layout import get_batch_dim, get_features_dim
 from mo.front.common.partial_infer.utils import assign_dims_to_weights
 from mo.front.extractor import add_attrs_props
 from mo.front.extractor import update_ie_fields
 from mo.graph.graph import Node, unique_id
 from mo.middle.passes.fusing.helpers import get_value_id, get_tensor_id
-from mo.middle.passes.shape import repack_fully_connected_weights_nhwc_to_nchw
 from mo.middle.pattern_match import apply_pattern
-from mo.ops.op import Op, PermuteAttrs
-from mo.ops.permute import Permute
-from mo.utils.error import Error
+from mo.ops.op import Op
 
 
 def pad_op_transform(graph: nx.MultiDiGraph, match: dict):
@@ -38,6 +35,22 @@ def pad_op_transform(graph: nx.MultiDiGraph, match: dict):
     pad_op = match['pad_op']
     input_data = pad_op.in_node(0)
     pads = pad_op.in_node(1).value if len(pad_op.in_nodes()) == 2 else pad_op.pads
+
+    if pad_op.mode != 'constant':
+        log.info('The pad node "{}" with pad mode "{}" cannot be fused.'.format(pad_op.soft_get('name'), pad_op.mode))
+        return
+
+    if pad_op.mode == 'constant' and pad_op.fill_value != 0.0:
+        log.info('The pad node "{}" with non-zero fill value cannot be fused.'.format(pad_op.soft_get('name')))
+        return
+
+    input_tensor_dims = len(match['pad_output'].shape)
+    if np.any(pads[get_features_dim(op.graph.graph['layout'],input_tensor_dims)] != 0) or \
+            np.any(pads[get_batch_dim(op.graph.graph['layout'], input_tensor_dims)] != 0):
+        log.info('The pad node "{}" with padding over feature/batch dimension cannot be fused.'.format(
+            pad_op.soft_get('name')))
+        return
+
     op.pad += pads
     op.pad_spatial_shape = op.pad[op.spatial_dims]
     op['auto_pad'] = None
@@ -83,11 +96,11 @@ def matmul_to_fully_connected_action(graph: nx.MultiDiGraph, match: dict):
         len(weights_consumers) if weights_consumers is not None else None))
 
     if not (weights.value is not None and
-            input.shape is not None and
-            len(input.shape) >= 2 and
-            weights.shape is not None and
-            len(weights.shape) == 2 and
-            len(weights_consumers) >= 1):
+                    input.shape is not None and
+                    len(input.shape) >= 2 and
+                    weights.shape is not None and
+                    len(weights.shape) == 2 and
+                    len(weights_consumers) >= 1):
         matmul['can_be_fused'] = False
         return
 
@@ -130,11 +143,11 @@ def gemm_to_fully_connected_action(graph: nx.MultiDiGraph, match: dict):
     C_consumers = graph.out_edges(C.node)
 
     if not (B.value is not None and
-            C.value is not None and
-            A.shape is not None and
-            C.shape.size == 1 and
-            not gemm.transpose_a and
-            (len(B_consumers) == 1 or not gemm.transpose_b)):
+                    C.value is not None and
+                    A.shape is not None and
+                    C.shape.size == 1 and
+                not gemm.transpose_a and
+                (len(B_consumers) == 1 or not gemm.transpose_b)):
         log.warning('Cannot convert Gemm to FullyConnected')
         return
 
@@ -292,7 +305,7 @@ def batch_norm_fuse_action(graph: nx.MultiDiGraph, match: dict):
     match['kernel'].value = match['kernel'].value * match['norm'].value
     graph.remove_edge(match['conv_output'].node, match['mul'].node)
     graph.remove_edge(match['mul'].node, match['mul_output'].node)
-    # graph.remove_node(match['mul'].node)    # if we remove a node, next iteration over isomorphisms gives an error
+    # graph.remove_node(match['mul'].node)  # if we remove a node, next iteration over isomorphisms gives an error
     graph.add_edge(match['conv'].node, match['mul_output'].node, out=0)
 
 
@@ -328,7 +341,8 @@ def convert_add_to_scaleshift(graph: nx.MultiDiGraph):
                 node.in_node(value_id).value = np.squeeze(node.in_node(value_id).value)
                 node.in_node(value_id).shape = node.in_node(value_id).value.shape
 
-                # if the node was created with eltwise then it has attribute 'operation' which should be removed from the IR
+                # if the node was created with eltwise then it has attribute 'operation' which should be removed from
+                # the IR
                 if node.has('operation'):
                     del graph.node[n]['operation']
 
@@ -366,7 +380,8 @@ def convert_mul_to_scaleshift(graph: nx.MultiDiGraph):
                 node.in_node(value_id).value = np.squeeze(node.in_node(value_id).value)
                 node.in_node(value_id).shape = node.in_node(value_id).value.shape
 
-                # if the node was created with eltwise then it has attribute 'operation' which should be removed from the IR
+                # if the node was created with eltwise then it has attribute 'operation' which should be removed from
+                # the IR
                 if node.has('operation'):
                     del graph.node[n]['operation']
 
@@ -398,11 +413,8 @@ def convert_nasnet_action(graph: nx.MultiDiGraph, matches: dict):
     This function converts speciefic for NasNet topology subgraph Pad->StridedSlice->AvgPool to Conv->Crop->AvgPool
     """
     input = matches['input']
-    output = matches['output']
 
     pad_op = matches['pad_op']
-    pad_const = matches['pad_const']
-    pad_out = matches['pad_out']
 
     sslice = matches['sslice']
     sslice_out = matches['sslice_out']
@@ -414,9 +426,7 @@ def convert_nasnet_action(graph: nx.MultiDiGraph, matches: dict):
         end.append(s.stop)
         stride.append(s.step)
 
-    avg_pool = matches['avg_pool']
-
-    if not np.array_equal(pad_const.value, np.array([[0, 0], [0, 1], [0, 1], [0, 0]])):
+    if not np.array_equal(pad_op.pads, np.array([[0, 0], [0, 1], [0, 1], [0, 0]])):
         log.error(" Pad values doesn't match!")
         return
 
@@ -453,20 +463,18 @@ def convert_nasnet_action(graph: nx.MultiDiGraph, matches: dict):
              shape=np.array(conv_weights.shape),
              data_type=input.data_type, infer=None,
              spatial_dims=np.array([0, 1]),
-             input_channel_dim=np.array(2),
-             output_channel_dim=np.array(3),
-             dims_number=np.array(4), can_be_bias=True)))
+             input_channel_dim=2,
+             output_channel_dim=3,
+             dims_number=4, can_be_bias=True)))
     graph.add_node(conv_output, **add_attrs_props(
         dict(kind='data', precision="FP32", name=conv_output, value=None, shape=output_shape,
              data_type=input.data_type)))
 
     # StridedSlice -> Crop
-    Crop = Op.get_op_class_by_name('Crop')
-    crop = Crop(graph, dict(name=sslice.name + '/Crop_', axis=np.array([1, 2]),
-                            dim=np.array([output_shape[1] - 1, output_shape[2] - 1]), offset=np.array([1, 1])))
+    crop_cls = Op.get_op_class_by_name('Crop')
+    crop = crop_cls(graph, dict(name=sslice.name + '/Crop_', axis=np.array([1, 2]),
+                                dim=np.array([output_shape[1] - 1, output_shape[2] - 1]), offset=np.array([1, 1])))
     crop.create_node_with_data([Node(graph, conv_output)], data_nodes=sslice_out)
-    # graph.add_node(crop_node, **add_attrs_props(dict(kind='op', precision="FP32", type='Crop', name=crop_node,
-    #                                                 op='Crop', axis=[1,2], dim=[output_shape[1]-1, output_shape[2]-1], offset=[1,1])))
 
     # Connect : Conv->Crop->AvgPool
     graph.add_edges_from([
@@ -482,7 +490,6 @@ def convert_nasnet(graph: nx.MultiDiGraph):
         graph,
         nodes=[
             ('input', dict(kind='data')),
-            ('pad_const', dict(kind='data')),
             ('pad_op', dict(kind='op', op='Pad')),
             ('pad_out', dict(kind='data')),
 
@@ -498,7 +505,6 @@ def convert_nasnet(graph: nx.MultiDiGraph):
         ],
         edges=[
             ('input', 'pad_op', {'in': 0}),
-            ('pad_const', 'pad_op', {'in': 1}),
             ('pad_op', 'pad_out'),
 
             ('begin', 'sslice', {'in': 1}),
@@ -594,7 +600,7 @@ def convert_multi_input_conv(graph: nx.MultiDiGraph):
                 num_inputs = len(node.in_nodes()) - 1
                 w_node = node.in_node(len(node.in_nodes()) - 1)
 
-            for i in range(0, num_inputs - 1):
+            for i in range(1, num_inputs):
                 in_i = node.in_node(i)
                 out_i = node.out_node(i)
                 conv_id = unique_id(graph, node.id + '__')
@@ -616,4 +622,4 @@ def convert_multi_input_conv(graph: nx.MultiDiGraph):
                 graph.add_edges_from([
                     (in_i.id, conv_id, {'in': 0}),
                 ])
-                graph.add_edge(conv_id, out_i.id, out=3)
+                graph.add_edge(conv_id, out_i.id, **{'out': 0})
index 04c3fbf..2878add 100644 (file)
@@ -59,13 +59,14 @@ def mark_output_reachable_nodes(graph: nx.MultiDiGraph):
     Mark nodes whether they are outputs reachable or not. The node is considered output reachable if it is connected to
     one of the nodes that has attribute is_output=True.
     """
-    nx.set_node_attributes(graph, name='is_output_reachable', values=False)
+    nx.set_node_attributes(G=graph, name='is_output_reachable', values=False)
     outputs = get_nodes_with_attributes(graph, is_output=True)
     log.debug('The following nodes are seeded as output reachable:\n{}'.format('\n'.join(sorted(map(str, outputs)))))
-    nx.set_node_attributes(graph, name='is_output_reachable', values={n: True for n in outputs})
+    nx.set_node_attributes(G=graph, name='is_output_reachable', values={n: True for n in outputs})
+    visited = set()
     for output_name in outputs:
         reverse_dfs(graph, output_name,
-                    lambda graph, node_name: mark_input_nodes(graph, node_name, 'is_output_reachable', True), set())
+                    lambda graph, node_name: mark_input_nodes(graph, node_name, 'is_output_reachable', True), visited)
 
 
 def mark_undead_nodes(graph: nx.MultiDiGraph, undead_types: list):
@@ -76,16 +77,16 @@ def mark_undead_nodes(graph: nx.MultiDiGraph, undead_types: list):
     :param undead_types: list of node types that should be marked as undead.
     :return: updated graph where each has attribute 'is_undead'.
     """
-    nx.set_node_attributes(graph, name='is_undead', values=False)
+    nx.set_node_attributes(G=graph, name='is_undead', values=False)
 
     # mark output nodes as undead
     outputs = get_nodes_with_attributes(graph, is_output=True)
-    nx.set_node_attributes(graph, name='is_undead', values={n: True for n in outputs})
+    nx.set_node_attributes(G=graph, name='is_undead', values={n: True for n in outputs})
 
     # mark specifically defined with node type set of nodes
     for type in undead_types:
         node_of_specific_type = get_nodes_with_attributes(graph, type=type)
-        nx.set_node_attributes(graph, name='is_undead', values={n: True for n in node_of_specific_type})
+        nx.set_node_attributes(G=graph, name='is_undead', values={n: True for n in node_of_specific_type})
 
     undead_nodes = get_nodes_with_attributes(graph, is_undead=True)
     # propagate 'undead' attribute to children nodes of undead nodes if the node produces constant value
@@ -98,7 +99,7 @@ def mark_undead_nodes(graph: nx.MultiDiGraph, undead_types: list):
 
     # mark input nodes as undead
     inputs = get_nodes_with_attributes(graph, is_input=True)
-    nx.set_node_attributes(graph, name='is_undead', values={n: True for n in inputs})
+    nx.set_node_attributes(G=graph, name='is_undead', values={n: True for n in inputs})
 
 
 def mark_const_producer_nodes(graph: nx.MultiDiGraph):
@@ -107,7 +108,7 @@ def mark_const_producer_nodes(graph: nx.MultiDiGraph):
     :param graph: graph to operate on.
     :return: .
     """
-    nx.set_node_attributes(graph, name='is_const_producer', values=True)
+    nx.set_node_attributes(G=graph, name='is_const_producer', values=True)
 
     for n in pseudo_topological_sort(graph):
         node = Node(graph, n)
@@ -142,7 +143,7 @@ def graph_clean_up_tf(graph: nx.MultiDiGraph):
 
 
 def remove_identity_action(graph: nx.MultiDiGraph, matches: dict):
-    remove_op_node(graph, matches['identity'])
+    remove_op_node_with_data_node(graph, matches['identity'])
 
 
 # TODO: unit tests
@@ -171,21 +172,22 @@ def merge_data_nodes(graph: nx.MultiDiGraph, survived: Node, removed: Node):
 
 
 # TODO: unit tests
-def remove_op_node(graph: nx.MultiDiGraph, identity: Node):
-    input = identity.in_node()
-    output = [v for _, v in graph.out_edges(identity.id)]
-    assert len(output) == 1
-    output = Node(graph, output[0])
+def remove_op_node_with_data_node(graph: nx.MultiDiGraph, node_to_remove: Node):
+    assert node_to_remove.kind == 'op'
+    input_data_node = node_to_remove.in_node()
+    output_node = [v for _, v in graph.out_edges(node_to_remove.id)]
+    assert len(output_node) == 1, "Cannot remove node producing two or more output tensors"
+    output_node = Node(graph, output_node[0])
+    assert output_node.kind == 'data', "The function must be used after partial infer"
 
-    graph.remove_edge(input.id, identity.id)
-    graph.remove_edge(identity.id, output.id)
+    graph.remove_edge(input_data_node.id, node_to_remove.id)
+    graph.remove_edge(node_to_remove.id, output_node.id)
 
-    merge_data_nodes(graph, output, input)
+    merge_data_nodes(graph, output_node, input_data_node)
 
     # we just have saved all output edges from 'input' by reconnecting them to 'output', now we can delete 'input'
-    log.debug('Removing op node: {}'.format(identity.id))
-    graph.remove_node(identity.id)
-    graph.remove_node(input.id)
+    log.debug('Removing op node: {}'.format(node_to_remove.id))
+    graph.remove_nodes_from([node_to_remove.id, input_data_node.id])
 
 
 def remove_op_nodes(graph: nx.MultiDiGraph, attrs: dict):
@@ -228,15 +230,3 @@ def remove_useless_split(graph: nx.MultiDiGraph):
         edges=[],
         action=remove_useless_split_action
     )
-
-
-def remove_node_from_graph(graph: nx.MultiDiGraph, previous_node: Node, removing_node: Node):
-    if len(removing_node.out_nodes()) > 0:
-        last_node_out = removing_node.out_node(0)
-        edge_data = graph.get_edge_data(removing_node.id, last_node_out.id)
-        out_port = edge_data[0]['out']
-        in_port = edge_data[0]['in']
-        graph.remove_edge(previous_node.id, removing_node.id)
-        graph.remove_edge(removing_node.id, last_node_out.id)
-        create_edge(previous_node, last_node_out, out_port=out_port, in_port=in_port)
-        graph.remove_node(removing_node.id)
index c51ac28..737074f 100644 (file)
@@ -24,6 +24,7 @@ from mo.middle.passes.eliminate import merge_data_nodes
 from mo.middle.pattern_match import apply_pattern
 from mo.ops.lin_op import Mul, Add
 from mo.ops.op import Op
+from mo.ops.reshape import Reshape
 
 
 def convert_batch_norm(graph: nx.MultiDiGraph):
@@ -115,7 +116,7 @@ def _fused_batch_norm_decomposition(graph: nx.MultiDiGraph, tinput: Node, toutpu
 def convert_scale_shift_to_mul_add(graph: nx.MultiDiGraph):
     nodes = [Node(graph, node) for node in graph.nodes() if Node(graph, node).soft_get('op') == 'ScaleShift']
     for node in nodes:
-        if node.soft_get('can_be_fused') == False:
+        if node.soft_get('can_be_fused') is False:
             continue
 
         has_biases = True
@@ -128,7 +129,7 @@ def convert_scale_shift_to_mul_add(graph: nx.MultiDiGraph):
         shift_node = node.in_node(2) if has_biases else None
         output_node = node.out_node()
 
-        if all([x == 1 for x in scale_node.value]):
+        if scale_node.has_valid("value") and all([x == 1 for x in scale_node.value]):
             has_weights = False
 
         mul_node = Mul(graph, dict(name=node.name + "/Mul_"))
@@ -140,7 +141,21 @@ def convert_scale_shift_to_mul_add(graph: nx.MultiDiGraph):
 
         # Expand dims for current layout
         broadcast_dims_cnt = len(input_node.shape) - 2 if graph.graph['layout'] == 'NCHW' else 0
-        Op.expand_node_shape(scale_node, broadcast_dims_cnt)
+        if scale_node.has_valid("value"):
+            Op.expand_node_shape(scale_node, broadcast_dims_cnt)
+        else:
+            # insert reshape to make shapes similar
+            reshape_dims = np.zeros(len(input_node.shape), dtype=np.int64)
+            for i in range(0, node.axis):
+                reshape_dims[i] = 1
+            for i in range(node.axis, node.axis + len(scale_node.shape)):
+                reshape_dims[i] = scale_node.shape[i-node.axis]
+            for i in range(node.axis + len(scale_node.shape), len(input_node.shape)):
+                reshape_dims[i] = 1
+            reshape = Reshape(graph, dict(name=scale_node.name+"/Broadcast_",
+                                          dim=reshape_dims))
+            scale_node = reshape.create_node_with_data(inputs=[scale_node])
+
         Op.expand_node_shape(shift_node, broadcast_dims_cnt)
 
         # Connect input->mul->out->add->out
index c3ef2bb..e6f46e8 100644 (file)
@@ -21,13 +21,14 @@ import numpy as np
 
 # TODO remove it
 from mo.front.extractor import update_ie_fields
-from mo.graph.graph import Node, get_outputs, get_node_id_by_name
+from mo.graph.graph import Node, get_outputs, get_node_id_by_name, dump_graph_for_graphviz
 from mo.middle.passes.eliminate import get_nodes_with_attributes
 from mo.middle.pattern_match import apply_pattern, for_each_sub_graph
 from mo.ops.lin_op import Mul, Add
 from mo.ops.op import Op
 from mo.utils.error import Error
 from mo.utils.utils import refer_to_faq_msg
+from mo.graph.graph import dict_includes
 
 
 def log_debug_dict(nodes_per_port: dict, direction_name: str):
@@ -93,6 +94,19 @@ def delete_control_flow_edges(graph: nx.MultiDiGraph):
             log.debug('Removing control flow edge from {} to {}'.format(u, v))
 
 
+def exit_bound_edges(graph: nx.MultiDiGraph, sources: list, end_node_attrs: dict):
+    """
+    Finds all descendant nodes for each node from 'sources' that have given attributes from end_node_attrs.
+    For each found node, create a tuple with a given element from 'source' and the node.
+    """
+    result = []
+    for node in sources:
+        for end_node in nx.descendants(graph, node):
+            if dict_includes(big=graph.node[end_node], sub_dict=end_node_attrs):
+                result.append((node, end_node, 0, {}))
+    return result
+
+
 def partial_infer(graph: nx.MultiDiGraph, start_node: str = None):
     """
     Tries to execute constant parts of the graph and deduce as much as possible
@@ -102,25 +116,28 @@ def partial_infer(graph: nx.MultiDiGraph, start_node: str = None):
     """
     cycle_nodes = get_nodes_with_attributes(graph, is_cyclic=True)
     cycle_nodes = [Node(graph, node).out_node().id for node in cycle_nodes]
-    ebunch = list(graph.out_edges(nbunch=cycle_nodes, data=True, keys=True))
-    graph.remove_edges_from(ebunch)
+    ebunch_cyclic = list(graph.out_edges(nbunch=cycle_nodes, data=True, keys=True))
+    ebunch_reconnected = exit_bound_edges(graph, sources=cycle_nodes, end_node_attrs={'op': 'Exit'})
+    graph.remove_edges_from(ebunch_cyclic)
+    graph.add_edges_from(ebunch_reconnected)
 
     try:
         nodes = list(nx.topological_sort(graph))
     except:
         raise Error('Graph contains a cycle. Can not proceed. ' + refer_to_faq_msg(97))
 
-    graph.add_edges_from(ebunch)
+    graph.remove_edges_from(ebunch_reconnected)
+    graph.add_edges_from(ebunch_cyclic)
 
     # Mark all nodes as not inferred yet
     if not start_node is None:
         start_index = nodes.index(start_node)
-        nx.set_node_attributes(graph.subgraph(nodes[start_index:]), name='is_partial_inferred', values=False)
+        nx.set_node_attributes(G=graph.subgraph(nodes[start_index:]), name='is_partial_inferred', values=False)
     else:
-        nx.set_node_attributes(graph, name='is_partial_inferred', values=False)
+        nx.set_node_attributes(G=graph, name='is_partial_inferred', values=False)
     debug_logger = log.getLogger().isEnabledFor(log.DEBUG)
 
-    nx.set_node_attributes(graph, name='executable',
+    nx.set_node_attributes(G=graph, name='executable',
                            values={n: True for n in get_nodes_with_attributes(graph, kind='data')})
 
     for n in nodes:
@@ -217,10 +234,10 @@ def check_for_cycle(graph: nx.MultiDiGraph):
 
 
 def mark_outputs(graph: nx.MultiDiGraph):
-    nx.set_node_attributes(graph, name='is_output', values=False)
+    nx.set_node_attributes(G=graph, name='is_output', values=False)
     for node in graph.nodes():
         if graph.node[node]['kind'] == 'data' and len(get_outputs(graph, node)) == 0:
-            nx.set_node_attributes(graph, name='is_output', values={node: True})
+            nx.set_node_attributes(G=graph, name='is_output', values={node: True})
 
 
 def override_batch(graph: nx.MultiDiGraph, batch: int):
@@ -249,29 +266,31 @@ def override_batch(graph: nx.MultiDiGraph, batch: int):
 
 def override_placeholder_shapes(graph: nx.MultiDiGraph, user_shapes: dict, batch=None):
     """
-    Overrides shapes for nodes defined by user
-    Or overrides shapes for nodes with 'op' param set to 'Placeholder'
-    Parameters
-    ----------
-    graph: graph to operate on
-    user_shapes: dictionary, that represents user defined nodes and shapes
-    batch: user defined integer value to override batch
+    This function overrides shapes for nodes with 'op' param set to 'Placeholder' with shapes defined by users (only
+    for inputs without in/out port specified).
+    And override batch if batch was specified and shape for input is not None.
+    :param graph: graph to operate on
+    :param user_shapes: dictionary, that represents user defined nodes and shapes
+    :param batch: user defined integer value to override batch
     """
     if user_shapes is None:
         # DON'T MOVE UPPER!!! WE NEED TO SET BATCH FIRST
         # user did not specify neither shapes nor inputs, keep models values
         return
-    if isinstance(user_shapes, dict):
-        for node_id, values in user_shapes.items():
+    placeholders = get_nodes_with_attributes(graph, kind='op', op='Placeholder')
+    for node_id in placeholders:
+        node_attrs = graph.node[node_id]
+        shape = None
+        if node_id in user_shapes:
+            values = user_shapes[node_id]
             for value in values:
-                shape = value['shape'] if 'shape' in value else None
-                if shape is not None:
-                    graph.node[node_id]['shape'] = shape
-                if 'shape' in graph.node[node_id] and graph.node[node_id]['shape'] is not None:
-                    if batch:
-                        old_batch = graph.node[node_id]['shape'][0]
-                        if old_batch != batch:
-                            graph.node[node_id]['shape'] = np.array([batch, *graph.node[node_id]['shape'][1:]])
+                if 'in' not in value and 'out' not in value:
+                    shape = value['shape'] if value['shape'] is not None else None
+                    break  # we assume only one specified shape for one input
+        if shape is not None:
+            node_attrs['shape'] = shape
+        if batch is not None and node_attrs['shape'] is not None and len(node_attrs['shape']) > 0:
+            node_attrs['shape'][0] = batch
 
 
 def _scale_input_action_mul(graph: nx.MultiDiGraph, match: dict, scale: float):
index b411200..a819cda 100644 (file)
@@ -28,10 +28,13 @@ def mean_to_avgpool_action(graph: nx.MultiDiGraph, matches: dict):
         return
     dims = len(matches['input'].shape)
     ones = np.ones(dims, dtype=np.int64)
+    axis = np.array(matches['axis'].value)
+    axis = axis if axis.ndim != 0 else np.array([axis], dtype=np.int64)
+
     mean = graph.node[matches['mean'].node]
     mean['stride'] = np.array(ones)
     # TODO: need to check axis with real layout
-    spatial_dims = np.array(matches['axis'].value)
+    spatial_dims = np.array(axis)
     mean['spatial_dims'] = spatial_dims
     mean['pad'] = np.zeros((dims, 2), np.int64)
     mean['pad_spatial_shape'] = np.array(mean['pad'][spatial_dims])
index b0285ed..647502b 100644 (file)
@@ -21,7 +21,7 @@ import numpy as np
 
 from mo.front.extractor import update_attrs
 from mo.graph.graph import Node, create_edge
-from mo.middle.passes.eliminate import remove_op_node, merge_data_nodes, graph_clean_up_tf
+from mo.middle.passes.eliminate import remove_op_node_with_data_node, merge_data_nodes, graph_clean_up_tf, get_nodes_with_attributes
 from mo.middle.passes.fusing.helpers import get_next_operation
 from mo.middle.pattern_match import apply_pattern
 from mo.ops.op import PermuteAttrs, Op
@@ -38,12 +38,10 @@ def reshape_squeeze_transform(graph: nx.MultiDiGraph, match: dict):
     reshape['shape'] = output.shape
     reshape.op = 'Reshape'
     reshape['type'] = 'Reshape'
-    reshape['axis'] = 0  # TODO what does it mean?
     if not reshape.has_valid('dim'):
         # do not override value 'dim' if it is set. It may contain specific values like -1 and 0
         reshape['dim'] = reshape.shape.copy()
     update_attrs(reshape, 'shape_attrs', 'dim')
-    reshape['num_axes'] = -1  # TODO what does it mean?
     if 'shape' in match:
         graph.remove_edge(match['shape'].node, match['reshape'].node)
 
@@ -71,60 +69,81 @@ def convert_reshape(graph: nx.MultiDiGraph):
     )
 
 
+def can_repack_fully_connected_weights_nhwc_to_nchw(fc_node: Node):
+    """
+    Checks that it is possible to repack weights of the FullyConnected layer if the Reshape layer is the input of the
+    FullyConnected and satisfies several conditions.
+    :param fc_node: the FullyConnected node to check
+    :return: the result of the check
+    """
+    if len(fc_node.in_node(0).in_nodes()) != 1:
+        return False
+
+    reshape_node = fc_node.in_node(0).in_node(0)
+    if not reshape_node.has_valid('type') or reshape_node.type != 'Reshape':
+        return False
+
+    if not reshape_node.in_node(0).has_valid('shape') or not reshape_node.out_node().has_valid('shape'):
+        return False
+
+    orig_shape = reshape_node.in_node(0).shape
+    new_shape = reshape_node.out_node().shape
+
+    # TODO a bit conservative condition; relax it checking specific dimensions that are involved in
+    # NHWC to NCWH translation
+    if len(orig_shape) == len(new_shape) and all(orig_shape == new_shape):
+        return False
+
+    # TODO here is a couple of limitations that makes this pass simpler; consider to relax them
+    if len(orig_shape) == 4 and len(new_shape) == 2 and orig_shape[0] == new_shape[0]:
+        # that means orig_shape is in NCHW and new_shape is in NC
+        # and we need to map CHW part to C after HWC to CHW transform
+        # Assuming that FullyConnected weights haven't been converted from IO to OI yet.
+        # So format is IO.
+        return True
+    else:
+        log.warning("Cannot do the complete NHWC to NCHW translation for FullyConnected weights. "
+                    "The final model can be broken.")
+        return False
+
+
 def repack_fully_connected_weights_nhwc_to_nchw(graph: nx.MultiDiGraph):
     """
     Repack weights of FullyConnected layer as a part of nhwc_to_nchw translation if Reshape of
     that involves dimensions that we are repacking appears right before FullyConnected layer.
     """
-    for node in graph.nodes():
-        node = Node(graph, node)
-        if node.has_valid('type') and node.type == 'FullyConnected':
-            assert node.in_node(0).kind == 'data'
-            if len(node.in_node(0).in_nodes()) == 1:
-                input = node.in_node(0).in_node(0)
-                if input.has_valid('type') and input.type == 'Reshape':
-                    assert len(input.in_nodes()) > 0
-                    if input.in_node(0).has_valid('shape') and input.out_node().has_valid('shape'):
-
-                        orig_shape = input.in_node(0).shape
-                        new_shape = input.out_node().shape
-
-                        # TODO a bit conservative condition; relax it checking specific dimensions
-                        # that are involved in NHWC to NCWH translation
-                        if len(orig_shape) != len(new_shape) or any(orig_shape != new_shape):
-                            # OK, here we are; need to repack node.in_node(1) to maintain it compatible with original
-                            # input order
-
-                            # TODO here is a couple of limitations that makes this pass simpler; consider to relax them
-                            if len(orig_shape) == 4 and len(new_shape) == 2 and orig_shape[0] == new_shape[0]:
-                                # that means orig_shape is in NCHW and new_shape is in NC
-                                # and we need to map CHW part to C after HWC to CHW transform
-                                # Assuming that FullyConnected weights haven't been converted from IO to OI yet.
-                                # So format is IO.
-
-                                assert all(orig_shape != -1), 'Input shape for {} can not be negative.'.format(node.id)
-                                assert all(new_shape != -1), 'Output shape for {} can not be negative.'.format(node.id)
-                                assert orig_shape[1] * orig_shape[2] * orig_shape[3] == new_shape[1], \
-                                    'Input shape does not correspond to output shape for layer {}.'.format(node.id)
-                                assert node.in_node(1).has_valid('value'), 'Node {} does not have value.'.format(node.id)
-
-                                weights = node.in_node(1)
-
-                                log.debug("orig_shape = {}".format(orig_shape))
-                                log.debug("new_shape = {}".format(new_shape))
-                                log.debug("weights.shape = {}".format(weights.shape))
-                                log.debug("weights.shape[1] = {}, new_shape[1] = {}".format(weights.shape[1], new_shape[1]))
-
-                                assert weights.shape[0] == new_shape[1], \
-                                    'First dim of weights does not correspond to output shape of {}'.format(node.id)
-                                # interpret I dimension of the weights as packed HWC
-                                # orig shape is already converted to NCHW, so provide transposed order for I repacking
-                                tmp_shape = (orig_shape[2], orig_shape[3], orig_shape[1], weights.shape[1])
-                                weights.value = np.transpose(weights.value.reshape(tmp_shape), (2, 0, 1, 3)).reshape(
-                                    weights.shape)
-                            else:
-                                log.warning("Cannot do the complete NHWC to NCHW translation for FullyConnected weights. "
-                                            "The final model can be broken.")
+    for node_id in get_nodes_with_attributes(graph, type='FullyConnected'):
+        fc_node = Node(graph, node_id)
+
+        if not can_repack_fully_connected_weights_nhwc_to_nchw(fc_node):
+            continue
+
+        reshape_node = fc_node.in_node(0).in_node(0)
+
+        orig_shape = reshape_node.in_node(0).shape
+        new_shape = reshape_node.out_node().shape
+
+        # OK, here we are; need to repack fc_node.in_node(1) to maintain it compatible with original input order
+
+        assert all(orig_shape != -1), 'Input shape for {} can not be negative.'.format(fc_node.id)
+        assert all(new_shape != -1), 'Output shape for {} can not be negative.'.format(fc_node.id)
+        assert orig_shape[1] * orig_shape[2] * orig_shape[3] == new_shape[1], \
+            'Input shape does not correspond to output shape for layer {}.'.format(fc_node.id)
+        assert fc_node.in_node(1).has_valid('value'), 'Node {} does not have value.'.format(fc_node.id)
+
+        weights = fc_node.in_node(1)
+
+        log.debug("orig_shape = {}".format(orig_shape))
+        log.debug("new_shape = {}".format(new_shape))
+        log.debug("weights.shape = {}".format(weights.shape))
+        log.debug("weights.shape[1] = {}, new_shape[1] = {}".format(weights.shape[1], new_shape[1]))
+
+        assert weights.shape[0] == new_shape[1], \
+            'First dim of weights does not correspond to output shape of {}'.format(fc_node.id)
+        # interpret I dimension of the weights as packed HWC
+        # orig shape is already converted to NCHW, so provide transposed order for I repacking
+        tmp_shape = (orig_shape[2], orig_shape[3], orig_shape[1], weights.shape[1])
+        weights.value = np.transpose(weights.value.reshape(tmp_shape), (2, 0, 1, 3)).reshape(weights.shape)
 
 
 def apply_nhwc_to_nchw_permutation(graph: nx.MultiDiGraph):
@@ -230,7 +249,10 @@ def permute_op_nodes_attrs(graph: nx.MultiDiGraph):
     for node in graph.nodes():
         node = Node(graph, node)
         if node.kind == 'op' and node.has_valid('permute_attrs'):
-            node.permute_attrs.permute_attrs(node)
+            try:
+                node.permute_attrs.permute_attrs(node)
+            except Exception as e:
+                raise Error('Can\'t permute attrs for node {}. Error message: {}'.format(node.id, e))
 
 
 def reverse_input_channels(graph: nx.MultiDiGraph):
@@ -333,20 +355,30 @@ def reverse_input_channels(graph: nx.MultiDiGraph):
 
 
 def conv_flatten_concat_action(graph: nx.MultiDiGraph, match: dict):
+    assert graph.graph['layout'] == 'NHWC'
     reshape_node = match['reshape']
     reshape_data_node = match['reshape_data']
-    concat_node = match['concat']
-    concat_data_node = match['concat_data']
     conv_name = match['conv'].name
     conv_data_node = match['conv_data']
+    # the pattern should be applied only in case when the reshape operation changes number of dimensions
+    if len(reshape_data_node.shape) == len(conv_data_node.shape) or reshape_node.has_and_set('nchw_layout'):
+        return
+
+    if len(reshape_data_node.out_nodes()) == 1 and reshape_data_node.out_node().has_valid('type') and \
+        reshape_data_node.out_node().type == 'FullyConnected' and \
+            can_repack_fully_connected_weights_nhwc_to_nchw(reshape_data_node.out_node()):
+        log.info('There is a FullyConnected layer after the node "{}" which weights will be repacked. So there is no '
+                 'need to insert Permute'.format(reshape_node.soft_get('name')))
+        return
     assert len(graph.in_edges(reshape_node.id)) == 1
     graph.remove_edge(conv_data_node.id, reshape_node.id)
-    new_permute_op = Permute(graph, {'order': np.array([0, 2, 3, 1])})
+
+    permutation_order = PermuteAttrs.get_nchw_to_nhwc_permutation(len(conv_data_node.shape)).perm
+    new_permute_op = Permute(graph, {'order': permutation_order})
     permute_data_node = new_permute_op.create_node_with_data([conv_data_node], dict(name=conv_name + '/Permute_'))
     create_edge(permute_data_node, reshape_node)
-    # Disable permutation for Reshape and Concat
+    # Disable permutation for Reshape and Concat layers attributes
     PermuteAttrs.set_permutation(reshape_node, reshape_data_node, None)
-    PermuteAttrs.set_permutation(concat_node, concat_data_node, None, skip_if_exists=True)
 
 
 def conv_flatten_concat(graph: nx.MultiDiGraph):
@@ -357,15 +389,31 @@ def conv_flatten_concat(graph: nx.MultiDiGraph):
             ('conv_data', dict(kind='data')),
             ('reshape', dict(kind='op', type='Reshape')),
             ('reshape_data', dict(kind='data')),
-            ('concat', dict(kind='op', type='Concat')),
-            ('concat_data', dict(kind='data'))
         ],
         edges=[
             ('conv', 'conv_data'),
             ('conv_data', 'reshape'),
             ('reshape', 'reshape_data'),
-            ('reshape_data', 'concat'),
-            ('concat', 'concat_data')
+        ],
+        action=conv_flatten_concat_action
+    )
+
+    apply_pattern(
+        graph,
+        nodes=[
+            ('real_conv', dict(kind='op', type='Convolution')),
+            ('real_conv_data', dict(kind='data')),
+            ('conv', dict(kind='op', type='ReLU')),
+            ('conv_data', dict(kind='data')),
+            ('reshape', dict(kind='op', type='Reshape')),
+            ('reshape_data', dict(kind='data')),
+        ],
+        edges=[
+            ('real_conv', 'real_conv_data'),
+            ('real_conv_data', 'conv'),
+            ('conv', 'conv_data'),
+            ('conv_data', 'reshape'),
+            ('reshape', 'reshape_data'),
         ],
         action=conv_flatten_concat_action
     )
@@ -390,4 +438,4 @@ def fuse_sequence_of_reshapes(graph: nx.MultiDiGraph):
                 # Detected Reshape1 --> data --> Reshape2 pattern without side edges
                 # Remove Reshape1
                 log.debug('Second phase for Reshape: {}'.format(node.name))
-                remove_op_node(graph, node)
+                remove_op_node_with_data_node(graph, node)
index fc08c4b..e6bcdee 100644 (file)
@@ -83,14 +83,12 @@ class Convolution(Op):
         return float_spatial_val_wo_stride / stride_spatial_shape + 1
 
     @staticmethod
-    def calc_deconvolution(node, input_spatial_shape, pad_spatial_shape, kernel_extent, output_padding=None):
+    def calc_deconvolution(node, input_spatial_shape, pad_spatial_shape, kernel_extent):
         ''' Calculates output shape for Deconvolution.
             Verified to be applicable for both Caffe and ONNX with explicitly defined pads.
             If pads are not specified for ONNX operator, this function is not applicable.
         '''
         shape = node.stride[node.spatial_dims] * (input_spatial_shape - 1) + kernel_extent - pad_spatial_shape
-        if output_padding is not None:
-            shape += output_padding
         return shape
 
     @staticmethod
@@ -128,7 +126,7 @@ class Convolution(Op):
         if node.has_valid('reshape_kernel') and node.reshape_kernel:
             if not (node.has_valid('output') and node.has_valid('channel_dims') and node.has_valid(
                     'group') and node.has_valid('kernel_spatial')):
-                log.error('Can\'t reshape kernel due to not all required attrs was set to {} node'.format(node.id))
+                log.error('Cannot reshape kernel due to not all required attrs was set to {} node'.format(node.id))
                 return
             # layout for Convolution weights is OIHW
             kernel_shape = np.array([node.output, input_shape[node.channel_dims].item() / node.group,
@@ -136,6 +134,11 @@ class Convolution(Op):
             if node.type == 'Deconvolution':  # layout for Deconvolution weights is IOHW
                 kernel_shape[[0, 1]] = kernel_shape[[1, 0]]
 
+            if np.prod(kernel_shape) != np.prod(node.in_node(weights_index).value.shape):
+                log.error("Size of weights {} does not match kernel shape: {}\n".format(np.prod(node.in_node(weights_index).value.shape), kernel_shape) +
+                          "    Possible reason is wrong channel number in input shape\n")
+                raise Error("Cannot reshape weights to kernel shape")
+
             node.in_node(weights_index).shape = np.array(kernel_shape)
             node.in_node(weights_index).value = np.reshape(node.in_node(weights_index).value, kernel_shape)
             node.reshape_kernel = False
@@ -154,7 +157,7 @@ class Convolution(Op):
         node['kernel_spatial'] = kernel_shape[node.kernel_spatial_idx]
 
         if not node.has_valid('output'):
-            # restore the number of output feature maps from the scond argument that is weights
+            # restore the number of output feature maps from the second argument that is weights
             if node.type in ['Convolution', 'Deconvolution']:
                 node['output'] = kernel_shape[node.output_feature_channel]
             else:
@@ -171,7 +174,7 @@ class Convolution(Op):
             node['stride'] = np.full([len(input_shape)], 1, dtype=np.int64)
         if not node.has_valid('pad'):
             node['pad'] = np.array([[0, 0]] * len(input_shape), dtype=np.int64)
-            node['pad_spatial_shape'] = node.pad[node.spatial_dims]
+        node['pad_spatial_shape'] = node.pad[node.spatial_dims]
 
         input_spatial_shape = input_shape[node.spatial_dims]
         stride_spatial_shape = node.stride[node.spatial_dims]
@@ -205,9 +208,15 @@ class Convolution(Op):
                         return
                 else:
                     output_padding = node.output_padding[node.spatial_dims] if node.has_valid('output_padding') else None
+                    if output_padding is not None:
+                        pad_spatial_shape -= output_padding
+                        for dim in range(len(pad_spatial_shape)):
+                            node.pad_spatial_shape[dim][1] -= pad_spatial_shape[dim]
+                        node.pad[node.spatial_dims] = node.pad_spatial_shape
+                        node['output_padding'] = None
+
                     float_spatial = Convolution.calc_deconvolution(node, input_spatial_shape, pad_spatial_shape,
-                                                                   kernel_extent,
-                                                                   output_padding)
+                                                                   kernel_extent)
                     node['output_spatial_shape'] = int64_array(float_spatial)
             else:
                 return
@@ -249,5 +258,4 @@ class Convolution(Op):
                                                        ])
 
         PermuteAttrs.set_permutation(node.in_node(weights_index), node,
-                                     node.get_weights_permute if node.has_valid('get_weights_permute') else None,
-                                     skip_if_exists=True)
+                                     node.get_weights_permute if node.has_valid('get_weights_permute') else None)
index 9a31c1b..b4fe12b 100644 (file)
@@ -133,5 +133,4 @@ class Deconvolution(Op):
                                                        ])
 
         PermuteAttrs.set_permutation(node.in_node(1), node,
-                                     node.get_weights_permute if node.has_valid('get_weights_permute') else None,
-                                     skip_if_exists=True)
+                                     node.get_weights_permute if node.has_valid('get_weights_permute') else None)
index f343067..18185f6 100644 (file)
@@ -28,7 +28,7 @@ class Eltwise(Op):
         operations = {
             'sum': ('Add', lambda a, b: a + b),
             'mul': ('Mul', lambda a, b: a * b),
-            'max': ('Max', lambda a, b: np.max(a, b))
+            'max': ('Max', lambda a, b: np.maximum(a, b))
         }
 
         super().__init__(graph, {
index 94f6a87..ce790bb 100644 (file)
@@ -32,4 +32,4 @@ class ExpandDims(Op):
 
     def supported_attrs(self):
         # TODO ugly copying from Reshape op
-        return ['axis', ('dim', lambda node: ', '.join(map(str, node['dim']))), 'num_axes']
+        return [('dim', lambda node: ', '.join(map(str, node['dim'])))]
diff --git a/model-optimizer/mo/ops/flatten.py b/model-optimizer/mo/ops/flatten.py
new file mode 100644 (file)
index 0000000..96408e4
--- /dev/null
@@ -0,0 +1,54 @@
+"""
+ Copyright (c) 2018 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import networkx as nx
+import numpy as np
+import logging as log
+
+from mo.front.caffe.extractors.utils import get_canonical_axis_index
+from mo.front.common.partial_infer.utils import int64_array
+from mo.ops.op import Op
+
+
+class Flatten(Op):
+    op = 'Flatten'
+    enabled = True
+
+    def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
+        super().__init__(graph, {
+            'type': __class__.op,
+            'op': __class__.op,
+            'infer': __class__.infer,
+        }, attrs)
+
+    def supported_attrs(self):
+        return ['axis', 'end_axis']
+
+    @staticmethod
+    def infer(node):
+        input_shape = node.in_node(0).shape
+        if input_shape is None:
+            log.debug('The input shape for the layer "{}" is not defined'.format(node.soft_get('name')))
+            return
+
+        axis = get_canonical_axis_index(input_shape, node.axis)
+        end_axis = node.end_axis if node.has('end_axis') else -1
+        end_axis = get_canonical_axis_index(input_shape, end_axis)
+        prod_axes = np.prod(input_shape[axis: end_axis + 1])
+        node.out_node(0).shape = int64_array([*input_shape[0: axis], prod_axes, *input_shape[end_axis + 1:]])
+
+        if node.in_node().has_valid('value'):
+            node.out_node().value = node.in_node().value.copy().reshape(node.out_node(0).shape)
index 0e8125d..07a40c7 100644 (file)
@@ -22,7 +22,7 @@ from mo.ops.op import Op
 
 
 class FlattenONNX(Op):
-    op = 'Flatten'
+    op = 'FlattenONNX'
     enabled = True
 
     def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
@@ -33,7 +33,7 @@ class FlattenONNX(Op):
         }, attrs)
 
     def supported_attrs(self):
-        return ['axis', ('dim', lambda node: ','.join(map(str, node['dim'])))]
+        return [('dim', lambda node: ','.join(map(str, node['dim'])))]
 
     @staticmethod
     def infer(node):
index a840497..ff1ec6b 100644 (file)
@@ -31,6 +31,8 @@ class LinOp(Op):
             'infer': None,
         }, attrs)
 
+    def supported_attrs(self):
+        return ['operation']
 
 class Add(LinOp):
     enabled = False
index e726381..745efff 100644 (file)
@@ -15,9 +15,7 @@
 """
 
 import networkx as nx
-import numpy as np
 
-from mo.front.common.partial_infer.eltwise import eltwise_infer
 from mo.graph.graph import Node
 from mo.ops.op import Op
 from mo.front.common.partial_infer.elemental import copy_shape_infer
@@ -44,7 +42,6 @@ class Memory(Op):
 
     @staticmethod
     def infer(node: Node):
-        outn = node.out_node(0)  # data
         if len(node.in_nodes()) > 0:
             # In case this is a memory node with input,
             # It should not have output
@@ -53,18 +50,16 @@ class Memory(Op):
             # node that will be removed later in pipeline
             copy_shape_infer(node)
             return
-        data_outs = outn.out_nodes()  # children
-        for out in data_outs:
-            if len(out.pb.blobs) == 0 or not isinstance(out.pb.blobs[0], np.ndarray):
-                continue
-            blob_shape = out.pb.blobs[0].shape[0]
-            if out.type == 'FullyConnected':
-                outn.shape = np.int64(np.array([1, blob_shape / out.pb.num_output]))
-                break
-            elif out.type == 'ScaleShift':
-                outn.shape = np.int64(np.array([1, blob_shape]))
-                break
+        elif node.has_valid('shape'):
+            # For Memories, that has not input infer shapes is very difficult
+            # But often we can know shape in extracting attributes
+            # And we can set the attribute 'shape' in extracting
+            batch = 1
+            for out_node in node.out_nodes().values():
+                out_node.shape = [batch, *node.shape[:]]
+            return
         else:
             raise Error('Model Optimizer is unable to calculate output shape of Memory node {}. ' +
                         refer_to_faq_msg(88),
                         node.id)
+
index 378d8a3..83d80fb 100644 (file)
@@ -71,6 +71,7 @@ class Op(object):
         """
         backend_attrs_mapping = {
             None: self.backend_attrs,
+            4: self.backend_attrs,
             3: self.backend_attrs,
             2: self.backend_attrs_v2
         }
@@ -162,6 +163,8 @@ class Op(object):
         # TODO Preserve debug infor
         inputs_with_edge_attrs = []
         for i, inp in enumerate(inputs):
+            if inp is None:
+                continue
             edge_attr = {'in': i}
             if edge_attrs is not None and i < len(edge_attrs):
                 edge_attr.update(edge_attrs[i])
@@ -194,7 +197,11 @@ class Op(object):
             assert all(old_value is None for old_value in old_data_value) or all(
                 [np.array_equal(old_data_value[id], data_node.value) for id, data_node in enumerate(data_nodes)])
             assert all(old_shape is None for old_shape in old_data_shape) or all(
-                [np.array_equal(old_data_shape[id], data_node.shape) for id, data_node in enumerate(data_nodes)])
+                [np.array_equal(old_data_shape[id], data_node.shape) for id, data_node in enumerate(data_nodes)]), \
+                "After re-inference of {} node, old and new shapes do not match. Old shapes: {}, new shapes: {}.".format(
+                    new_op_node.soft_get('name'),
+                    [old_data_shape[id] for id in range(len(data_nodes))],
+                    [data_node.shape for data_node in data_nodes])
             for data_node in data_nodes:
                 log.debug(
                     'Finished running infer function, data nodes attributes: {}'.format(
@@ -323,6 +330,7 @@ class PermuteAttrs:
     common_attrs_permutation = {
             'dim': common_permutation,
             'pad': common_permutation,
+            'pads': common_permutation,
             'shape': common_permutation,
             'order': lambda node, permutation, attr: permutation.inv[node[attr][permutation.perm]],
             'stride': common_permutation,
@@ -393,7 +401,7 @@ class PermuteAttrs:
         node['permute_attrs'].update_attrs(attrs)
 
     @staticmethod
-    def set_permutation(node1, node2, permutation, skip_if_exists=False):
+    def set_permutation(node1, node2, permutation):
         # This function creates permutation on edge between node1->node2
         edge_attrs = node1.graph.get_edge_data(node1.id, node2.id)[0]
         if 'permutation' not in edge_attrs:
@@ -401,9 +409,10 @@ class PermuteAttrs:
                                    values={(node1.id, node2.id, 0): permutation},
                                    name='permutation')
         else:
-            if skip_if_exists:
-                return
-            raise Error('Permutation already exists in edge between {} and {}'.format(node1.name, node2.name))
+            # If permutation exists we check that given and already set permutations are equal
+            if (edge_attrs['permutation'] is None and permutation is not None) or \
+                not np.array_equal(edge_attrs['permutation'], permutation):
+                raise Error('Permutation already exists in edge between {} and {}'.format(node1.name, node2.name))
 
     @staticmethod
     def get_inverse_permutation(perm):
index f9e5e3a..739b886 100644 (file)
  limitations under the License.
 """
 
+import logging as log
+
 import networkx as nx
 import numpy as np
 
-from mo.front.common.partial_infer.transpose import transpose_infer
-from mo.front.extractor import attr_getter
-from mo.ops.op import Op
+from mo.ops.op import Op, PermuteAttrs
 
 
 class Pad(Op):
-    ''' Pad operation that explicitly extends an input tensor at edges.
+    """ Pad operation that explicitly extends an input tensor at edges.
         
         This operation frequently appears in TF and rarely in ONNX models
         followed by some windowed operation like convolution or pooling.
@@ -45,15 +45,15 @@ class Pad(Op):
         where pad_begin_dim1 etc. are padding margins in elements. If the second
         input argument is omitted, then it is in 'pads' attribute in the same
         format.
-    '''
+    """
 
     op = 'Pad'
     enabled = True
 
     def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
         super().__init__(graph, {
-            # no 'type' as this operation is not directly supported by IE
             'op': __class__.op,
+            'type': __class__.op,
             'infer': __class__.infer,
             'mode': 'constant',
             'fill_value': float(0),
@@ -64,21 +64,31 @@ class Pad(Op):
         return ['mode', 'fill_value', 'pads']
 
     def backend_attrs(self):
-        # it shouldn't be translated to IE layer
-        return []
+        return [('pad_mode', 'mode'),
+                ('pad_value', 'fill_value'),
+                ('pads_begin', lambda node: ','.join(map(str, node.pads[:, 0]))),
+                ('pads_end', lambda node: ','.join(map(str, node.pads[:, 1]))),
+                ]
 
     @staticmethod
     def infer(node):
+        PermuteAttrs.create_permute_attrs(node, attrs=[('pads', 'input:0')])
+
         if node.has_valid('pads'):
-            assert len(node.in_nodes()) == 1, "Pad operation has pads attribute and unexpected additional input argument for node {}.".format(node.name)
-            padding = node.pads
+            assert len(node.in_nodes()) == 1, "Pad operation has pads attribute and unexpected additional input " \
+                                              "argument for node {}.".format(node.name)
         else:
-            assert len(node.in_nodes()) == 2, "Missing required second input argument for node {} and pads attribute is missing.".format(node.name)
-            padding = node.in_node(1).value
+            assert len(node.in_nodes()) >= 2, "Missing required second input argument for node {} and pads attribute " \
+                                              "is missing.".format(node.name)
+            node.pads = node.in_node(1).value
+            if len(node.in_nodes()) == 3:  # the third input contains the fill value
+                node.fill_value = node.in_node(2).value
+        padding = node.pads
 
         input_shape = node.in_node(0).shape
         if padding is None or input_shape is None:
-            return None
+            log.error('The paddings are not defined for node "{}"'.format(node.soft_get('name')))
+            return
 
         # paddings can be defined, partially defined or undefined
         # TODO for now we only handle fully defined paddings
@@ -99,7 +109,8 @@ class Pad(Op):
 
         # preserve non-positive values in the input shape, because it has a special meaning
         shape = np.array(
-            [shape_change[i] + input_shape[i] if input_shape[i] > 0 else input_shape[i] for i in range(len(input_shape))])
+            [shape_change[i] + input_shape[i] if input_shape[i] > 0 else input_shape[i] for i in
+             range(len(input_shape))])
 
         assert len(node.out_nodes()) == 1
 
index 6c9299b..a26ab7d 100644 (file)
@@ -92,7 +92,9 @@ class Pooling(Op):
             node['stride'] = np.array([1 for x in range(len(input_shape))], dtype=np.int64)
 
         if node.has_and_set('global_pool'):
+            node['window'] = np.zeros(len(input_shape), dtype=np.int64)
             node.window[node.spatial_dims] = input_spatial_shape
+
         window_spatial_shape = node.window[node.spatial_dims]
         stride_spatial = node.stride[node.spatial_dims]
         assert any(stride_spatial), 'Stride can not be zero in node {}'.format(node.id)
index 2213537..c4d1ca0 100644 (file)
@@ -26,11 +26,12 @@ from mo.ops.op import Op
 
 class Power(Op):
     enabled = False
+    op = 'Power'
 
     def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
         super().__init__(graph, {
             'type': 'Power',
-            'op': 'Power',
+            'op': __class__.op,
             'power': 1,
             'scale': 1,
             'shift': 0,
index 0bb2504..1237928 100644 (file)
@@ -19,14 +19,21 @@ import logging as log
 import networkx as nx
 import numpy as np
 
-from mo.front.common.partial_infer.eltwise import eltwise_infer
+from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
 from mo.ops.op import Op
+from mo.utils.error import Error
 
 
 class Reduce(Op):
     enabled = False
 
+    reduce_method_map = {
+        'max': np.max,
+        'mean': np.mean,
+        'sum': np.sum,
+    }
+
     def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
         super().__init__(graph, {
             'op': 'Reduce',
@@ -36,32 +43,45 @@ class Reduce(Op):
 
     @staticmethod
     def infer(node: Node):
+        if len(node.in_nodes()) == 2:
+            reduction_indices_data = node.in_node(1)
+            if reduction_indices_data.has_valid('value'):
+                node['axis'] = reduction_indices_data.value
+            else:
+                raise Error("Can not deduce `reduction_indices` for node {}. It should be deduced out of first port "
+                            "input due to absence of value in this node".format(node.id))
+            node.graph.remove_edge(reduction_indices_data.id, node.id)
+
         input_node = node.in_node()
         input_shape = np.array(node.in_node().shape, dtype=np.int64)
         output_node = node.out_node()
 
+        # In case if axis is None it means that reduction comes along each dimension
+        if node.axis is None:
+            node.axis = int64_array(list(range(len(input_shape))))
+
         if not node.has_valid('reduce_type'):
             log.error('Reduce type for node {} not specified!'.format(node.id))
             return
 
         reduce_type = node.reduce_type
-
         if input_node.has_valid('value'):
-            if reduce_type.lower() == 'mean':
+            if reduce_type.lower() in ['mean', 'max']:
                 # Value and Shape propagation for constant path
-                output_node.value = np.mean(input_node.value, axis=tuple(node.axis), keepdims=node.keep_dims)
+                output_node.value = Reduce.reduce_method_map[reduce_type.lower()](input_node.value,
+                                                                                  axis=tuple(node.axis),
+                                                                                  keepdims=node.keep_dims)
                 output_node.shape = output_node.value.shape
             else:
                 log.error('Reduce type {} is not supported for node {}'.format(reduce_type, node.id))
                 return
         else:
-            # In case if axis is None it means that reduction comes along each dimension
-            if node.axis is None:
-                node.axis = np.array(range(len(input_shape)))
-
             used_dims = np.zeros(len(input_shape), dtype=np.bool)
             output_shape = input_shape
 
+            if node.axis.size == 1:
+                node.axis = int64_array([node.axis.item()])
+
             for dim in node.axis:
                 used_dims[dim] = True
                 output_shape[dim] = 1
@@ -71,4 +91,3 @@ class Reduce(Op):
                 output_shape = output_shape[np.invert(used_dims)]
 
             output_node.shape = output_shape
-
index 1ceaee1..f616c8d 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
+import math
 
 import networkx as nx
+import numpy as np
 
 from mo.front.common.partial_infer.elemental import single_output_infer
 from mo.front.common.partial_infer.reshape import tf_reshape_shape_infer
+from mo.graph.graph import Node
 from mo.ops.op import Op
+from mo.utils.error import Error
 
 
 class Reshape(Op):
@@ -27,13 +31,47 @@ class Reshape(Op):
 
     def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
         super().__init__(graph, {
-            'axis': 0,
-            'num_axes': -1,
             'kind': 'op',
             'type': __class__.op,
             'op': __class__.op,
-            'infer': lambda node: single_output_infer(node, tf_reshape_shape_infer)
+            'infer': lambda node: single_output_infer(node, tf_reshape_shape_infer,
+                                                      lambda node: np.reshape(node.in_node().value,
+                                                                              node.out_node().shape))
         }, attrs)
 
     def supported_attrs(self):
-        return ['axis', ('dim', lambda node: ','.join(map(str, node['dim']))), 'num_axes']
+        return [('dim', lambda node: ','.join(map(str, node['dim'])))]
+
+    @staticmethod
+    def kaldi_infer(node: Node):
+        in_node = node.in_node().in_node()  # prev_layer_node -> data -> this_node
+        input_shape = node.in_node().shape
+        # Kaldi Reshape hugely depends on the layers that precedes or succeeds
+        # Convolution/Pooling layers. Therefore there are 4 cases with different
+        # partial inference.
+        batch = input_shape[0]
+        if in_node.op == 'Convolution' or in_node.op == 'Pooling':
+            output_spatial = np.array([batch, np.prod(input_shape[1:])], dtype=np.int64)
+            return Reshape.set_shape_and_dim(node, output_spatial)
+        # Supports ONLY NCHW and NH layouts
+        if len(input_shape) not in [4, 2]:
+            raise Error('Reshape in Kaldi support only 1d or 3d shapes')
+        spatial_shape = input_shape[1]
+        if len(input_shape) in [4]:
+            spatial_shape = input_shape[2:3]
+        out_node = node.out_node().out_node()
+        if out_node.op == 'Convolution':
+            output_spatial = np.array(
+                [batch, math.ceil(spatial_shape / out_node.patch_stride), 1, out_node.patch_stride], dtype=np.int64)
+            return Reshape.set_shape_and_dim(node, output_spatial)
+        elif out_node.op == 'Pooling':
+            if out_node.pool_step is None:
+                out_node.stride = np.array([1, 1, out_node.window[-1], out_node.window[-1]], dtype=np.int64)
+            output_spatial = np.array(
+                [batch, out_node.pool_stride, 1, math.ceil(spatial_shape / out_node.pool_stride)], dtype=np.int64)
+            return Reshape.set_shape_and_dim(node, output_spatial)
+
+    @staticmethod
+    def set_shape_and_dim(node: Node, reshape_dim):
+        Reshape.update_node_stat(node, {'dim': reshape_dim})
+        node.out_node().shape = reshape_dim
index 19f34fa..5f6145d 100644 (file)
@@ -29,6 +29,7 @@ class Slice(Op):
 
     def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
         super().__init__(graph, {
+            'type': __class__.op,
             'op': 'Slice',
             'infer': __class__.infer
         }, attrs)
index 5a5d038..eaf6bc0 100644 (file)
@@ -22,13 +22,14 @@ from mo.ops.op import Op
 
 
 class Softmax(Op):
-    op = 'Softmax'
+    op = 'SoftMax'
     enabled = True
 
     def __init__(self, graph: nx.MultiDiGraph, attrs: dict):
         super().__init__(graph, {
             'infer': Softmax.infer,
             'kind': 'op',
+            'axis': 1,
             'type': __class__.op,
             'op': __class__.op,
         }, attrs)
index 145680b..5ce6b0f 100644 (file)
@@ -42,7 +42,7 @@ class Split(Op):
         input_node = node.in_node(0)
         outputs = node.out_nodes()
         out_shape = copy.copy(input_node.shape)
-        out_shape[node.axis] = np.int64(input_node.shape[node.axis] / node.pb.num_split)
+        out_shape[node.axis] = np.int64(input_node.shape[node.axis] / node.num_split)
         for idx, output in outputs.items():
             output.shape = out_shape
         PermuteAttrs.create_permute_attrs(node, attrs=[('axis', 'input:0')])
index a935e15..ef215c9 100644 (file)
@@ -24,8 +24,6 @@ class Squeeze(Op):
 
     def __init__(self, graph, attrs: dict):
         super().__init__(graph, {
-            'axis': 0,
-            'num_axes': -1,
             'dim': None,
             'kind': 'op',
             'type': 'Reshape',
@@ -34,4 +32,4 @@ class Squeeze(Op):
         }, attrs)
 
     def supported_attrs(self):
-        return ['axis', ('dim', lambda node: ', '.join(map(str, node['dim']))), 'num_axes']
+        return [('dim', lambda node: ', '.join(map(str, node['dim'])))]
index 2d42418..99195a3 100644 (file)
@@ -26,8 +26,6 @@ class Unsqueeze(Op):
 
     def __init__(self, graph, attrs: dict):
         super().__init__(graph, {
-            'axis': 0,
-            'num_axes': -1,
             'kind': 'op',
             'type': 'Reshape',
             'op': __class__.op,
@@ -35,7 +33,7 @@ class Unsqueeze(Op):
         }, attrs)
 
     def supported_attrs(self):
-        return ['axis', ('dim', lambda node: ', '.join(map(str, node['dim']))), 'num_axes']
+        return [('dim', lambda node: ', '.join(map(str, node['dim'])))]
 
     @staticmethod
     def infer(node):
index 107cc36..d334396 100644 (file)
@@ -62,20 +62,7 @@ def driver(argv: argparse.Namespace, proto_file_name: str, model_file_name: str,
 
     FusePermutesSequence.enabled = False
 
-    try:
-        proto, model = loader.load_caffe_proto_model(proto_file_name, model_file_name)
-    except Error as e:
-        raise
-    except Exception as e:
-        raise Error('Model Optimizer is not able to read {}. Possible reasons: '.format(proto_file_name) +
-                    '1. your caffemodel contains custom layers that are not supported in Model Optimizer by default. ' +
-                    '2. your prototxt does not have a valid structure, e.g you downloaded it as html. ' +
-                    'In particular the first unknown field is {} '.format(str(e).split(' ')[-1]) +
-                    'After you made sure that prototxt has a valid structure and still see this issue, then ' +
-                    'you need to generate a python parser for caffe.proto that was used when the model ' +
-                    'was created. ' +
-                    'Run "python3 generate_caffe_pb2.py --input_proto ${PATH_TO_CAFFE}/src/caffe/proto/caffe.proto". ' +
-                    refer_to_faq_msg(1)) from e
+    proto, model = loader.load_caffe_proto_model(proto_file_name, model_file_name)
 
     update_extractors_with_extensions(
         caffe_type_extractors,
@@ -99,7 +86,7 @@ def driver(argv: argparse.Namespace, proto_file_name: str, model_file_name: str,
     graph.graph['layout'] = 'NCHW'
     graph.graph['cmd_params'] = argv
     graph.graph['fw'] = 'caffe'
-    graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 3
+    graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 4
 
     extract_node_attrs(graph, lambda node: (True, common_caffe_fields(node)))
 
@@ -182,13 +169,14 @@ def driver(argv: argparse.Namespace, proto_file_name: str, model_file_name: str,
     # Mark nodes with attr 'can_be_fused': False to disable fusing for specified nodes
     mark_unfused_nodes(graph, argv.finegrain_fusing)
 
+    #need this pass even without fusing to convert scale with 2 inputs
+    convert_scale_shift_to_mul_add(graph)
+    graph_clean_up(graph)
+
     if not argv.disable_fusing:
         convert_bn_to_mul_add(graph)
         graph_clean_up(graph)
 
-        convert_scale_shift_to_mul_add(graph)
-        graph_clean_up(graph)
-
         fuse_mul_add_sequence(graph)
         graph_clean_up(graph)
 
index 7baa301..fcb3faa 100644 (file)
@@ -23,13 +23,12 @@ from extensions.front.kaldi.fuse_repeated_reshape import FuseRepeatedReshapes
 from extensions.middle.EltwiseChecker import EltwiseChecker
 from mo.front.common.register_custom_ops import update_extractors_with_extensions
 from mo.front.extractor import create_tensor_nodes, extract_node_attrs, add_output_ops, remove_output_ops
-from mo.front.kaldi import loader
 from mo.front.kaldi.extractor import kaldi_extractor, kaldi_type_extractors
+from mo.front.kaldi.loader.loader import load_kaldi_model, read_counts_file
 from mo.utils import class_registration
 from mo.utils.cli_parser import get_meta_info
 from mo.utils.error import Error
 from mo.utils.find_inputs import find_outputs
-
 from mo.graph.graph import print_graph_stat, Node, check_empty_graph
 from mo.middle.passes.eliminate import graph_clean_up
 from mo.middle.passes.infer import override_placeholder_shapes, partial_infer, mark_outputs, override_batch
@@ -104,34 +103,27 @@ def driver(argv, input_model, output_model_name, outputs, output_dir, scale, pla
     meta_info = get_meta_info(argv)
 
     EltwiseChecker.enabled = False
-    
+
     try:
-        graph, input_shapes = loader.load_kaldi_nnet_model(input_model)
+        graph, input_shapes = load_kaldi_model(input_model)
     except Exception as e:
         raise Error('Model Optimizer is not able to read Kaldi model {}. '.format(input_model) +
                     refer_to_faq_msg(91)) from e
     check_empty_graph(graph, 'load_kaldi_nnet_model')
-
     graph.graph['cmd_params'] = argv
     graph.graph['fw'] = 'kaldi'
-    graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 3
-
-    if argv.counts:
-        try:
-            counts = loader.read_counts_file(argv.counts)
-        except Exception as e:
-            raise Error('Model Optimizer is not able to read counts file {}'.format(argv.counts) +
-                        refer_to_faq_msg(92)) from e
+    graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 4
 
     update_extractors_with_extensions(kaldi_type_extractors)
-    # Intentionally before extracting attributes
 
-    class_registration.apply_replacements(graph, class_registration.ClassType.FRONT_REPLACER)
     extract_node_attrs(graph, lambda node: kaldi_extractor(node))
 
+    class_registration.apply_replacements(graph, class_registration.ClassType.FRONT_REPLACER)
+
     output_op_nodes = add_output_ops(graph, outputs)  # TODO pass real outputs instead of None
     log.debug("After adding specific nodes for outputs")
     print_graph_stat(graph)
+
     check_empty_graph(graph, 'add_output_ops')
     create_tensor_nodes(graph)
 
@@ -153,16 +145,22 @@ def driver(argv, input_model, output_model_name, outputs, output_dir, scale, pla
     # You need to pass required network outputs here
     # but we don't have a way yet, so just passing all discovered sinks
     mark_outputs(graph)
-
     graph_clean_up(graph)
     log.debug("After graph_cleanup")
     print_graph_stat(graph)
     graph = partial_infer(graph)
+
     # The order is intentional, firstly eliminate repeated, then remove redundant
     FuseRepeatedReshapes().find_and_replace_pattern(graph)
     EliminateRedundantReshape().find_and_replace_pattern(graph)
     check_empty_graph(graph, 'partial_infer')
     if argv.counts:
+        try:
+            counts = read_counts_file(argv.counts)
+        except Exception as e:
+            raise Error('Model Optimizer is not able to read counts file {}'.format(argv.counts) +
+                        refer_to_faq_msg(92)) from e
+
         apply_biases_to_last_layer(graph, counts)
 
     if argv.remove_output_softmax:
index d4d7ae1..03ac18f 100644 (file)
@@ -52,7 +52,6 @@ from mo.front.common.register_custom_ops import update_extractors_with_extension
 from mo.front.mxnet.extractor import mxnet_op_extractors
 from mo.utils import class_registration
 from mo.utils.cli_parser import get_meta_info
-from extensions.middle.PadToPoolingMiddleReplacer import PadToPoolingMiddleReplacer
 from extensions.middle.EltwiseInputNormalization import EltwiseInputNormalize
 
 
@@ -88,7 +87,7 @@ def add_input_data_to_prior_boxes(graph: nx.MultiDiGraph, input_names: str = '')
 #TODO Remove the func after 'add_output_ops' will be moved to front replacer.
 def check_softmax_node_inputs(graph: nx.MultiDiGraph):
     for i, attrs in list(graph.nodes(data=True)):
-        if 'op' in attrs and attrs['op'] == 'Softmax':
+        if 'op' in attrs and attrs['op'] == 'SoftMax':
             node = Node(graph, i)
             if len(node.in_nodes()) > 1:
                 graph.remove_node(node.in_node(1).id)
@@ -125,7 +124,8 @@ def driver(argv: argparse.Namespace, input_model: str, output_model_name: str, o
     graph.graph['layout'] = 'NCHW'
     graph.graph['cmd_params'] = argv
     graph.graph['fw'] = 'mxnet'
-    graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 3
+    graph.graph['feature_dim'] = 1 if graph.graph['layout'] == 'NCHW' else 3
+    graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 4
     graph = extract_node_attrs(graph, mxnet_op_extractor)
     check_softmax_node_inputs(graph)
 
@@ -168,8 +168,7 @@ def driver(argv: argparse.Namespace, input_model: str, output_model_name: str, o
     scale_input(graph, scale)
     add_mean_scale_values(graph, mean_scale_values)
 
-    remove_op_nodes(graph, {'op': 'Dropout'})
-    remove_op_nodes(graph, {'op': '_copy'})
+    remove_op_nodes(graph, {'identity': True})
 
     graph_clean_up(graph)
 
@@ -200,7 +199,6 @@ def driver(argv: argparse.Namespace, input_model: str, output_model_name: str, o
         stride_optimization(graph)
 
     fuse_pad(graph)
-    PadToPoolingMiddleReplacer().find_and_replace_pattern(graph)
 
     # Converting Mul->Add to ScaleShift node
     convert_muladd_to_scaleshift_or_power(graph)
index fcb6dc2..88fd356 100644 (file)
@@ -78,7 +78,7 @@ def driver(argv: argparse.Namespace, model_file_name: str, output_model_name: st
         graph.graph['cmd_params'] = argv
         graph.graph['fw'] = 'onnx'
         graph.graph['feature_dim'] = 1 if graph.graph['layout'] == 'NCHW' else 3
-        graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 3
+        graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 4
         # extract basic attributes earlier to enable some passes that relies on them before full attribute
         # extractor is called
         extract_node_attrs(graph, lambda node: (True, common_onnx_fields(node)))
index af31736..f6e1503 100644 (file)
@@ -22,14 +22,22 @@ import networkx as nx
 import numpy as np
 import tensorflow as tf
 
-from extensions.middle.EltwiseInputNormalization import EltwiseInputNormalize
-from extensions.middle.TensorIteratorConditionChecker import ConditionChecks
-from mo.middle.pattern_match import for_each_sub_graph, for_graph_and_each_sub_graph_recursively
+try:
+    import tensorflow.contrib
+except:
+    pass  # we try to import contrib for loading models that use contrib operations
+
 import mo.front.tf.custom_subgraph_call as csc
 from extensions.front.freeze_placeholder_value import FreezePlaceholderValue
+from extensions.front.tf.basic_lstm_cell import BasicLSTMCell
+from extensions.middle.AddIsCyclicAttribute import AddIsCyclicAttribute
+from extensions.middle.EltwiseInputNormalization import EltwiseInputNormalize
+from extensions.middle.GemmResolver import GemmResolver
 from extensions.middle.TensorIteratorBackEdge import BackEdgesMatching
-from extensions.middle.TensorIteratorCondition import LoopConditionMatcher
-from extensions.middle.TensorIteratorInput import SmartInputMatcher, SimpleInputMatcher
+from extensions.middle.TensorIteratorCondition import LoopConditionMatcher, \
+    SimpleConditionMather  # SimpleConditionMather
+from extensions.middle.TensorIteratorConditionChecker import ConditionChecks
+from extensions.middle.TensorIteratorInput import SmartInputMatcher, SimpleInputMatcher, BackEdgeSimpleInputMatcher
 from extensions.middle.TensorIteratorMerge import TensorIteratorMerge
 from extensions.middle.TensorIteratorOutput import SmartOutputMatcher
 from extensions.middle.TensorIterator_utils import DeleteSelect
@@ -41,7 +49,7 @@ from mo.front.extractor import restore_edges, add_output_ops, add_input_ops, \
     extract_node_attrs, create_tensor_nodes, remove_output_ops, user_data_repack, remove_control_dependency_inputs
 from mo.front.tf.change_placeholder_type import change_placeholders_types_to_FP32
 from mo.front.tf.extractor import get_tf_edges, common_tf_fields, tf_op_extractor, tf_op_extractors
-from mo.front.tf.loader import load_tf_graph_def, protobuf2nx
+from mo.front.tf.loader import load_tf_graph_def, protobuf2nx, variables_to_constants
 from mo.front.tf.register_custom_ops import update_registration
 from mo.front.tf.replacement import FrontReplacementFromConfigFileOp
 from mo.graph.graph import check_empty_graph
@@ -55,7 +63,8 @@ from mo.middle.passes.fusing.fuse_linear_ops import fuse_linear_ops
 from mo.middle.passes.fusing.fuse_linear_seq import fuse_mul_add_sequence
 from mo.middle.passes.fusing.mark_unfused_nodes import mark_unfused_nodes
 from mo.middle.passes.infer import scale_input, override_placeholder_shapes, partial_infer, convert_mul_add_to_power, \
-    update_fully_connected_shapes, add_mean_scale_values, override_batch, check_for_cycle, delete_not_executable, delete_control_flow_edges
+    update_fully_connected_shapes, add_mean_scale_values, override_batch, check_for_cycle, delete_not_executable, \
+    delete_control_flow_edges
 from mo.middle.passes.l2normalization import l2_norm_to_norm
 from mo.middle.passes.leaky_relu import convert_mul_eltwise_to_leaky_relu
 from mo.middle.passes.mean_scale_values import move_scaleshift_to_preprocess
@@ -64,6 +73,7 @@ from mo.middle.passes.shape import convert_squeeze, convert_reshape, reverse_inp
     conv_flatten_concat, fuse_sequence_of_reshapes, repack_fully_connected_weights_nhwc_to_nchw, \
     apply_nhwc_to_nchw_permutation, permute_data_nodes_attrs, permute_op_nodes_attrs, merge_nodes_permutations
 from mo.middle.passes.shared_weights_duplication import duplicate_shared_weights
+from mo.middle.pattern_match import for_each_sub_graph, for_graph_and_each_sub_graph_recursively
 from mo.pipeline.common import prepare_emit_ir
 from mo.utils import class_registration, tensorboard
 from mo.utils.cli_parser import get_meta_info
@@ -109,12 +119,12 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str
             log.info('Loading library "{}" with custom operations'.format(library))
             tf.load_op_library(library)
 
-    graph_def = load_tf_graph_def(graph_file_name=model_file_name, is_binary=is_binary,
-                                  checkpoint=argv.input_checkpoint,
-                                  user_output_node_names_list=outputs,
-                                  model_dir=argv.saved_model_dir,
-                                  meta_graph_file=argv.input_meta_graph,
-                                  saved_model_tags=argv.saved_model_tags)
+    graph_def, variables_values = load_tf_graph_def(graph_file_name=model_file_name, is_binary=is_binary,
+                                                    checkpoint=argv.input_checkpoint,
+                                                    user_output_node_names_list=outputs,
+                                                    model_dir=argv.saved_model_dir,
+                                                    meta_graph_file=argv.input_meta_graph,
+                                                    saved_model_tags=argv.saved_model_tags)
 
     try:
         tf.import_graph_def(graph_def, name='')
@@ -140,7 +150,16 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str
         graph.graph['layout'] = 'NCHW' if argv.disable_nhwc_to_nchw else 'NHWC'
         graph.graph['cmd_params'] = argv
         graph.graph['fw'] = 'tf'
-        graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 3
+        graph.graph['ir_version'] = 2 if argv.generate_deprecated_IR_V2 else 4
+
+        if graph.graph['ir_version'] == 2:
+            # When the deprecated IR version was requested,
+            # we configure only those phases that can lead to
+            # functional regressions in the version 2.
+            # BasicLSTMCell is one such transformation; when it is turned off,
+            # the body of TF basic_lstm_cell is converted as-is in a decomposed form,
+            # and should work in version 2.
+            BasicLSTMCell.enabled = False
 
         # placeholder for request from a transformation pass to repeat the entire conversion
         graph.graph['repeat_conversion'] = False
@@ -168,6 +187,8 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str
         FreezePlaceholderValue.replacement_dict = freeze_placeholder
         update_registration()
 
+    GemmResolver.enabled = False
+
     inputs = list(packed_user_shapes.keys()) if packed_user_shapes is not None and isinstance(packed_user_shapes,
                                                                                               dict) else None
     graph.graph['inputs'] = inputs  # save user defined inputs for other extensions
@@ -177,9 +198,14 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str
 
     # this call of 'graph_clean_up' removes child nodes of outputs which is useful when custom output is specified
     graph_clean_up_tf(graph)
+
     check_empty_graph(graph, 'add_output_ops and add_input_ops. It may happen due to absence of \'Placeholder\' layer '
                              'in the model')
 
+    variables_to_constants(graph, variables_values)
+    del variables_values
+    graph_clean_up_tf(graph)
+
     if argv.tensorflow_custom_operations_config_update:
         if update_custom_replacement_config_file(graph, argv.tensorflow_custom_operations_config_update):
             return 0
@@ -230,42 +256,50 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str
         partial_infer(graph)
         delete_control_flow_edges(graph)
 
-        # TENSOR ITERATOR CREATING BEGINS
-        replacer = DeleteSelect()
+        replacer = AddIsCyclicAttribute()
         replacer.find_and_replace_pattern(graph)
 
-        replacer = SmartInputMatcher()
-        replacer.find_and_replace_pattern(graph)
+        # TENSOR ITERATOR CREATING BEGINS
+        if graph.graph['is_cyclic']:
+            replacer = DeleteSelect()
+            replacer.find_and_replace_pattern(graph)
 
-        replacer = SmartOutputMatcher()
-        replacer.find_and_replace_pattern(graph)
+            replacer = SmartInputMatcher()
+            replacer.find_and_replace_pattern(graph)
 
-        replacer = LoopConditionMatcher()
-        replacer.find_and_replace_pattern(graph)
+            replacer = SmartOutputMatcher()
+            replacer.find_and_replace_pattern(graph)
 
-        replacer = BackEdgesMatching()
-        replacer.find_and_replace_pattern(graph)
+            replacer = LoopConditionMatcher()
+            replacer.find_and_replace_pattern(graph)
 
-        replacer = ConditionChecks()
-        replacer.find_and_replace_pattern(graph)
+            replacer = SimpleConditionMather()
+            replacer.find_and_replace_pattern(graph)
+
+            replacer = BackEdgesMatching()
+            replacer.find_and_replace_pattern(graph)
+
+            replacer = ConditionChecks()
+            replacer.find_and_replace_pattern(graph)
 
         delete_not_executable(graph)
         graph_clean_up_tf(graph)
+        if graph.graph['is_cyclic']:
+            replacer = SimpleInputMatcher()
+            replacer.find_and_replace_pattern(graph)
 
-        replacer = SimpleInputMatcher()
-        replacer.find_and_replace_pattern(graph)
+            replacer = BackEdgeSimpleInputMatcher()
+            replacer.find_and_replace_pattern(graph)
 
-        # Here will be optimizing path (ops afer Enter and before body take out of body)
-
-        replacer = TensorIteratorMerge()
-        replacer.find_and_replace_pattern(graph)
+            # Here will be optimizing path (ops after Enter and before body take out of body)
 
+            replacer = TensorIteratorMerge()
+            replacer.find_and_replace_pattern(graph)
         # TENSOR ITERATOR CREATING ENDS
 
         check_for_cycle(graph)
 
-        graph_clean_up_tf(graph)
-        for_each_sub_graph(graph, graph_clean_up_tf)
+        for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf)
         check_empty_graph(graph, 'partial_infer')
 
         csc.prepare_tf_call_nodes(graph)
@@ -283,14 +317,12 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str
         add_mean_scale_values(graph, mean_scale_values)
 
         convert_dilated_convolution(graph)
-        graph_clean_up_tf(graph)
-        for_each_sub_graph(graph, graph_clean_up_tf)
+        for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf)
 
         l2_norm_to_norm(graph)
         graph_clean_up_tf(graph)
 
-        remove_op_nodes(graph, {'op': 'Identity'})
-        remove_op_nodes(graph, {'op': 'StopGradient'})
+        remove_op_nodes(graph, {'identity': True})
         remove_useless_split(graph)
 
         class_registration.apply_replacements(graph, class_registration.ClassType.MIDDLE_REPLACER)
@@ -304,8 +336,7 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str
         convert_matmul_to_fully_connected(graph)
 
         # Mark nodes with attr 'can_be_fused': False to disable fusing for specified nodes
-        mark_unfused_nodes(graph, argv.finegrain_fusing)
-        for_each_sub_graph(graph, lambda graph: mark_unfused_nodes(graph, argv.finegrain_fusing))
+        for_graph_and_each_sub_graph_recursively(graph, lambda graph: mark_unfused_nodes(graph, argv.finegrain_fusing))
 
         # Converting FusedBatchNorm layer to Mul->Add->Mul->Add sequence
         # IE doesn't support BN with 4 inputs, so we have to split it to two ScaleShift
@@ -314,20 +345,16 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str
 
         if not argv.disable_fusing:
             # Converting ScaleShift layer to Mul->Add
-            convert_scale_shift_to_mul_add(graph)
-            graph_clean_up_tf(graph)
+            for_graph_and_each_sub_graph_recursively(graph, convert_scale_shift_to_mul_add)
+            for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf)
 
             # Fusing the sequences of Mul/Add operations
-            fuse_mul_add_sequence(graph)
-            for_each_sub_graph(graph, fuse_mul_add_sequence)
-            graph_clean_up_tf(graph)
-            for_each_sub_graph(graph, graph_clean_up_tf)
+            for_graph_and_each_sub_graph_recursively(graph, fuse_mul_add_sequence)
+            for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf)
 
             # Fusing linear operation to Convolution
-            fuse_linear_ops(graph)
-            for_each_sub_graph(graph, fuse_linear_ops)
-            graph_clean_up_tf(graph)
-            for_each_sub_graph(graph, graph_clean_up_tf)
+            for_graph_and_each_sub_graph_recursively(graph, fuse_linear_ops)
+            for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf)
 
         if not argv.disable_gfusing:
             grouped_convolutions_fusing(graph)
@@ -337,27 +364,29 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str
                 graph_clean_up_tf(graph)
 
         # Converting Mul->Add to ScaleShift node
-        convert_muladd_to_scaleshift_or_power(graph)
-        graph_clean_up_tf(graph)
+        for_graph_and_each_sub_graph_recursively(graph, convert_muladd_to_scaleshift_or_power)
+        for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf)
 
-        convert_mul_add_to_power(graph)
+        for_graph_and_each_sub_graph_recursively(graph, convert_mul_add_to_power)
 
         # Need to eliminate dead nodes before doing update_fully_connected_shapes
         # because update_fully_connected_shapes does partial inference and dead
         # nodes will lead to sporadic failures.
-        graph_clean_up_tf(graph)
-        update_fully_connected_shapes(graph)
+        for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf)
+        for_graph_and_each_sub_graph_recursively(graph, update_fully_connected_shapes)
 
-        convert_mul_eltwise_to_leaky_relu(graph)
+        for_graph_and_each_sub_graph_recursively(graph, convert_mul_eltwise_to_leaky_relu)
         graph_clean_up_tf(graph)
+        for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf)
 
-        fuse_pad(graph)
-        graph_clean_up_tf(graph)
+        for_graph_and_each_sub_graph_recursively(graph, fuse_pad)
+        for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf)
 
-        convert_reshape(graph)
-        convert_squeeze(graph)
-        convert_add_to_scaleshift(graph)  # scale = 1
-        convert_mul_to_scaleshift(graph)  # biases = 0
+        for_graph_and_each_sub_graph_recursively(graph, convert_reshape)
+        for_graph_and_each_sub_graph_recursively(graph, convert_squeeze)
+
+        for_graph_and_each_sub_graph_recursively(graph, convert_add_to_scaleshift)  # scale = 1
+        for_graph_and_each_sub_graph_recursively(graph, convert_mul_to_scaleshift)  # biases = 0
 
         if argv.reverse_input_channels:
             reverse_input_channels(graph)
@@ -366,28 +395,22 @@ def tf2nx(argv: argparse.Namespace, model_file_name: str, output_model_name: str
             move_scaleshift_to_preprocess(graph)
             graph_clean_up_tf(graph)
 
-        fuse_sequence_of_reshapes(graph)
+        for_graph_and_each_sub_graph_recursively(graph, fuse_sequence_of_reshapes)
 
         pattern = EltwiseInputNormalize()
         pattern.find_and_replace_pattern(graph)
 
         conv_flatten_concat(graph)
 
-        apply_nhwc_to_nchw_permutation(graph)
-        for_each_sub_graph(graph, apply_nhwc_to_nchw_permutation)
-        merge_nodes_permutations(graph)
-        for_each_sub_graph(graph, merge_nodes_permutations)
-        permute_data_nodes_attrs(graph)
-        for_each_sub_graph(graph, permute_data_nodes_attrs)
-        permute_op_nodes_attrs(graph)
-        for_each_sub_graph(graph, permute_op_nodes_attrs)
+        for_graph_and_each_sub_graph_recursively(graph, apply_nhwc_to_nchw_permutation)
+        for_graph_and_each_sub_graph_recursively(graph, merge_nodes_permutations)
+        for_graph_and_each_sub_graph_recursively(graph, permute_data_nodes_attrs)
+        for_graph_and_each_sub_graph_recursively(graph, permute_op_nodes_attrs)
 
-        repack_fully_connected_weights_nhwc_to_nchw(graph)
-        for_each_sub_graph(graph, repack_fully_connected_weights_nhwc_to_nchw)
-        transpose_fully_connected_weights(graph)
-        for_each_sub_graph(graph, transpose_fully_connected_weights)
+        for_graph_and_each_sub_graph_recursively(graph, repack_fully_connected_weights_nhwc_to_nchw)
+        for_graph_and_each_sub_graph_recursively(graph, transpose_fully_connected_weights)
 
-        graph_clean_up_tf(graph)
+        for_graph_and_each_sub_graph_recursively(graph, graph_clean_up_tf)
 
         if argv.offload_unsupported_operations_to_tf:
             unsupported_ops_to_offload_to_tf = find_unsupported_ops(graph)
index f69978b..8d4c834 100644 (file)
@@ -114,8 +114,14 @@ def apply_replacements(graph: nx.MultiDiGraph, replacements_type):
             replacer.find_and_replace_pattern(graph)
             check_empty_graph(graph, replacer_cls)
         except Error as err:
-            raise Error('Exception occurred during running replacer "{}": {}'.format(replacement_id, str(err).replace(
-                '[REPLACEMENT_ID]', replacement_id))) from err
+            raise Error('Exception occurred during running replacer "{}" ({}): {}'.format(
+                replacement_id,
+                replacer_cls,
+                str(err).replace('[REPLACEMENT_ID]', replacement_id),
+            )) from err
         except Exception as err:
-            raise Exception('Exception occurred during running replacer "{}": {}'.format(
-                replacement_id, str(err).replace('[REPLACEMENT_ID]', replacement_id))) from err
+            raise Exception('Exception occurred during running replacer "{} ({})": {}'.format(
+                replacement_id,
+                replacer_cls,
+                str(err).replace('[REPLACEMENT_ID]', replacement_id),
+            )) from err
index a3a6460..48558b2 100644 (file)
@@ -141,138 +141,6 @@ def writable_dir(path: str):
         else:
             raise Error('The directory "{}" is not writable'.format(cur_path))
 
-
-def get_caffe_legacy_cli_parser(parser: argparse.ArgumentParser = None):
-    if not parser:
-        parser = argparse.ArgumentParser()
-    forever_legacy_group = parser.add_argument_group('Caffe* legacy parameters from Beta2 release ' +
-                                                     'that are no longer supported')
-    forever_legacy_group.add_argument('-ListA',
-                                      action='store_true',
-                                      help='List supported precisions')
-    forever_legacy_group.add_argument('-ListF',
-                                      action='store_true',
-                                      help='List supported frameworks')
-    forever_legacy_group.add_argument('-ListN',
-                                      action='store_true',
-                                      help='List supported classes of topologies')
-    forever_legacy_group.add_argument('-l',
-                                      action='store_true',
-                                      help='Learn network statistics and find the best normalization factor')
-    forever_legacy_group.add_argument('-t',
-                                      help='File name of a training/validation network topology')
-    forever_legacy_group.add_argument('-nl',
-                                      help='Number of learning iterations')
-    forever_legacy_group.add_argument('-v',
-                                      action='store_true',
-                                      help='Validate a normalized network')
-    forever_legacy_group.add_argument('-nv',
-                                      help='Number of validation iterations')
-    forever_legacy_group.add_argument('-dm',
-                                      help='Dump a normalized and converted model to a binary file')
-    forever_legacy_group.add_argument('-dr',
-                                      help='Dump report of Model Optimizer run')
-    forever_legacy_group.add_argument('-q',
-                                      help='Quantization file (used for low precision)')
-    forever_legacy_group.add_argument('-c',
-                                      action='store_true',
-                                      help='Generate OpenVX* code')
-    forever_legacy_group.add_argument('-IRCode',
-                                      action='store_true',
-                                      help='Generate OpenVX* code from IR model')
-    forever_legacy_group.add_argument('-ds',
-                                      help='Dump net statistics into .csv files')
-    forever_legacy_group.add_argument('--hfuse',
-                                      choices=['NONE', 'PARTIAL', 'FULL'],
-                                      default='NONE',
-                                      help='<NONE|PARTIAL|FULL>: enable/disable optimization of the layers' +
-                                           ' horizontal fusion when applicibale ("NONE" to disable. ' +
-                                           '"PARTIAL" to enable only native branches and ' +
-                                           '"FULL" for copy the layer')
-    forever_legacy_group.add_argument('--target',
-                                      choices=['XEON'],
-                                      default='XEON',
-                                      help='Target configuration')
-    forever_legacy_group.add_argument('--network',
-                                      help='Generated network type supported networks: ' +
-                                           '<CLASSIFICATION,LOCALIZATION,SEGMENTATION>')
-    forever_legacy_group.add_argument('--code-cfg',
-                                      help='OpenVX* code generation configuration (DEBUG or RELEASE)')
-    forever_legacy_group.add_argument('-mx',
-                                      help='Enable/disable mixed precision handler in a generated OpenVX* code ' +
-                                           '("0"|"false" to disable. "1"|"true" to enable)')
-
-    common_group = parser.add_argument_group('Caffe*-specific parameters in a format of Beta2 release')
-    # Common parameters
-    common_group.add_argument('--version',
-                              action='store_true',
-                              help='List versions of Model Optimizer for TensorFlow*')
-    common_group.add_argument('-w',
-                              type=readable_file,
-                              help='Path to a binary weights file (.caffemodel) including the file name',
-                              action=CanonicalizePathCheckExistenceAction)
-    common_group.add_argument('-d',
-                              type=readable_file,
-                              help='Path to a model proto file (.prototxt) including the file name',
-                              action=CanonicalizePathCheckExistenceAction)
-    common_group.add_argument('-b',
-                              type=int,
-                              default=1,
-                              help='Batch size. Default value is 1')
-    common_group.add_argument('-f',
-                              type=int,
-                              default=1,
-                              help='Network normalization factor')
-    common_group.add_argument('-p',
-                              choices=['FP32', 'FP16'],
-                              default='FP32',
-                              help='Precision of weights of the generated IR')
-    common_group.add_argument('-i',
-                              action='store_true',
-                              help='Generates IR. Currently does nothing, used for compatibility with the previous version of the Model Optimizer.')
-    common_group.add_argument('-ms',
-                              help='Mean image values in the following format: "-ms x,y,z"')
-    common_group.add_argument('-mf',
-                              type=readable_file,
-                              help='Path to a mean image file (.binaryproto)',
-                              action=CanonicalizePathCheckExistenceAction)
-    common_group.add_argument('-mo',
-                              help='Offsets for the mean image file in the following format: "-mo x,y"')
-    common_group.add_argument('--scale',
-                              type=tuple,
-                              default=(),
-                              help='Scale values per channel in the following format: "--scale x,y,z". Floating point ' +
-                                   'values are also accepted.')
-    # TODO: was it true or false by default?
-    common_group.add_argument('--fuse',
-                              choices=['false', '0', 'true', '1'],
-                              default='false',
-                              help='Enable/disable the layers fusion optimization ("0"|"false" to disable. ' +
-                                   '"1"|"true" to enable)')
-    common_group.add_argument('--framework',
-                              type=str,
-                              choices=['CAFFE'],
-                              default='CAFFE',
-                              help='Name of the framework used to train the input model: <CAFFE>')
-    # TODO: do we want to keep compatibility here and put IR in 'Artifacts/NAME_OF_MODEL/' directory
-    common_group.add_argument('-o',
-                              default=get_absolute_path('.'),
-                              help='Output the directory path. By default, the output directory ' +
-                                   'is set to "Artifacts"',
-                              action=CanonicalizePathAction,
-                              type=writable_dir)
-    common_group.add_argument('-k',
-                              help='Path to the mapping file ("CustomLayersMapping.xml") used for ' +
-                                   'registering custom layers',
-                              action=CanonicalizePathCheckExistenceAction,
-                              type=readable_dir)
-    common_group.add_argument('-*',
-                              action='store_true',
-                              help='Do all of the above. Currently does nothing, used for compatibility with the previous version of the Model Optimizer.')
-
-    return parser
-
-
 def get_common_cli_parser(parser: argparse.ArgumentParser = None):
     if not parser:
         parser = argparse.ArgumentParser()
@@ -459,11 +327,11 @@ def get_tf_cli_options():
 
 def get_mxnet_cli_options():
     d = {
-        'input_symbol': ' Deploy-ready symbol file',
+        'input_symbol': '- Deploy-ready symbol file',
         'nd_prefix_name': '- Prefix name for args.nd and argx.nd files',
-        'pretrained_model_name': '- Pretrained model which will be merged with .nd files',
-        'save_params_from_nd': '- Enable save built params file from nd files',
-        'legacy_mxnet_model': '- Load the model trained with MXNet with version lower than 1.0.0'
+        'pretrained_model_name': '- Pretrained model to be merged with the .nd files',
+        'save_params_from_nd': '- Enable saving built parameters file from .nd files',
+        'legacy_mxnet_model': '- Enable MXNet loader for models trained with MXNet version lower than 1.0.0'
     }
 
     return OrderedDict(sorted(d.items(), key=lambda t: t[0]))
@@ -471,8 +339,8 @@ def get_mxnet_cli_options():
 
 def get_kaldi_cli_options():
     d = {
-        'counts': '- Path to the counts file',
-        'remove_output_softmax': '- Removes the Softmax layer that is the output layer'
+        'counts': '- A file name with full path to the counts file',
+        'remove_output_softmax': '- Removes the SoftMax layer that is the output layer'
     }
 
     return OrderedDict(sorted(d.items(), key=lambda t: t[0]))
@@ -627,14 +495,14 @@ def get_mxnet_cli_parser(parser: argparse.ArgumentParser = None):
                           help="Prefix name for args.nd and argx.nd files.",
                           default=None)
     mx_group.add_argument("--pretrained_model_name",
-                          help="Pretrained model without extension and epoch number which will be merged with args.nd and argx.nd files.",
+                          help="Name of a pretrained MXNet model without extension and epoch number. This model will be merged with args.nd and argx.nd files",
                           default=None)
     mx_group.add_argument("--save_params_from_nd",
                           action='store_true',
-                          help="Enable save built params file from nd files.")
+                          help="Enable saving built parameters file from .nd files")
     mx_group.add_argument("--legacy_mxnet_model",
                           action='store_true',
-                          help="Load the model trained with less version of MXNet than 1.0.0")
+                          help="Enable MXNet loader to make a model compatible with the latest MXNet version. Use only if your model was trained with MXNet version lower than 1.0.0")
     return parser
 
 
@@ -658,7 +526,7 @@ def get_kaldi_cli_parser(parser: argparse.ArgumentParser = None):
                              action=CanonicalizePathCheckExistenceAction)
 
     kaldi_group.add_argument("--remove_output_softmax",
-                             help="Removes the Softmax layer that is the output layer",
+                             help="Removes the SoftMax layer that is the output layer",
                              action='store_true')
     return parser
 
index 43417f2..b651228 100644 (file)
@@ -181,7 +181,7 @@ def sub_graph_between_nodes(graph: nx.MultiDiGraph, start_nodes: list, end_nodes
     d = deque(start_nodes)
     extra_start_nodes = []
 
-    nx.set_node_attributes(graph, name='prev', values=None)
+    nx.set_node_attributes(G=graph, name='prev', values=None)
     while len(d) != 0:
         cur_node_name = d.popleft()
         sub_graph_nodes.append(cur_node_name)
index 620ebc3..1149c71 100644 (file)
@@ -26,5 +26,7 @@ def guess_framework_by_ext(input_model_path: str) -> int:
         return 'mxnet'
     elif re.match('^.*\.nnet$', input_model_path):
         return 'kaldi'
+    elif re.match('^.*\.mdl', input_model_path):
+        return 'kaldi'
     elif re.match('^.*\.onnx$', input_model_path):
         return 'onnx'
index 9009f6b..cfdbf28 100644 (file)
@@ -187,7 +187,7 @@ class SimpleProtoParser(object):
         """
         self._split_to_tokens(file_content)
         if not self._convert_tokens_to_dict():
-            log.error('Failed to generate dictionary representation of file with content: {}'.format(file_content))
+            log.error('Failed to generate dictionary representation of file.')
             return None
         return self._result
 
index bfaea4d..8d69718 100644 (file)
@@ -78,9 +78,9 @@ if __name__ == "__main__":  # pragma: no cover
     if argv.input_model and argv.saved_model_dir:
         print("[ ERROR ] Both keys were provided --input_model and --input_dir. Please, provide only one of them")
         sys.exit(1)
-    graph_def = load_tf_graph_def(graph_file_name=argv.input_model, is_binary=not argv.text,
-                                  checkpoint=argv.input_checkpoint,
-                                  model_dir=argv.saved_model_dir, saved_model_tags=argv.saved_model_tags)
+    graph_def, _ = load_tf_graph_def(graph_file_name=argv.input_model, is_binary=not argv.text,
+                                     checkpoint=argv.input_checkpoint,
+                                     model_dir=argv.saved_model_dir, saved_model_tags=argv.saved_model_tags)
     summary = summarize_graph(graph_def)
     print("{} input(s) detected:".format(len(summary['inputs'])))
     for input in summary['inputs']:
index ebbd0a4..9ca78ec 100644 (file)
 """
 
 import tensorflow as tf
-
+try:
+    import tensorflow.contrib
+except:
+    pass  # we try to import contrib for loading models that use contrib operations
 from mo.utils.error import Error
 from mo.utils.utils import refer_to_faq_msg
 
index ad5c3f4..c4f089c 100644 (file)
@@ -31,10 +31,17 @@ class NamedAttrsClass:
 
 
 def match_shapes(pattern: np.array, shape: np.array):
-    '''Check if shape matches shape pattern handling -1 and 0 in the pattern.'''
+    """ Check if shape matches shape pattern handling -1 and 0 in the pattern. """
     # Elements with values -1 and 0 in pattern are just ignored.
     # Other elements should match.
     if pattern.size != shape.size:
         return False
     indices = [i for i, n in enumerate(pattern) if n not in [0, -1]]
     return np.array_equal(pattern[indices], shape[indices])
+
+
+def symm_match_shapes(shape1: np.array, shape2: np.array):
+    """ Check if shape matches shape pattern handling -1 and 0 in the pattern. """
+    # Elements with values -1 and 0 in both shapes are just ignored.
+    # Other elements should match. Undefined elements can be one side only.
+    return match_shapes(shape1, shape2) or match_shapes(shape2, shape1)
index 328ff44..09f3105 100644 (file)
@@ -141,6 +141,13 @@ def check_requirements(framework = None):
             not_satisfied_versions.append((name, 'not installed', 'required: {}'.format(required_version)))
             exit_code = 1
             continue
+        except Exception as e:
+            log.error('Error happened while importing {} module. It may happen due to unsatisfied requirements of '
+                      'that module. Please run requirements installation script once more.\n'
+                      'Details on module importing failure: {}'.format(name, e))
+            not_satisfied_versions.append((name, 'package error', 'required: {}'.format(required_version)))
+            exit_code = 1
+            continue
 
     if len(not_satisfied_versions) != 0:
         extension = 'bat' if os.name == 'nt' else 'sh'
index 1d1b83b..7583c33 100644 (file)
@@ -1,6 +1,6 @@
-tensorflow==1.9.0
-mxnet==1.0.0
+tensorflow>=1.2.0
+mxnet>=1.0.0,<=1.3.1
 networkx>=1.11
 numpy>=1.12.0
-protobuf==3.5.1
+protobuf==3.6.1
 onnx>=1.1.2
index 350f2f0..2acb120 100644 (file)
@@ -1,3 +1,3 @@
 networkx>=1.11
 numpy>=1.12.0
-protobuf==3.5.1
+protobuf==3.6.1
index 7924075..ae4ec3c 100644 (file)
@@ -1,3 +1,3 @@
-mxnet==1.0.0
+mxnet>=1.0.0,<=1.3.1
 networkx>=1.11
 numpy>=1.12.0
index 3f43a23..c700465 100644 (file)
@@ -1,3 +1,3 @@
-05:16AM November 07, 2018
-1.4.292.6ef7232d
-6ef7232dff4640b40a63f278e56139613bb2cf96
+06:46PM December 13, 2018
+1.5.12.49d067a0
+49d067a07dedf8e95920e9649e890a76451ca648